From 03138f41a067de0f0d17e59902731e337010af1c Mon Sep 17 00:00:00 2001 From: Keyu Chen <54015474+keyuchen21@users.noreply.github.com> Date: Mon, 18 Aug 2025 10:10:49 -0400 Subject: [PATCH] feat(text-splitters): add optional custom header pattern support (#31887) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Description This PR adds support for custom header patterns in `MarkdownHeaderTextSplitter`, allowing users to define non-standard Markdown header formats (like `**Header**`) and specify their hierarchy levels. **Issue:** Fixes #22738 **Dependencies:** None - this change has no new dependencies **Key Changes:** - Added optional `custom_header_patterns` parameter to support non-standard header formats - Enable splitting on patterns like `**Header**` and `***Header***` - Maintain full backward compatibility with existing usage - Added comprehensive tests for custom and mixed header scenarios ## Example Usage ```python from langchain_text_splitters import MarkdownHeaderTextSplitter headers_to_split_on = [ ("**", "Chapter"), ("***", "Section"), ] custom_header_patterns = { "**": 1, # Level 1 headers "***": 2, # Level 2 headers } splitter = MarkdownHeaderTextSplitter( headers_to_split_on=headers_to_split_on, custom_header_patterns=custom_header_patterns, ) # Now **Chapter 1** is treated as a level 1 header # And ***Section 1.1*** is treated as a level 2 header ``` ## Testing - ✅ Added unit tests for custom header patterns - ✅ Added tests for mixed standard and custom headers - ✅ All existing tests pass (backward compatibility maintained) - ✅ Linting and formatting checks pass --- The implementation provides a flexible solution while maintaining the simplicity of the existing API. Users can continue using the splitter exactly as before, with the new functionality being entirely opt-in through the `custom_header_patterns` parameter. --------- Co-authored-by: Mason Daugherty Co-authored-by: Claude --- .../langchain_text_splitters/markdown.py | 67 +++++++++- .../tests/unit_tests/test_text_splitters.py | 121 ++++++++++++++++++ 2 files changed, 181 insertions(+), 7 deletions(-) diff --git a/libs/text-splitters/langchain_text_splitters/markdown.py b/libs/text-splitters/langchain_text_splitters/markdown.py index e5ea4ff030a..18f8be72bd7 100644 --- a/libs/text-splitters/langchain_text_splitters/markdown.py +++ b/libs/text-splitters/langchain_text_splitters/markdown.py @@ -1,7 +1,7 @@ from __future__ import annotations import re -from typing import Any, TypedDict, Union +from typing import Any, Optional, TypedDict, Union from langchain_core.documents import Document @@ -26,6 +26,7 @@ class MarkdownHeaderTextSplitter: headers_to_split_on: list[tuple[str, str]], return_each_line: bool = False, # noqa: FBT001,FBT002 strip_headers: bool = True, # noqa: FBT001,FBT002 + custom_header_patterns: Optional[dict[str, int]] = None, ) -> None: """Create a new MarkdownHeaderTextSplitter. @@ -33,6 +34,9 @@ class MarkdownHeaderTextSplitter: headers_to_split_on: Headers we want to track return_each_line: Return each line w/ associated headers strip_headers: Strip split headers from the content of the chunk + custom_header_patterns: Optional dict mapping header patterns to their + levels. For example: {"**": 1, "***": 2} to treat **Header** as + level 1 and ***Header*** as level 2 headers. """ # Output line-by-line or aggregated into chunks w/ common headers self.return_each_line = return_each_line @@ -43,6 +47,39 @@ class MarkdownHeaderTextSplitter: ) # Strip headers split headers from the content of the chunk self.strip_headers = strip_headers + # Custom header patterns with their levels + self.custom_header_patterns = custom_header_patterns or {} + + def _is_custom_header(self, line: str, sep: str) -> bool: + """Check if line matches a custom header pattern. + + Args: + line: The line to check + sep: The separator pattern to match + + Returns: + True if the line matches the custom pattern format + """ + if sep not in self.custom_header_patterns: + return False + + # Escape special regex characters in the separator + escaped_sep = re.escape(sep) + # Create regex pattern to match exactly one separator at start and end + # with content in between + pattern = ( + f"^{escaped_sep}(?!{escaped_sep})(.+?)(? list[Document]: """Combine lines with common metadata into chunks. @@ -131,16 +168,22 @@ class MarkdownHeaderTextSplitter: # Check each line against each of the header types (e.g., #, ##) for sep, name in self.headers_to_split_on: - # Check if line starts with a header that we intend to split on - if stripped_line.startswith(sep) and ( + is_standard_header = stripped_line.startswith(sep) and ( # Header with no text OR header is followed by space # Both are valid conditions that sep is being used a header len(stripped_line) == len(sep) or stripped_line[len(sep)] == " " - ): + ) + is_custom_header = self._is_custom_header(stripped_line, sep) + + # Check if line matches either standard or custom header pattern + if is_standard_header or is_custom_header: # Ensure we are tracking the header as metadata if name is not None: # Get the current header level - current_header_level = sep.count("#") + if sep in self.custom_header_patterns: + current_header_level = self.custom_header_patterns[sep] + else: + current_header_level = sep.count("#") # Pop out headers of lower or same level from the stack while ( @@ -156,10 +199,20 @@ class MarkdownHeaderTextSplitter: initial_metadata.pop(popped_header["name"]) # Push the current header to the stack + # Extract header text based on header type + if is_custom_header: + # For custom headers like **Header**, extract text + # between patterns + header_text = stripped_line[len(sep) : -len(sep)].strip() + else: + # For standard headers like # Header, extract text + # after the separator + header_text = stripped_line[len(sep) :].strip() + header: HeaderType = { "level": current_header_level, "name": name, - "data": stripped_line[len(sep) :].strip(), + "data": header_text, } header_stack.append(header) # Update initial_metadata with the current header @@ -288,7 +341,7 @@ class ExperimentalMarkdownSyntaxTextSplitter: chunks based on specified headers and formatting preferences. Args: - headers_to_split_on (Union[List[Tuple[str, str]], None]): + headers_to_split_on (Union[list[tuple[str, str]], None]): A list of tuples, where each tuple contains a header tag (e.g., "h1") and its corresponding metadata key. If None, default headers are used. return_each_line (bool): diff --git a/libs/text-splitters/tests/unit_tests/test_text_splitters.py b/libs/text-splitters/tests/unit_tests/test_text_splitters.py index 2ed35e3c13e..5ecca3bdf71 100644 --- a/libs/text-splitters/tests/unit_tests/test_text_splitters.py +++ b/libs/text-splitters/tests/unit_tests/test_text_splitters.py @@ -1465,6 +1465,127 @@ def test_md_header_text_splitter_with_invisible_characters(characters: str) -> N assert output == expected_output +def test_md_header_text_splitter_with_custom_headers() -> None: + """Test markdown splitter with custom header patterns like **Header**.""" + markdown_document = """**Chapter 1** + +This is the content for chapter 1. + +***Section 1.1*** + +This is the content for section 1.1. + +**Chapter 2** + +This is the content for chapter 2. + +***Section 2.1*** + +This is the content for section 2.1. +""" + + headers_to_split_on = [ + ("**", "Bold Header"), + ("***", "Bold Italic Header"), + ] + + custom_header_patterns = { + "**": 1, # Level 1 headers + "***": 2, # Level 2 headers + } + markdown_splitter = MarkdownHeaderTextSplitter( + headers_to_split_on=headers_to_split_on, + custom_header_patterns=custom_header_patterns, + ) + output = markdown_splitter.split_text(markdown_document) + + expected_output = [ + Document( + page_content="This is the content for chapter 1.", + metadata={"Bold Header": "Chapter 1"}, + ), + Document( + page_content="This is the content for section 1.1.", + metadata={"Bold Header": "Chapter 1", "Bold Italic Header": "Section 1.1"}, + ), + Document( + page_content="This is the content for chapter 2.", + metadata={"Bold Header": "Chapter 2"}, + ), + Document( + page_content="This is the content for section 2.1.", + metadata={"Bold Header": "Chapter 2", "Bold Italic Header": "Section 2.1"}, + ), + ] + + assert output == expected_output + + +def test_md_header_text_splitter_mixed_headers() -> None: + """Test markdown splitter with both standard and custom headers.""" + markdown_document = """# Standard Header 1 + +Content under standard header. + +**Custom Header 1** + +Content under custom header. + +## Standard Header 2 + +Content under standard header 2. + +***Custom Header 2*** + +Content under custom header 2. +""" + + headers_to_split_on = [ + ("#", "Header 1"), + ("##", "Header 2"), + ("**", "Bold Header"), + ("***", "Bold Italic Header"), + ] + + custom_header_patterns = { + "**": 1, # Same level as # + "***": 2, # Same level as ## + } + + markdown_splitter = MarkdownHeaderTextSplitter( + headers_to_split_on=headers_to_split_on, + custom_header_patterns=custom_header_patterns, + ) + output = markdown_splitter.split_text(markdown_document) + + expected_output = [ + Document( + page_content="Content under standard header.", + metadata={"Header 1": "Standard Header 1"}, + ), + Document( + page_content="Content under custom header.", + metadata={"Bold Header": "Custom Header 1"}, + ), + Document( + page_content="Content under standard header 2.", + metadata={ + "Bold Header": "Custom Header 1", + "Header 2": "Standard Header 2", + }, + ), + Document( + page_content="Content under custom header 2.", + metadata={ + "Bold Header": "Custom Header 1", + "Bold Italic Header": "Custom Header 2", + }, + ), + ] + + assert output == expected_output + + EXPERIMENTAL_MARKDOWN_DOCUMENT = ( "# My Header 1\n" "Content for header 1\n"