feat(text-splitters): add optional custom header pattern support (#31887)

## Description This PR adds support for custom header patterns in `MarkdownHeaderTextSplitter`, allowing users to define non-standard Markdown header formats (like `**Header**`) and specify their hierarchy levels. **Issue:** Fixes #22738 **Dependencies:** None - this change has no new dependencies **Key Changes:** - Added optional `custom_header_patterns` parameter to support non-standard header formats - Enable splitting on patterns like `**Header**` and `***Header***` - Maintain full backward compatibility with existing usage - Added comprehensive tests for custom and mixed header scenarios ## Example Usage ```python from langchain_text_splitters import MarkdownHeaderTextSplitter headers_to_split_on = [ ("**", "Chapter"), ("***", "Section"), ] custom_header_patterns = { "**": 1, # Level 1 headers "***": 2, # Level 2 headers } splitter = MarkdownHeaderTextSplitter( headers_to_split_on=headers_to_split_on, custom_header_patterns=custom_header_patterns, ) # Now **Chapter 1** is treated as a level 1 header # And ***Section 1.1*** is treated as a level 2 header ``` ## Testing - ✅ Added unit tests for custom header patterns - ✅ Added tests for mixed standard and custom headers - ✅ All existing tests pass (backward compatibility maintained) - ✅ Linting and formatting checks pass --- The implementation provides a flexible solution while maintaining the simplicity of the existing API. Users can continue using the splitter exactly as before, with the new functionality being entirely opt-in through the `custom_header_patterns` parameter. --------- Co-authored-by: Mason Daugherty <mason@langchain.dev> Co-authored-by: Claude <noreply@anthropic.com>
2025-08-24 20:12:11 +00:00 · 2025-08-18 10:10:49 -04:00 · 2025-08-18 10:10:49 -04:00 · 03138f41a0
commit 03138f41a0
parent fd891ee3d4
2 changed files with 181 additions and 7 deletions
--- a/libs/text-splitters/langchain_text_splitters/markdown.py
+++ b/libs/text-splitters/langchain_text_splitters/markdown.py
@ -1,7 +1,7 @@
 from __future__ import annotations
 import re
-from typing import Any, TypedDict, Union
+from typing import Any, Optional, TypedDict, Union
 from langchain_core.documents import Document
@ -26,6 +26,7 @@ class MarkdownHeaderTextSplitter:
        headers_to_split_on: list[tuple[str, str]],
        return_each_line: bool = False,  # noqa: FBT001,FBT002
        strip_headers: bool = True,  # noqa: FBT001,FBT002
        custom_header_patterns: Optional[dict[str, int]] = None,
    ) -> None:
        """Create a new MarkdownHeaderTextSplitter.
@ -33,6 +34,9 @@ class MarkdownHeaderTextSplitter:
            headers_to_split_on: Headers we want to track
            return_each_line: Return each line w/ associated headers
            strip_headers: Strip split headers from the content of the chunk
            custom_header_patterns: Optional dict mapping header patterns to their
                levels. For example: {"**": 1, "***": 2} to treat **Header** as
                level 1 and ***Header*** as level 2 headers.
        """
        # Output line-by-line or aggregated into chunks w/ common headers
        self.return_each_line = return_each_line
@ -43,6 +47,39 @@ class MarkdownHeaderTextSplitter:
        )
        # Strip headers split headers from the content of the chunk
        self.strip_headers = strip_headers
        # Custom header patterns with their levels
        self.custom_header_patterns = custom_header_patterns or {}
    def _is_custom_header(self, line: str, sep: str) -> bool:
        """Check if line matches a custom header pattern.
        Args:
            line: The line to check
            sep: The separator pattern to match
        Returns:
            True if the line matches the custom pattern format
        """
        if sep not in self.custom_header_patterns:
            return False
        # Escape special regex characters in the separator
        escaped_sep = re.escape(sep)
        # Create regex pattern to match exactly one separator at start and end
        # with content in between
        pattern = (
            f"^{escaped_sep}(?!{escaped_sep})(.+?)(?<!{escaped_sep}){escaped_sep}$"
        )
        match = re.match(pattern, line)
        if match:
            # Extract the content between the patterns
            content = match.group(1).strip()
            # Valid header if there's actual content (not just whitespace or separators)
            # Check that content doesn't consist only of separator characters
            if content and not all(c in sep for c in content.replace(" ", "")):
                return True
        return False
    def aggregate_lines_to_chunks(self, lines: list[LineType]) -> list[Document]:
        """Combine lines with common metadata into chunks.
@ -131,16 +168,22 @@ class MarkdownHeaderTextSplitter:
            # Check each line against each of the header types (e.g., #, ##)
            for sep, name in self.headers_to_split_on:
-                # Check if line starts with a header that we intend to split on
+                is_standard_header = stripped_line.startswith(sep) and (
                if stripped_line.startswith(sep) and (
                    # Header with no text OR header is followed by space
                    # Both are valid conditions that sep is being used a header
                    len(stripped_line) == len(sep) or stripped_line[len(sep)] == " "
-                ):
+                )
                is_custom_header = self._is_custom_header(stripped_line, sep)
                # Check if line matches either standard or custom header pattern
                if is_standard_header or is_custom_header:
                    # Ensure we are tracking the header as metadata
                    if name is not None:
                        # Get the current header level
-                        current_header_level = sep.count("#")
+                        if sep in self.custom_header_patterns:
                            current_header_level = self.custom_header_patterns[sep]
                        else:
                            current_header_level = sep.count("#")
                        # Pop out headers of lower or same level from the stack
                        while (
@ -156,10 +199,20 @@ class MarkdownHeaderTextSplitter:
                                initial_metadata.pop(popped_header["name"])
                        # Push the current header to the stack
                        # Extract header text based on header type
                        if is_custom_header:
                            # For custom headers like **Header**, extract text
                            # between patterns
                            header_text = stripped_line[len(sep) : -len(sep)].strip()
                        else:
                            # For standard headers like # Header, extract text
                            # after the separator
                            header_text = stripped_line[len(sep) :].strip()
                        header: HeaderType = {
                            "level": current_header_level,
                            "name": name,
-                            "data": stripped_line[len(sep) :].strip(),
+                            "data": header_text,
                        }
                        header_stack.append(header)
                        # Update initial_metadata with the current header
@ -288,7 +341,7 @@ class ExperimentalMarkdownSyntaxTextSplitter:
        chunks based on specified headers and formatting preferences.
        Args:
-            headers_to_split_on (Union[List[Tuple[str, str]], None]):
+            headers_to_split_on (Union[list[tuple[str, str]], None]):
                A list of tuples, where each tuple contains a header tag (e.g., "h1")
                and its corresponding metadata key. If None, default headers are used.
            return_each_line (bool):
--- a/libs/text-splitters/tests/unit_tests/test_text_splitters.py
+++ b/libs/text-splitters/tests/unit_tests/test_text_splitters.py
@ -1465,6 +1465,127 @@ def test_md_header_text_splitter_with_invisible_characters(characters: str) -> N
    assert output == expected_output
 def test_md_header_text_splitter_with_custom_headers() -> None:
    """Test markdown splitter with custom header patterns like **Header**."""
    markdown_document = """**Chapter 1**
 This is the content for chapter 1.
 ***Section 1.1***
 This is the content for section 1.1.
 **Chapter 2**
 This is the content for chapter 2.
 ***Section 2.1***
 This is the content for section 2.1.
 """
    headers_to_split_on = [
        ("**", "Bold Header"),
        ("***", "Bold Italic Header"),
    ]
    custom_header_patterns = {
        "**": 1,  # Level 1 headers
        "***": 2,  # Level 2 headers
    }
    markdown_splitter = MarkdownHeaderTextSplitter(
        headers_to_split_on=headers_to_split_on,
        custom_header_patterns=custom_header_patterns,
    )
    output = markdown_splitter.split_text(markdown_document)
    expected_output = [
        Document(
            page_content="This is the content for chapter 1.",
            metadata={"Bold Header": "Chapter 1"},
        ),
        Document(
            page_content="This is the content for section 1.1.",
            metadata={"Bold Header": "Chapter 1", "Bold Italic Header": "Section 1.1"},
        ),
        Document(
            page_content="This is the content for chapter 2.",
            metadata={"Bold Header": "Chapter 2"},
        ),
        Document(
            page_content="This is the content for section 2.1.",
            metadata={"Bold Header": "Chapter 2", "Bold Italic Header": "Section 2.1"},
        ),
    ]
    assert output == expected_output
 def test_md_header_text_splitter_mixed_headers() -> None:
    """Test markdown splitter with both standard and custom headers."""
    markdown_document = """# Standard Header 1
 Content under standard header.
 **Custom Header 1**
 Content under custom header.
 ## Standard Header 2
 Content under standard header 2.
 ***Custom Header 2***
 Content under custom header 2.
 """
    headers_to_split_on = [
        ("#", "Header 1"),
        ("##", "Header 2"),
        ("**", "Bold Header"),
        ("***", "Bold Italic Header"),
    ]
    custom_header_patterns = {
        "**": 1,  # Same level as #
        "***": 2,  # Same level as ##
    }
    markdown_splitter = MarkdownHeaderTextSplitter(
        headers_to_split_on=headers_to_split_on,
        custom_header_patterns=custom_header_patterns,
    )
    output = markdown_splitter.split_text(markdown_document)
    expected_output = [
        Document(
            page_content="Content under standard header.",
            metadata={"Header 1": "Standard Header 1"},
        ),
        Document(
            page_content="Content under custom header.",
            metadata={"Bold Header": "Custom Header 1"},
        ),
        Document(
            page_content="Content under standard header 2.",
            metadata={
                "Bold Header": "Custom Header 1",
                "Header 2": "Standard Header 2",
            },
        ),
        Document(
            page_content="Content under custom header 2.",
            metadata={
                "Bold Header": "Custom Header 1",
                "Bold Italic Header": "Custom Header 2",
            },
        ),
    ]
    assert output == expected_output
 EXPERIMENTAL_MARKDOWN_DOCUMENT = (
    "# My Header 1\n"
    "Content for header 1\n"