feat(text-splitters): add optional custom header pattern support (#31887)

## Description This PR adds support for custom header patterns in `MarkdownHeaderTextSplitter`, allowing users to define non-standard Markdown header formats (like `**Header**`) and specify their hierarchy levels. **Issue:** Fixes #22738 **Dependencies:** None - this change has no new dependencies **Key Changes:** - Added optional `custom_header_patterns` parameter to support non-standard header formats - Enable splitting on patterns like `**Header**` and `***Header***` - Maintain full backward compatibility with existing usage - Added comprehensive tests for custom and mixed header scenarios ## Example Usage ```python from langchain_text_splitters import MarkdownHeaderTextSplitter headers_to_split_on = [ ("**", "Chapter"), ("***", "Section"), ] custom_header_patterns = { "**": 1, # Level 1 headers "***": 2, # Level 2 headers } splitter = MarkdownHeaderTextSplitter( headers_to_split_on=headers_to_split_on, custom_header_patterns=custom_header_patterns, ) # Now **Chapter 1** is treated as a level 1 header # And ***Section 1.1*** is treated as a level 2 header ``` ## Testing - ✅ Added unit tests for custom header patterns - ✅ Added tests for mixed standard and custom headers - ✅ All existing tests pass (backward compatibility maintained) - ✅ Linting and formatting checks pass --- The implementation provides a flexible solution while maintaining the simplicity of the existing API. Users can continue using the splitter exactly as before, with the new functionality being entirely opt-in through the `custom_header_patterns` parameter. --------- Co-authored-by: Mason Daugherty <mason@langchain.dev> Co-authored-by: Claude <noreply@anthropic.com>
2025-08-22 10:59:22 +00:00 · 2025-08-18 10:10:49 -04:00 · 2025-08-18 10:10:49 -04:00 · 03138f41a0
commit 03138f41a0
parent fd891ee3d4
2 changed files with 181 additions and 7 deletions
--- a/libs/text-splitters/langchain_text_splitters/markdown.py
+++ b/libs/text-splitters/langchain_text_splitters/markdown.py
@ -1,7 +1,7 @@
 from __future__ import annotations

 import re
-from typing import Any, TypedDict, Union
+from typing import Any, Optional, TypedDict, Union

 from langchain_core.documents import Document

@ -26,6 +26,7 @@ class MarkdownHeaderTextSplitter:
        headers_to_split_on: list[tuple[str, str]],
        return_each_line: bool = False,  # noqa: FBT001,FBT002
        strip_headers: bool = True,  # noqa: FBT001,FBT002
+        custom_header_patterns: Optional[dict[str, int]] = None,
    ) -> None:
        """Create a new MarkdownHeaderTextSplitter.

@ -33,6 +34,9 @@ class MarkdownHeaderTextSplitter:
            headers_to_split_on: Headers we want to track
            return_each_line: Return each line w/ associated headers
            strip_headers: Strip split headers from the content of the chunk
+            custom_header_patterns: Optional dict mapping header patterns to their
+                levels. For example: {"**": 1, "***": 2} to treat **Header** as
+                level 1 and ***Header*** as level 2 headers.
        """
        # Output line-by-line or aggregated into chunks w/ common headers
        self.return_each_line = return_each_line
@ -43,6 +47,39 @@ class MarkdownHeaderTextSplitter:
        )
        # Strip headers split headers from the content of the chunk
        self.strip_headers = strip_headers
+        # Custom header patterns with their levels
+        self.custom_header_patterns = custom_header_patterns or {}
+
+    def _is_custom_header(self, line: str, sep: str) -> bool:
+        """Check if line matches a custom header pattern.
+
+        Args:
+            line: The line to check
+            sep: The separator pattern to match
+
+        Returns:
+            True if the line matches the custom pattern format
+        """
+        if sep not in self.custom_header_patterns:
+            return False
+
+        # Escape special regex characters in the separator
+        escaped_sep = re.escape(sep)
+        # Create regex pattern to match exactly one separator at start and end
+        # with content in between
+        pattern = (
+            f"^{escaped_sep}(?!{escaped_sep})(.+?)(?<!{escaped_sep}){escaped_sep}$"
+        )
+
+        match = re.match(pattern, line)
+        if match:
+            # Extract the content between the patterns
+            content = match.group(1).strip()
+            # Valid header if there's actual content (not just whitespace or separators)
+            # Check that content doesn't consist only of separator characters
+            if content and not all(c in sep for c in content.replace(" ", "")):
+                return True
+        return False

    def aggregate_lines_to_chunks(self, lines: list[LineType]) -> list[Document]:
        """Combine lines with common metadata into chunks.
@ -131,16 +168,22 @@ class MarkdownHeaderTextSplitter:

            # Check each line against each of the header types (e.g., #, ##)
            for sep, name in self.headers_to_split_on:
-                # Check if line starts with a header that we intend to split on
-                if stripped_line.startswith(sep) and (
+                is_standard_header = stripped_line.startswith(sep) and (
                    # Header with no text OR header is followed by space
                    # Both are valid conditions that sep is being used a header
                    len(stripped_line) == len(sep) or stripped_line[len(sep)] == " "
-                ):
+                )
+                is_custom_header = self._is_custom_header(stripped_line, sep)
+
+                # Check if line matches either standard or custom header pattern
+                if is_standard_header or is_custom_header:
                    # Ensure we are tracking the header as metadata
                    if name is not None:
                        # Get the current header level
-                        current_header_level = sep.count("#")
+                        if sep in self.custom_header_patterns:
+                            current_header_level = self.custom_header_patterns[sep]
+                        else:
+                            current_header_level = sep.count("#")

                        # Pop out headers of lower or same level from the stack
                        while (
@ -156,10 +199,20 @@ class MarkdownHeaderTextSplitter:
                                initial_metadata.pop(popped_header["name"])

                        # Push the current header to the stack
+                        # Extract header text based on header type
+                        if is_custom_header:
+                            # For custom headers like **Header**, extract text
+                            # between patterns
+                            header_text = stripped_line[len(sep) : -len(sep)].strip()
+                        else:
+                            # For standard headers like # Header, extract text
+                            # after the separator
+                            header_text = stripped_line[len(sep) :].strip()
+
                        header: HeaderType = {
                            "level": current_header_level,
                            "name": name,
-                            "data": stripped_line[len(sep) :].strip(),
+                            "data": header_text,
                        }
                        header_stack.append(header)
                        # Update initial_metadata with the current header
@ -288,7 +341,7 @@ class ExperimentalMarkdownSyntaxTextSplitter:
        chunks based on specified headers and formatting preferences.

        Args:
-            headers_to_split_on (Union[List[Tuple[str, str]], None]):
+            headers_to_split_on (Union[list[tuple[str, str]], None]):
                A list of tuples, where each tuple contains a header tag (e.g., "h1")
                and its corresponding metadata key. If None, default headers are used.
            return_each_line (bool):
--- a/libs/text-splitters/tests/unit_tests/test_text_splitters.py
+++ b/libs/text-splitters/tests/unit_tests/test_text_splitters.py
@ -1465,6 +1465,127 @@ def test_md_header_text_splitter_with_invisible_characters(characters: str) -> N
    assert output == expected_output


+def test_md_header_text_splitter_with_custom_headers() -> None:
+    """Test markdown splitter with custom header patterns like **Header**."""
+    markdown_document = """**Chapter 1**
+
+This is the content for chapter 1.
+
+***Section 1.1***
+
+This is the content for section 1.1.
+
+**Chapter 2**
+
+This is the content for chapter 2.
+
+***Section 2.1***
+
+This is the content for section 2.1.
+"""
+
+    headers_to_split_on = [
+        ("**", "Bold Header"),
+        ("***", "Bold Italic Header"),
+    ]
+
+    custom_header_patterns = {
+        "**": 1,  # Level 1 headers
+        "***": 2,  # Level 2 headers
+    }
+    markdown_splitter = MarkdownHeaderTextSplitter(
+        headers_to_split_on=headers_to_split_on,
+        custom_header_patterns=custom_header_patterns,
+    )
+    output = markdown_splitter.split_text(markdown_document)
+
+    expected_output = [
+        Document(
+            page_content="This is the content for chapter 1.",
+            metadata={"Bold Header": "Chapter 1"},
+        ),
+        Document(
+            page_content="This is the content for section 1.1.",
+            metadata={"Bold Header": "Chapter 1", "Bold Italic Header": "Section 1.1"},
+        ),
+        Document(
+            page_content="This is the content for chapter 2.",
+            metadata={"Bold Header": "Chapter 2"},
+        ),
+        Document(
+            page_content="This is the content for section 2.1.",
+            metadata={"Bold Header": "Chapter 2", "Bold Italic Header": "Section 2.1"},
+        ),
+    ]
+
+    assert output == expected_output
+
+
+def test_md_header_text_splitter_mixed_headers() -> None:
+    """Test markdown splitter with both standard and custom headers."""
+    markdown_document = """# Standard Header 1
+
+Content under standard header.
+
+**Custom Header 1**
+
+Content under custom header.
+
+## Standard Header 2
+
+Content under standard header 2.
+
+***Custom Header 2***
+
+Content under custom header 2.
+"""
+
+    headers_to_split_on = [
+        ("#", "Header 1"),
+        ("##", "Header 2"),
+        ("**", "Bold Header"),
+        ("***", "Bold Italic Header"),
+    ]
+
+    custom_header_patterns = {
+        "**": 1,  # Same level as #
+        "***": 2,  # Same level as ##
+    }
+
+    markdown_splitter = MarkdownHeaderTextSplitter(
+        headers_to_split_on=headers_to_split_on,
+        custom_header_patterns=custom_header_patterns,
+    )
+    output = markdown_splitter.split_text(markdown_document)
+
+    expected_output = [
+        Document(
+            page_content="Content under standard header.",
+            metadata={"Header 1": "Standard Header 1"},
+        ),
+        Document(
+            page_content="Content under custom header.",
+            metadata={"Bold Header": "Custom Header 1"},
+        ),
+        Document(
+            page_content="Content under standard header 2.",
+            metadata={
+                "Bold Header": "Custom Header 1",
+                "Header 2": "Standard Header 2",
+            },
+        ),
+        Document(
+            page_content="Content under custom header 2.",
+            metadata={
+                "Bold Header": "Custom Header 1",
+                "Bold Italic Header": "Custom Header 2",
+            },
+        ),
+    ]
+
+    assert output == expected_output
+
+
 EXPERIMENTAL_MARKDOWN_DOCUMENT = (
    "# My Header 1\n"
    "Content for header 1\n"