From 03138f41a067de0f0d17e59902731e337010af1c Mon Sep 17 00:00:00 2001
From: Keyu Chen <54015474+keyuchen21@users.noreply.github.com>
Date: Mon, 18 Aug 2025 10:10:49 -0400
Subject: [PATCH] feat(text-splitters): add optional custom header pattern
 support (#31887)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Description

This PR adds support for custom header patterns in
`MarkdownHeaderTextSplitter`, allowing users to define non-standard
Markdown header formats (like `**Header**`) and specify their hierarchy
levels.

**Issue:** Fixes #22738

**Dependencies:** None - this change has no new dependencies

**Key Changes:**
- Added optional `custom_header_patterns` parameter to support
non-standard header formats
- Enable splitting on patterns like `**Header**` and `***Header***`
- Maintain full backward compatibility with existing usage
- Added comprehensive tests for custom and mixed header scenarios

## Example Usage

```python
from langchain_text_splitters import MarkdownHeaderTextSplitter

headers_to_split_on = [
    ("**", "Chapter"),
    ("***", "Section"),
]

custom_header_patterns = {
    "**": 1,   # Level 1 headers
    "***": 2,  # Level 2 headers
}

splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on,
    custom_header_patterns=custom_header_patterns,
)

# Now **Chapter 1** is treated as a level 1 header
# And ***Section 1.1*** is treated as a level 2 header
```

## Testing

- ✅ Added unit tests for custom header patterns
- ✅ Added tests for mixed standard and custom headers
- ✅ All existing tests pass (backward compatibility maintained)
- ✅ Linting and formatting checks pass

---

The implementation provides a flexible solution while maintaining the
simplicity of the existing API. Users can continue using the splitter
exactly as before, with the new functionality being entirely opt-in
through the `custom_header_patterns` parameter.

---------

Co-authored-by: Mason Daugherty <mason@langchain.dev>
Co-authored-by: Claude <noreply@anthropic.com>
---
 .../langchain_text_splitters/markdown.py      |  67 +++++++++-
 .../tests/unit_tests/test_text_splitters.py   | 121 ++++++++++++++++++
 2 files changed, 181 insertions(+), 7 deletions(-)

diff --git a/libs/text-splitters/langchain_text_splitters/markdown.py b/libs/text-splitters/langchain_text_splitters/markdown.py
index e5ea4ff030a..18f8be72bd7 100644
--- a/libs/text-splitters/langchain_text_splitters/markdown.py
+++ b/libs/text-splitters/langchain_text_splitters/markdown.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 import re
-from typing import Any, TypedDict, Union
+from typing import Any, Optional, TypedDict, Union
 
 from langchain_core.documents import Document
 
@@ -26,6 +26,7 @@ class MarkdownHeaderTextSplitter:
         headers_to_split_on: list[tuple[str, str]],
         return_each_line: bool = False,  # noqa: FBT001,FBT002
         strip_headers: bool = True,  # noqa: FBT001,FBT002
+        custom_header_patterns: Optional[dict[str, int]] = None,
     ) -> None:
         """Create a new MarkdownHeaderTextSplitter.
 
@@ -33,6 +34,9 @@ class MarkdownHeaderTextSplitter:
             headers_to_split_on: Headers we want to track
             return_each_line: Return each line w/ associated headers
             strip_headers: Strip split headers from the content of the chunk
+            custom_header_patterns: Optional dict mapping header patterns to their
+                levels. For example: {"**": 1, "***": 2} to treat **Header** as
+                level 1 and ***Header*** as level 2 headers.
         """
         # Output line-by-line or aggregated into chunks w/ common headers
         self.return_each_line = return_each_line
@@ -43,6 +47,39 @@ class MarkdownHeaderTextSplitter:
         )
         # Strip headers split headers from the content of the chunk
         self.strip_headers = strip_headers
+        # Custom header patterns with their levels
+        self.custom_header_patterns = custom_header_patterns or {}
+
+    def _is_custom_header(self, line: str, sep: str) -> bool:
+        """Check if line matches a custom header pattern.
+
+        Args:
+            line: The line to check
+            sep: The separator pattern to match
+
+        Returns:
+            True if the line matches the custom pattern format
+        """
+        if sep not in self.custom_header_patterns:
+            return False
+
+        # Escape special regex characters in the separator
+        escaped_sep = re.escape(sep)
+        # Create regex pattern to match exactly one separator at start and end
+        # with content in between
+        pattern = (
+            f"^{escaped_sep}(?!{escaped_sep})(.+?)(?<!{escaped_sep}){escaped_sep}$"
+        )
+
+        match = re.match(pattern, line)
+        if match:
+            # Extract the content between the patterns
+            content = match.group(1).strip()
+            # Valid header if there's actual content (not just whitespace or separators)
+            # Check that content doesn't consist only of separator characters
+            if content and not all(c in sep for c in content.replace(" ", "")):
+                return True
+        return False
 
     def aggregate_lines_to_chunks(self, lines: list[LineType]) -> list[Document]:
         """Combine lines with common metadata into chunks.
@@ -131,16 +168,22 @@ class MarkdownHeaderTextSplitter:
 
             # Check each line against each of the header types (e.g., #, ##)
             for sep, name in self.headers_to_split_on:
-                # Check if line starts with a header that we intend to split on
-                if stripped_line.startswith(sep) and (
+                is_standard_header = stripped_line.startswith(sep) and (
                     # Header with no text OR header is followed by space
                     # Both are valid conditions that sep is being used a header
                     len(stripped_line) == len(sep) or stripped_line[len(sep)] == " "
-                ):
+                )
+                is_custom_header = self._is_custom_header(stripped_line, sep)
+
+                # Check if line matches either standard or custom header pattern
+                if is_standard_header or is_custom_header:
                     # Ensure we are tracking the header as metadata
                     if name is not None:
                         # Get the current header level
-                        current_header_level = sep.count("#")
+                        if sep in self.custom_header_patterns:
+                            current_header_level = self.custom_header_patterns[sep]
+                        else:
+                            current_header_level = sep.count("#")
 
                         # Pop out headers of lower or same level from the stack
                         while (
@@ -156,10 +199,20 @@ class MarkdownHeaderTextSplitter:
                                 initial_metadata.pop(popped_header["name"])
 
                         # Push the current header to the stack
+                        # Extract header text based on header type
+                        if is_custom_header:
+                            # For custom headers like **Header**, extract text
+                            # between patterns
+                            header_text = stripped_line[len(sep) : -len(sep)].strip()
+                        else:
+                            # For standard headers like # Header, extract text
+                            # after the separator
+                            header_text = stripped_line[len(sep) :].strip()
+
                         header: HeaderType = {
                             "level": current_header_level,
                             "name": name,
-                            "data": stripped_line[len(sep) :].strip(),
+                            "data": header_text,
                         }
                         header_stack.append(header)
                         # Update initial_metadata with the current header
@@ -288,7 +341,7 @@ class ExperimentalMarkdownSyntaxTextSplitter:
         chunks based on specified headers and formatting preferences.
 
         Args:
-            headers_to_split_on (Union[List[Tuple[str, str]], None]):
+            headers_to_split_on (Union[list[tuple[str, str]], None]):
                 A list of tuples, where each tuple contains a header tag (e.g., "h1")
                 and its corresponding metadata key. If None, default headers are used.
             return_each_line (bool):
diff --git a/libs/text-splitters/tests/unit_tests/test_text_splitters.py b/libs/text-splitters/tests/unit_tests/test_text_splitters.py
index 2ed35e3c13e..5ecca3bdf71 100644
--- a/libs/text-splitters/tests/unit_tests/test_text_splitters.py
+++ b/libs/text-splitters/tests/unit_tests/test_text_splitters.py
@@ -1465,6 +1465,127 @@ def test_md_header_text_splitter_with_invisible_characters(characters: str) -> N
     assert output == expected_output
 
 
+def test_md_header_text_splitter_with_custom_headers() -> None:
+    """Test markdown splitter with custom header patterns like **Header**."""
+    markdown_document = """**Chapter 1**
+
+This is the content for chapter 1.
+
+***Section 1.1***
+
+This is the content for section 1.1.
+
+**Chapter 2**
+
+This is the content for chapter 2.
+
+***Section 2.1***
+
+This is the content for section 2.1.
+"""
+
+    headers_to_split_on = [
+        ("**", "Bold Header"),
+        ("***", "Bold Italic Header"),
+    ]
+
+    custom_header_patterns = {
+        "**": 1,  # Level 1 headers
+        "***": 2,  # Level 2 headers
+    }
+    markdown_splitter = MarkdownHeaderTextSplitter(
+        headers_to_split_on=headers_to_split_on,
+        custom_header_patterns=custom_header_patterns,
+    )
+    output = markdown_splitter.split_text(markdown_document)
+
+    expected_output = [
+        Document(
+            page_content="This is the content for chapter 1.",
+            metadata={"Bold Header": "Chapter 1"},
+        ),
+        Document(
+            page_content="This is the content for section 1.1.",
+            metadata={"Bold Header": "Chapter 1", "Bold Italic Header": "Section 1.1"},
+        ),
+        Document(
+            page_content="This is the content for chapter 2.",
+            metadata={"Bold Header": "Chapter 2"},
+        ),
+        Document(
+            page_content="This is the content for section 2.1.",
+            metadata={"Bold Header": "Chapter 2", "Bold Italic Header": "Section 2.1"},
+        ),
+    ]
+
+    assert output == expected_output
+
+
+def test_md_header_text_splitter_mixed_headers() -> None:
+    """Test markdown splitter with both standard and custom headers."""
+    markdown_document = """# Standard Header 1
+
+Content under standard header.
+
+**Custom Header 1**
+
+Content under custom header.
+
+## Standard Header 2
+
+Content under standard header 2.
+
+***Custom Header 2***
+
+Content under custom header 2.
+"""
+
+    headers_to_split_on = [
+        ("#", "Header 1"),
+        ("##", "Header 2"),
+        ("**", "Bold Header"),
+        ("***", "Bold Italic Header"),
+    ]
+
+    custom_header_patterns = {
+        "**": 1,  # Same level as #
+        "***": 2,  # Same level as ##
+    }
+
+    markdown_splitter = MarkdownHeaderTextSplitter(
+        headers_to_split_on=headers_to_split_on,
+        custom_header_patterns=custom_header_patterns,
+    )
+    output = markdown_splitter.split_text(markdown_document)
+
+    expected_output = [
+        Document(
+            page_content="Content under standard header.",
+            metadata={"Header 1": "Standard Header 1"},
+        ),
+        Document(
+            page_content="Content under custom header.",
+            metadata={"Bold Header": "Custom Header 1"},
+        ),
+        Document(
+            page_content="Content under standard header 2.",
+            metadata={
+                "Bold Header": "Custom Header 1",
+                "Header 2": "Standard Header 2",
+            },
+        ),
+        Document(
+            page_content="Content under custom header 2.",
+            metadata={
+                "Bold Header": "Custom Header 1",
+                "Bold Italic Header": "Custom Header 2",
+            },
+        ),
+    ]
+
+    assert output == expected_output
+
+
 EXPERIMENTAL_MARKDOWN_DOCUMENT = (
     "# My Header 1\n"
     "Content for header 1\n"