feat(text-splitters): add optional custom header pattern support (#31887)

## Description

This PR adds support for custom header patterns in
`MarkdownHeaderTextSplitter`, allowing users to define non-standard
Markdown header formats (like `**Header**`) and specify their hierarchy
levels.

**Issue:** Fixes #22738

**Dependencies:** None - this change has no new dependencies

**Key Changes:**
- Added optional `custom_header_patterns` parameter to support
non-standard header formats
- Enable splitting on patterns like `**Header**` and `***Header***`
- Maintain full backward compatibility with existing usage
- Added comprehensive tests for custom and mixed header scenarios

## Example Usage

```python
from langchain_text_splitters import MarkdownHeaderTextSplitter

headers_to_split_on = [
    ("**", "Chapter"),
    ("***", "Section"),
]

custom_header_patterns = {
    "**": 1,   # Level 1 headers
    "***": 2,  # Level 2 headers
}

splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on,
    custom_header_patterns=custom_header_patterns,
)

# Now **Chapter 1** is treated as a level 1 header
# And ***Section 1.1*** is treated as a level 2 header
```

## Testing

-  Added unit tests for custom header patterns
-  Added tests for mixed standard and custom headers
-  All existing tests pass (backward compatibility maintained)
-  Linting and formatting checks pass

---

The implementation provides a flexible solution while maintaining the
simplicity of the existing API. Users can continue using the splitter
exactly as before, with the new functionality being entirely opt-in
through the `custom_header_patterns` parameter.

---------

Co-authored-by: Mason Daugherty <mason@langchain.dev>
Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
Keyu Chen 2025-08-18 10:10:49 -04:00 committed by GitHub
parent fd891ee3d4
commit 03138f41a0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 181 additions and 7 deletions

View File

@ -1,7 +1,7 @@
from __future__ import annotations
import re
from typing import Any, TypedDict, Union
from typing import Any, Optional, TypedDict, Union
from langchain_core.documents import Document
@ -26,6 +26,7 @@ class MarkdownHeaderTextSplitter:
headers_to_split_on: list[tuple[str, str]],
return_each_line: bool = False, # noqa: FBT001,FBT002
strip_headers: bool = True, # noqa: FBT001,FBT002
custom_header_patterns: Optional[dict[str, int]] = None,
) -> None:
"""Create a new MarkdownHeaderTextSplitter.
@ -33,6 +34,9 @@ class MarkdownHeaderTextSplitter:
headers_to_split_on: Headers we want to track
return_each_line: Return each line w/ associated headers
strip_headers: Strip split headers from the content of the chunk
custom_header_patterns: Optional dict mapping header patterns to their
levels. For example: {"**": 1, "***": 2} to treat **Header** as
level 1 and ***Header*** as level 2 headers.
"""
# Output line-by-line or aggregated into chunks w/ common headers
self.return_each_line = return_each_line
@ -43,6 +47,39 @@ class MarkdownHeaderTextSplitter:
)
# Strip headers split headers from the content of the chunk
self.strip_headers = strip_headers
# Custom header patterns with their levels
self.custom_header_patterns = custom_header_patterns or {}
def _is_custom_header(self, line: str, sep: str) -> bool:
"""Check if line matches a custom header pattern.
Args:
line: The line to check
sep: The separator pattern to match
Returns:
True if the line matches the custom pattern format
"""
if sep not in self.custom_header_patterns:
return False
# Escape special regex characters in the separator
escaped_sep = re.escape(sep)
# Create regex pattern to match exactly one separator at start and end
# with content in between
pattern = (
f"^{escaped_sep}(?!{escaped_sep})(.+?)(?<!{escaped_sep}){escaped_sep}$"
)
match = re.match(pattern, line)
if match:
# Extract the content between the patterns
content = match.group(1).strip()
# Valid header if there's actual content (not just whitespace or separators)
# Check that content doesn't consist only of separator characters
if content and not all(c in sep for c in content.replace(" ", "")):
return True
return False
def aggregate_lines_to_chunks(self, lines: list[LineType]) -> list[Document]:
"""Combine lines with common metadata into chunks.
@ -131,16 +168,22 @@ class MarkdownHeaderTextSplitter:
# Check each line against each of the header types (e.g., #, ##)
for sep, name in self.headers_to_split_on:
# Check if line starts with a header that we intend to split on
if stripped_line.startswith(sep) and (
is_standard_header = stripped_line.startswith(sep) and (
# Header with no text OR header is followed by space
# Both are valid conditions that sep is being used a header
len(stripped_line) == len(sep) or stripped_line[len(sep)] == " "
):
)
is_custom_header = self._is_custom_header(stripped_line, sep)
# Check if line matches either standard or custom header pattern
if is_standard_header or is_custom_header:
# Ensure we are tracking the header as metadata
if name is not None:
# Get the current header level
current_header_level = sep.count("#")
if sep in self.custom_header_patterns:
current_header_level = self.custom_header_patterns[sep]
else:
current_header_level = sep.count("#")
# Pop out headers of lower or same level from the stack
while (
@ -156,10 +199,20 @@ class MarkdownHeaderTextSplitter:
initial_metadata.pop(popped_header["name"])
# Push the current header to the stack
# Extract header text based on header type
if is_custom_header:
# For custom headers like **Header**, extract text
# between patterns
header_text = stripped_line[len(sep) : -len(sep)].strip()
else:
# For standard headers like # Header, extract text
# after the separator
header_text = stripped_line[len(sep) :].strip()
header: HeaderType = {
"level": current_header_level,
"name": name,
"data": stripped_line[len(sep) :].strip(),
"data": header_text,
}
header_stack.append(header)
# Update initial_metadata with the current header
@ -288,7 +341,7 @@ class ExperimentalMarkdownSyntaxTextSplitter:
chunks based on specified headers and formatting preferences.
Args:
headers_to_split_on (Union[List[Tuple[str, str]], None]):
headers_to_split_on (Union[list[tuple[str, str]], None]):
A list of tuples, where each tuple contains a header tag (e.g., "h1")
and its corresponding metadata key. If None, default headers are used.
return_each_line (bool):

View File

@ -1465,6 +1465,127 @@ def test_md_header_text_splitter_with_invisible_characters(characters: str) -> N
assert output == expected_output
def test_md_header_text_splitter_with_custom_headers() -> None:
"""Test markdown splitter with custom header patterns like **Header**."""
markdown_document = """**Chapter 1**
This is the content for chapter 1.
***Section 1.1***
This is the content for section 1.1.
**Chapter 2**
This is the content for chapter 2.
***Section 2.1***
This is the content for section 2.1.
"""
headers_to_split_on = [
("**", "Bold Header"),
("***", "Bold Italic Header"),
]
custom_header_patterns = {
"**": 1, # Level 1 headers
"***": 2, # Level 2 headers
}
markdown_splitter = MarkdownHeaderTextSplitter(
headers_to_split_on=headers_to_split_on,
custom_header_patterns=custom_header_patterns,
)
output = markdown_splitter.split_text(markdown_document)
expected_output = [
Document(
page_content="This is the content for chapter 1.",
metadata={"Bold Header": "Chapter 1"},
),
Document(
page_content="This is the content for section 1.1.",
metadata={"Bold Header": "Chapter 1", "Bold Italic Header": "Section 1.1"},
),
Document(
page_content="This is the content for chapter 2.",
metadata={"Bold Header": "Chapter 2"},
),
Document(
page_content="This is the content for section 2.1.",
metadata={"Bold Header": "Chapter 2", "Bold Italic Header": "Section 2.1"},
),
]
assert output == expected_output
def test_md_header_text_splitter_mixed_headers() -> None:
"""Test markdown splitter with both standard and custom headers."""
markdown_document = """# Standard Header 1
Content under standard header.
**Custom Header 1**
Content under custom header.
## Standard Header 2
Content under standard header 2.
***Custom Header 2***
Content under custom header 2.
"""
headers_to_split_on = [
("#", "Header 1"),
("##", "Header 2"),
("**", "Bold Header"),
("***", "Bold Italic Header"),
]
custom_header_patterns = {
"**": 1, # Same level as #
"***": 2, # Same level as ##
}
markdown_splitter = MarkdownHeaderTextSplitter(
headers_to_split_on=headers_to_split_on,
custom_header_patterns=custom_header_patterns,
)
output = markdown_splitter.split_text(markdown_document)
expected_output = [
Document(
page_content="Content under standard header.",
metadata={"Header 1": "Standard Header 1"},
),
Document(
page_content="Content under custom header.",
metadata={"Bold Header": "Custom Header 1"},
),
Document(
page_content="Content under standard header 2.",
metadata={
"Bold Header": "Custom Header 1",
"Header 2": "Standard Header 2",
},
),
Document(
page_content="Content under custom header 2.",
metadata={
"Bold Header": "Custom Header 1",
"Bold Italic Header": "Custom Header 2",
},
),
]
assert output == expected_output
EXPERIMENTAL_MARKDOWN_DOCUMENT = (
"# My Header 1\n"
"Content for header 1\n"