mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-22 10:59:22 +00:00
feat(text-splitters): add optional custom header pattern support (#31887)
## Description This PR adds support for custom header patterns in `MarkdownHeaderTextSplitter`, allowing users to define non-standard Markdown header formats (like `**Header**`) and specify their hierarchy levels. **Issue:** Fixes #22738 **Dependencies:** None - this change has no new dependencies **Key Changes:** - Added optional `custom_header_patterns` parameter to support non-standard header formats - Enable splitting on patterns like `**Header**` and `***Header***` - Maintain full backward compatibility with existing usage - Added comprehensive tests for custom and mixed header scenarios ## Example Usage ```python from langchain_text_splitters import MarkdownHeaderTextSplitter headers_to_split_on = [ ("**", "Chapter"), ("***", "Section"), ] custom_header_patterns = { "**": 1, # Level 1 headers "***": 2, # Level 2 headers } splitter = MarkdownHeaderTextSplitter( headers_to_split_on=headers_to_split_on, custom_header_patterns=custom_header_patterns, ) # Now **Chapter 1** is treated as a level 1 header # And ***Section 1.1*** is treated as a level 2 header ``` ## Testing - ✅ Added unit tests for custom header patterns - ✅ Added tests for mixed standard and custom headers - ✅ All existing tests pass (backward compatibility maintained) - ✅ Linting and formatting checks pass --- The implementation provides a flexible solution while maintaining the simplicity of the existing API. Users can continue using the splitter exactly as before, with the new functionality being entirely opt-in through the `custom_header_patterns` parameter. --------- Co-authored-by: Mason Daugherty <mason@langchain.dev> Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
parent
fd891ee3d4
commit
03138f41a0
@ -1,7 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from typing import Any, TypedDict, Union
|
||||
from typing import Any, Optional, TypedDict, Union
|
||||
|
||||
from langchain_core.documents import Document
|
||||
|
||||
@ -26,6 +26,7 @@ class MarkdownHeaderTextSplitter:
|
||||
headers_to_split_on: list[tuple[str, str]],
|
||||
return_each_line: bool = False, # noqa: FBT001,FBT002
|
||||
strip_headers: bool = True, # noqa: FBT001,FBT002
|
||||
custom_header_patterns: Optional[dict[str, int]] = None,
|
||||
) -> None:
|
||||
"""Create a new MarkdownHeaderTextSplitter.
|
||||
|
||||
@ -33,6 +34,9 @@ class MarkdownHeaderTextSplitter:
|
||||
headers_to_split_on: Headers we want to track
|
||||
return_each_line: Return each line w/ associated headers
|
||||
strip_headers: Strip split headers from the content of the chunk
|
||||
custom_header_patterns: Optional dict mapping header patterns to their
|
||||
levels. For example: {"**": 1, "***": 2} to treat **Header** as
|
||||
level 1 and ***Header*** as level 2 headers.
|
||||
"""
|
||||
# Output line-by-line or aggregated into chunks w/ common headers
|
||||
self.return_each_line = return_each_line
|
||||
@ -43,6 +47,39 @@ class MarkdownHeaderTextSplitter:
|
||||
)
|
||||
# Strip headers split headers from the content of the chunk
|
||||
self.strip_headers = strip_headers
|
||||
# Custom header patterns with their levels
|
||||
self.custom_header_patterns = custom_header_patterns or {}
|
||||
|
||||
def _is_custom_header(self, line: str, sep: str) -> bool:
|
||||
"""Check if line matches a custom header pattern.
|
||||
|
||||
Args:
|
||||
line: The line to check
|
||||
sep: The separator pattern to match
|
||||
|
||||
Returns:
|
||||
True if the line matches the custom pattern format
|
||||
"""
|
||||
if sep not in self.custom_header_patterns:
|
||||
return False
|
||||
|
||||
# Escape special regex characters in the separator
|
||||
escaped_sep = re.escape(sep)
|
||||
# Create regex pattern to match exactly one separator at start and end
|
||||
# with content in between
|
||||
pattern = (
|
||||
f"^{escaped_sep}(?!{escaped_sep})(.+?)(?<!{escaped_sep}){escaped_sep}$"
|
||||
)
|
||||
|
||||
match = re.match(pattern, line)
|
||||
if match:
|
||||
# Extract the content between the patterns
|
||||
content = match.group(1).strip()
|
||||
# Valid header if there's actual content (not just whitespace or separators)
|
||||
# Check that content doesn't consist only of separator characters
|
||||
if content and not all(c in sep for c in content.replace(" ", "")):
|
||||
return True
|
||||
return False
|
||||
|
||||
def aggregate_lines_to_chunks(self, lines: list[LineType]) -> list[Document]:
|
||||
"""Combine lines with common metadata into chunks.
|
||||
@ -131,16 +168,22 @@ class MarkdownHeaderTextSplitter:
|
||||
|
||||
# Check each line against each of the header types (e.g., #, ##)
|
||||
for sep, name in self.headers_to_split_on:
|
||||
# Check if line starts with a header that we intend to split on
|
||||
if stripped_line.startswith(sep) and (
|
||||
is_standard_header = stripped_line.startswith(sep) and (
|
||||
# Header with no text OR header is followed by space
|
||||
# Both are valid conditions that sep is being used a header
|
||||
len(stripped_line) == len(sep) or stripped_line[len(sep)] == " "
|
||||
):
|
||||
)
|
||||
is_custom_header = self._is_custom_header(stripped_line, sep)
|
||||
|
||||
# Check if line matches either standard or custom header pattern
|
||||
if is_standard_header or is_custom_header:
|
||||
# Ensure we are tracking the header as metadata
|
||||
if name is not None:
|
||||
# Get the current header level
|
||||
current_header_level = sep.count("#")
|
||||
if sep in self.custom_header_patterns:
|
||||
current_header_level = self.custom_header_patterns[sep]
|
||||
else:
|
||||
current_header_level = sep.count("#")
|
||||
|
||||
# Pop out headers of lower or same level from the stack
|
||||
while (
|
||||
@ -156,10 +199,20 @@ class MarkdownHeaderTextSplitter:
|
||||
initial_metadata.pop(popped_header["name"])
|
||||
|
||||
# Push the current header to the stack
|
||||
# Extract header text based on header type
|
||||
if is_custom_header:
|
||||
# For custom headers like **Header**, extract text
|
||||
# between patterns
|
||||
header_text = stripped_line[len(sep) : -len(sep)].strip()
|
||||
else:
|
||||
# For standard headers like # Header, extract text
|
||||
# after the separator
|
||||
header_text = stripped_line[len(sep) :].strip()
|
||||
|
||||
header: HeaderType = {
|
||||
"level": current_header_level,
|
||||
"name": name,
|
||||
"data": stripped_line[len(sep) :].strip(),
|
||||
"data": header_text,
|
||||
}
|
||||
header_stack.append(header)
|
||||
# Update initial_metadata with the current header
|
||||
@ -288,7 +341,7 @@ class ExperimentalMarkdownSyntaxTextSplitter:
|
||||
chunks based on specified headers and formatting preferences.
|
||||
|
||||
Args:
|
||||
headers_to_split_on (Union[List[Tuple[str, str]], None]):
|
||||
headers_to_split_on (Union[list[tuple[str, str]], None]):
|
||||
A list of tuples, where each tuple contains a header tag (e.g., "h1")
|
||||
and its corresponding metadata key. If None, default headers are used.
|
||||
return_each_line (bool):
|
||||
|
@ -1465,6 +1465,127 @@ def test_md_header_text_splitter_with_invisible_characters(characters: str) -> N
|
||||
assert output == expected_output
|
||||
|
||||
|
||||
def test_md_header_text_splitter_with_custom_headers() -> None:
|
||||
"""Test markdown splitter with custom header patterns like **Header**."""
|
||||
markdown_document = """**Chapter 1**
|
||||
|
||||
This is the content for chapter 1.
|
||||
|
||||
***Section 1.1***
|
||||
|
||||
This is the content for section 1.1.
|
||||
|
||||
**Chapter 2**
|
||||
|
||||
This is the content for chapter 2.
|
||||
|
||||
***Section 2.1***
|
||||
|
||||
This is the content for section 2.1.
|
||||
"""
|
||||
|
||||
headers_to_split_on = [
|
||||
("**", "Bold Header"),
|
||||
("***", "Bold Italic Header"),
|
||||
]
|
||||
|
||||
custom_header_patterns = {
|
||||
"**": 1, # Level 1 headers
|
||||
"***": 2, # Level 2 headers
|
||||
}
|
||||
markdown_splitter = MarkdownHeaderTextSplitter(
|
||||
headers_to_split_on=headers_to_split_on,
|
||||
custom_header_patterns=custom_header_patterns,
|
||||
)
|
||||
output = markdown_splitter.split_text(markdown_document)
|
||||
|
||||
expected_output = [
|
||||
Document(
|
||||
page_content="This is the content for chapter 1.",
|
||||
metadata={"Bold Header": "Chapter 1"},
|
||||
),
|
||||
Document(
|
||||
page_content="This is the content for section 1.1.",
|
||||
metadata={"Bold Header": "Chapter 1", "Bold Italic Header": "Section 1.1"},
|
||||
),
|
||||
Document(
|
||||
page_content="This is the content for chapter 2.",
|
||||
metadata={"Bold Header": "Chapter 2"},
|
||||
),
|
||||
Document(
|
||||
page_content="This is the content for section 2.1.",
|
||||
metadata={"Bold Header": "Chapter 2", "Bold Italic Header": "Section 2.1"},
|
||||
),
|
||||
]
|
||||
|
||||
assert output == expected_output
|
||||
|
||||
|
||||
def test_md_header_text_splitter_mixed_headers() -> None:
|
||||
"""Test markdown splitter with both standard and custom headers."""
|
||||
markdown_document = """# Standard Header 1
|
||||
|
||||
Content under standard header.
|
||||
|
||||
**Custom Header 1**
|
||||
|
||||
Content under custom header.
|
||||
|
||||
## Standard Header 2
|
||||
|
||||
Content under standard header 2.
|
||||
|
||||
***Custom Header 2***
|
||||
|
||||
Content under custom header 2.
|
||||
"""
|
||||
|
||||
headers_to_split_on = [
|
||||
("#", "Header 1"),
|
||||
("##", "Header 2"),
|
||||
("**", "Bold Header"),
|
||||
("***", "Bold Italic Header"),
|
||||
]
|
||||
|
||||
custom_header_patterns = {
|
||||
"**": 1, # Same level as #
|
||||
"***": 2, # Same level as ##
|
||||
}
|
||||
|
||||
markdown_splitter = MarkdownHeaderTextSplitter(
|
||||
headers_to_split_on=headers_to_split_on,
|
||||
custom_header_patterns=custom_header_patterns,
|
||||
)
|
||||
output = markdown_splitter.split_text(markdown_document)
|
||||
|
||||
expected_output = [
|
||||
Document(
|
||||
page_content="Content under standard header.",
|
||||
metadata={"Header 1": "Standard Header 1"},
|
||||
),
|
||||
Document(
|
||||
page_content="Content under custom header.",
|
||||
metadata={"Bold Header": "Custom Header 1"},
|
||||
),
|
||||
Document(
|
||||
page_content="Content under standard header 2.",
|
||||
metadata={
|
||||
"Bold Header": "Custom Header 1",
|
||||
"Header 2": "Standard Header 2",
|
||||
},
|
||||
),
|
||||
Document(
|
||||
page_content="Content under custom header 2.",
|
||||
metadata={
|
||||
"Bold Header": "Custom Header 1",
|
||||
"Bold Italic Header": "Custom Header 2",
|
||||
},
|
||||
),
|
||||
]
|
||||
|
||||
assert output == expected_output
|
||||
|
||||
|
||||
EXPERIMENTAL_MARKDOWN_DOCUMENT = (
|
||||
"# My Header 1\n"
|
||||
"Content for header 1\n"
|
||||
|
Loading…
Reference in New Issue
Block a user