mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-24 20:12:11 +00:00
feat(text-splitters): add optional custom header pattern support (#31887)
## Description This PR adds support for custom header patterns in `MarkdownHeaderTextSplitter`, allowing users to define non-standard Markdown header formats (like `**Header**`) and specify their hierarchy levels. **Issue:** Fixes #22738 **Dependencies:** None - this change has no new dependencies **Key Changes:** - Added optional `custom_header_patterns` parameter to support non-standard header formats - Enable splitting on patterns like `**Header**` and `***Header***` - Maintain full backward compatibility with existing usage - Added comprehensive tests for custom and mixed header scenarios ## Example Usage ```python from langchain_text_splitters import MarkdownHeaderTextSplitter headers_to_split_on = [ ("**", "Chapter"), ("***", "Section"), ] custom_header_patterns = { "**": 1, # Level 1 headers "***": 2, # Level 2 headers } splitter = MarkdownHeaderTextSplitter( headers_to_split_on=headers_to_split_on, custom_header_patterns=custom_header_patterns, ) # Now **Chapter 1** is treated as a level 1 header # And ***Section 1.1*** is treated as a level 2 header ``` ## Testing - ✅ Added unit tests for custom header patterns - ✅ Added tests for mixed standard and custom headers - ✅ All existing tests pass (backward compatibility maintained) - ✅ Linting and formatting checks pass --- The implementation provides a flexible solution while maintaining the simplicity of the existing API. Users can continue using the splitter exactly as before, with the new functionality being entirely opt-in through the `custom_header_patterns` parameter. --------- Co-authored-by: Mason Daugherty <mason@langchain.dev> Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
parent
fd891ee3d4
commit
03138f41a0
@ -1,7 +1,7 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import re
|
import re
|
||||||
from typing import Any, TypedDict, Union
|
from typing import Any, Optional, TypedDict, Union
|
||||||
|
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
@ -26,6 +26,7 @@ class MarkdownHeaderTextSplitter:
|
|||||||
headers_to_split_on: list[tuple[str, str]],
|
headers_to_split_on: list[tuple[str, str]],
|
||||||
return_each_line: bool = False, # noqa: FBT001,FBT002
|
return_each_line: bool = False, # noqa: FBT001,FBT002
|
||||||
strip_headers: bool = True, # noqa: FBT001,FBT002
|
strip_headers: bool = True, # noqa: FBT001,FBT002
|
||||||
|
custom_header_patterns: Optional[dict[str, int]] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Create a new MarkdownHeaderTextSplitter.
|
"""Create a new MarkdownHeaderTextSplitter.
|
||||||
|
|
||||||
@ -33,6 +34,9 @@ class MarkdownHeaderTextSplitter:
|
|||||||
headers_to_split_on: Headers we want to track
|
headers_to_split_on: Headers we want to track
|
||||||
return_each_line: Return each line w/ associated headers
|
return_each_line: Return each line w/ associated headers
|
||||||
strip_headers: Strip split headers from the content of the chunk
|
strip_headers: Strip split headers from the content of the chunk
|
||||||
|
custom_header_patterns: Optional dict mapping header patterns to their
|
||||||
|
levels. For example: {"**": 1, "***": 2} to treat **Header** as
|
||||||
|
level 1 and ***Header*** as level 2 headers.
|
||||||
"""
|
"""
|
||||||
# Output line-by-line or aggregated into chunks w/ common headers
|
# Output line-by-line or aggregated into chunks w/ common headers
|
||||||
self.return_each_line = return_each_line
|
self.return_each_line = return_each_line
|
||||||
@ -43,6 +47,39 @@ class MarkdownHeaderTextSplitter:
|
|||||||
)
|
)
|
||||||
# Strip headers split headers from the content of the chunk
|
# Strip headers split headers from the content of the chunk
|
||||||
self.strip_headers = strip_headers
|
self.strip_headers = strip_headers
|
||||||
|
# Custom header patterns with their levels
|
||||||
|
self.custom_header_patterns = custom_header_patterns or {}
|
||||||
|
|
||||||
|
def _is_custom_header(self, line: str, sep: str) -> bool:
|
||||||
|
"""Check if line matches a custom header pattern.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
line: The line to check
|
||||||
|
sep: The separator pattern to match
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if the line matches the custom pattern format
|
||||||
|
"""
|
||||||
|
if sep not in self.custom_header_patterns:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Escape special regex characters in the separator
|
||||||
|
escaped_sep = re.escape(sep)
|
||||||
|
# Create regex pattern to match exactly one separator at start and end
|
||||||
|
# with content in between
|
||||||
|
pattern = (
|
||||||
|
f"^{escaped_sep}(?!{escaped_sep})(.+?)(?<!{escaped_sep}){escaped_sep}$"
|
||||||
|
)
|
||||||
|
|
||||||
|
match = re.match(pattern, line)
|
||||||
|
if match:
|
||||||
|
# Extract the content between the patterns
|
||||||
|
content = match.group(1).strip()
|
||||||
|
# Valid header if there's actual content (not just whitespace or separators)
|
||||||
|
# Check that content doesn't consist only of separator characters
|
||||||
|
if content and not all(c in sep for c in content.replace(" ", "")):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
def aggregate_lines_to_chunks(self, lines: list[LineType]) -> list[Document]:
|
def aggregate_lines_to_chunks(self, lines: list[LineType]) -> list[Document]:
|
||||||
"""Combine lines with common metadata into chunks.
|
"""Combine lines with common metadata into chunks.
|
||||||
@ -131,16 +168,22 @@ class MarkdownHeaderTextSplitter:
|
|||||||
|
|
||||||
# Check each line against each of the header types (e.g., #, ##)
|
# Check each line against each of the header types (e.g., #, ##)
|
||||||
for sep, name in self.headers_to_split_on:
|
for sep, name in self.headers_to_split_on:
|
||||||
# Check if line starts with a header that we intend to split on
|
is_standard_header = stripped_line.startswith(sep) and (
|
||||||
if stripped_line.startswith(sep) and (
|
|
||||||
# Header with no text OR header is followed by space
|
# Header with no text OR header is followed by space
|
||||||
# Both are valid conditions that sep is being used a header
|
# Both are valid conditions that sep is being used a header
|
||||||
len(stripped_line) == len(sep) or stripped_line[len(sep)] == " "
|
len(stripped_line) == len(sep) or stripped_line[len(sep)] == " "
|
||||||
):
|
)
|
||||||
|
is_custom_header = self._is_custom_header(stripped_line, sep)
|
||||||
|
|
||||||
|
# Check if line matches either standard or custom header pattern
|
||||||
|
if is_standard_header or is_custom_header:
|
||||||
# Ensure we are tracking the header as metadata
|
# Ensure we are tracking the header as metadata
|
||||||
if name is not None:
|
if name is not None:
|
||||||
# Get the current header level
|
# Get the current header level
|
||||||
current_header_level = sep.count("#")
|
if sep in self.custom_header_patterns:
|
||||||
|
current_header_level = self.custom_header_patterns[sep]
|
||||||
|
else:
|
||||||
|
current_header_level = sep.count("#")
|
||||||
|
|
||||||
# Pop out headers of lower or same level from the stack
|
# Pop out headers of lower or same level from the stack
|
||||||
while (
|
while (
|
||||||
@ -156,10 +199,20 @@ class MarkdownHeaderTextSplitter:
|
|||||||
initial_metadata.pop(popped_header["name"])
|
initial_metadata.pop(popped_header["name"])
|
||||||
|
|
||||||
# Push the current header to the stack
|
# Push the current header to the stack
|
||||||
|
# Extract header text based on header type
|
||||||
|
if is_custom_header:
|
||||||
|
# For custom headers like **Header**, extract text
|
||||||
|
# between patterns
|
||||||
|
header_text = stripped_line[len(sep) : -len(sep)].strip()
|
||||||
|
else:
|
||||||
|
# For standard headers like # Header, extract text
|
||||||
|
# after the separator
|
||||||
|
header_text = stripped_line[len(sep) :].strip()
|
||||||
|
|
||||||
header: HeaderType = {
|
header: HeaderType = {
|
||||||
"level": current_header_level,
|
"level": current_header_level,
|
||||||
"name": name,
|
"name": name,
|
||||||
"data": stripped_line[len(sep) :].strip(),
|
"data": header_text,
|
||||||
}
|
}
|
||||||
header_stack.append(header)
|
header_stack.append(header)
|
||||||
# Update initial_metadata with the current header
|
# Update initial_metadata with the current header
|
||||||
@ -288,7 +341,7 @@ class ExperimentalMarkdownSyntaxTextSplitter:
|
|||||||
chunks based on specified headers and formatting preferences.
|
chunks based on specified headers and formatting preferences.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
headers_to_split_on (Union[List[Tuple[str, str]], None]):
|
headers_to_split_on (Union[list[tuple[str, str]], None]):
|
||||||
A list of tuples, where each tuple contains a header tag (e.g., "h1")
|
A list of tuples, where each tuple contains a header tag (e.g., "h1")
|
||||||
and its corresponding metadata key. If None, default headers are used.
|
and its corresponding metadata key. If None, default headers are used.
|
||||||
return_each_line (bool):
|
return_each_line (bool):
|
||||||
|
@ -1465,6 +1465,127 @@ def test_md_header_text_splitter_with_invisible_characters(characters: str) -> N
|
|||||||
assert output == expected_output
|
assert output == expected_output
|
||||||
|
|
||||||
|
|
||||||
|
def test_md_header_text_splitter_with_custom_headers() -> None:
|
||||||
|
"""Test markdown splitter with custom header patterns like **Header**."""
|
||||||
|
markdown_document = """**Chapter 1**
|
||||||
|
|
||||||
|
This is the content for chapter 1.
|
||||||
|
|
||||||
|
***Section 1.1***
|
||||||
|
|
||||||
|
This is the content for section 1.1.
|
||||||
|
|
||||||
|
**Chapter 2**
|
||||||
|
|
||||||
|
This is the content for chapter 2.
|
||||||
|
|
||||||
|
***Section 2.1***
|
||||||
|
|
||||||
|
This is the content for section 2.1.
|
||||||
|
"""
|
||||||
|
|
||||||
|
headers_to_split_on = [
|
||||||
|
("**", "Bold Header"),
|
||||||
|
("***", "Bold Italic Header"),
|
||||||
|
]
|
||||||
|
|
||||||
|
custom_header_patterns = {
|
||||||
|
"**": 1, # Level 1 headers
|
||||||
|
"***": 2, # Level 2 headers
|
||||||
|
}
|
||||||
|
markdown_splitter = MarkdownHeaderTextSplitter(
|
||||||
|
headers_to_split_on=headers_to_split_on,
|
||||||
|
custom_header_patterns=custom_header_patterns,
|
||||||
|
)
|
||||||
|
output = markdown_splitter.split_text(markdown_document)
|
||||||
|
|
||||||
|
expected_output = [
|
||||||
|
Document(
|
||||||
|
page_content="This is the content for chapter 1.",
|
||||||
|
metadata={"Bold Header": "Chapter 1"},
|
||||||
|
),
|
||||||
|
Document(
|
||||||
|
page_content="This is the content for section 1.1.",
|
||||||
|
metadata={"Bold Header": "Chapter 1", "Bold Italic Header": "Section 1.1"},
|
||||||
|
),
|
||||||
|
Document(
|
||||||
|
page_content="This is the content for chapter 2.",
|
||||||
|
metadata={"Bold Header": "Chapter 2"},
|
||||||
|
),
|
||||||
|
Document(
|
||||||
|
page_content="This is the content for section 2.1.",
|
||||||
|
metadata={"Bold Header": "Chapter 2", "Bold Italic Header": "Section 2.1"},
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
assert output == expected_output
|
||||||
|
|
||||||
|
|
||||||
|
def test_md_header_text_splitter_mixed_headers() -> None:
|
||||||
|
"""Test markdown splitter with both standard and custom headers."""
|
||||||
|
markdown_document = """# Standard Header 1
|
||||||
|
|
||||||
|
Content under standard header.
|
||||||
|
|
||||||
|
**Custom Header 1**
|
||||||
|
|
||||||
|
Content under custom header.
|
||||||
|
|
||||||
|
## Standard Header 2
|
||||||
|
|
||||||
|
Content under standard header 2.
|
||||||
|
|
||||||
|
***Custom Header 2***
|
||||||
|
|
||||||
|
Content under custom header 2.
|
||||||
|
"""
|
||||||
|
|
||||||
|
headers_to_split_on = [
|
||||||
|
("#", "Header 1"),
|
||||||
|
("##", "Header 2"),
|
||||||
|
("**", "Bold Header"),
|
||||||
|
("***", "Bold Italic Header"),
|
||||||
|
]
|
||||||
|
|
||||||
|
custom_header_patterns = {
|
||||||
|
"**": 1, # Same level as #
|
||||||
|
"***": 2, # Same level as ##
|
||||||
|
}
|
||||||
|
|
||||||
|
markdown_splitter = MarkdownHeaderTextSplitter(
|
||||||
|
headers_to_split_on=headers_to_split_on,
|
||||||
|
custom_header_patterns=custom_header_patterns,
|
||||||
|
)
|
||||||
|
output = markdown_splitter.split_text(markdown_document)
|
||||||
|
|
||||||
|
expected_output = [
|
||||||
|
Document(
|
||||||
|
page_content="Content under standard header.",
|
||||||
|
metadata={"Header 1": "Standard Header 1"},
|
||||||
|
),
|
||||||
|
Document(
|
||||||
|
page_content="Content under custom header.",
|
||||||
|
metadata={"Bold Header": "Custom Header 1"},
|
||||||
|
),
|
||||||
|
Document(
|
||||||
|
page_content="Content under standard header 2.",
|
||||||
|
metadata={
|
||||||
|
"Bold Header": "Custom Header 1",
|
||||||
|
"Header 2": "Standard Header 2",
|
||||||
|
},
|
||||||
|
),
|
||||||
|
Document(
|
||||||
|
page_content="Content under custom header 2.",
|
||||||
|
metadata={
|
||||||
|
"Bold Header": "Custom Header 1",
|
||||||
|
"Bold Italic Header": "Custom Header 2",
|
||||||
|
},
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
assert output == expected_output
|
||||||
|
|
||||||
|
|
||||||
EXPERIMENTAL_MARKDOWN_DOCUMENT = (
|
EXPERIMENTAL_MARKDOWN_DOCUMENT = (
|
||||||
"# My Header 1\n"
|
"# My Header 1\n"
|
||||||
"Content for header 1\n"
|
"Content for header 1\n"
|
||||||
|
Loading…
Reference in New Issue
Block a user