mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-09 23:12:38 +00:00
feat(text-splitters): add optional custom header pattern support (#31887)
## Description This PR adds support for custom header patterns in `MarkdownHeaderTextSplitter`, allowing users to define non-standard Markdown header formats (like `**Header**`) and specify their hierarchy levels. **Issue:** Fixes #22738 **Dependencies:** None - this change has no new dependencies **Key Changes:** - Added optional `custom_header_patterns` parameter to support non-standard header formats - Enable splitting on patterns like `**Header**` and `***Header***` - Maintain full backward compatibility with existing usage - Added comprehensive tests for custom and mixed header scenarios ## Example Usage ```python from langchain_text_splitters import MarkdownHeaderTextSplitter headers_to_split_on = [ ("**", "Chapter"), ("***", "Section"), ] custom_header_patterns = { "**": 1, # Level 1 headers "***": 2, # Level 2 headers } splitter = MarkdownHeaderTextSplitter( headers_to_split_on=headers_to_split_on, custom_header_patterns=custom_header_patterns, ) # Now **Chapter 1** is treated as a level 1 header # And ***Section 1.1*** is treated as a level 2 header ``` ## Testing - ✅ Added unit tests for custom header patterns - ✅ Added tests for mixed standard and custom headers - ✅ All existing tests pass (backward compatibility maintained) - ✅ Linting and formatting checks pass --- The implementation provides a flexible solution while maintaining the simplicity of the existing API. Users can continue using the splitter exactly as before, with the new functionality being entirely opt-in through the `custom_header_patterns` parameter. --------- Co-authored-by: Mason Daugherty <mason@langchain.dev> Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -1465,6 +1465,127 @@ def test_md_header_text_splitter_with_invisible_characters(characters: str) -> N
|
||||
assert output == expected_output
|
||||
|
||||
|
||||
def test_md_header_text_splitter_with_custom_headers() -> None:
|
||||
"""Test markdown splitter with custom header patterns like **Header**."""
|
||||
markdown_document = """**Chapter 1**
|
||||
|
||||
This is the content for chapter 1.
|
||||
|
||||
***Section 1.1***
|
||||
|
||||
This is the content for section 1.1.
|
||||
|
||||
**Chapter 2**
|
||||
|
||||
This is the content for chapter 2.
|
||||
|
||||
***Section 2.1***
|
||||
|
||||
This is the content for section 2.1.
|
||||
"""
|
||||
|
||||
headers_to_split_on = [
|
||||
("**", "Bold Header"),
|
||||
("***", "Bold Italic Header"),
|
||||
]
|
||||
|
||||
custom_header_patterns = {
|
||||
"**": 1, # Level 1 headers
|
||||
"***": 2, # Level 2 headers
|
||||
}
|
||||
markdown_splitter = MarkdownHeaderTextSplitter(
|
||||
headers_to_split_on=headers_to_split_on,
|
||||
custom_header_patterns=custom_header_patterns,
|
||||
)
|
||||
output = markdown_splitter.split_text(markdown_document)
|
||||
|
||||
expected_output = [
|
||||
Document(
|
||||
page_content="This is the content for chapter 1.",
|
||||
metadata={"Bold Header": "Chapter 1"},
|
||||
),
|
||||
Document(
|
||||
page_content="This is the content for section 1.1.",
|
||||
metadata={"Bold Header": "Chapter 1", "Bold Italic Header": "Section 1.1"},
|
||||
),
|
||||
Document(
|
||||
page_content="This is the content for chapter 2.",
|
||||
metadata={"Bold Header": "Chapter 2"},
|
||||
),
|
||||
Document(
|
||||
page_content="This is the content for section 2.1.",
|
||||
metadata={"Bold Header": "Chapter 2", "Bold Italic Header": "Section 2.1"},
|
||||
),
|
||||
]
|
||||
|
||||
assert output == expected_output
|
||||
|
||||
|
||||
def test_md_header_text_splitter_mixed_headers() -> None:
|
||||
"""Test markdown splitter with both standard and custom headers."""
|
||||
markdown_document = """# Standard Header 1
|
||||
|
||||
Content under standard header.
|
||||
|
||||
**Custom Header 1**
|
||||
|
||||
Content under custom header.
|
||||
|
||||
## Standard Header 2
|
||||
|
||||
Content under standard header 2.
|
||||
|
||||
***Custom Header 2***
|
||||
|
||||
Content under custom header 2.
|
||||
"""
|
||||
|
||||
headers_to_split_on = [
|
||||
("#", "Header 1"),
|
||||
("##", "Header 2"),
|
||||
("**", "Bold Header"),
|
||||
("***", "Bold Italic Header"),
|
||||
]
|
||||
|
||||
custom_header_patterns = {
|
||||
"**": 1, # Same level as #
|
||||
"***": 2, # Same level as ##
|
||||
}
|
||||
|
||||
markdown_splitter = MarkdownHeaderTextSplitter(
|
||||
headers_to_split_on=headers_to_split_on,
|
||||
custom_header_patterns=custom_header_patterns,
|
||||
)
|
||||
output = markdown_splitter.split_text(markdown_document)
|
||||
|
||||
expected_output = [
|
||||
Document(
|
||||
page_content="Content under standard header.",
|
||||
metadata={"Header 1": "Standard Header 1"},
|
||||
),
|
||||
Document(
|
||||
page_content="Content under custom header.",
|
||||
metadata={"Bold Header": "Custom Header 1"},
|
||||
),
|
||||
Document(
|
||||
page_content="Content under standard header 2.",
|
||||
metadata={
|
||||
"Bold Header": "Custom Header 1",
|
||||
"Header 2": "Standard Header 2",
|
||||
},
|
||||
),
|
||||
Document(
|
||||
page_content="Content under custom header 2.",
|
||||
metadata={
|
||||
"Bold Header": "Custom Header 1",
|
||||
"Bold Italic Header": "Custom Header 2",
|
||||
},
|
||||
),
|
||||
]
|
||||
|
||||
assert output == expected_output
|
||||
|
||||
|
||||
EXPERIMENTAL_MARKDOWN_DOCUMENT = (
|
||||
"# My Header 1\n"
|
||||
"Content for header 1\n"
|
||||
|
Reference in New Issue
Block a user