mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-23 23:29:21 +00:00
text-splitters: fix stale header metadata in ExperimentalMarkdownSyntaxTextSplitter (#31622)
**Description:** Previously, when transitioning from a deeper Markdown header (e.g., ###) to a shallower one (e.g., ##), the ExperimentalMarkdownSyntaxTextSplitter retained the deeper header in the metadata. This commit updates the `_resolve_header_stack` method to remove headers at the same or deeper levels before appending the current header. As a result, each chunk now reflects only the active header context. Fixes unexpected metadata leakage across sections in nested Markdown documents. Additionally, test cases have been updated to: - Validate correct header resolution and metadata assignment. - Cover edge cases with nested headers and horizontal rules. **Issue:** Fixes [#31596](https://github.com/langchain-ai/langchain/issues/31596) **Dependencies:** None **Twitter handle:** -> [_RaghuKapur](https://twitter.com/_RaghuKapur) **LinkedIn:** -> [https://www.linkedin.com/in/raghukapur/](https://www.linkedin.com/in/raghukapur/)
This commit is contained in:
parent
9d4d258162
commit
2c9859956a
@ -376,10 +376,10 @@ class ExperimentalMarkdownSyntaxTextSplitter:
|
||||
|
||||
def _resolve_header_stack(self, header_depth: int, header_text: str) -> None:
|
||||
for i, (depth, _) in enumerate(self.current_header_stack):
|
||||
if depth == header_depth:
|
||||
self.current_header_stack[i] = (header_depth, header_text)
|
||||
self.current_header_stack = self.current_header_stack[: i + 1]
|
||||
return
|
||||
if depth >= header_depth:
|
||||
# Truncate everything from this level onward
|
||||
self.current_header_stack = self.current_header_stack[:i]
|
||||
break
|
||||
self.current_header_stack.append((header_depth, header_text))
|
||||
|
||||
def _resolve_code_chunk(self, current_line: str, raw_lines: List[str]) -> str:
|
||||
|
@ -1464,6 +1464,10 @@ EXPERIMENTAL_MARKDOWN_DOCUMENT = (
|
||||
"Content for header 1\n"
|
||||
"## Header 2\n"
|
||||
"Content for header 2\n"
|
||||
"### Header 3\n"
|
||||
"Content for header 3\n"
|
||||
"## Header 2 Again\n"
|
||||
"This should be tagged with Header 1 and Header 2 Again\n"
|
||||
"```python\n"
|
||||
"def func_definition():\n"
|
||||
" print('Keep the whitespace consistent')\n"
|
||||
@ -1491,6 +1495,18 @@ def test_experimental_markdown_syntax_text_splitter() -> None:
|
||||
page_content="Content for header 2\n",
|
||||
metadata={"Header 1": "My Header 1", "Header 2": "Header 2"},
|
||||
),
|
||||
Document(
|
||||
page_content="Content for header 3\n",
|
||||
metadata={
|
||||
"Header 1": "My Header 1",
|
||||
"Header 2": "Header 2",
|
||||
"Header 3": "Header 3",
|
||||
},
|
||||
),
|
||||
Document(
|
||||
page_content="This should be tagged with Header 1 and Header 2 Again\n",
|
||||
metadata={"Header 1": "My Header 1", "Header 2": "Header 2 Again"},
|
||||
),
|
||||
Document(
|
||||
page_content=(
|
||||
"```python\ndef func_definition():\n "
|
||||
@ -1499,7 +1515,7 @@ def test_experimental_markdown_syntax_text_splitter() -> None:
|
||||
metadata={
|
||||
"Code": "python",
|
||||
"Header 1": "My Header 1",
|
||||
"Header 2": "Header 2",
|
||||
"Header 2": "Header 2 Again",
|
||||
},
|
||||
),
|
||||
Document(
|
||||
@ -1530,7 +1546,15 @@ def test_experimental_markdown_syntax_text_splitter_header_configuration() -> No
|
||||
|
||||
expected_output = [
|
||||
Document(
|
||||
page_content="Content for header 1\n## Header 2\nContent for header 2\n",
|
||||
page_content=(
|
||||
"Content for header 1\n"
|
||||
"## Header 2\n"
|
||||
"Content for header 2\n"
|
||||
"### Header 3\n"
|
||||
"Content for header 3\n"
|
||||
"## Header 2 Again\n"
|
||||
"This should be tagged with Header 1 and Header 2 Again\n"
|
||||
),
|
||||
metadata={"Encabezamiento 1": "My Header 1"},
|
||||
),
|
||||
Document(
|
||||
@ -1571,6 +1595,21 @@ def test_experimental_markdown_syntax_text_splitter_with_headers() -> None:
|
||||
page_content="## Header 2\nContent for header 2\n",
|
||||
metadata={"Header 1": "My Header 1", "Header 2": "Header 2"},
|
||||
),
|
||||
Document(
|
||||
page_content="### Header 3\nContent for header 3\n",
|
||||
metadata={
|
||||
"Header 1": "My Header 1",
|
||||
"Header 2": "Header 2",
|
||||
"Header 3": "Header 3",
|
||||
},
|
||||
),
|
||||
Document(
|
||||
page_content=(
|
||||
"## Header 2 Again\n"
|
||||
"This should be tagged with Header 1 and Header 2 Again\n"
|
||||
),
|
||||
metadata={"Header 1": "My Header 1", "Header 2": "Header 2 Again"},
|
||||
),
|
||||
Document(
|
||||
page_content=(
|
||||
"```python\ndef func_definition():\n "
|
||||
@ -1579,7 +1618,7 @@ def test_experimental_markdown_syntax_text_splitter_with_headers() -> None:
|
||||
metadata={
|
||||
"Code": "python",
|
||||
"Header 1": "My Header 1",
|
||||
"Header 2": "Header 2",
|
||||
"Header 2": "Header 2 Again",
|
||||
},
|
||||
),
|
||||
Document(
|
||||
@ -1614,12 +1653,24 @@ def test_experimental_markdown_syntax_text_splitter_split_lines() -> None:
|
||||
page_content="Content for header 2",
|
||||
metadata={"Header 1": "My Header 1", "Header 2": "Header 2"},
|
||||
),
|
||||
Document(
|
||||
page_content="Content for header 3",
|
||||
metadata={
|
||||
"Header 1": "My Header 1",
|
||||
"Header 2": "Header 2",
|
||||
"Header 3": "Header 3",
|
||||
},
|
||||
),
|
||||
Document(
|
||||
page_content="This should be tagged with Header 1 and Header 2 Again",
|
||||
metadata={"Header 1": "My Header 1", "Header 2": "Header 2 Again"},
|
||||
),
|
||||
Document(
|
||||
page_content="```python",
|
||||
metadata={
|
||||
"Code": "python",
|
||||
"Header 1": "My Header 1",
|
||||
"Header 2": "Header 2",
|
||||
"Header 2": "Header 2 Again",
|
||||
},
|
||||
),
|
||||
Document(
|
||||
@ -1627,7 +1678,7 @@ def test_experimental_markdown_syntax_text_splitter_split_lines() -> None:
|
||||
metadata={
|
||||
"Code": "python",
|
||||
"Header 1": "My Header 1",
|
||||
"Header 2": "Header 2",
|
||||
"Header 2": "Header 2 Again",
|
||||
},
|
||||
),
|
||||
Document(
|
||||
@ -1635,7 +1686,7 @@ def test_experimental_markdown_syntax_text_splitter_split_lines() -> None:
|
||||
metadata={
|
||||
"Code": "python",
|
||||
"Header 1": "My Header 1",
|
||||
"Header 2": "Header 2",
|
||||
"Header 2": "Header 2 Again",
|
||||
},
|
||||
),
|
||||
Document(
|
||||
@ -1643,7 +1694,7 @@ def test_experimental_markdown_syntax_text_splitter_split_lines() -> None:
|
||||
metadata={
|
||||
"Code": "python",
|
||||
"Header 1": "My Header 1",
|
||||
"Header 2": "Header 2",
|
||||
"Header 2": "Header 2 Again",
|
||||
},
|
||||
),
|
||||
Document(
|
||||
|
Loading…
Reference in New Issue
Block a user