mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-10 13:27:36 +00:00
text-splitters: fix stale header metadata in ExperimentalMarkdownSyntaxTextSplitter (#31622)
**Description:** Previously, when transitioning from a deeper Markdown header (e.g., ###) to a shallower one (e.g., ##), the ExperimentalMarkdownSyntaxTextSplitter retained the deeper header in the metadata. This commit updates the `_resolve_header_stack` method to remove headers at the same or deeper levels before appending the current header. As a result, each chunk now reflects only the active header context. Fixes unexpected metadata leakage across sections in nested Markdown documents. Additionally, test cases have been updated to: - Validate correct header resolution and metadata assignment. - Cover edge cases with nested headers and horizontal rules. **Issue:** Fixes [#31596](https://github.com/langchain-ai/langchain/issues/31596) **Dependencies:** None **Twitter handle:** -> [_RaghuKapur](https://twitter.com/_RaghuKapur) **LinkedIn:** -> [https://www.linkedin.com/in/raghukapur/](https://www.linkedin.com/in/raghukapur/)
This commit is contained in:
parent
9d4d258162
commit
2c9859956a
@ -376,10 +376,10 @@ class ExperimentalMarkdownSyntaxTextSplitter:
|
|||||||
|
|
||||||
def _resolve_header_stack(self, header_depth: int, header_text: str) -> None:
|
def _resolve_header_stack(self, header_depth: int, header_text: str) -> None:
|
||||||
for i, (depth, _) in enumerate(self.current_header_stack):
|
for i, (depth, _) in enumerate(self.current_header_stack):
|
||||||
if depth == header_depth:
|
if depth >= header_depth:
|
||||||
self.current_header_stack[i] = (header_depth, header_text)
|
# Truncate everything from this level onward
|
||||||
self.current_header_stack = self.current_header_stack[: i + 1]
|
self.current_header_stack = self.current_header_stack[:i]
|
||||||
return
|
break
|
||||||
self.current_header_stack.append((header_depth, header_text))
|
self.current_header_stack.append((header_depth, header_text))
|
||||||
|
|
||||||
def _resolve_code_chunk(self, current_line: str, raw_lines: List[str]) -> str:
|
def _resolve_code_chunk(self, current_line: str, raw_lines: List[str]) -> str:
|
||||||
|
@ -1464,6 +1464,10 @@ EXPERIMENTAL_MARKDOWN_DOCUMENT = (
|
|||||||
"Content for header 1\n"
|
"Content for header 1\n"
|
||||||
"## Header 2\n"
|
"## Header 2\n"
|
||||||
"Content for header 2\n"
|
"Content for header 2\n"
|
||||||
|
"### Header 3\n"
|
||||||
|
"Content for header 3\n"
|
||||||
|
"## Header 2 Again\n"
|
||||||
|
"This should be tagged with Header 1 and Header 2 Again\n"
|
||||||
"```python\n"
|
"```python\n"
|
||||||
"def func_definition():\n"
|
"def func_definition():\n"
|
||||||
" print('Keep the whitespace consistent')\n"
|
" print('Keep the whitespace consistent')\n"
|
||||||
@ -1491,6 +1495,18 @@ def test_experimental_markdown_syntax_text_splitter() -> None:
|
|||||||
page_content="Content for header 2\n",
|
page_content="Content for header 2\n",
|
||||||
metadata={"Header 1": "My Header 1", "Header 2": "Header 2"},
|
metadata={"Header 1": "My Header 1", "Header 2": "Header 2"},
|
||||||
),
|
),
|
||||||
|
Document(
|
||||||
|
page_content="Content for header 3\n",
|
||||||
|
metadata={
|
||||||
|
"Header 1": "My Header 1",
|
||||||
|
"Header 2": "Header 2",
|
||||||
|
"Header 3": "Header 3",
|
||||||
|
},
|
||||||
|
),
|
||||||
|
Document(
|
||||||
|
page_content="This should be tagged with Header 1 and Header 2 Again\n",
|
||||||
|
metadata={"Header 1": "My Header 1", "Header 2": "Header 2 Again"},
|
||||||
|
),
|
||||||
Document(
|
Document(
|
||||||
page_content=(
|
page_content=(
|
||||||
"```python\ndef func_definition():\n "
|
"```python\ndef func_definition():\n "
|
||||||
@ -1499,7 +1515,7 @@ def test_experimental_markdown_syntax_text_splitter() -> None:
|
|||||||
metadata={
|
metadata={
|
||||||
"Code": "python",
|
"Code": "python",
|
||||||
"Header 1": "My Header 1",
|
"Header 1": "My Header 1",
|
||||||
"Header 2": "Header 2",
|
"Header 2": "Header 2 Again",
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
Document(
|
Document(
|
||||||
@ -1530,7 +1546,15 @@ def test_experimental_markdown_syntax_text_splitter_header_configuration() -> No
|
|||||||
|
|
||||||
expected_output = [
|
expected_output = [
|
||||||
Document(
|
Document(
|
||||||
page_content="Content for header 1\n## Header 2\nContent for header 2\n",
|
page_content=(
|
||||||
|
"Content for header 1\n"
|
||||||
|
"## Header 2\n"
|
||||||
|
"Content for header 2\n"
|
||||||
|
"### Header 3\n"
|
||||||
|
"Content for header 3\n"
|
||||||
|
"## Header 2 Again\n"
|
||||||
|
"This should be tagged with Header 1 and Header 2 Again\n"
|
||||||
|
),
|
||||||
metadata={"Encabezamiento 1": "My Header 1"},
|
metadata={"Encabezamiento 1": "My Header 1"},
|
||||||
),
|
),
|
||||||
Document(
|
Document(
|
||||||
@ -1571,6 +1595,21 @@ def test_experimental_markdown_syntax_text_splitter_with_headers() -> None:
|
|||||||
page_content="## Header 2\nContent for header 2\n",
|
page_content="## Header 2\nContent for header 2\n",
|
||||||
metadata={"Header 1": "My Header 1", "Header 2": "Header 2"},
|
metadata={"Header 1": "My Header 1", "Header 2": "Header 2"},
|
||||||
),
|
),
|
||||||
|
Document(
|
||||||
|
page_content="### Header 3\nContent for header 3\n",
|
||||||
|
metadata={
|
||||||
|
"Header 1": "My Header 1",
|
||||||
|
"Header 2": "Header 2",
|
||||||
|
"Header 3": "Header 3",
|
||||||
|
},
|
||||||
|
),
|
||||||
|
Document(
|
||||||
|
page_content=(
|
||||||
|
"## Header 2 Again\n"
|
||||||
|
"This should be tagged with Header 1 and Header 2 Again\n"
|
||||||
|
),
|
||||||
|
metadata={"Header 1": "My Header 1", "Header 2": "Header 2 Again"},
|
||||||
|
),
|
||||||
Document(
|
Document(
|
||||||
page_content=(
|
page_content=(
|
||||||
"```python\ndef func_definition():\n "
|
"```python\ndef func_definition():\n "
|
||||||
@ -1579,7 +1618,7 @@ def test_experimental_markdown_syntax_text_splitter_with_headers() -> None:
|
|||||||
metadata={
|
metadata={
|
||||||
"Code": "python",
|
"Code": "python",
|
||||||
"Header 1": "My Header 1",
|
"Header 1": "My Header 1",
|
||||||
"Header 2": "Header 2",
|
"Header 2": "Header 2 Again",
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
Document(
|
Document(
|
||||||
@ -1614,12 +1653,24 @@ def test_experimental_markdown_syntax_text_splitter_split_lines() -> None:
|
|||||||
page_content="Content for header 2",
|
page_content="Content for header 2",
|
||||||
metadata={"Header 1": "My Header 1", "Header 2": "Header 2"},
|
metadata={"Header 1": "My Header 1", "Header 2": "Header 2"},
|
||||||
),
|
),
|
||||||
|
Document(
|
||||||
|
page_content="Content for header 3",
|
||||||
|
metadata={
|
||||||
|
"Header 1": "My Header 1",
|
||||||
|
"Header 2": "Header 2",
|
||||||
|
"Header 3": "Header 3",
|
||||||
|
},
|
||||||
|
),
|
||||||
|
Document(
|
||||||
|
page_content="This should be tagged with Header 1 and Header 2 Again",
|
||||||
|
metadata={"Header 1": "My Header 1", "Header 2": "Header 2 Again"},
|
||||||
|
),
|
||||||
Document(
|
Document(
|
||||||
page_content="```python",
|
page_content="```python",
|
||||||
metadata={
|
metadata={
|
||||||
"Code": "python",
|
"Code": "python",
|
||||||
"Header 1": "My Header 1",
|
"Header 1": "My Header 1",
|
||||||
"Header 2": "Header 2",
|
"Header 2": "Header 2 Again",
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
Document(
|
Document(
|
||||||
@ -1627,7 +1678,7 @@ def test_experimental_markdown_syntax_text_splitter_split_lines() -> None:
|
|||||||
metadata={
|
metadata={
|
||||||
"Code": "python",
|
"Code": "python",
|
||||||
"Header 1": "My Header 1",
|
"Header 1": "My Header 1",
|
||||||
"Header 2": "Header 2",
|
"Header 2": "Header 2 Again",
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
Document(
|
Document(
|
||||||
@ -1635,7 +1686,7 @@ def test_experimental_markdown_syntax_text_splitter_split_lines() -> None:
|
|||||||
metadata={
|
metadata={
|
||||||
"Code": "python",
|
"Code": "python",
|
||||||
"Header 1": "My Header 1",
|
"Header 1": "My Header 1",
|
||||||
"Header 2": "Header 2",
|
"Header 2": "Header 2 Again",
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
Document(
|
Document(
|
||||||
@ -1643,7 +1694,7 @@ def test_experimental_markdown_syntax_text_splitter_split_lines() -> None:
|
|||||||
metadata={
|
metadata={
|
||||||
"Code": "python",
|
"Code": "python",
|
||||||
"Header 1": "My Header 1",
|
"Header 1": "My Header 1",
|
||||||
"Header 2": "Header 2",
|
"Header 2": "Header 2 Again",
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
Document(
|
Document(
|
||||||
|
Loading…
Reference in New Issue
Block a user