text-splitters: fix stale header metadata in ExperimentalMarkdownSyntaxTextSplitter (#31622)

**Description:**

Previously, when transitioning from a deeper Markdown header (e.g., ###)
to a shallower one (e.g., ##), the
ExperimentalMarkdownSyntaxTextSplitter retained the deeper header in the
metadata.

This commit updates the `_resolve_header_stack` method to remove headers
at the same or deeper levels before appending the current header. As a
result, each chunk now reflects only the active header context.

Fixes unexpected metadata leakage across sections in nested Markdown
documents.

Additionally, test cases have been updated to:
- Validate correct header resolution and metadata assignment.
- Cover edge cases with nested headers and horizontal rules.

**Issue:** 
Fixes [#31596](https://github.com/langchain-ai/langchain/issues/31596)

**Dependencies:**
None

**Twitter handle:** -> [_RaghuKapur](https://twitter.com/_RaghuKapur)

**LinkedIn:** ->
[https://www.linkedin.com/in/raghukapur/](https://www.linkedin.com/in/raghukapur/)
This commit is contained in:
Raghu Kapur 2025-06-20 15:52:17 -04:00 committed by GitHub
parent 9d4d258162
commit 2c9859956a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 62 additions and 11 deletions

View File

@ -376,10 +376,10 @@ class ExperimentalMarkdownSyntaxTextSplitter:
def _resolve_header_stack(self, header_depth: int, header_text: str) -> None:
for i, (depth, _) in enumerate(self.current_header_stack):
if depth == header_depth:
self.current_header_stack[i] = (header_depth, header_text)
self.current_header_stack = self.current_header_stack[: i + 1]
return
if depth >= header_depth:
# Truncate everything from this level onward
self.current_header_stack = self.current_header_stack[:i]
break
self.current_header_stack.append((header_depth, header_text))
def _resolve_code_chunk(self, current_line: str, raw_lines: List[str]) -> str:

View File

@ -1464,6 +1464,10 @@ EXPERIMENTAL_MARKDOWN_DOCUMENT = (
"Content for header 1\n"
"## Header 2\n"
"Content for header 2\n"
"### Header 3\n"
"Content for header 3\n"
"## Header 2 Again\n"
"This should be tagged with Header 1 and Header 2 Again\n"
"```python\n"
"def func_definition():\n"
" print('Keep the whitespace consistent')\n"
@ -1491,6 +1495,18 @@ def test_experimental_markdown_syntax_text_splitter() -> None:
page_content="Content for header 2\n",
metadata={"Header 1": "My Header 1", "Header 2": "Header 2"},
),
Document(
page_content="Content for header 3\n",
metadata={
"Header 1": "My Header 1",
"Header 2": "Header 2",
"Header 3": "Header 3",
},
),
Document(
page_content="This should be tagged with Header 1 and Header 2 Again\n",
metadata={"Header 1": "My Header 1", "Header 2": "Header 2 Again"},
),
Document(
page_content=(
"```python\ndef func_definition():\n "
@ -1499,7 +1515,7 @@ def test_experimental_markdown_syntax_text_splitter() -> None:
metadata={
"Code": "python",
"Header 1": "My Header 1",
"Header 2": "Header 2",
"Header 2": "Header 2 Again",
},
),
Document(
@ -1530,7 +1546,15 @@ def test_experimental_markdown_syntax_text_splitter_header_configuration() -> No
expected_output = [
Document(
page_content="Content for header 1\n## Header 2\nContent for header 2\n",
page_content=(
"Content for header 1\n"
"## Header 2\n"
"Content for header 2\n"
"### Header 3\n"
"Content for header 3\n"
"## Header 2 Again\n"
"This should be tagged with Header 1 and Header 2 Again\n"
),
metadata={"Encabezamiento 1": "My Header 1"},
),
Document(
@ -1571,6 +1595,21 @@ def test_experimental_markdown_syntax_text_splitter_with_headers() -> None:
page_content="## Header 2\nContent for header 2\n",
metadata={"Header 1": "My Header 1", "Header 2": "Header 2"},
),
Document(
page_content="### Header 3\nContent for header 3\n",
metadata={
"Header 1": "My Header 1",
"Header 2": "Header 2",
"Header 3": "Header 3",
},
),
Document(
page_content=(
"## Header 2 Again\n"
"This should be tagged with Header 1 and Header 2 Again\n"
),
metadata={"Header 1": "My Header 1", "Header 2": "Header 2 Again"},
),
Document(
page_content=(
"```python\ndef func_definition():\n "
@ -1579,7 +1618,7 @@ def test_experimental_markdown_syntax_text_splitter_with_headers() -> None:
metadata={
"Code": "python",
"Header 1": "My Header 1",
"Header 2": "Header 2",
"Header 2": "Header 2 Again",
},
),
Document(
@ -1614,12 +1653,24 @@ def test_experimental_markdown_syntax_text_splitter_split_lines() -> None:
page_content="Content for header 2",
metadata={"Header 1": "My Header 1", "Header 2": "Header 2"},
),
Document(
page_content="Content for header 3",
metadata={
"Header 1": "My Header 1",
"Header 2": "Header 2",
"Header 3": "Header 3",
},
),
Document(
page_content="This should be tagged with Header 1 and Header 2 Again",
metadata={"Header 1": "My Header 1", "Header 2": "Header 2 Again"},
),
Document(
page_content="```python",
metadata={
"Code": "python",
"Header 1": "My Header 1",
"Header 2": "Header 2",
"Header 2": "Header 2 Again",
},
),
Document(
@ -1627,7 +1678,7 @@ def test_experimental_markdown_syntax_text_splitter_split_lines() -> None:
metadata={
"Code": "python",
"Header 1": "My Header 1",
"Header 2": "Header 2",
"Header 2": "Header 2 Again",
},
),
Document(
@ -1635,7 +1686,7 @@ def test_experimental_markdown_syntax_text_splitter_split_lines() -> None:
metadata={
"Code": "python",
"Header 1": "My Header 1",
"Header 2": "Header 2",
"Header 2": "Header 2 Again",
},
),
Document(
@ -1643,7 +1694,7 @@ def test_experimental_markdown_syntax_text_splitter_split_lines() -> None:
metadata={
"Code": "python",
"Header 1": "My Header 1",
"Header 2": "Header 2",
"Header 2": "Header 2 Again",
},
),
Document(