From 2c9859956a002eebc238f0ea221b76e9cc02ebf9 Mon Sep 17 00:00:00 2001 From: Raghu Kapur <64493087+raghukapur9@users.noreply.github.com> Date: Fri, 20 Jun 2025 15:52:17 -0400 Subject: [PATCH] text-splitters: fix stale header metadata in ExperimentalMarkdownSyntaxTextSplitter (#31622) **Description:** Previously, when transitioning from a deeper Markdown header (e.g., ###) to a shallower one (e.g., ##), the ExperimentalMarkdownSyntaxTextSplitter retained the deeper header in the metadata. This commit updates the `_resolve_header_stack` method to remove headers at the same or deeper levels before appending the current header. As a result, each chunk now reflects only the active header context. Fixes unexpected metadata leakage across sections in nested Markdown documents. Additionally, test cases have been updated to: - Validate correct header resolution and metadata assignment. - Cover edge cases with nested headers and horizontal rules. **Issue:** Fixes [#31596](https://github.com/langchain-ai/langchain/issues/31596) **Dependencies:** None **Twitter handle:** -> [_RaghuKapur](https://twitter.com/_RaghuKapur) **LinkedIn:** -> [https://www.linkedin.com/in/raghukapur/](https://www.linkedin.com/in/raghukapur/) --- .../langchain_text_splitters/markdown.py | 8 +-- .../tests/unit_tests/test_text_splitters.py | 65 +++++++++++++++++-- 2 files changed, 62 insertions(+), 11 deletions(-) diff --git a/libs/text-splitters/langchain_text_splitters/markdown.py b/libs/text-splitters/langchain_text_splitters/markdown.py index 1db585bef46..ae885bbb0ab 100644 --- a/libs/text-splitters/langchain_text_splitters/markdown.py +++ b/libs/text-splitters/langchain_text_splitters/markdown.py @@ -376,10 +376,10 @@ class ExperimentalMarkdownSyntaxTextSplitter: def _resolve_header_stack(self, header_depth: int, header_text: str) -> None: for i, (depth, _) in enumerate(self.current_header_stack): - if depth == header_depth: - self.current_header_stack[i] = (header_depth, header_text) - self.current_header_stack = self.current_header_stack[: i + 1] - return + if depth >= header_depth: + # Truncate everything from this level onward + self.current_header_stack = self.current_header_stack[:i] + break self.current_header_stack.append((header_depth, header_text)) def _resolve_code_chunk(self, current_line: str, raw_lines: List[str]) -> str: diff --git a/libs/text-splitters/tests/unit_tests/test_text_splitters.py b/libs/text-splitters/tests/unit_tests/test_text_splitters.py index 85c6caace15..9aeb4b1520a 100644 --- a/libs/text-splitters/tests/unit_tests/test_text_splitters.py +++ b/libs/text-splitters/tests/unit_tests/test_text_splitters.py @@ -1464,6 +1464,10 @@ EXPERIMENTAL_MARKDOWN_DOCUMENT = ( "Content for header 1\n" "## Header 2\n" "Content for header 2\n" + "### Header 3\n" + "Content for header 3\n" + "## Header 2 Again\n" + "This should be tagged with Header 1 and Header 2 Again\n" "```python\n" "def func_definition():\n" " print('Keep the whitespace consistent')\n" @@ -1491,6 +1495,18 @@ def test_experimental_markdown_syntax_text_splitter() -> None: page_content="Content for header 2\n", metadata={"Header 1": "My Header 1", "Header 2": "Header 2"}, ), + Document( + page_content="Content for header 3\n", + metadata={ + "Header 1": "My Header 1", + "Header 2": "Header 2", + "Header 3": "Header 3", + }, + ), + Document( + page_content="This should be tagged with Header 1 and Header 2 Again\n", + metadata={"Header 1": "My Header 1", "Header 2": "Header 2 Again"}, + ), Document( page_content=( "```python\ndef func_definition():\n " @@ -1499,7 +1515,7 @@ def test_experimental_markdown_syntax_text_splitter() -> None: metadata={ "Code": "python", "Header 1": "My Header 1", - "Header 2": "Header 2", + "Header 2": "Header 2 Again", }, ), Document( @@ -1530,7 +1546,15 @@ def test_experimental_markdown_syntax_text_splitter_header_configuration() -> No expected_output = [ Document( - page_content="Content for header 1\n## Header 2\nContent for header 2\n", + page_content=( + "Content for header 1\n" + "## Header 2\n" + "Content for header 2\n" + "### Header 3\n" + "Content for header 3\n" + "## Header 2 Again\n" + "This should be tagged with Header 1 and Header 2 Again\n" + ), metadata={"Encabezamiento 1": "My Header 1"}, ), Document( @@ -1571,6 +1595,21 @@ def test_experimental_markdown_syntax_text_splitter_with_headers() -> None: page_content="## Header 2\nContent for header 2\n", metadata={"Header 1": "My Header 1", "Header 2": "Header 2"}, ), + Document( + page_content="### Header 3\nContent for header 3\n", + metadata={ + "Header 1": "My Header 1", + "Header 2": "Header 2", + "Header 3": "Header 3", + }, + ), + Document( + page_content=( + "## Header 2 Again\n" + "This should be tagged with Header 1 and Header 2 Again\n" + ), + metadata={"Header 1": "My Header 1", "Header 2": "Header 2 Again"}, + ), Document( page_content=( "```python\ndef func_definition():\n " @@ -1579,7 +1618,7 @@ def test_experimental_markdown_syntax_text_splitter_with_headers() -> None: metadata={ "Code": "python", "Header 1": "My Header 1", - "Header 2": "Header 2", + "Header 2": "Header 2 Again", }, ), Document( @@ -1614,12 +1653,24 @@ def test_experimental_markdown_syntax_text_splitter_split_lines() -> None: page_content="Content for header 2", metadata={"Header 1": "My Header 1", "Header 2": "Header 2"}, ), + Document( + page_content="Content for header 3", + metadata={ + "Header 1": "My Header 1", + "Header 2": "Header 2", + "Header 3": "Header 3", + }, + ), + Document( + page_content="This should be tagged with Header 1 and Header 2 Again", + metadata={"Header 1": "My Header 1", "Header 2": "Header 2 Again"}, + ), Document( page_content="```python", metadata={ "Code": "python", "Header 1": "My Header 1", - "Header 2": "Header 2", + "Header 2": "Header 2 Again", }, ), Document( @@ -1627,7 +1678,7 @@ def test_experimental_markdown_syntax_text_splitter_split_lines() -> None: metadata={ "Code": "python", "Header 1": "My Header 1", - "Header 2": "Header 2", + "Header 2": "Header 2 Again", }, ), Document( @@ -1635,7 +1686,7 @@ def test_experimental_markdown_syntax_text_splitter_split_lines() -> None: metadata={ "Code": "python", "Header 1": "My Header 1", - "Header 2": "Header 2", + "Header 2": "Header 2 Again", }, ), Document( @@ -1643,7 +1694,7 @@ def test_experimental_markdown_syntax_text_splitter_split_lines() -> None: metadata={ "Code": "python", "Header 1": "My Header 1", - "Header 2": "Header 2", + "Header 2": "Header 2 Again", }, ), Document(