text-splitters: fix state persistence issue in ExperimentalMarkdownSyntaxTextSplitter (#28373)

- **Description:** This PR resolves an issue with the `ExperimentalMarkdownSyntaxTextSplitter` class, which retains the internal state across multiple calls to the `split_text` method. This behaviour caused an unintended accumulation of chunks in `self` variables, leading to incorrect outputs when processing multiple Markdown files sequentially. - Modified `libs\text-splitters\langchain_text_splitters\markdown.py` to reset the relevant internal attributes at the start of each `split_text` invocation. This ensures each call processes the input independently. - Added unit tests in `libs\text-splitters\tests\unit_tests\test_text_splitters.py` to verify the fix and ensure the state does not persist across calls. - **Issue:** Fixes [#26440](https://github.com/langchain-ai/langchain/issues/26440). - **Dependencies:** No additional dependencies are introduced with this change. - [x] Unit tests were added to verify the changes. - [x] Updated documentation where necessary. - [x] Ran `make format`, `make lint`, and `make test` to ensure compliance with project standards. --------- Co-authored-by: Angel Chen <angelchen396@gmail.com> Co-authored-by: Chester Curme <chester.curme@gmail.com>
2025-09-16 06:53:16 +00:00 · 2024-12-18 15:27:59 -05:00
parent 7c8f977695
commit 3256b5d6ae
2 changed files with 406 additions and 0 deletions
--- a/libs/text-splitters/langchain_text_splitters/markdown.py
+++ b/libs/text-splitters/langchain_text_splitters/markdown.py
@@ -324,6 +324,11 @@ class ExperimentalMarkdownSyntaxTextSplitter:
            chunks of the input text. If `return_each_line` is enabled, each line
            is returned as a separate `Document`.
        """
+        # Reset the state for each new file processed
+        self.chunks.clear()
+        self.current_chunk = Document(page_content="")
+        self.current_header_stack.clear()
+
        raw_lines = text.splitlines(keepends=True)

        while raw_lines:
--- a/libs/text-splitters/tests/unit_tests/test_text_splitters.py
+++ b/libs/text-splitters/tests/unit_tests/test_text_splitters.py
@@ -1527,6 +1527,407 @@ def test_experimental_markdown_syntax_text_splitter_split_lines() -> None:
    assert output == expected_output


+EXPERIMENTAL_MARKDOWN_DOCUMENTS = [
+    (
+        "# My Header 1 From Document 1\n"
+        "Content for header 1 from Document 1\n"
+        "## Header 2 From Document 1\n"
+        "Content for header 2 from Document 1\n"
+        "```python\n"
+        "def func_definition():\n"
+        "   print('Keep the whitespace consistent')\n"
+        "```\n"
+        "# Header 1 again From Document 1\n"
+        "We should also split on the horizontal line\n"
+        "----\n"
+        "This will be a new doc but with the same header metadata\n\n"
+        "And it includes a new paragraph"
+    ),
+    (
+        "# My Header 1 From Document 2\n"
+        "Content for header 1 from Document 2\n"
+        "## Header 2 From Document 2\n"
+        "Content for header 2 from Document 2\n"
+        "```python\n"
+        "def func_definition():\n"
+        "   print('Keep the whitespace consistent')\n"
+        "```\n"
+        "# Header 1 again From Document 2\n"
+        "We should also split on the horizontal line\n"
+        "----\n"
+        "This will be a new doc but with the same header metadata\n\n"
+        "And it includes a new paragraph"
+    ),
+]
+
+
+def test_experimental_markdown_syntax_text_splitter_on_multi_files() -> None:
+    """Test experimental markdown syntax splitter split
+    on default called consecutively on two files."""
+    markdown_splitter = ExperimentalMarkdownSyntaxTextSplitter()
+    output = []
+    for experimental_markdown_document in EXPERIMENTAL_MARKDOWN_DOCUMENTS:
+        output += markdown_splitter.split_text(experimental_markdown_document)
+
+    expected_output = [
+        Document(
+            page_content="Content for header 1 from Document 1\n",
+            metadata={"Header 1": "My Header 1 From Document 1"},
+        ),
+        Document(
+            page_content="Content for header 2 from Document 1\n",
+            metadata={
+                "Header 1": "My Header 1 From Document 1",
+                "Header 2": "Header 2 From Document 1",
+            },
+        ),
+        Document(
+            page_content=(
+                "```python\ndef func_definition():\n   "
+                "print('Keep the whitespace consistent')\n```\n"
+            ),
+            metadata={
+                "Code": "python",
+                "Header 1": "My Header 1 From Document 1",
+                "Header 2": "Header 2 From Document 1",
+            },
+        ),
+        Document(
+            page_content="We should also split on the horizontal line\n",
+            metadata={"Header 1": "Header 1 again From Document 1"},
+        ),
+        Document(
+            page_content=(
+                "This will be a new doc but with the same header metadata\n\n"
+                "And it includes a new paragraph"
+            ),
+            metadata={"Header 1": "Header 1 again From Document 1"},
+        ),
+        Document(
+            page_content="Content for header 1 from Document 2\n",
+            metadata={"Header 1": "My Header 1 From Document 2"},
+        ),
+        Document(
+            page_content="Content for header 2 from Document 2\n",
+            metadata={
+                "Header 1": "My Header 1 From Document 2",
+                "Header 2": "Header 2 From Document 2",
+            },
+        ),
+        Document(
+            page_content=(
+                "```python\ndef func_definition():\n   "
+                "print('Keep the whitespace consistent')\n```\n"
+            ),
+            metadata={
+                "Code": "python",
+                "Header 1": "My Header 1 From Document 2",
+                "Header 2": "Header 2 From Document 2",
+            },
+        ),
+        Document(
+            page_content="We should also split on the horizontal line\n",
+            metadata={"Header 1": "Header 1 again From Document 2"},
+        ),
+        Document(
+            page_content=(
+                "This will be a new doc but with the same header metadata\n\n"
+                "And it includes a new paragraph"
+            ),
+            metadata={"Header 1": "Header 1 again From Document 2"},
+        ),
+    ]
+
+    assert output == expected_output
+
+
+def test_experimental_markdown_syntax_text_splitter_split_lines_on_multi_files() -> (
+    None
+):
+    """Test experimental markdown syntax splitter split
+    on each line called consecutively on two files."""
+    markdown_splitter = ExperimentalMarkdownSyntaxTextSplitter(return_each_line=True)
+    output = []
+    for experimental_markdown_document in EXPERIMENTAL_MARKDOWN_DOCUMENTS:
+        output += markdown_splitter.split_text(experimental_markdown_document)
+    expected_output = [
+        Document(
+            page_content="Content for header 1 from Document 1",
+            metadata={"Header 1": "My Header 1 From Document 1"},
+        ),
+        Document(
+            page_content="Content for header 2 from Document 1",
+            metadata={
+                "Header 1": "My Header 1 From Document 1",
+                "Header 2": "Header 2 From Document 1",
+            },
+        ),
+        Document(
+            page_content="```python",
+            metadata={
+                "Code": "python",
+                "Header 1": "My Header 1 From Document 1",
+                "Header 2": "Header 2 From Document 1",
+            },
+        ),
+        Document(
+            page_content="def func_definition():",
+            metadata={
+                "Code": "python",
+                "Header 1": "My Header 1 From Document 1",
+                "Header 2": "Header 2 From Document 1",
+            },
+        ),
+        Document(
+            page_content="   print('Keep the whitespace consistent')",
+            metadata={
+                "Code": "python",
+                "Header 1": "My Header 1 From Document 1",
+                "Header 2": "Header 2 From Document 1",
+            },
+        ),
+        Document(
+            page_content="```",
+            metadata={
+                "Code": "python",
+                "Header 1": "My Header 1 From Document 1",
+                "Header 2": "Header 2 From Document 1",
+            },
+        ),
+        Document(
+            page_content="We should also split on the horizontal line",
+            metadata={"Header 1": "Header 1 again From Document 1"},
+        ),
+        Document(
+            page_content="This will be a new doc but with the same header metadata",
+            metadata={"Header 1": "Header 1 again From Document 1"},
+        ),
+        Document(
+            page_content="And it includes a new paragraph",
+            metadata={"Header 1": "Header 1 again From Document 1"},
+        ),
+        Document(
+            page_content="Content for header 1 from Document 2",
+            metadata={"Header 1": "My Header 1 From Document 2"},
+        ),
+        Document(
+            page_content="Content for header 2 from Document 2",
+            metadata={
+                "Header 1": "My Header 1 From Document 2",
+                "Header 2": "Header 2 From Document 2",
+            },
+        ),
+        Document(
+            page_content="```python",
+            metadata={
+                "Code": "python",
+                "Header 1": "My Header 1 From Document 2",
+                "Header 2": "Header 2 From Document 2",
+            },
+        ),
+        Document(
+            page_content="def func_definition():",
+            metadata={
+                "Code": "python",
+                "Header 1": "My Header 1 From Document 2",
+                "Header 2": "Header 2 From Document 2",
+            },
+        ),
+        Document(
+            page_content="   print('Keep the whitespace consistent')",
+            metadata={
+                "Code": "python",
+                "Header 1": "My Header 1 From Document 2",
+                "Header 2": "Header 2 From Document 2",
+            },
+        ),
+        Document(
+            page_content="```",
+            metadata={
+                "Code": "python",
+                "Header 1": "My Header 1 From Document 2",
+                "Header 2": "Header 2 From Document 2",
+            },
+        ),
+        Document(
+            page_content="We should also split on the horizontal line",
+            metadata={"Header 1": "Header 1 again From Document 2"},
+        ),
+        Document(
+            page_content="This will be a new doc but with the same header metadata",
+            metadata={"Header 1": "Header 1 again From Document 2"},
+        ),
+        Document(
+            page_content="And it includes a new paragraph",
+            metadata={"Header 1": "Header 1 again From Document 2"},
+        ),
+    ]
+
+    assert output == expected_output
+
+
+def test_experimental_markdown_syntax_text_splitter_with_header_on_multi_files() -> (
+    None
+):
+    """Test experimental markdown splitter
+    by header called consecutively on two files"""
+
+    markdown_splitter = ExperimentalMarkdownSyntaxTextSplitter(strip_headers=False)
+    output = []
+    for experimental_markdown_document in EXPERIMENTAL_MARKDOWN_DOCUMENTS:
+        output += markdown_splitter.split_text(experimental_markdown_document)
+
+    expected_output = [
+        Document(
+            page_content="# My Header 1 From Document 1\n"
+            "Content for header 1 from Document 1\n",
+            metadata={"Header 1": "My Header 1 From Document 1"},
+        ),
+        Document(
+            page_content="## Header 2 From Document 1\n"
+            "Content for header 2 from Document 1\n",
+            metadata={
+                "Header 1": "My Header 1 From Document 1",
+                "Header 2": "Header 2 From Document 1",
+            },
+        ),
+        Document(
+            page_content=(
+                "```python\ndef func_definition():\n   "
+                "print('Keep the whitespace consistent')\n```\n"
+            ),
+            metadata={
+                "Code": "python",
+                "Header 1": "My Header 1 From Document 1",
+                "Header 2": "Header 2 From Document 1",
+            },
+        ),
+        Document(
+            page_content="# Header 1 again From Document 1\n"
+            "We should also split on the horizontal line\n",
+            metadata={"Header 1": "Header 1 again From Document 1"},
+        ),
+        Document(
+            page_content=(
+                "This will be a new doc but with the same header metadata\n\n"
+                "And it includes a new paragraph"
+            ),
+            metadata={"Header 1": "Header 1 again From Document 1"},
+        ),
+        Document(
+            page_content="# My Header 1 From Document 2\n"
+            "Content for header 1 from Document 2\n",
+            metadata={"Header 1": "My Header 1 From Document 2"},
+        ),
+        Document(
+            page_content="## Header 2 From Document 2\n"
+            "Content for header 2 from Document 2\n",
+            metadata={
+                "Header 1": "My Header 1 From Document 2",
+                "Header 2": "Header 2 From Document 2",
+            },
+        ),
+        Document(
+            page_content=(
+                "```python\ndef func_definition():\n   "
+                "print('Keep the whitespace consistent')\n```\n"
+            ),
+            metadata={
+                "Code": "python",
+                "Header 1": "My Header 1 From Document 2",
+                "Header 2": "Header 2 From Document 2",
+            },
+        ),
+        Document(
+            page_content="# Header 1 again From Document 2\n"
+            "We should also split on the horizontal line\n",
+            metadata={"Header 1": "Header 1 again From Document 2"},
+        ),
+        Document(
+            page_content=(
+                "This will be a new doc but with the same header metadata\n\n"
+                "And it includes a new paragraph"
+            ),
+            metadata={"Header 1": "Header 1 again From Document 2"},
+        ),
+    ]
+    assert output == expected_output
+
+
+def test_experimental_markdown_syntax_text_splitter_header_config_on_multi_files() -> (
+    None
+):
+    """Test experimental markdown splitter
+    by header configuration called consecutively on two files"""
+
+    headers_to_split_on = [("#", "Encabezamiento 1")]
+    markdown_splitter = ExperimentalMarkdownSyntaxTextSplitter(
+        headers_to_split_on=headers_to_split_on
+    )
+    output = []
+    for experimental_markdown_document in EXPERIMENTAL_MARKDOWN_DOCUMENTS:
+        output += markdown_splitter.split_text(experimental_markdown_document)
+
+    expected_output = [
+        Document(
+            page_content="Content for header 1 from Document 1\n"
+            "## Header 2 From Document 1\n"
+            "Content for header 2 from Document 1\n",
+            metadata={"Encabezamiento 1": "My Header 1 From Document 1"},
+        ),
+        Document(
+            page_content=(
+                "```python\ndef func_definition():\n   "
+                "print('Keep the whitespace consistent')\n```\n"
+            ),
+            metadata={
+                "Code": "python",
+                "Encabezamiento 1": "My Header 1 From Document 1",
+            },
+        ),
+        Document(
+            page_content="We should also split on the horizontal line\n",
+            metadata={"Encabezamiento 1": "Header 1 again From Document 1"},
+        ),
+        Document(
+            page_content=(
+                "This will be a new doc but with the same header metadata\n\n"
+                "And it includes a new paragraph"
+            ),
+            metadata={"Encabezamiento 1": "Header 1 again From Document 1"},
+        ),
+        Document(
+            page_content="Content for header 1 from Document 2\n"
+            "## Header 2 From Document 2\n"
+            "Content for header 2 from Document 2\n",
+            metadata={"Encabezamiento 1": "My Header 1 From Document 2"},
+        ),
+        Document(
+            page_content=(
+                "```python\ndef func_definition():\n   "
+                "print('Keep the whitespace consistent')\n```\n"
+            ),
+            metadata={
+                "Code": "python",
+                "Encabezamiento 1": "My Header 1 From Document 2",
+            },
+        ),
+        Document(
+            page_content="We should also split on the horizontal line\n",
+            metadata={"Encabezamiento 1": "Header 1 again From Document 2"},
+        ),
+        Document(
+            page_content=(
+                "This will be a new doc but with the same header metadata\n\n"
+                "And it includes a new paragraph"
+            ),
+            metadata={"Encabezamiento 1": "Header 1 again From Document 2"},
+        ),
+    ]
+
+    assert output == expected_output
+
+
 def test_solidity_code_splitter() -> None:
    splitter = RecursiveCharacterTextSplitter.from_language(
        Language.SOL, chunk_size=CHUNK_SIZE, chunk_overlap=0