mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-23 15:19:33 +00:00
text-splitters: fix state persistence issue in ExperimentalMarkdownSyntaxTextSplitter (#28373)
- **Description:** This PR resolves an issue with the `ExperimentalMarkdownSyntaxTextSplitter` class, which retains the internal state across multiple calls to the `split_text` method. This behaviour caused an unintended accumulation of chunks in `self` variables, leading to incorrect outputs when processing multiple Markdown files sequentially. - Modified `libs\text-splitters\langchain_text_splitters\markdown.py` to reset the relevant internal attributes at the start of each `split_text` invocation. This ensures each call processes the input independently. - Added unit tests in `libs\text-splitters\tests\unit_tests\test_text_splitters.py` to verify the fix and ensure the state does not persist across calls. - **Issue:** Fixes [#26440](https://github.com/langchain-ai/langchain/issues/26440). - **Dependencies:** No additional dependencies are introduced with this change. - [x] Unit tests were added to verify the changes. - [x] Updated documentation where necessary. - [x] Ran `make format`, `make lint`, and `make test` to ensure compliance with project standards. --------- Co-authored-by: Angel Chen <angelchen396@gmail.com> Co-authored-by: Chester Curme <chester.curme@gmail.com>
This commit is contained in:
parent
7c8f977695
commit
3256b5d6ae
@ -324,6 +324,11 @@ class ExperimentalMarkdownSyntaxTextSplitter:
|
||||
chunks of the input text. If `return_each_line` is enabled, each line
|
||||
is returned as a separate `Document`.
|
||||
"""
|
||||
# Reset the state for each new file processed
|
||||
self.chunks.clear()
|
||||
self.current_chunk = Document(page_content="")
|
||||
self.current_header_stack.clear()
|
||||
|
||||
raw_lines = text.splitlines(keepends=True)
|
||||
|
||||
while raw_lines:
|
||||
|
@ -1527,6 +1527,407 @@ def test_experimental_markdown_syntax_text_splitter_split_lines() -> None:
|
||||
assert output == expected_output
|
||||
|
||||
|
||||
EXPERIMENTAL_MARKDOWN_DOCUMENTS = [
|
||||
(
|
||||
"# My Header 1 From Document 1\n"
|
||||
"Content for header 1 from Document 1\n"
|
||||
"## Header 2 From Document 1\n"
|
||||
"Content for header 2 from Document 1\n"
|
||||
"```python\n"
|
||||
"def func_definition():\n"
|
||||
" print('Keep the whitespace consistent')\n"
|
||||
"```\n"
|
||||
"# Header 1 again From Document 1\n"
|
||||
"We should also split on the horizontal line\n"
|
||||
"----\n"
|
||||
"This will be a new doc but with the same header metadata\n\n"
|
||||
"And it includes a new paragraph"
|
||||
),
|
||||
(
|
||||
"# My Header 1 From Document 2\n"
|
||||
"Content for header 1 from Document 2\n"
|
||||
"## Header 2 From Document 2\n"
|
||||
"Content for header 2 from Document 2\n"
|
||||
"```python\n"
|
||||
"def func_definition():\n"
|
||||
" print('Keep the whitespace consistent')\n"
|
||||
"```\n"
|
||||
"# Header 1 again From Document 2\n"
|
||||
"We should also split on the horizontal line\n"
|
||||
"----\n"
|
||||
"This will be a new doc but with the same header metadata\n\n"
|
||||
"And it includes a new paragraph"
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
def test_experimental_markdown_syntax_text_splitter_on_multi_files() -> None:
|
||||
"""Test experimental markdown syntax splitter split
|
||||
on default called consecutively on two files."""
|
||||
markdown_splitter = ExperimentalMarkdownSyntaxTextSplitter()
|
||||
output = []
|
||||
for experimental_markdown_document in EXPERIMENTAL_MARKDOWN_DOCUMENTS:
|
||||
output += markdown_splitter.split_text(experimental_markdown_document)
|
||||
|
||||
expected_output = [
|
||||
Document(
|
||||
page_content="Content for header 1 from Document 1\n",
|
||||
metadata={"Header 1": "My Header 1 From Document 1"},
|
||||
),
|
||||
Document(
|
||||
page_content="Content for header 2 from Document 1\n",
|
||||
metadata={
|
||||
"Header 1": "My Header 1 From Document 1",
|
||||
"Header 2": "Header 2 From Document 1",
|
||||
},
|
||||
),
|
||||
Document(
|
||||
page_content=(
|
||||
"```python\ndef func_definition():\n "
|
||||
"print('Keep the whitespace consistent')\n```\n"
|
||||
),
|
||||
metadata={
|
||||
"Code": "python",
|
||||
"Header 1": "My Header 1 From Document 1",
|
||||
"Header 2": "Header 2 From Document 1",
|
||||
},
|
||||
),
|
||||
Document(
|
||||
page_content="We should also split on the horizontal line\n",
|
||||
metadata={"Header 1": "Header 1 again From Document 1"},
|
||||
),
|
||||
Document(
|
||||
page_content=(
|
||||
"This will be a new doc but with the same header metadata\n\n"
|
||||
"And it includes a new paragraph"
|
||||
),
|
||||
metadata={"Header 1": "Header 1 again From Document 1"},
|
||||
),
|
||||
Document(
|
||||
page_content="Content for header 1 from Document 2\n",
|
||||
metadata={"Header 1": "My Header 1 From Document 2"},
|
||||
),
|
||||
Document(
|
||||
page_content="Content for header 2 from Document 2\n",
|
||||
metadata={
|
||||
"Header 1": "My Header 1 From Document 2",
|
||||
"Header 2": "Header 2 From Document 2",
|
||||
},
|
||||
),
|
||||
Document(
|
||||
page_content=(
|
||||
"```python\ndef func_definition():\n "
|
||||
"print('Keep the whitespace consistent')\n```\n"
|
||||
),
|
||||
metadata={
|
||||
"Code": "python",
|
||||
"Header 1": "My Header 1 From Document 2",
|
||||
"Header 2": "Header 2 From Document 2",
|
||||
},
|
||||
),
|
||||
Document(
|
||||
page_content="We should also split on the horizontal line\n",
|
||||
metadata={"Header 1": "Header 1 again From Document 2"},
|
||||
),
|
||||
Document(
|
||||
page_content=(
|
||||
"This will be a new doc but with the same header metadata\n\n"
|
||||
"And it includes a new paragraph"
|
||||
),
|
||||
metadata={"Header 1": "Header 1 again From Document 2"},
|
||||
),
|
||||
]
|
||||
|
||||
assert output == expected_output
|
||||
|
||||
|
||||
def test_experimental_markdown_syntax_text_splitter_split_lines_on_multi_files() -> (
|
||||
None
|
||||
):
|
||||
"""Test experimental markdown syntax splitter split
|
||||
on each line called consecutively on two files."""
|
||||
markdown_splitter = ExperimentalMarkdownSyntaxTextSplitter(return_each_line=True)
|
||||
output = []
|
||||
for experimental_markdown_document in EXPERIMENTAL_MARKDOWN_DOCUMENTS:
|
||||
output += markdown_splitter.split_text(experimental_markdown_document)
|
||||
expected_output = [
|
||||
Document(
|
||||
page_content="Content for header 1 from Document 1",
|
||||
metadata={"Header 1": "My Header 1 From Document 1"},
|
||||
),
|
||||
Document(
|
||||
page_content="Content for header 2 from Document 1",
|
||||
metadata={
|
||||
"Header 1": "My Header 1 From Document 1",
|
||||
"Header 2": "Header 2 From Document 1",
|
||||
},
|
||||
),
|
||||
Document(
|
||||
page_content="```python",
|
||||
metadata={
|
||||
"Code": "python",
|
||||
"Header 1": "My Header 1 From Document 1",
|
||||
"Header 2": "Header 2 From Document 1",
|
||||
},
|
||||
),
|
||||
Document(
|
||||
page_content="def func_definition():",
|
||||
metadata={
|
||||
"Code": "python",
|
||||
"Header 1": "My Header 1 From Document 1",
|
||||
"Header 2": "Header 2 From Document 1",
|
||||
},
|
||||
),
|
||||
Document(
|
||||
page_content=" print('Keep the whitespace consistent')",
|
||||
metadata={
|
||||
"Code": "python",
|
||||
"Header 1": "My Header 1 From Document 1",
|
||||
"Header 2": "Header 2 From Document 1",
|
||||
},
|
||||
),
|
||||
Document(
|
||||
page_content="```",
|
||||
metadata={
|
||||
"Code": "python",
|
||||
"Header 1": "My Header 1 From Document 1",
|
||||
"Header 2": "Header 2 From Document 1",
|
||||
},
|
||||
),
|
||||
Document(
|
||||
page_content="We should also split on the horizontal line",
|
||||
metadata={"Header 1": "Header 1 again From Document 1"},
|
||||
),
|
||||
Document(
|
||||
page_content="This will be a new doc but with the same header metadata",
|
||||
metadata={"Header 1": "Header 1 again From Document 1"},
|
||||
),
|
||||
Document(
|
||||
page_content="And it includes a new paragraph",
|
||||
metadata={"Header 1": "Header 1 again From Document 1"},
|
||||
),
|
||||
Document(
|
||||
page_content="Content for header 1 from Document 2",
|
||||
metadata={"Header 1": "My Header 1 From Document 2"},
|
||||
),
|
||||
Document(
|
||||
page_content="Content for header 2 from Document 2",
|
||||
metadata={
|
||||
"Header 1": "My Header 1 From Document 2",
|
||||
"Header 2": "Header 2 From Document 2",
|
||||
},
|
||||
),
|
||||
Document(
|
||||
page_content="```python",
|
||||
metadata={
|
||||
"Code": "python",
|
||||
"Header 1": "My Header 1 From Document 2",
|
||||
"Header 2": "Header 2 From Document 2",
|
||||
},
|
||||
),
|
||||
Document(
|
||||
page_content="def func_definition():",
|
||||
metadata={
|
||||
"Code": "python",
|
||||
"Header 1": "My Header 1 From Document 2",
|
||||
"Header 2": "Header 2 From Document 2",
|
||||
},
|
||||
),
|
||||
Document(
|
||||
page_content=" print('Keep the whitespace consistent')",
|
||||
metadata={
|
||||
"Code": "python",
|
||||
"Header 1": "My Header 1 From Document 2",
|
||||
"Header 2": "Header 2 From Document 2",
|
||||
},
|
||||
),
|
||||
Document(
|
||||
page_content="```",
|
||||
metadata={
|
||||
"Code": "python",
|
||||
"Header 1": "My Header 1 From Document 2",
|
||||
"Header 2": "Header 2 From Document 2",
|
||||
},
|
||||
),
|
||||
Document(
|
||||
page_content="We should also split on the horizontal line",
|
||||
metadata={"Header 1": "Header 1 again From Document 2"},
|
||||
),
|
||||
Document(
|
||||
page_content="This will be a new doc but with the same header metadata",
|
||||
metadata={"Header 1": "Header 1 again From Document 2"},
|
||||
),
|
||||
Document(
|
||||
page_content="And it includes a new paragraph",
|
||||
metadata={"Header 1": "Header 1 again From Document 2"},
|
||||
),
|
||||
]
|
||||
|
||||
assert output == expected_output
|
||||
|
||||
|
||||
def test_experimental_markdown_syntax_text_splitter_with_header_on_multi_files() -> (
|
||||
None
|
||||
):
|
||||
"""Test experimental markdown splitter
|
||||
by header called consecutively on two files"""
|
||||
|
||||
markdown_splitter = ExperimentalMarkdownSyntaxTextSplitter(strip_headers=False)
|
||||
output = []
|
||||
for experimental_markdown_document in EXPERIMENTAL_MARKDOWN_DOCUMENTS:
|
||||
output += markdown_splitter.split_text(experimental_markdown_document)
|
||||
|
||||
expected_output = [
|
||||
Document(
|
||||
page_content="# My Header 1 From Document 1\n"
|
||||
"Content for header 1 from Document 1\n",
|
||||
metadata={"Header 1": "My Header 1 From Document 1"},
|
||||
),
|
||||
Document(
|
||||
page_content="## Header 2 From Document 1\n"
|
||||
"Content for header 2 from Document 1\n",
|
||||
metadata={
|
||||
"Header 1": "My Header 1 From Document 1",
|
||||
"Header 2": "Header 2 From Document 1",
|
||||
},
|
||||
),
|
||||
Document(
|
||||
page_content=(
|
||||
"```python\ndef func_definition():\n "
|
||||
"print('Keep the whitespace consistent')\n```\n"
|
||||
),
|
||||
metadata={
|
||||
"Code": "python",
|
||||
"Header 1": "My Header 1 From Document 1",
|
||||
"Header 2": "Header 2 From Document 1",
|
||||
},
|
||||
),
|
||||
Document(
|
||||
page_content="# Header 1 again From Document 1\n"
|
||||
"We should also split on the horizontal line\n",
|
||||
metadata={"Header 1": "Header 1 again From Document 1"},
|
||||
),
|
||||
Document(
|
||||
page_content=(
|
||||
"This will be a new doc but with the same header metadata\n\n"
|
||||
"And it includes a new paragraph"
|
||||
),
|
||||
metadata={"Header 1": "Header 1 again From Document 1"},
|
||||
),
|
||||
Document(
|
||||
page_content="# My Header 1 From Document 2\n"
|
||||
"Content for header 1 from Document 2\n",
|
||||
metadata={"Header 1": "My Header 1 From Document 2"},
|
||||
),
|
||||
Document(
|
||||
page_content="## Header 2 From Document 2\n"
|
||||
"Content for header 2 from Document 2\n",
|
||||
metadata={
|
||||
"Header 1": "My Header 1 From Document 2",
|
||||
"Header 2": "Header 2 From Document 2",
|
||||
},
|
||||
),
|
||||
Document(
|
||||
page_content=(
|
||||
"```python\ndef func_definition():\n "
|
||||
"print('Keep the whitespace consistent')\n```\n"
|
||||
),
|
||||
metadata={
|
||||
"Code": "python",
|
||||
"Header 1": "My Header 1 From Document 2",
|
||||
"Header 2": "Header 2 From Document 2",
|
||||
},
|
||||
),
|
||||
Document(
|
||||
page_content="# Header 1 again From Document 2\n"
|
||||
"We should also split on the horizontal line\n",
|
||||
metadata={"Header 1": "Header 1 again From Document 2"},
|
||||
),
|
||||
Document(
|
||||
page_content=(
|
||||
"This will be a new doc but with the same header metadata\n\n"
|
||||
"And it includes a new paragraph"
|
||||
),
|
||||
metadata={"Header 1": "Header 1 again From Document 2"},
|
||||
),
|
||||
]
|
||||
assert output == expected_output
|
||||
|
||||
|
||||
def test_experimental_markdown_syntax_text_splitter_header_config_on_multi_files() -> (
|
||||
None
|
||||
):
|
||||
"""Test experimental markdown splitter
|
||||
by header configuration called consecutively on two files"""
|
||||
|
||||
headers_to_split_on = [("#", "Encabezamiento 1")]
|
||||
markdown_splitter = ExperimentalMarkdownSyntaxTextSplitter(
|
||||
headers_to_split_on=headers_to_split_on
|
||||
)
|
||||
output = []
|
||||
for experimental_markdown_document in EXPERIMENTAL_MARKDOWN_DOCUMENTS:
|
||||
output += markdown_splitter.split_text(experimental_markdown_document)
|
||||
|
||||
expected_output = [
|
||||
Document(
|
||||
page_content="Content for header 1 from Document 1\n"
|
||||
"## Header 2 From Document 1\n"
|
||||
"Content for header 2 from Document 1\n",
|
||||
metadata={"Encabezamiento 1": "My Header 1 From Document 1"},
|
||||
),
|
||||
Document(
|
||||
page_content=(
|
||||
"```python\ndef func_definition():\n "
|
||||
"print('Keep the whitespace consistent')\n```\n"
|
||||
),
|
||||
metadata={
|
||||
"Code": "python",
|
||||
"Encabezamiento 1": "My Header 1 From Document 1",
|
||||
},
|
||||
),
|
||||
Document(
|
||||
page_content="We should also split on the horizontal line\n",
|
||||
metadata={"Encabezamiento 1": "Header 1 again From Document 1"},
|
||||
),
|
||||
Document(
|
||||
page_content=(
|
||||
"This will be a new doc but with the same header metadata\n\n"
|
||||
"And it includes a new paragraph"
|
||||
),
|
||||
metadata={"Encabezamiento 1": "Header 1 again From Document 1"},
|
||||
),
|
||||
Document(
|
||||
page_content="Content for header 1 from Document 2\n"
|
||||
"## Header 2 From Document 2\n"
|
||||
"Content for header 2 from Document 2\n",
|
||||
metadata={"Encabezamiento 1": "My Header 1 From Document 2"},
|
||||
),
|
||||
Document(
|
||||
page_content=(
|
||||
"```python\ndef func_definition():\n "
|
||||
"print('Keep the whitespace consistent')\n```\n"
|
||||
),
|
||||
metadata={
|
||||
"Code": "python",
|
||||
"Encabezamiento 1": "My Header 1 From Document 2",
|
||||
},
|
||||
),
|
||||
Document(
|
||||
page_content="We should also split on the horizontal line\n",
|
||||
metadata={"Encabezamiento 1": "Header 1 again From Document 2"},
|
||||
),
|
||||
Document(
|
||||
page_content=(
|
||||
"This will be a new doc but with the same header metadata\n\n"
|
||||
"And it includes a new paragraph"
|
||||
),
|
||||
metadata={"Encabezamiento 1": "Header 1 again From Document 2"},
|
||||
),
|
||||
]
|
||||
|
||||
assert output == expected_output
|
||||
|
||||
|
||||
def test_solidity_code_splitter() -> None:
|
||||
splitter = RecursiveCharacterTextSplitter.from_language(
|
||||
Language.SOL, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
||||
|
Loading…
Reference in New Issue
Block a user