mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-07 13:40:46 +00:00
Fix MarkdownHeaderTextSplitter
not recognizing tilde-fenced code blocks (#13511)
- **Description:** Previously `MarkdownHeaderTextSplitter` did not consider tilde-fenced code blocks (https://spec.commonmark.org/0.30/#fenced-code-blocks). This PR fixes that. ````md # Bug caused by previous implementation: ~~~py foo() # This is a comment that would be considered header bar() ~~~ ```` - **Tag maintainer:** @baskaryan
This commit is contained in:
parent
7929b26017
commit
2703a1b061
@ -389,16 +389,23 @@ class MarkdownHeaderTextSplitter:
|
||||
initial_metadata: Dict[str, str] = {}
|
||||
|
||||
in_code_block = False
|
||||
opening_fence = ""
|
||||
|
||||
for line in lines:
|
||||
stripped_line = line.strip()
|
||||
|
||||
if stripped_line.startswith("```"):
|
||||
# code block in one row
|
||||
if stripped_line.count("```") >= 2:
|
||||
if not in_code_block:
|
||||
# Exclude inline code spans
|
||||
if stripped_line.startswith("```") and stripped_line.count("```") == 1:
|
||||
in_code_block = True
|
||||
opening_fence = "```"
|
||||
elif stripped_line.startswith("~~~"):
|
||||
in_code_block = True
|
||||
opening_fence = "~~~"
|
||||
else:
|
||||
if stripped_line.startswith(opening_fence):
|
||||
in_code_block = False
|
||||
else:
|
||||
in_code_block = not in_code_block
|
||||
opening_fence = ""
|
||||
|
||||
if in_code_block:
|
||||
current_content.append(stripped_line)
|
||||
|
@ -1031,6 +1031,77 @@ def test_md_header_text_splitter_3() -> None:
|
||||
assert output == expected_output
|
||||
|
||||
|
||||
@pytest.mark.parametrize("fence", [("```"), ("~~~")])
|
||||
def test_md_header_text_splitter_fenced_code_block(fence: str) -> None:
|
||||
"""Test markdown splitter by header: Fenced code block."""
|
||||
|
||||
markdown_document = (
|
||||
"# This is a Header\n\n"
|
||||
f"{fence}\n"
|
||||
"foo()\n"
|
||||
"# Not a header\n"
|
||||
"bar()\n"
|
||||
f"{fence}"
|
||||
)
|
||||
|
||||
headers_to_split_on = [
|
||||
("#", "Header 1"),
|
||||
("##", "Header 2"),
|
||||
]
|
||||
|
||||
markdown_splitter = MarkdownHeaderTextSplitter(
|
||||
headers_to_split_on=headers_to_split_on,
|
||||
)
|
||||
output = markdown_splitter.split_text(markdown_document)
|
||||
|
||||
expected_output = [
|
||||
Document(
|
||||
page_content=f"{fence}\nfoo()\n# Not a header\nbar()\n{fence}",
|
||||
metadata={"Header 1": "This is a Header"},
|
||||
),
|
||||
]
|
||||
|
||||
assert output == expected_output
|
||||
|
||||
|
||||
@pytest.mark.parametrize(["fence", "other_fence"], [("```", "~~~"), ("~~~", "```")])
|
||||
def test_md_header_text_splitter_fenced_code_block_interleaved(
|
||||
fence: str, other_fence: str
|
||||
) -> None:
|
||||
"""Test markdown splitter by header: Interleaved fenced code block."""
|
||||
|
||||
markdown_document = (
|
||||
"# This is a Header\n\n"
|
||||
f"{fence}\n"
|
||||
"foo\n"
|
||||
"# Not a header\n"
|
||||
f"{other_fence}\n"
|
||||
"# Not a header\n"
|
||||
f"{fence}"
|
||||
)
|
||||
|
||||
headers_to_split_on = [
|
||||
("#", "Header 1"),
|
||||
("##", "Header 2"),
|
||||
]
|
||||
|
||||
markdown_splitter = MarkdownHeaderTextSplitter(
|
||||
headers_to_split_on=headers_to_split_on,
|
||||
)
|
||||
output = markdown_splitter.split_text(markdown_document)
|
||||
|
||||
expected_output = [
|
||||
Document(
|
||||
page_content=(
|
||||
f"{fence}\nfoo\n# Not a header\n{other_fence}\n# Not a header\n{fence}"
|
||||
),
|
||||
metadata={"Header 1": "This is a Header"},
|
||||
),
|
||||
]
|
||||
|
||||
assert output == expected_output
|
||||
|
||||
|
||||
def test_solidity_code_splitter() -> None:
|
||||
splitter = RecursiveCharacterTextSplitter.from_language(
|
||||
Language.SOL, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
||||
|
Loading…
Reference in New Issue
Block a user