mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-09 06:24:47 +00:00
Fix MarkdownHeaderTextSplitter
not recognizing tilde-fenced code blocks (#13511)
- **Description:** Previously `MarkdownHeaderTextSplitter` did not consider tilde-fenced code blocks (https://spec.commonmark.org/0.30/#fenced-code-blocks). This PR fixes that. ````md # Bug caused by previous implementation: ~~~py foo() # This is a comment that would be considered header bar() ~~~ ```` - **Tag maintainer:** @baskaryan
This commit is contained in:
parent
7929b26017
commit
2703a1b061
@ -389,16 +389,23 @@ class MarkdownHeaderTextSplitter:
|
|||||||
initial_metadata: Dict[str, str] = {}
|
initial_metadata: Dict[str, str] = {}
|
||||||
|
|
||||||
in_code_block = False
|
in_code_block = False
|
||||||
|
opening_fence = ""
|
||||||
|
|
||||||
for line in lines:
|
for line in lines:
|
||||||
stripped_line = line.strip()
|
stripped_line = line.strip()
|
||||||
|
|
||||||
if stripped_line.startswith("```"):
|
if not in_code_block:
|
||||||
# code block in one row
|
# Exclude inline code spans
|
||||||
if stripped_line.count("```") >= 2:
|
if stripped_line.startswith("```") and stripped_line.count("```") == 1:
|
||||||
in_code_block = False
|
in_code_block = True
|
||||||
|
opening_fence = "```"
|
||||||
|
elif stripped_line.startswith("~~~"):
|
||||||
|
in_code_block = True
|
||||||
|
opening_fence = "~~~"
|
||||||
else:
|
else:
|
||||||
in_code_block = not in_code_block
|
if stripped_line.startswith(opening_fence):
|
||||||
|
in_code_block = False
|
||||||
|
opening_fence = ""
|
||||||
|
|
||||||
if in_code_block:
|
if in_code_block:
|
||||||
current_content.append(stripped_line)
|
current_content.append(stripped_line)
|
||||||
|
@ -1031,6 +1031,77 @@ def test_md_header_text_splitter_3() -> None:
|
|||||||
assert output == expected_output
|
assert output == expected_output
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("fence", [("```"), ("~~~")])
|
||||||
|
def test_md_header_text_splitter_fenced_code_block(fence: str) -> None:
|
||||||
|
"""Test markdown splitter by header: Fenced code block."""
|
||||||
|
|
||||||
|
markdown_document = (
|
||||||
|
"# This is a Header\n\n"
|
||||||
|
f"{fence}\n"
|
||||||
|
"foo()\n"
|
||||||
|
"# Not a header\n"
|
||||||
|
"bar()\n"
|
||||||
|
f"{fence}"
|
||||||
|
)
|
||||||
|
|
||||||
|
headers_to_split_on = [
|
||||||
|
("#", "Header 1"),
|
||||||
|
("##", "Header 2"),
|
||||||
|
]
|
||||||
|
|
||||||
|
markdown_splitter = MarkdownHeaderTextSplitter(
|
||||||
|
headers_to_split_on=headers_to_split_on,
|
||||||
|
)
|
||||||
|
output = markdown_splitter.split_text(markdown_document)
|
||||||
|
|
||||||
|
expected_output = [
|
||||||
|
Document(
|
||||||
|
page_content=f"{fence}\nfoo()\n# Not a header\nbar()\n{fence}",
|
||||||
|
metadata={"Header 1": "This is a Header"},
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
assert output == expected_output
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(["fence", "other_fence"], [("```", "~~~"), ("~~~", "```")])
|
||||||
|
def test_md_header_text_splitter_fenced_code_block_interleaved(
|
||||||
|
fence: str, other_fence: str
|
||||||
|
) -> None:
|
||||||
|
"""Test markdown splitter by header: Interleaved fenced code block."""
|
||||||
|
|
||||||
|
markdown_document = (
|
||||||
|
"# This is a Header\n\n"
|
||||||
|
f"{fence}\n"
|
||||||
|
"foo\n"
|
||||||
|
"# Not a header\n"
|
||||||
|
f"{other_fence}\n"
|
||||||
|
"# Not a header\n"
|
||||||
|
f"{fence}"
|
||||||
|
)
|
||||||
|
|
||||||
|
headers_to_split_on = [
|
||||||
|
("#", "Header 1"),
|
||||||
|
("##", "Header 2"),
|
||||||
|
]
|
||||||
|
|
||||||
|
markdown_splitter = MarkdownHeaderTextSplitter(
|
||||||
|
headers_to_split_on=headers_to_split_on,
|
||||||
|
)
|
||||||
|
output = markdown_splitter.split_text(markdown_document)
|
||||||
|
|
||||||
|
expected_output = [
|
||||||
|
Document(
|
||||||
|
page_content=(
|
||||||
|
f"{fence}\nfoo\n# Not a header\n{other_fence}\n# Not a header\n{fence}"
|
||||||
|
),
|
||||||
|
metadata={"Header 1": "This is a Header"},
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
assert output == expected_output
|
||||||
|
|
||||||
|
|
||||||
def test_solidity_code_splitter() -> None:
|
def test_solidity_code_splitter() -> None:
|
||||||
splitter = RecursiveCharacterTextSplitter.from_language(
|
splitter = RecursiveCharacterTextSplitter.from_language(
|
||||||
Language.SOL, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
Language.SOL, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
||||||
|
Loading…
Reference in New Issue
Block a user