mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-23 23:29:21 +00:00
text-splitters[patch]: fix MarkdownHeaderTextSplitter fails to parse headers with non-printable characters (#20645)
Description: MarkdownHeaderTextSplitter Fails to Parse Headers with non-printable characters. more #20643 The following is the official test case. Just replacing `# Foo\n\n` with `\ufeff# Foo\n\n` will cause the test case to fail. chunk metadata is empty ```python def test_md_header_text_splitter_1() -> None: """Test markdown splitter by header: Case 1.""" markdown_document = ( "\ufeff# Foo\n\n" " ## Bar\n\n" "Hi this is Jim\n\n" "Hi this is Joe\n\n" " ## Baz\n\n" " Hi this is Molly" ) headers_to_split_on = [ ("#", "Header 1"), ("##", "Header 2"), ] markdown_splitter = MarkdownHeaderTextSplitter( headers_to_split_on=headers_to_split_on, ) output = markdown_splitter.split_text(markdown_document) expected_output = [ Document( page_content="Hi this is Jim \nHi this is Joe", metadata={"Header 1": "Foo", "Header 2": "Bar"}, ), Document( page_content="Hi this is Molly", metadata={"Header 1": "Foo", "Header 2": "Baz"}, ), ] assert output == expected_output ``` twitter: @coolbeevip Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com>
This commit is contained in:
parent
2968f20970
commit
2cd907ad7e
@ -107,7 +107,9 @@ class MarkdownHeaderTextSplitter:
|
||||
|
||||
for line in lines:
|
||||
stripped_line = line.strip()
|
||||
|
||||
# Remove all non-printable characters from the string, keeping only visible
|
||||
# text.
|
||||
stripped_line = "".join(filter(str.isprintable, stripped_line))
|
||||
if not in_code_block:
|
||||
# Exclude inline code spans
|
||||
if stripped_line.startswith("```") and stripped_line.count("```") == 1:
|
||||
|
@ -1220,6 +1220,38 @@ def test_md_header_text_splitter_fenced_code_block_interleaved(
|
||||
assert output == expected_output
|
||||
|
||||
|
||||
@pytest.mark.parametrize("characters", ["\ufeff"])
|
||||
def test_md_header_text_splitter_with_invisible_characters(characters: str) -> None:
|
||||
"""Test markdown splitter by header: Fenced code block."""
|
||||
|
||||
markdown_document = (
|
||||
f"{characters}# Foo\n\n" "foo()\n" f"{characters}## Bar\n\n" "bar()"
|
||||
)
|
||||
|
||||
headers_to_split_on = [
|
||||
("#", "Header 1"),
|
||||
("##", "Header 2"),
|
||||
]
|
||||
|
||||
markdown_splitter = MarkdownHeaderTextSplitter(
|
||||
headers_to_split_on=headers_to_split_on,
|
||||
)
|
||||
output = markdown_splitter.split_text(markdown_document)
|
||||
|
||||
expected_output = [
|
||||
Document(
|
||||
page_content="foo()",
|
||||
metadata={"Header 1": "Foo"},
|
||||
),
|
||||
Document(
|
||||
page_content="bar()",
|
||||
metadata={"Header 1": "Foo", "Header 2": "Bar"},
|
||||
),
|
||||
]
|
||||
|
||||
assert output == expected_output
|
||||
|
||||
|
||||
def test_solidity_code_splitter() -> None:
|
||||
splitter = RecursiveCharacterTextSplitter.from_language(
|
||||
Language.SOL, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
||||
|
Loading…
Reference in New Issue
Block a user