diff --git a/libs/text-splitters/langchain_text_splitters/markdown.py b/libs/text-splitters/langchain_text_splitters/markdown.py index fb9d16dcf25..b717597d5ca 100644 --- a/libs/text-splitters/langchain_text_splitters/markdown.py +++ b/libs/text-splitters/langchain_text_splitters/markdown.py @@ -107,7 +107,9 @@ class MarkdownHeaderTextSplitter: for line in lines: stripped_line = line.strip() - + # Remove all non-printable characters from the string, keeping only visible + # text. + stripped_line = "".join(filter(str.isprintable, stripped_line)) if not in_code_block: # Exclude inline code spans if stripped_line.startswith("```") and stripped_line.count("```") == 1: diff --git a/libs/text-splitters/tests/unit_tests/test_text_splitters.py b/libs/text-splitters/tests/unit_tests/test_text_splitters.py index 1202d13f2e2..3d88d786fb3 100644 --- a/libs/text-splitters/tests/unit_tests/test_text_splitters.py +++ b/libs/text-splitters/tests/unit_tests/test_text_splitters.py @@ -1220,6 +1220,38 @@ def test_md_header_text_splitter_fenced_code_block_interleaved( assert output == expected_output +@pytest.mark.parametrize("characters", ["\ufeff"]) +def test_md_header_text_splitter_with_invisible_characters(characters: str) -> None: + """Test markdown splitter by header: Fenced code block.""" + + markdown_document = ( + f"{characters}# Foo\n\n" "foo()\n" f"{characters}## Bar\n\n" "bar()" + ) + + headers_to_split_on = [ + ("#", "Header 1"), + ("##", "Header 2"), + ] + + markdown_splitter = MarkdownHeaderTextSplitter( + headers_to_split_on=headers_to_split_on, + ) + output = markdown_splitter.split_text(markdown_document) + + expected_output = [ + Document( + page_content="foo()", + metadata={"Header 1": "Foo"}, + ), + Document( + page_content="bar()", + metadata={"Header 1": "Foo", "Header 2": "Bar"}, + ), + ] + + assert output == expected_output + + def test_solidity_code_splitter() -> None: splitter = RecursiveCharacterTextSplitter.from_language( Language.SOL, chunk_size=CHUNK_SIZE, chunk_overlap=0