Add tests and update notebook for MarkdownHeaderTextSplitter (#6069)

Add test and update notebook for `MarkdownHeaderTextSplitter`.
2025-09-13 13:36:15 +00:00 · 2023-06-13 09:07:52 -07:00
parent 8fdf88b8e3
commit ee3d0513ad
2 changed files with 159 additions and 211 deletions
--- a/tests/unit_tests/test_text_splitter.py
+++ b/tests/unit_tests/test_text_splitter.py
@@ -7,6 +7,7 @@ from langchain.docstore.document import Document
 from langchain.text_splitter import (
    CharacterTextSplitter,
    Language,
+    MarkdownHeaderTextSplitter,
    PythonCodeTextSplitter,
    RecursiveCharacterTextSplitter,
 )
@@ -671,3 +672,129 @@ def test_html_code_splitter() -> None:
        "<p>Some text</p>",
        "<p>Some more text</p>\n            </div>",
    ]
+
+
+def test_md_header_text_splitter_1() -> None:
+    """Test markdown splitter by header: Case 1."""
+
+    markdown_document = (
+        "# Foo\n\n"
+        "    ## Bar\n\n"
+        "Hi this is Jim\n\n"
+        "Hi this is Joe\n\n"
+        " ## Baz\n\n"
+        " Hi this is Molly"
+    )
+    headers_to_split_on = [
+        ("#", "Header 1"),
+        ("##", "Header 2"),
+    ]
+    markdown_splitter = MarkdownHeaderTextSplitter(
+        headers_to_split_on=headers_to_split_on,
+    )
+    output = markdown_splitter.split_text(markdown_document)
+    expected_output = [
+        {
+            "content": "Hi this is Jim  \nHi this is Joe",
+            "metadata": {"Header 1": "Foo", "Header 2": "Bar"},
+        },
+        {
+            "content": "Hi this is Molly",
+            "metadata": {"Header 1": "Foo", "Header 2": "Baz"},
+        },
+    ]
+    assert output == expected_output
+
+
+def test_md_header_text_splitter_2() -> None:
+    """Test markdown splitter by header: Case 2."""
+    markdown_document = (
+        "# Foo\n\n"
+        "    ## Bar\n\n"
+        "Hi this is Jim\n\n"
+        "Hi this is Joe\n\n"
+        " ### Boo \n\n"
+        " Hi this is Lance \n\n"
+        " ## Baz\n\n"
+        " Hi this is Molly"
+    )
+
+    headers_to_split_on = [
+        ("#", "Header 1"),
+        ("##", "Header 2"),
+        ("###", "Header 3"),
+    ]
+    markdown_splitter = MarkdownHeaderTextSplitter(
+        headers_to_split_on=headers_to_split_on,
+    )
+    output = markdown_splitter.split_text(markdown_document)
+    expected_output = [
+        {
+            "content": "Hi this is Jim  \nHi this is Joe",
+            "metadata": {"Header 1": "Foo", "Header 2": "Bar"},
+        },
+        {
+            "content": "Hi this is Lance",
+            "metadata": {"Header 1": "Foo", "Header 2": "Bar", "Header 3": "Boo"},
+        },
+        {
+            "content": "Hi this is Molly",
+            "metadata": {"Header 1": "Foo", "Header 2": "Baz"},
+        },
+    ]
+    assert output == expected_output
+
+
+def test_md_header_text_splitter_3() -> None:
+    """Test markdown splitter by header: Case 3."""
+
+    markdown_document = (
+        "# Foo\n\n"
+        "    ## Bar\n\n"
+        "Hi this is Jim\n\n"
+        "Hi this is Joe\n\n"
+        " ### Boo \n\n"
+        " Hi this is Lance \n\n"
+        " #### Bim \n\n"
+        " Hi this is John \n\n"
+        " ## Baz\n\n"
+        " Hi this is Molly"
+    )
+
+    headers_to_split_on = [
+        ("#", "Header 1"),
+        ("##", "Header 2"),
+        ("###", "Header 3"),
+        ("####", "Header 4"),
+    ]
+
+    markdown_splitter = MarkdownHeaderTextSplitter(
+        headers_to_split_on=headers_to_split_on,
+    )
+    output = markdown_splitter.split_text(markdown_document)
+
+    expected_output = [
+        {
+            "content": "Hi this is Jim  \nHi this is Joe",
+            "metadata": {"Header 1": "Foo", "Header 2": "Bar"},
+        },
+        {
+            "content": "Hi this is Lance",
+            "metadata": {"Header 1": "Foo", "Header 2": "Bar", "Header 3": "Boo"},
+        },
+        {
+            "content": "Hi this is John",
+            "metadata": {
+                "Header 1": "Foo",
+                "Header 2": "Bar",
+                "Header 3": "Boo",
+                "Header 4": "Bim",
+            },
+        },
+        {
+            "content": "Hi this is Molly",
+            "metadata": {"Header 1": "Foo", "Header 2": "Baz"},
+        },
+    ]
+
+    assert output == expected_output