Add option to preserve headers in MarkdownHeaderTextSplitter (#14433)

- **Description:** `MarkdownHeaderTextSplitter` currently strips header lines from chunked content. Many applications require these header lines are preserved. This adds an optional parameter to preserve those headers in the chunked content. - **Issue:** #2836 (relevant) - **Dependencies:** - - **Tag maintainer:** @baskaryan - **Twitter handle:** @finnless Unit tests and new examples in notebook included. cc @rlancemartin --------- Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
2025-09-22 19:09:57 +00:00 · 2024-01-02 22:34:52 -08:00
parent 0a7d360ba4
commit 6c4b5a4eff
3 changed files with 151 additions and 7 deletions
--- a/libs/langchain/langchain/text_splitter.py
+++ b/libs/langchain/langchain/text_splitter.py
@@ -323,13 +323,17 @@ class MarkdownHeaderTextSplitter:
    """Splitting markdown files based on specified headers."""

    def __init__(
-        self, headers_to_split_on: List[Tuple[str, str]], return_each_line: bool = False
+        self,
+        headers_to_split_on: List[Tuple[str, str]],
+        return_each_line: bool = False,
+        strip_headers: bool = True,
    ):
        """Create a new MarkdownHeaderTextSplitter.

        Args:
            headers_to_split_on: Headers we want to track
            return_each_line: Return each line w/ associated headers
+            strip_headers: Strip split headers from the content of the chunk
        """
        # Output line-by-line or aggregated into chunks w/ common headers
        self.return_each_line = return_each_line
@@ -338,6 +342,8 @@ class MarkdownHeaderTextSplitter:
        self.headers_to_split_on = sorted(
            headers_to_split_on, key=lambda split: len(split[0]), reverse=True
        )
+        # Strip headers split headers from the content of the chunk
+        self.strip_headers = strip_headers

    def aggregate_lines_to_chunks(self, lines: List[LineType]) -> List[Document]:
        """Combine lines with common metadata into chunks
@@ -355,6 +361,23 @@ class MarkdownHeaderTextSplitter:
                # has the same metadata as the current line,
                # append the current content to the last lines's content
                aggregated_chunks[-1]["content"] += "  \n" + line["content"]
+            elif (
+                aggregated_chunks
+                and aggregated_chunks[-1]["metadata"] != line["metadata"]
+                # may be issues if other metadata is present
+                and len(aggregated_chunks[-1]["metadata"]) < len(line["metadata"])
+                and aggregated_chunks[-1]["content"].split("\n")[-1][0] == "#"
+                and not self.strip_headers
+            ):
+                # If the last line in the aggregated list
+                # has different metadata as the current line,
+                # and has shallower header level than the current line,
+                # and the last line is a header,
+                # and we are not stripping headers,
+                # append the current content to the last line's content
+                aggregated_chunks[-1]["content"] += "  \n" + line["content"]
+                # and update the last line's metadata
+                aggregated_chunks[-1]["metadata"] = line["metadata"]
            else:
                # Otherwise, append the current line to the aggregated list
                aggregated_chunks.append(line)
@@ -451,6 +474,9 @@ class MarkdownHeaderTextSplitter:
                        )
                        current_content.clear()

+                    if not self.strip_headers:
+                        current_content.append(stripped_line)
+
                    break
            else:
                if stripped_line:
--- a/libs/langchain/tests/unit_tests/test_text_splitter.py
+++ b/libs/langchain/tests/unit_tests/test_text_splitter.py
@@ -1035,6 +1035,87 @@ def test_md_header_text_splitter_3() -> None:
    assert output == expected_output


+def test_md_header_text_splitter_preserve_headers_1() -> None:
+    """Test markdown splitter by header: Preserve Headers."""
+
+    markdown_document = (
+        "# Foo\n\n"
+        "    ## Bat\n\n"
+        "Hi this is Jim\n\n"
+        "Hi Joe\n\n"
+        "## Baz\n\n"
+        "# Bar\n\n"
+        "This is Alice\n\n"
+        "This is Bob"
+    )
+    headers_to_split_on = [
+        ("#", "Header 1"),
+    ]
+    markdown_splitter = MarkdownHeaderTextSplitter(
+        headers_to_split_on=headers_to_split_on,
+        strip_headers=False,
+    )
+    output = markdown_splitter.split_text(markdown_document)
+    expected_output = [
+        Document(
+            page_content="# Foo  \n## Bat  \nHi this is Jim  \nHi Joe  \n## Baz",
+            metadata={"Header 1": "Foo"},
+        ),
+        Document(
+            page_content="# Bar  \nThis is Alice  \nThis is Bob",
+            metadata={"Header 1": "Bar"},
+        ),
+    ]
+    assert output == expected_output
+
+
+def test_md_header_text_splitter_preserve_headers_2() -> None:
+    """Test markdown splitter by header: Preserve Headers."""
+
+    markdown_document = (
+        "# Foo\n\n"
+        "    ## Bar\n\n"
+        "Hi this is Jim\n\n"
+        "Hi this is Joe\n\n"
+        "### Boo \n\n"
+        "Hi this is Lance\n\n"
+        "## Baz\n\n"
+        "Hi this is Molly\n"
+        "    ## Buz\n"
+        "# Bop"
+    )
+    headers_to_split_on = [
+        ("#", "Header 1"),
+        ("##", "Header 2"),
+        ("###", "Header 3"),
+    ]
+    markdown_splitter = MarkdownHeaderTextSplitter(
+        headers_to_split_on=headers_to_split_on,
+        strip_headers=False,
+    )
+    output = markdown_splitter.split_text(markdown_document)
+    expected_output = [
+        Document(
+            page_content="# Foo  \n## Bar  \nHi this is Jim  \nHi this is Joe",
+            metadata={"Header 1": "Foo", "Header 2": "Bar"},
+        ),
+        Document(
+            page_content="### Boo  \nHi this is Lance",
+            metadata={"Header 1": "Foo", "Header 2": "Bar", "Header 3": "Boo"},
+        ),
+        Document(
+            page_content="## Baz  \nHi this is Molly",
+            metadata={"Header 1": "Foo", "Header 2": "Baz"},
+        ),
+        Document(
+            page_content="## Buz",
+            metadata={"Header 1": "Foo", "Header 2": "Buz"},
+        ),
+        Document(page_content="# Bop", metadata={"Header 1": "Bop"}),
+    ]
+    assert output == expected_output
+
+
@pytest.mark.parametrize("fence", [("```"), ("~~~")])
 def test_md_header_text_splitter_fenced_code_block(fence: str) -> None:
    """Test markdown splitter by header: Fenced code block."""