diff --git a/docs/docs/modules/data_connection/document_transformers/markdown_header_metadata.ipynb b/docs/docs/modules/data_connection/document_transformers/markdown_header_metadata.ipynb index 9da4bfbf1f6..f2cea000efa 100644 --- a/docs/docs/modules/data_connection/document_transformers/markdown_header_metadata.ipynb +++ b/docs/docs/modules/data_connection/document_transformers/markdown_header_metadata.ipynb @@ -117,6 +117,41 @@ "type(md_header_splits[0])" ] }, + { + "cell_type": "markdown", + "id": "102aad57-7bef-42d3-ab4e-b50d6dc11718", + "metadata": {}, + "source": [ + "By default, `MarkdownHeaderTextSplitter` strips headers being split on from the output chunk's content. This can be disabled by setting `strip_headers = False`." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "9fce45ba-a4be-4a69-ad27-f5ff195c4fd7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(page_content='# Foo \\n## Bar \\nHi this is Jim \\nHi this is Joe', metadata={'Header 1': 'Foo', 'Header 2': 'Bar'}),\n", + " Document(page_content='### Boo \\nHi this is Lance', metadata={'Header 1': 'Foo', 'Header 2': 'Bar', 'Header 3': 'Boo'}),\n", + " Document(page_content='## Baz \\nHi this is Molly', metadata={'Header 1': 'Foo', 'Header 2': 'Baz'})]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "markdown_splitter = MarkdownHeaderTextSplitter(\n", + " headers_to_split_on=headers_to_split_on, strip_headers=False\n", + ")\n", + "md_header_splits = markdown_splitter.split_text(markdown_document)\n", + "md_header_splits" + ] + }, { "cell_type": "markdown", "id": "9bd8977a", @@ -127,7 +162,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "id": "480e0e3a", "metadata": { "ExecuteTime": { @@ -139,14 +174,14 @@ { "data": { "text/plain": [ - "[Document(page_content='Markdown[9] is a lightweight markup language for creating formatted text using a plain-text editor. John Gruber created Markdown in 2004 as a markup language that is appealing to human readers in its source code form.[9]', metadata={'Header 1': 'Intro', 'Header 2': 'History'}),\n", + "[Document(page_content='# Intro \\n## History \\nMarkdown[9] is a lightweight markup language for creating formatted text using a plain-text editor. John Gruber created Markdown in 2004 as a markup language that is appealing to human readers in its source code form.[9]', metadata={'Header 1': 'Intro', 'Header 2': 'History'}),\n", " Document(page_content='Markdown is widely used in blogging, instant messaging, online forums, collaborative software, documentation pages, and readme files.', metadata={'Header 1': 'Intro', 'Header 2': 'History'}),\n", - " Document(page_content='As Markdown popularity grew rapidly, many Markdown implementations appeared, driven mostly by the need for \\nadditional features such as tables, footnotes, definition lists,[note 1] and Markdown inside HTML blocks. \\n#### Standardization', metadata={'Header 1': 'Intro', 'Header 2': 'Rise and divergence'}),\n", + " Document(page_content='## Rise and divergence \\nAs Markdown popularity grew rapidly, many Markdown implementations appeared, driven mostly by the need for \\nadditional features such as tables, footnotes, definition lists,[note 1] and Markdown inside HTML blocks.', metadata={'Header 1': 'Intro', 'Header 2': 'Rise and divergence'}),\n", " Document(page_content='#### Standardization \\nFrom 2012, a group of people, including Jeff Atwood and John MacFarlane, launched what Atwood characterised as a standardisation effort.', metadata={'Header 1': 'Intro', 'Header 2': 'Rise and divergence'}),\n", - " Document(page_content='Implementations of Markdown are available for over a dozen programming languages.', metadata={'Header 1': 'Intro', 'Header 2': 'Implementations'})]" + " Document(page_content='## Implementations \\nImplementations of Markdown are available for over a dozen programming languages.', metadata={'Header 1': 'Intro', 'Header 2': 'Implementations'})]" ] }, - "execution_count": 4, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -160,7 +195,9 @@ "]\n", "\n", "# MD splits\n", - "markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)\n", + "markdown_splitter = MarkdownHeaderTextSplitter(\n", + " headers_to_split_on=headers_to_split_on, strip_headers=False\n", + ")\n", "md_header_splits = markdown_splitter.split_text(markdown_document)\n", "\n", "# Char-level splits\n", diff --git a/libs/langchain/langchain/text_splitter.py b/libs/langchain/langchain/text_splitter.py index be0cb5bdfa6..da65a80dc9f 100644 --- a/libs/langchain/langchain/text_splitter.py +++ b/libs/langchain/langchain/text_splitter.py @@ -323,13 +323,17 @@ class MarkdownHeaderTextSplitter: """Splitting markdown files based on specified headers.""" def __init__( - self, headers_to_split_on: List[Tuple[str, str]], return_each_line: bool = False + self, + headers_to_split_on: List[Tuple[str, str]], + return_each_line: bool = False, + strip_headers: bool = True, ): """Create a new MarkdownHeaderTextSplitter. Args: headers_to_split_on: Headers we want to track return_each_line: Return each line w/ associated headers + strip_headers: Strip split headers from the content of the chunk """ # Output line-by-line or aggregated into chunks w/ common headers self.return_each_line = return_each_line @@ -338,6 +342,8 @@ class MarkdownHeaderTextSplitter: self.headers_to_split_on = sorted( headers_to_split_on, key=lambda split: len(split[0]), reverse=True ) + # Strip headers split headers from the content of the chunk + self.strip_headers = strip_headers def aggregate_lines_to_chunks(self, lines: List[LineType]) -> List[Document]: """Combine lines with common metadata into chunks @@ -355,6 +361,23 @@ class MarkdownHeaderTextSplitter: # has the same metadata as the current line, # append the current content to the last lines's content aggregated_chunks[-1]["content"] += " \n" + line["content"] + elif ( + aggregated_chunks + and aggregated_chunks[-1]["metadata"] != line["metadata"] + # may be issues if other metadata is present + and len(aggregated_chunks[-1]["metadata"]) < len(line["metadata"]) + and aggregated_chunks[-1]["content"].split("\n")[-1][0] == "#" + and not self.strip_headers + ): + # If the last line in the aggregated list + # has different metadata as the current line, + # and has shallower header level than the current line, + # and the last line is a header, + # and we are not stripping headers, + # append the current content to the last line's content + aggregated_chunks[-1]["content"] += " \n" + line["content"] + # and update the last line's metadata + aggregated_chunks[-1]["metadata"] = line["metadata"] else: # Otherwise, append the current line to the aggregated list aggregated_chunks.append(line) @@ -451,6 +474,9 @@ class MarkdownHeaderTextSplitter: ) current_content.clear() + if not self.strip_headers: + current_content.append(stripped_line) + break else: if stripped_line: diff --git a/libs/langchain/tests/unit_tests/test_text_splitter.py b/libs/langchain/tests/unit_tests/test_text_splitter.py index 2f9cf2ac600..f099cc7cc2d 100644 --- a/libs/langchain/tests/unit_tests/test_text_splitter.py +++ b/libs/langchain/tests/unit_tests/test_text_splitter.py @@ -1035,6 +1035,87 @@ def test_md_header_text_splitter_3() -> None: assert output == expected_output +def test_md_header_text_splitter_preserve_headers_1() -> None: + """Test markdown splitter by header: Preserve Headers.""" + + markdown_document = ( + "# Foo\n\n" + " ## Bat\n\n" + "Hi this is Jim\n\n" + "Hi Joe\n\n" + "## Baz\n\n" + "# Bar\n\n" + "This is Alice\n\n" + "This is Bob" + ) + headers_to_split_on = [ + ("#", "Header 1"), + ] + markdown_splitter = MarkdownHeaderTextSplitter( + headers_to_split_on=headers_to_split_on, + strip_headers=False, + ) + output = markdown_splitter.split_text(markdown_document) + expected_output = [ + Document( + page_content="# Foo \n## Bat \nHi this is Jim \nHi Joe \n## Baz", + metadata={"Header 1": "Foo"}, + ), + Document( + page_content="# Bar \nThis is Alice \nThis is Bob", + metadata={"Header 1": "Bar"}, + ), + ] + assert output == expected_output + + +def test_md_header_text_splitter_preserve_headers_2() -> None: + """Test markdown splitter by header: Preserve Headers.""" + + markdown_document = ( + "# Foo\n\n" + " ## Bar\n\n" + "Hi this is Jim\n\n" + "Hi this is Joe\n\n" + "### Boo \n\n" + "Hi this is Lance\n\n" + "## Baz\n\n" + "Hi this is Molly\n" + " ## Buz\n" + "# Bop" + ) + headers_to_split_on = [ + ("#", "Header 1"), + ("##", "Header 2"), + ("###", "Header 3"), + ] + markdown_splitter = MarkdownHeaderTextSplitter( + headers_to_split_on=headers_to_split_on, + strip_headers=False, + ) + output = markdown_splitter.split_text(markdown_document) + expected_output = [ + Document( + page_content="# Foo \n## Bar \nHi this is Jim \nHi this is Joe", + metadata={"Header 1": "Foo", "Header 2": "Bar"}, + ), + Document( + page_content="### Boo \nHi this is Lance", + metadata={"Header 1": "Foo", "Header 2": "Bar", "Header 3": "Boo"}, + ), + Document( + page_content="## Baz \nHi this is Molly", + metadata={"Header 1": "Foo", "Header 2": "Baz"}, + ), + Document( + page_content="## Buz", + metadata={"Header 1": "Foo", "Header 2": "Buz"}, + ), + Document(page_content="# Bop", metadata={"Header 1": "Bop"}), + ] + assert output == expected_output + + @pytest.mark.parametrize("fence", [("```"), ("~~~")]) def test_md_header_text_splitter_fenced_code_block(fence: str) -> None: """Test markdown splitter by header: Fenced code block."""