mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-29 18:08:36 +00:00
Add option to preserve headers in MarkdownHeaderTextSplitter (#14433)
- **Description:** `MarkdownHeaderTextSplitter` currently strips header lines from chunked content. Many applications require these header lines are preserved. This adds an optional parameter to preserve those headers in the chunked content. - **Issue:** #2836 (relevant) - **Dependencies:** - - **Tag maintainer:** @baskaryan - **Twitter handle:** @finnless Unit tests and new examples in notebook included. cc @rlancemartin --------- Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
This commit is contained in:
parent
0a7d360ba4
commit
6c4b5a4eff
@ -117,6 +117,41 @@
|
||||
"type(md_header_splits[0])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "102aad57-7bef-42d3-ab4e-b50d6dc11718",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"By default, `MarkdownHeaderTextSplitter` strips headers being split on from the output chunk's content. This can be disabled by setting `strip_headers = False`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "9fce45ba-a4be-4a69-ad27-f5ff195c4fd7",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content='# Foo \\n## Bar \\nHi this is Jim \\nHi this is Joe', metadata={'Header 1': 'Foo', 'Header 2': 'Bar'}),\n",
|
||||
" Document(page_content='### Boo \\nHi this is Lance', metadata={'Header 1': 'Foo', 'Header 2': 'Bar', 'Header 3': 'Boo'}),\n",
|
||||
" Document(page_content='## Baz \\nHi this is Molly', metadata={'Header 1': 'Foo', 'Header 2': 'Baz'})]"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"markdown_splitter = MarkdownHeaderTextSplitter(\n",
|
||||
" headers_to_split_on=headers_to_split_on, strip_headers=False\n",
|
||||
")\n",
|
||||
"md_header_splits = markdown_splitter.split_text(markdown_document)\n",
|
||||
"md_header_splits"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "9bd8977a",
|
||||
@ -127,7 +162,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 5,
|
||||
"id": "480e0e3a",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
@ -139,14 +174,14 @@
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content='Markdown[9] is a lightweight markup language for creating formatted text using a plain-text editor. John Gruber created Markdown in 2004 as a markup language that is appealing to human readers in its source code form.[9]', metadata={'Header 1': 'Intro', 'Header 2': 'History'}),\n",
|
||||
"[Document(page_content='# Intro \\n## History \\nMarkdown[9] is a lightweight markup language for creating formatted text using a plain-text editor. John Gruber created Markdown in 2004 as a markup language that is appealing to human readers in its source code form.[9]', metadata={'Header 1': 'Intro', 'Header 2': 'History'}),\n",
|
||||
" Document(page_content='Markdown is widely used in blogging, instant messaging, online forums, collaborative software, documentation pages, and readme files.', metadata={'Header 1': 'Intro', 'Header 2': 'History'}),\n",
|
||||
" Document(page_content='As Markdown popularity grew rapidly, many Markdown implementations appeared, driven mostly by the need for \\nadditional features such as tables, footnotes, definition lists,[note 1] and Markdown inside HTML blocks. \\n#### Standardization', metadata={'Header 1': 'Intro', 'Header 2': 'Rise and divergence'}),\n",
|
||||
" Document(page_content='## Rise and divergence \\nAs Markdown popularity grew rapidly, many Markdown implementations appeared, driven mostly by the need for \\nadditional features such as tables, footnotes, definition lists,[note 1] and Markdown inside HTML blocks.', metadata={'Header 1': 'Intro', 'Header 2': 'Rise and divergence'}),\n",
|
||||
" Document(page_content='#### Standardization \\nFrom 2012, a group of people, including Jeff Atwood and John MacFarlane, launched what Atwood characterised as a standardisation effort.', metadata={'Header 1': 'Intro', 'Header 2': 'Rise and divergence'}),\n",
|
||||
" Document(page_content='Implementations of Markdown are available for over a dozen programming languages.', metadata={'Header 1': 'Intro', 'Header 2': 'Implementations'})]"
|
||||
" Document(page_content='## Implementations \\nImplementations of Markdown are available for over a dozen programming languages.', metadata={'Header 1': 'Intro', 'Header 2': 'Implementations'})]"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -160,7 +195,9 @@
|
||||
"]\n",
|
||||
"\n",
|
||||
"# MD splits\n",
|
||||
"markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)\n",
|
||||
"markdown_splitter = MarkdownHeaderTextSplitter(\n",
|
||||
" headers_to_split_on=headers_to_split_on, strip_headers=False\n",
|
||||
")\n",
|
||||
"md_header_splits = markdown_splitter.split_text(markdown_document)\n",
|
||||
"\n",
|
||||
"# Char-level splits\n",
|
||||
|
@ -323,13 +323,17 @@ class MarkdownHeaderTextSplitter:
|
||||
"""Splitting markdown files based on specified headers."""
|
||||
|
||||
def __init__(
|
||||
self, headers_to_split_on: List[Tuple[str, str]], return_each_line: bool = False
|
||||
self,
|
||||
headers_to_split_on: List[Tuple[str, str]],
|
||||
return_each_line: bool = False,
|
||||
strip_headers: bool = True,
|
||||
):
|
||||
"""Create a new MarkdownHeaderTextSplitter.
|
||||
|
||||
Args:
|
||||
headers_to_split_on: Headers we want to track
|
||||
return_each_line: Return each line w/ associated headers
|
||||
strip_headers: Strip split headers from the content of the chunk
|
||||
"""
|
||||
# Output line-by-line or aggregated into chunks w/ common headers
|
||||
self.return_each_line = return_each_line
|
||||
@ -338,6 +342,8 @@ class MarkdownHeaderTextSplitter:
|
||||
self.headers_to_split_on = sorted(
|
||||
headers_to_split_on, key=lambda split: len(split[0]), reverse=True
|
||||
)
|
||||
# Strip headers split headers from the content of the chunk
|
||||
self.strip_headers = strip_headers
|
||||
|
||||
def aggregate_lines_to_chunks(self, lines: List[LineType]) -> List[Document]:
|
||||
"""Combine lines with common metadata into chunks
|
||||
@ -355,6 +361,23 @@ class MarkdownHeaderTextSplitter:
|
||||
# has the same metadata as the current line,
|
||||
# append the current content to the last lines's content
|
||||
aggregated_chunks[-1]["content"] += " \n" + line["content"]
|
||||
elif (
|
||||
aggregated_chunks
|
||||
and aggregated_chunks[-1]["metadata"] != line["metadata"]
|
||||
# may be issues if other metadata is present
|
||||
and len(aggregated_chunks[-1]["metadata"]) < len(line["metadata"])
|
||||
and aggregated_chunks[-1]["content"].split("\n")[-1][0] == "#"
|
||||
and not self.strip_headers
|
||||
):
|
||||
# If the last line in the aggregated list
|
||||
# has different metadata as the current line,
|
||||
# and has shallower header level than the current line,
|
||||
# and the last line is a header,
|
||||
# and we are not stripping headers,
|
||||
# append the current content to the last line's content
|
||||
aggregated_chunks[-1]["content"] += " \n" + line["content"]
|
||||
# and update the last line's metadata
|
||||
aggregated_chunks[-1]["metadata"] = line["metadata"]
|
||||
else:
|
||||
# Otherwise, append the current line to the aggregated list
|
||||
aggregated_chunks.append(line)
|
||||
@ -451,6 +474,9 @@ class MarkdownHeaderTextSplitter:
|
||||
)
|
||||
current_content.clear()
|
||||
|
||||
if not self.strip_headers:
|
||||
current_content.append(stripped_line)
|
||||
|
||||
break
|
||||
else:
|
||||
if stripped_line:
|
||||
|
@ -1035,6 +1035,87 @@ def test_md_header_text_splitter_3() -> None:
|
||||
assert output == expected_output
|
||||
|
||||
|
||||
def test_md_header_text_splitter_preserve_headers_1() -> None:
|
||||
"""Test markdown splitter by header: Preserve Headers."""
|
||||
|
||||
markdown_document = (
|
||||
"# Foo\n\n"
|
||||
" ## Bat\n\n"
|
||||
"Hi this is Jim\n\n"
|
||||
"Hi Joe\n\n"
|
||||
"## Baz\n\n"
|
||||
"# Bar\n\n"
|
||||
"This is Alice\n\n"
|
||||
"This is Bob"
|
||||
)
|
||||
headers_to_split_on = [
|
||||
("#", "Header 1"),
|
||||
]
|
||||
markdown_splitter = MarkdownHeaderTextSplitter(
|
||||
headers_to_split_on=headers_to_split_on,
|
||||
strip_headers=False,
|
||||
)
|
||||
output = markdown_splitter.split_text(markdown_document)
|
||||
expected_output = [
|
||||
Document(
|
||||
page_content="# Foo \n## Bat \nHi this is Jim \nHi Joe \n## Baz",
|
||||
metadata={"Header 1": "Foo"},
|
||||
),
|
||||
Document(
|
||||
page_content="# Bar \nThis is Alice \nThis is Bob",
|
||||
metadata={"Header 1": "Bar"},
|
||||
),
|
||||
]
|
||||
assert output == expected_output
|
||||
|
||||
|
||||
def test_md_header_text_splitter_preserve_headers_2() -> None:
|
||||
"""Test markdown splitter by header: Preserve Headers."""
|
||||
|
||||
markdown_document = (
|
||||
"# Foo\n\n"
|
||||
" ## Bar\n\n"
|
||||
"Hi this is Jim\n\n"
|
||||
"Hi this is Joe\n\n"
|
||||
"### Boo \n\n"
|
||||
"Hi this is Lance\n\n"
|
||||
"## Baz\n\n"
|
||||
"Hi this is Molly\n"
|
||||
" ## Buz\n"
|
||||
"# Bop"
|
||||
)
|
||||
headers_to_split_on = [
|
||||
("#", "Header 1"),
|
||||
("##", "Header 2"),
|
||||
("###", "Header 3"),
|
||||
]
|
||||
markdown_splitter = MarkdownHeaderTextSplitter(
|
||||
headers_to_split_on=headers_to_split_on,
|
||||
strip_headers=False,
|
||||
)
|
||||
output = markdown_splitter.split_text(markdown_document)
|
||||
expected_output = [
|
||||
Document(
|
||||
page_content="# Foo \n## Bar \nHi this is Jim \nHi this is Joe",
|
||||
metadata={"Header 1": "Foo", "Header 2": "Bar"},
|
||||
),
|
||||
Document(
|
||||
page_content="### Boo \nHi this is Lance",
|
||||
metadata={"Header 1": "Foo", "Header 2": "Bar", "Header 3": "Boo"},
|
||||
),
|
||||
Document(
|
||||
page_content="## Baz \nHi this is Molly",
|
||||
metadata={"Header 1": "Foo", "Header 2": "Baz"},
|
||||
),
|
||||
Document(
|
||||
page_content="## Buz",
|
||||
metadata={"Header 1": "Foo", "Header 2": "Buz"},
|
||||
),
|
||||
Document(page_content="# Bop", metadata={"Header 1": "Bop"}),
|
||||
]
|
||||
assert output == expected_output
|
||||
|
||||
|
||||
@pytest.mark.parametrize("fence", [("```"), ("~~~")])
|
||||
def test_md_header_text_splitter_fenced_code_block(fence: str) -> None:
|
||||
"""Test markdown splitter by header: Fenced code block."""
|
||||
|
Loading…
Reference in New Issue
Block a user