mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-29 09:58:44 +00:00
Add option to preserve headers in MarkdownHeaderTextSplitter (#14433)
- **Description:** `MarkdownHeaderTextSplitter` currently strips header lines from chunked content. Many applications require these header lines are preserved. This adds an optional parameter to preserve those headers in the chunked content. - **Issue:** #2836 (relevant) - **Dependencies:** - - **Tag maintainer:** @baskaryan - **Twitter handle:** @finnless Unit tests and new examples in notebook included. cc @rlancemartin --------- Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
This commit is contained in:
parent
0a7d360ba4
commit
6c4b5a4eff
@ -117,6 +117,41 @@
|
|||||||
"type(md_header_splits[0])"
|
"type(md_header_splits[0])"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "102aad57-7bef-42d3-ab4e-b50d6dc11718",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"By default, `MarkdownHeaderTextSplitter` strips headers being split on from the output chunk's content. This can be disabled by setting `strip_headers = False`."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"id": "9fce45ba-a4be-4a69-ad27-f5ff195c4fd7",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[Document(page_content='# Foo \\n## Bar \\nHi this is Jim \\nHi this is Joe', metadata={'Header 1': 'Foo', 'Header 2': 'Bar'}),\n",
|
||||||
|
" Document(page_content='### Boo \\nHi this is Lance', metadata={'Header 1': 'Foo', 'Header 2': 'Bar', 'Header 3': 'Boo'}),\n",
|
||||||
|
" Document(page_content='## Baz \\nHi this is Molly', metadata={'Header 1': 'Foo', 'Header 2': 'Baz'})]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"markdown_splitter = MarkdownHeaderTextSplitter(\n",
|
||||||
|
" headers_to_split_on=headers_to_split_on, strip_headers=False\n",
|
||||||
|
")\n",
|
||||||
|
"md_header_splits = markdown_splitter.split_text(markdown_document)\n",
|
||||||
|
"md_header_splits"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "9bd8977a",
|
"id": "9bd8977a",
|
||||||
@ -127,7 +162,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 4,
|
"execution_count": 5,
|
||||||
"id": "480e0e3a",
|
"id": "480e0e3a",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"ExecuteTime": {
|
"ExecuteTime": {
|
||||||
@ -139,14 +174,14 @@
|
|||||||
{
|
{
|
||||||
"data": {
|
"data": {
|
||||||
"text/plain": [
|
"text/plain": [
|
||||||
"[Document(page_content='Markdown[9] is a lightweight markup language for creating formatted text using a plain-text editor. John Gruber created Markdown in 2004 as a markup language that is appealing to human readers in its source code form.[9]', metadata={'Header 1': 'Intro', 'Header 2': 'History'}),\n",
|
"[Document(page_content='# Intro \\n## History \\nMarkdown[9] is a lightweight markup language for creating formatted text using a plain-text editor. John Gruber created Markdown in 2004 as a markup language that is appealing to human readers in its source code form.[9]', metadata={'Header 1': 'Intro', 'Header 2': 'History'}),\n",
|
||||||
" Document(page_content='Markdown is widely used in blogging, instant messaging, online forums, collaborative software, documentation pages, and readme files.', metadata={'Header 1': 'Intro', 'Header 2': 'History'}),\n",
|
" Document(page_content='Markdown is widely used in blogging, instant messaging, online forums, collaborative software, documentation pages, and readme files.', metadata={'Header 1': 'Intro', 'Header 2': 'History'}),\n",
|
||||||
" Document(page_content='As Markdown popularity grew rapidly, many Markdown implementations appeared, driven mostly by the need for \\nadditional features such as tables, footnotes, definition lists,[note 1] and Markdown inside HTML blocks. \\n#### Standardization', metadata={'Header 1': 'Intro', 'Header 2': 'Rise and divergence'}),\n",
|
" Document(page_content='## Rise and divergence \\nAs Markdown popularity grew rapidly, many Markdown implementations appeared, driven mostly by the need for \\nadditional features such as tables, footnotes, definition lists,[note 1] and Markdown inside HTML blocks.', metadata={'Header 1': 'Intro', 'Header 2': 'Rise and divergence'}),\n",
|
||||||
" Document(page_content='#### Standardization \\nFrom 2012, a group of people, including Jeff Atwood and John MacFarlane, launched what Atwood characterised as a standardisation effort.', metadata={'Header 1': 'Intro', 'Header 2': 'Rise and divergence'}),\n",
|
" Document(page_content='#### Standardization \\nFrom 2012, a group of people, including Jeff Atwood and John MacFarlane, launched what Atwood characterised as a standardisation effort.', metadata={'Header 1': 'Intro', 'Header 2': 'Rise and divergence'}),\n",
|
||||||
" Document(page_content='Implementations of Markdown are available for over a dozen programming languages.', metadata={'Header 1': 'Intro', 'Header 2': 'Implementations'})]"
|
" Document(page_content='## Implementations \\nImplementations of Markdown are available for over a dozen programming languages.', metadata={'Header 1': 'Intro', 'Header 2': 'Implementations'})]"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 4,
|
"execution_count": 5,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
@ -160,7 +195,9 @@
|
|||||||
"]\n",
|
"]\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# MD splits\n",
|
"# MD splits\n",
|
||||||
"markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)\n",
|
"markdown_splitter = MarkdownHeaderTextSplitter(\n",
|
||||||
|
" headers_to_split_on=headers_to_split_on, strip_headers=False\n",
|
||||||
|
")\n",
|
||||||
"md_header_splits = markdown_splitter.split_text(markdown_document)\n",
|
"md_header_splits = markdown_splitter.split_text(markdown_document)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Char-level splits\n",
|
"# Char-level splits\n",
|
||||||
|
@ -323,13 +323,17 @@ class MarkdownHeaderTextSplitter:
|
|||||||
"""Splitting markdown files based on specified headers."""
|
"""Splitting markdown files based on specified headers."""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, headers_to_split_on: List[Tuple[str, str]], return_each_line: bool = False
|
self,
|
||||||
|
headers_to_split_on: List[Tuple[str, str]],
|
||||||
|
return_each_line: bool = False,
|
||||||
|
strip_headers: bool = True,
|
||||||
):
|
):
|
||||||
"""Create a new MarkdownHeaderTextSplitter.
|
"""Create a new MarkdownHeaderTextSplitter.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
headers_to_split_on: Headers we want to track
|
headers_to_split_on: Headers we want to track
|
||||||
return_each_line: Return each line w/ associated headers
|
return_each_line: Return each line w/ associated headers
|
||||||
|
strip_headers: Strip split headers from the content of the chunk
|
||||||
"""
|
"""
|
||||||
# Output line-by-line or aggregated into chunks w/ common headers
|
# Output line-by-line or aggregated into chunks w/ common headers
|
||||||
self.return_each_line = return_each_line
|
self.return_each_line = return_each_line
|
||||||
@ -338,6 +342,8 @@ class MarkdownHeaderTextSplitter:
|
|||||||
self.headers_to_split_on = sorted(
|
self.headers_to_split_on = sorted(
|
||||||
headers_to_split_on, key=lambda split: len(split[0]), reverse=True
|
headers_to_split_on, key=lambda split: len(split[0]), reverse=True
|
||||||
)
|
)
|
||||||
|
# Strip headers split headers from the content of the chunk
|
||||||
|
self.strip_headers = strip_headers
|
||||||
|
|
||||||
def aggregate_lines_to_chunks(self, lines: List[LineType]) -> List[Document]:
|
def aggregate_lines_to_chunks(self, lines: List[LineType]) -> List[Document]:
|
||||||
"""Combine lines with common metadata into chunks
|
"""Combine lines with common metadata into chunks
|
||||||
@ -355,6 +361,23 @@ class MarkdownHeaderTextSplitter:
|
|||||||
# has the same metadata as the current line,
|
# has the same metadata as the current line,
|
||||||
# append the current content to the last lines's content
|
# append the current content to the last lines's content
|
||||||
aggregated_chunks[-1]["content"] += " \n" + line["content"]
|
aggregated_chunks[-1]["content"] += " \n" + line["content"]
|
||||||
|
elif (
|
||||||
|
aggregated_chunks
|
||||||
|
and aggregated_chunks[-1]["metadata"] != line["metadata"]
|
||||||
|
# may be issues if other metadata is present
|
||||||
|
and len(aggregated_chunks[-1]["metadata"]) < len(line["metadata"])
|
||||||
|
and aggregated_chunks[-1]["content"].split("\n")[-1][0] == "#"
|
||||||
|
and not self.strip_headers
|
||||||
|
):
|
||||||
|
# If the last line in the aggregated list
|
||||||
|
# has different metadata as the current line,
|
||||||
|
# and has shallower header level than the current line,
|
||||||
|
# and the last line is a header,
|
||||||
|
# and we are not stripping headers,
|
||||||
|
# append the current content to the last line's content
|
||||||
|
aggregated_chunks[-1]["content"] += " \n" + line["content"]
|
||||||
|
# and update the last line's metadata
|
||||||
|
aggregated_chunks[-1]["metadata"] = line["metadata"]
|
||||||
else:
|
else:
|
||||||
# Otherwise, append the current line to the aggregated list
|
# Otherwise, append the current line to the aggregated list
|
||||||
aggregated_chunks.append(line)
|
aggregated_chunks.append(line)
|
||||||
@ -451,6 +474,9 @@ class MarkdownHeaderTextSplitter:
|
|||||||
)
|
)
|
||||||
current_content.clear()
|
current_content.clear()
|
||||||
|
|
||||||
|
if not self.strip_headers:
|
||||||
|
current_content.append(stripped_line)
|
||||||
|
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
if stripped_line:
|
if stripped_line:
|
||||||
|
@ -1035,6 +1035,87 @@ def test_md_header_text_splitter_3() -> None:
|
|||||||
assert output == expected_output
|
assert output == expected_output
|
||||||
|
|
||||||
|
|
||||||
|
def test_md_header_text_splitter_preserve_headers_1() -> None:
|
||||||
|
"""Test markdown splitter by header: Preserve Headers."""
|
||||||
|
|
||||||
|
markdown_document = (
|
||||||
|
"# Foo\n\n"
|
||||||
|
" ## Bat\n\n"
|
||||||
|
"Hi this is Jim\n\n"
|
||||||
|
"Hi Joe\n\n"
|
||||||
|
"## Baz\n\n"
|
||||||
|
"# Bar\n\n"
|
||||||
|
"This is Alice\n\n"
|
||||||
|
"This is Bob"
|
||||||
|
)
|
||||||
|
headers_to_split_on = [
|
||||||
|
("#", "Header 1"),
|
||||||
|
]
|
||||||
|
markdown_splitter = MarkdownHeaderTextSplitter(
|
||||||
|
headers_to_split_on=headers_to_split_on,
|
||||||
|
strip_headers=False,
|
||||||
|
)
|
||||||
|
output = markdown_splitter.split_text(markdown_document)
|
||||||
|
expected_output = [
|
||||||
|
Document(
|
||||||
|
page_content="# Foo \n## Bat \nHi this is Jim \nHi Joe \n## Baz",
|
||||||
|
metadata={"Header 1": "Foo"},
|
||||||
|
),
|
||||||
|
Document(
|
||||||
|
page_content="# Bar \nThis is Alice \nThis is Bob",
|
||||||
|
metadata={"Header 1": "Bar"},
|
||||||
|
),
|
||||||
|
]
|
||||||
|
assert output == expected_output
|
||||||
|
|
||||||
|
|
||||||
|
def test_md_header_text_splitter_preserve_headers_2() -> None:
|
||||||
|
"""Test markdown splitter by header: Preserve Headers."""
|
||||||
|
|
||||||
|
markdown_document = (
|
||||||
|
"# Foo\n\n"
|
||||||
|
" ## Bar\n\n"
|
||||||
|
"Hi this is Jim\n\n"
|
||||||
|
"Hi this is Joe\n\n"
|
||||||
|
"### Boo \n\n"
|
||||||
|
"Hi this is Lance\n\n"
|
||||||
|
"## Baz\n\n"
|
||||||
|
"Hi this is Molly\n"
|
||||||
|
" ## Buz\n"
|
||||||
|
"# Bop"
|
||||||
|
)
|
||||||
|
headers_to_split_on = [
|
||||||
|
("#", "Header 1"),
|
||||||
|
("##", "Header 2"),
|
||||||
|
("###", "Header 3"),
|
||||||
|
]
|
||||||
|
markdown_splitter = MarkdownHeaderTextSplitter(
|
||||||
|
headers_to_split_on=headers_to_split_on,
|
||||||
|
strip_headers=False,
|
||||||
|
)
|
||||||
|
output = markdown_splitter.split_text(markdown_document)
|
||||||
|
expected_output = [
|
||||||
|
Document(
|
||||||
|
page_content="# Foo \n## Bar \nHi this is Jim \nHi this is Joe",
|
||||||
|
metadata={"Header 1": "Foo", "Header 2": "Bar"},
|
||||||
|
),
|
||||||
|
Document(
|
||||||
|
page_content="### Boo \nHi this is Lance",
|
||||||
|
metadata={"Header 1": "Foo", "Header 2": "Bar", "Header 3": "Boo"},
|
||||||
|
),
|
||||||
|
Document(
|
||||||
|
page_content="## Baz \nHi this is Molly",
|
||||||
|
metadata={"Header 1": "Foo", "Header 2": "Baz"},
|
||||||
|
),
|
||||||
|
Document(
|
||||||
|
page_content="## Buz",
|
||||||
|
metadata={"Header 1": "Foo", "Header 2": "Buz"},
|
||||||
|
),
|
||||||
|
Document(page_content="# Bop", metadata={"Header 1": "Bop"}),
|
||||||
|
]
|
||||||
|
assert output == expected_output
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("fence", [("```"), ("~~~")])
|
@pytest.mark.parametrize("fence", [("```"), ("~~~")])
|
||||||
def test_md_header_text_splitter_fenced_code_block(fence: str) -> None:
|
def test_md_header_text_splitter_fenced_code_block(fence: str) -> None:
|
||||||
"""Test markdown splitter by header: Fenced code block."""
|
"""Test markdown splitter by header: Fenced code block."""
|
||||||
|
Loading…
Reference in New Issue
Block a user