mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-22 19:09:57 +00:00
Add option to preserve headers in MarkdownHeaderTextSplitter (#14433)
- **Description:** `MarkdownHeaderTextSplitter` currently strips header lines from chunked content. Many applications require these header lines are preserved. This adds an optional parameter to preserve those headers in the chunked content. - **Issue:** #2836 (relevant) - **Dependencies:** - - **Tag maintainer:** @baskaryan - **Twitter handle:** @finnless Unit tests and new examples in notebook included. cc @rlancemartin --------- Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
This commit is contained in:
@@ -323,13 +323,17 @@ class MarkdownHeaderTextSplitter:
|
||||
"""Splitting markdown files based on specified headers."""
|
||||
|
||||
def __init__(
|
||||
self, headers_to_split_on: List[Tuple[str, str]], return_each_line: bool = False
|
||||
self,
|
||||
headers_to_split_on: List[Tuple[str, str]],
|
||||
return_each_line: bool = False,
|
||||
strip_headers: bool = True,
|
||||
):
|
||||
"""Create a new MarkdownHeaderTextSplitter.
|
||||
|
||||
Args:
|
||||
headers_to_split_on: Headers we want to track
|
||||
return_each_line: Return each line w/ associated headers
|
||||
strip_headers: Strip split headers from the content of the chunk
|
||||
"""
|
||||
# Output line-by-line or aggregated into chunks w/ common headers
|
||||
self.return_each_line = return_each_line
|
||||
@@ -338,6 +342,8 @@ class MarkdownHeaderTextSplitter:
|
||||
self.headers_to_split_on = sorted(
|
||||
headers_to_split_on, key=lambda split: len(split[0]), reverse=True
|
||||
)
|
||||
# Strip headers split headers from the content of the chunk
|
||||
self.strip_headers = strip_headers
|
||||
|
||||
def aggregate_lines_to_chunks(self, lines: List[LineType]) -> List[Document]:
|
||||
"""Combine lines with common metadata into chunks
|
||||
@@ -355,6 +361,23 @@ class MarkdownHeaderTextSplitter:
|
||||
# has the same metadata as the current line,
|
||||
# append the current content to the last lines's content
|
||||
aggregated_chunks[-1]["content"] += " \n" + line["content"]
|
||||
elif (
|
||||
aggregated_chunks
|
||||
and aggregated_chunks[-1]["metadata"] != line["metadata"]
|
||||
# may be issues if other metadata is present
|
||||
and len(aggregated_chunks[-1]["metadata"]) < len(line["metadata"])
|
||||
and aggregated_chunks[-1]["content"].split("\n")[-1][0] == "#"
|
||||
and not self.strip_headers
|
||||
):
|
||||
# If the last line in the aggregated list
|
||||
# has different metadata as the current line,
|
||||
# and has shallower header level than the current line,
|
||||
# and the last line is a header,
|
||||
# and we are not stripping headers,
|
||||
# append the current content to the last line's content
|
||||
aggregated_chunks[-1]["content"] += " \n" + line["content"]
|
||||
# and update the last line's metadata
|
||||
aggregated_chunks[-1]["metadata"] = line["metadata"]
|
||||
else:
|
||||
# Otherwise, append the current line to the aggregated list
|
||||
aggregated_chunks.append(line)
|
||||
@@ -451,6 +474,9 @@ class MarkdownHeaderTextSplitter:
|
||||
)
|
||||
current_content.clear()
|
||||
|
||||
if not self.strip_headers:
|
||||
current_content.append(stripped_line)
|
||||
|
||||
break
|
||||
else:
|
||||
if stripped_line:
|
||||
|
@@ -1035,6 +1035,87 @@ def test_md_header_text_splitter_3() -> None:
|
||||
assert output == expected_output
|
||||
|
||||
|
||||
def test_md_header_text_splitter_preserve_headers_1() -> None:
|
||||
"""Test markdown splitter by header: Preserve Headers."""
|
||||
|
||||
markdown_document = (
|
||||
"# Foo\n\n"
|
||||
" ## Bat\n\n"
|
||||
"Hi this is Jim\n\n"
|
||||
"Hi Joe\n\n"
|
||||
"## Baz\n\n"
|
||||
"# Bar\n\n"
|
||||
"This is Alice\n\n"
|
||||
"This is Bob"
|
||||
)
|
||||
headers_to_split_on = [
|
||||
("#", "Header 1"),
|
||||
]
|
||||
markdown_splitter = MarkdownHeaderTextSplitter(
|
||||
headers_to_split_on=headers_to_split_on,
|
||||
strip_headers=False,
|
||||
)
|
||||
output = markdown_splitter.split_text(markdown_document)
|
||||
expected_output = [
|
||||
Document(
|
||||
page_content="# Foo \n## Bat \nHi this is Jim \nHi Joe \n## Baz",
|
||||
metadata={"Header 1": "Foo"},
|
||||
),
|
||||
Document(
|
||||
page_content="# Bar \nThis is Alice \nThis is Bob",
|
||||
metadata={"Header 1": "Bar"},
|
||||
),
|
||||
]
|
||||
assert output == expected_output
|
||||
|
||||
|
||||
def test_md_header_text_splitter_preserve_headers_2() -> None:
|
||||
"""Test markdown splitter by header: Preserve Headers."""
|
||||
|
||||
markdown_document = (
|
||||
"# Foo\n\n"
|
||||
" ## Bar\n\n"
|
||||
"Hi this is Jim\n\n"
|
||||
"Hi this is Joe\n\n"
|
||||
"### Boo \n\n"
|
||||
"Hi this is Lance\n\n"
|
||||
"## Baz\n\n"
|
||||
"Hi this is Molly\n"
|
||||
" ## Buz\n"
|
||||
"# Bop"
|
||||
)
|
||||
headers_to_split_on = [
|
||||
("#", "Header 1"),
|
||||
("##", "Header 2"),
|
||||
("###", "Header 3"),
|
||||
]
|
||||
markdown_splitter = MarkdownHeaderTextSplitter(
|
||||
headers_to_split_on=headers_to_split_on,
|
||||
strip_headers=False,
|
||||
)
|
||||
output = markdown_splitter.split_text(markdown_document)
|
||||
expected_output = [
|
||||
Document(
|
||||
page_content="# Foo \n## Bar \nHi this is Jim \nHi this is Joe",
|
||||
metadata={"Header 1": "Foo", "Header 2": "Bar"},
|
||||
),
|
||||
Document(
|
||||
page_content="### Boo \nHi this is Lance",
|
||||
metadata={"Header 1": "Foo", "Header 2": "Bar", "Header 3": "Boo"},
|
||||
),
|
||||
Document(
|
||||
page_content="## Baz \nHi this is Molly",
|
||||
metadata={"Header 1": "Foo", "Header 2": "Baz"},
|
||||
),
|
||||
Document(
|
||||
page_content="## Buz",
|
||||
metadata={"Header 1": "Foo", "Header 2": "Buz"},
|
||||
),
|
||||
Document(page_content="# Bop", metadata={"Header 1": "Bop"}),
|
||||
]
|
||||
assert output == expected_output
|
||||
|
||||
|
||||
@pytest.mark.parametrize("fence", [("```"), ("~~~")])
|
||||
def test_md_header_text_splitter_fenced_code_block(fence: str) -> None:
|
||||
"""Test markdown splitter by header: Fenced code block."""
|
||||
|
Reference in New Issue
Block a user