mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-24 15:43:54 +00:00
text-splitters[patch]: Fix HTMLSectionSplitter (#22812)
Update former pull request: https://github.com/langchain-ai/langchain/pull/22654. Modified `langchain_text_splitters.HTMLSectionSplitter`, where in the latest version `dict` data structure is used to store sections from a html document, in function `split_html_by_headers`. The header/section element names serve as dict keys. This can be a problem when duplicate header/section element names are present in a single html document. Latter ones can replace former ones with the same name. Therefore some contents can be miss after html text splitting is conducted. Using a list to store sections can hopefully solve the problem. A Unit test considering duplicate header names has been added. --------- Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
parent
fbeeb6da75
commit
c8c67dde6f
@ -233,9 +233,7 @@ class HTMLSectionSplitter:
|
|||||||
documents.append(new_doc)
|
documents.append(new_doc)
|
||||||
return documents
|
return documents
|
||||||
|
|
||||||
def split_html_by_headers(
|
def split_html_by_headers(self, html_doc: str) -> List[Dict[str, Optional[str]]]:
|
||||||
self, html_doc: str
|
|
||||||
) -> Dict[str, Dict[str, Optional[str]]]:
|
|
||||||
try:
|
try:
|
||||||
from bs4 import BeautifulSoup, PageElement # type: ignore[import-untyped]
|
from bs4 import BeautifulSoup, PageElement # type: ignore[import-untyped]
|
||||||
except ImportError as e:
|
except ImportError as e:
|
||||||
@ -247,7 +245,7 @@ class HTMLSectionSplitter:
|
|||||||
|
|
||||||
soup = BeautifulSoup(html_doc, "html.parser")
|
soup = BeautifulSoup(html_doc, "html.parser")
|
||||||
headers = list(self.headers_to_split_on.keys())
|
headers = list(self.headers_to_split_on.keys())
|
||||||
sections: Dict[str, Dict[str, Optional[str]]] = {}
|
sections: list[dict[str, str | None]] = []
|
||||||
|
|
||||||
headers = soup.find_all(["body"] + headers)
|
headers = soup.find_all(["body"] + headers)
|
||||||
|
|
||||||
@ -269,10 +267,13 @@ class HTMLSectionSplitter:
|
|||||||
content = " ".join(section_content).strip()
|
content = " ".join(section_content).strip()
|
||||||
|
|
||||||
if content != "":
|
if content != "":
|
||||||
sections[current_header] = {
|
sections.append(
|
||||||
"content": content,
|
{
|
||||||
"tag_name": current_header_tag,
|
"header": current_header,
|
||||||
}
|
"content": content,
|
||||||
|
"tag_name": current_header_tag,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
return sections
|
return sections
|
||||||
|
|
||||||
@ -307,12 +308,12 @@ class HTMLSectionSplitter:
|
|||||||
|
|
||||||
return [
|
return [
|
||||||
Document(
|
Document(
|
||||||
cast(str, sections[section_key]["content"]),
|
cast(str, section["content"]),
|
||||||
metadata={
|
metadata={
|
||||||
self.headers_to_split_on[
|
self.headers_to_split_on[str(section["tag_name"])]: section[
|
||||||
str(sections[section_key]["tag_name"])
|
"header"
|
||||||
]: section_key
|
]
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
for section_key in sections.keys()
|
for section in sections
|
||||||
]
|
]
|
||||||
|
@ -1650,6 +1650,59 @@ def test_section_splitter_accepts_an_absolute_path() -> None:
|
|||||||
sec_splitter.split_text(html_string)
|
sec_splitter.split_text(html_string)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("lxml")
|
||||||
|
@pytest.mark.requires("bs4")
|
||||||
|
def test_happy_path_splitting_with_duplicate_header_tag() -> None:
|
||||||
|
# arrange
|
||||||
|
html_string = """<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<div>
|
||||||
|
<h1>Foo</h1>
|
||||||
|
<p>Some intro text about Foo.</p>
|
||||||
|
<div>
|
||||||
|
<h2>Bar main section</h2>
|
||||||
|
<p>Some intro text about Bar.</p>
|
||||||
|
<h3>Bar subsection 1</h3>
|
||||||
|
<p>Some text about the first subtopic of Bar.</p>
|
||||||
|
<h3>Bar subsection 2</h3>
|
||||||
|
<p>Some text about the second subtopic of Bar.</p>
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<h2>Foo</h2>
|
||||||
|
<p>Some text about Baz</p>
|
||||||
|
</div>
|
||||||
|
<h1>Foo</h1>
|
||||||
|
<br>
|
||||||
|
<p>Some concluding text about Foo</p>
|
||||||
|
</div>
|
||||||
|
</body>
|
||||||
|
</html>"""
|
||||||
|
|
||||||
|
sec_splitter = HTMLSectionSplitter(
|
||||||
|
headers_to_split_on=[("h1", "Header 1"), ("h2", "Header 2")]
|
||||||
|
)
|
||||||
|
|
||||||
|
docs = sec_splitter.split_text(html_string)
|
||||||
|
|
||||||
|
assert len(docs) == 4
|
||||||
|
assert docs[0].page_content == "Foo \n Some intro text about Foo."
|
||||||
|
assert docs[0].metadata["Header 1"] == "Foo"
|
||||||
|
|
||||||
|
assert docs[1].page_content == (
|
||||||
|
"Bar main section \n Some intro text about Bar. \n "
|
||||||
|
"Bar subsection 1 \n Some text about the first subtopic of Bar. \n "
|
||||||
|
"Bar subsection 2 \n Some text about the second subtopic of Bar."
|
||||||
|
)
|
||||||
|
assert docs[1].metadata["Header 2"] == "Bar main section"
|
||||||
|
|
||||||
|
assert docs[2].page_content == "Foo \n Some text about Baz"
|
||||||
|
assert docs[2].metadata["Header 2"] == "Foo"
|
||||||
|
|
||||||
|
assert docs[3].page_content == "Foo \n \n Some concluding text about Foo"
|
||||||
|
assert docs[3].metadata["Header 1"] == "Foo"
|
||||||
|
|
||||||
|
|
||||||
def test_split_json() -> None:
|
def test_split_json() -> None:
|
||||||
"""Test json text splitter"""
|
"""Test json text splitter"""
|
||||||
max_chunk = 800
|
max_chunk = 800
|
||||||
|
Loading…
Reference in New Issue
Block a user