text-splitters[patch]: Fix HTMLSectionSplitter (#22812)

Update former pull request:
https://github.com/langchain-ai/langchain/pull/22654.

Modified `langchain_text_splitters.HTMLSectionSplitter`, where in the
latest version `dict` data structure is used to store sections from a
html document, in function `split_html_by_headers`. The header/section
element names serve as dict keys. This can be a problem when duplicate
header/section element names are present in a single html document.
Latter ones can replace former ones with the same name. Therefore some
contents can be miss after html text splitting is conducted.

Using a list to store sections can hopefully solve the problem. A Unit
test considering duplicate header names has been added.

---------

Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
Jiejun Tan
2024-06-15 06:40:39 +08:00
committed by GitHub
parent fbeeb6da75
commit c8c67dde6f
2 changed files with 67 additions and 13 deletions

View File

@@ -233,9 +233,7 @@ class HTMLSectionSplitter:
documents.append(new_doc)
return documents
def split_html_by_headers(
self, html_doc: str
) -> Dict[str, Dict[str, Optional[str]]]:
def split_html_by_headers(self, html_doc: str) -> List[Dict[str, Optional[str]]]:
try:
from bs4 import BeautifulSoup, PageElement # type: ignore[import-untyped]
except ImportError as e:
@@ -247,7 +245,7 @@ class HTMLSectionSplitter:
soup = BeautifulSoup(html_doc, "html.parser")
headers = list(self.headers_to_split_on.keys())
sections: Dict[str, Dict[str, Optional[str]]] = {}
sections: list[dict[str, str | None]] = []
headers = soup.find_all(["body"] + headers)
@@ -269,10 +267,13 @@ class HTMLSectionSplitter:
content = " ".join(section_content).strip()
if content != "":
sections[current_header] = {
"content": content,
"tag_name": current_header_tag,
}
sections.append(
{
"header": current_header,
"content": content,
"tag_name": current_header_tag,
}
)
return sections
@@ -307,12 +308,12 @@ class HTMLSectionSplitter:
return [
Document(
cast(str, sections[section_key]["content"]),
cast(str, section["content"]),
metadata={
self.headers_to_split_on[
str(sections[section_key]["tag_name"])
]: section_key
self.headers_to_split_on[str(section["tag_name"])]: section[
"header"
]
},
)
for section_key in sections.keys()
for section in sections
]