mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-04 20:46:45 +00:00
text-splitters[patch]: Fix HTMLSectionSplitter (#22812)
Update former pull request: https://github.com/langchain-ai/langchain/pull/22654. Modified `langchain_text_splitters.HTMLSectionSplitter`, where in the latest version `dict` data structure is used to store sections from a html document, in function `split_html_by_headers`. The header/section element names serve as dict keys. This can be a problem when duplicate header/section element names are present in a single html document. Latter ones can replace former ones with the same name. Therefore some contents can be miss after html text splitting is conducted. Using a list to store sections can hopefully solve the problem. A Unit test considering duplicate header names has been added. --------- Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
@@ -233,9 +233,7 @@ class HTMLSectionSplitter:
|
||||
documents.append(new_doc)
|
||||
return documents
|
||||
|
||||
def split_html_by_headers(
|
||||
self, html_doc: str
|
||||
) -> Dict[str, Dict[str, Optional[str]]]:
|
||||
def split_html_by_headers(self, html_doc: str) -> List[Dict[str, Optional[str]]]:
|
||||
try:
|
||||
from bs4 import BeautifulSoup, PageElement # type: ignore[import-untyped]
|
||||
except ImportError as e:
|
||||
@@ -247,7 +245,7 @@ class HTMLSectionSplitter:
|
||||
|
||||
soup = BeautifulSoup(html_doc, "html.parser")
|
||||
headers = list(self.headers_to_split_on.keys())
|
||||
sections: Dict[str, Dict[str, Optional[str]]] = {}
|
||||
sections: list[dict[str, str | None]] = []
|
||||
|
||||
headers = soup.find_all(["body"] + headers)
|
||||
|
||||
@@ -269,10 +267,13 @@ class HTMLSectionSplitter:
|
||||
content = " ".join(section_content).strip()
|
||||
|
||||
if content != "":
|
||||
sections[current_header] = {
|
||||
"content": content,
|
||||
"tag_name": current_header_tag,
|
||||
}
|
||||
sections.append(
|
||||
{
|
||||
"header": current_header,
|
||||
"content": content,
|
||||
"tag_name": current_header_tag,
|
||||
}
|
||||
)
|
||||
|
||||
return sections
|
||||
|
||||
@@ -307,12 +308,12 @@ class HTMLSectionSplitter:
|
||||
|
||||
return [
|
||||
Document(
|
||||
cast(str, sections[section_key]["content"]),
|
||||
cast(str, section["content"]),
|
||||
metadata={
|
||||
self.headers_to_split_on[
|
||||
str(sections[section_key]["tag_name"])
|
||||
]: section_key
|
||||
self.headers_to_split_on[str(section["tag_name"])]: section[
|
||||
"header"
|
||||
]
|
||||
},
|
||||
)
|
||||
for section_key in sections.keys()
|
||||
for section in sections
|
||||
]
|
||||
|
Reference in New Issue
Block a user