text-splitters[patch]: Fix HTMLSectionSplitter (#22812)

Update former pull request: https://github.com/langchain-ai/langchain/pull/22654. Modified `langchain_text_splitters.HTMLSectionSplitter`, where in the latest version `dict` data structure is used to store sections from a html document, in function `split_html_by_headers`. The header/section element names serve as dict keys. This can be a problem when duplicate header/section element names are present in a single html document. Latter ones can replace former ones with the same name. Therefore some contents can be miss after html text splitting is conducted. Using a list to store sections can hopefully solve the problem. A Unit test considering duplicate header names has been added. --------- Co-authored-by: Bagatur <baskaryan@gmail.com>
2025-09-04 20:46:45 +00:00 · 2024-06-15 06:40:39 +08:00
parent fbeeb6da75
commit c8c67dde6f
2 changed files with 67 additions and 13 deletions
--- a/libs/text-splitters/langchain_text_splitters/html.py
+++ b/libs/text-splitters/langchain_text_splitters/html.py
@@ -233,9 +233,7 @@ class HTMLSectionSplitter:
                documents.append(new_doc)
        return documents

-    def split_html_by_headers(
-        self, html_doc: str
-    ) -> Dict[str, Dict[str, Optional[str]]]:
+    def split_html_by_headers(self, html_doc: str) -> List[Dict[str, Optional[str]]]:
        try:
            from bs4 import BeautifulSoup, PageElement  # type: ignore[import-untyped]
        except ImportError as e:
@@ -247,7 +245,7 @@ class HTMLSectionSplitter:

        soup = BeautifulSoup(html_doc, "html.parser")
        headers = list(self.headers_to_split_on.keys())
-        sections: Dict[str, Dict[str, Optional[str]]] = {}
+        sections: list[dict[str, str | None]] = []

        headers = soup.find_all(["body"] + headers)

@@ -269,10 +267,13 @@ class HTMLSectionSplitter:
            content = " ".join(section_content).strip()

            if content != "":
-                sections[current_header] = {
-                    "content": content,
-                    "tag_name": current_header_tag,
-                }
+                sections.append(
+                    {
+                        "header": current_header,
+                        "content": content,
+                        "tag_name": current_header_tag,
+                    }
+                )

        return sections

@@ -307,12 +308,12 @@ class HTMLSectionSplitter:

        return [
            Document(
-                cast(str, sections[section_key]["content"]),
+                cast(str, section["content"]),
                metadata={
-                    self.headers_to_split_on[
-                        str(sections[section_key]["tag_name"])
-                    ]: section_key
+                    self.headers_to_split_on[str(section["tag_name"])]: section[
+                        "header"
+                    ]
                },
            )
-            for section_key in sections.keys()
+            for section in sections
        ]