text-splitters[minor]: Replace lxml and XSLT with BeautifulSoup in HTMLHeaderTextSplitter for Improved Large HTML File Processing (#27678)

This pull request updates the `HTMLHeaderTextSplitter` by replacing the `split_text_from_file` method's implementation. The original method used `lxml` and XSLT for processing HTML files, which caused `lxml.etree.xsltapplyerror maxhead` when handling large HTML documents due to limitations in the XSLT processor. Fixes #13149 By switching to BeautifulSoup (`bs4`), we achieve: - **Improved Performance and Reliability:** BeautifulSoup efficiently processes large HTML files without the errors associated with `lxml` and XSLT. - **Simplified Dependencies:** Removes the dependency on `lxml` and external XSLT files, relying instead on the widely used `beautifulsoup4` library. - **Maintained Functionality:** The new method replicates the original behavior, ensuring compatibility with existing code and preserving the extraction of content and metadata. **Issue:** This change addresses issues related to processing large HTML files with the existing `HTMLHeaderTextSplitter` implementation. It resolves problems where users encounter lxml.etree.xsltapplyerror maxhead due to large HTML documents. **Dependencies:** - **BeautifulSoup (`beautifulsoup4`):** The `beautifulsoup4` library is now used for parsing HTML content. - Installation: `pip install beautifulsoup4` **Code Changes:** Updated the `split_text_from_file` method in `HTMLHeaderTextSplitter` as follows: ```python def split_text_from_file(self, file: Any) -> List[Document]: """Split HTML file using BeautifulSoup. Args: file: HTML file path or file-like object. Returns: List of Document objects with page_content and metadata. """ from bs4 import BeautifulSoup from langchain.docstore.document import Document import bs4 # Read the HTML content from the file or file-like object if isinstance(file, str): with open(file, 'r', encoding='utf-8') as f: html_content = f.read() else: # Assuming file is a file-like object html_content = file.read() # Parse the HTML content using BeautifulSoup soup = BeautifulSoup(html_content, 'html.parser') # Extract the header tags and their corresponding metadata keys headers_to_split_on = [tag[0] for tag in self.headers_to_split_on] header_mapping = dict(self.headers_to_split_on) documents = [] # Find the body of the document body = soup.body if soup.body else soup # Find all header tags in the order they appear all_headers = body.find_all(headers_to_split_on) # If there's content before the first header, collect it first_header = all_headers[0] if all_headers else None if first_header: pre_header_content = '' for elem in first_header.find_all_previous(): if isinstance(elem, bs4.Tag): text = elem.get_text(separator=' ', strip=True) if text: pre_header_content = text + ' ' + pre_header_content if pre_header_content.strip(): documents.append(Document( page_content=pre_header_content.strip(), metadata={} # No metadata since there's no header )) else: # If no headers are found, return the whole content full_text = body.get_text(separator=' ', strip=True) if full_text.strip(): documents.append(Document( page_content=full_text.strip(), metadata={} )) return documents # Process each header and its associated content for header in all_headers: current_metadata = {} header_name = header.name header_text = header.get_text(separator=' ', strip=True) current_metadata[header_mapping[header_name]] = header_text # Collect all sibling elements until the next header of the same or higher level content_elements = [] for sibling in header.find_next_siblings(): if sibling.name in headers_to_split_on: # Stop at the next header break if isinstance(sibling, bs4.Tag): content_elements.append(sibling) # Get the text content of the collected elements current_content = '' for elem in content_elements: text = elem.get_text(separator=' ', strip=True) if text: current_content += text + ' ' # Create a Document if there is content if current_content.strip(): documents.append(Document( page_content=current_content.strip(), metadata=current_metadata.copy() )) else: # If there's no content, but we have metadata, still create a Document documents.append(Document( page_content='', metadata=current_metadata.copy() )) return documents ``` --------- Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
2025-06-23 15:19:33 +00:00 · 2025-01-20 23:10:37 +02:00 · 2025-01-20 23:10:37 +02:00 · d3ed9b86be
commit d3ed9b86be
parent 989eec4b7b
2 changed files with 736 additions and 156 deletions
--- a/libs/text-splitters/langchain_text_splitters/html.py
+++ b/libs/text-splitters/langchain_text_splitters/html.py
@ -3,7 +3,7 @@ from __future__ import annotations
 import copy
 import pathlib
 import re
-from io import BytesIO, StringIO
+from io import StringIO
 from typing import (
    Any,
    Callable,
@ -34,148 +34,291 @@ class ElementType(TypedDict):
 class HTMLHeaderTextSplitter:
-    """Splitting HTML files based on specified headers.
+    """Split HTML content into structured Documents based on specified headers.
-    Requires lxml package.
+    Splits HTML content by detecting specified header tags (e.g., <h1>, <h2>) and
    creating hierarchical Document objects that reflect the semantic structure
    of the original content. For each identified section, the splitter associates
    the extracted text with metadata corresponding to the encountered headers.
    If no specified headers are found, the entire content is returned as a single
    Document. This allows for flexible handling of HTML input, ensuring that
    information is organized according to its semantic headers.
    The splitter provides the option to return each HTML element as a separate
    Document or aggregate them into semantically meaningful chunks. It also
    gracefully handles multiple levels of nested headers, creating a rich,
    hierarchical representation of the content.
    Args:
        headers_to_split_on (List[Tuple[str, str]]): A list of (header_tag,
            header_name) pairs representing the headers that define splitting
            boundaries. For example, [("h1", "Header 1"), ("h2", "Header 2")]
            will split content by <h1> and <h2> tags, assigning their textual
            content to the Document metadata.
        return_each_element (bool): If True, every HTML element encountered
            (including headers, paragraphs, etc.) is returned as a separate
            Document. If False, content under the same header hierarchy is
            aggregated into fewer Documents.
    Returns:
        List[Document]: A list of Document objects. Each Document contains
        `page_content` holding the extracted text and `metadata` that maps
        the header hierarchy to their corresponding titles.
    Example:
        .. code-block:: python
            from langchain_text_splitters.html_header_text_splitter import (
                HTMLHeaderTextSplitter,
            )
            # Define headers for splitting on h1 and h2 tags.
            headers_to_split_on = [("h1", "Main Topic"), ("h2", "Sub Topic")]
            splitter = HTMLHeaderTextSplitter(
                headers_to_split_on=headers_to_split_on,
                return_each_element=False
            )
            html_content = \"\"\"
            <html>
              <body>
                <h1>Introduction</h1>
                <p>Welcome to the introduction section.</p>
                <h2>Background</h2>
                <p>Some background details here.</p>
                <h1>Conclusion</h1>
                <p>Final thoughts.</p>
              </body>
            </html>
            \"\"\"
            documents = splitter.split_text(html_content)
            # 'documents' now contains Document objects reflecting the hierarchy:
            # - Document with metadata={"Main Topic": "Introduction"} and
            #   content="Introduction"
            # - Document with metadata={"Main Topic": "Introduction"} and
            #   content="Welcome to the introduction section."
            # - Document with metadata={"Main Topic": "Introduction",
            #   "Sub Topic": "Background"} and content="Background"
            # - Document with metadata={"Main Topic": "Introduction",
            #   "Sub Topic": "Background"} and content="Some background details here."
            # - Document with metadata={"Main Topic": "Conclusion"} and
            #   content="Conclusion"
            # - Document with metadata={"Main Topic": "Conclusion"} and
            #   content="Final thoughts."
    """
    def __init__(
        self,
        headers_to_split_on: List[Tuple[str, str]],
        return_each_element: bool = False,
-    ):
+    ) -> None:
-        """Create a new HTMLHeaderTextSplitter.
+        """Initialize with headers to split on.
        Args:
-            headers_to_split_on: list of tuples of headers we want to track mapped to
+            headers_to_split_on: A list of tuples where
-                (arbitrary) keys for metadata. Allowed header values: h1, h2, h3, h4,
+                each tuple contains a header tag and its corresponding value.
-                h5, h6 e.g. [("h1", "Header 1"), ("h2", "Header 2)].
+            return_each_element: Whether to return each HTML
-            return_each_element: Return each element w/ associated headers.
+                element as a separate Document. Defaults to False.
        """
-        # Output element-by-element or aggregated into chunks w/ common headers
+        self.headers_to_split_on = sorted(
            headers_to_split_on, key=lambda x: int(x[0][1])
        )
        self.header_mapping = dict(self.headers_to_split_on)
        self.header_tags = [tag for tag, _ in self.headers_to_split_on]
        self.return_each_element = return_each_element
        self.headers_to_split_on = sorted(headers_to_split_on)
    def aggregate_elements_to_chunks(
        self, elements: List[ElementType]
    ) -> List[Document]:
        """Combine elements with common metadata into chunks.
        Args:
            elements: HTML element content with associated identifying info and metadata
        """
        aggregated_chunks: List[ElementType] = []
        for element in elements:
            if (
                aggregated_chunks
                and aggregated_chunks[-1]["metadata"] == element["metadata"]
            ):
                # If the last element in the aggregated list
                # has the same metadata as the current element,
                # append the current content to the last element's content
                aggregated_chunks[-1]["content"] += "  \n" + element["content"]
            else:
                # Otherwise, append the current element to the aggregated list
                aggregated_chunks.append(element)
        return [
            Document(page_content=chunk["content"], metadata=chunk["metadata"])
            for chunk in aggregated_chunks
        ]
    def split_text_from_url(self, url: str, **kwargs: Any) -> List[Document]:
        """Split HTML from web URL.
        Args:
            url: web URL
            **kwargs: Arbitrary additional keyword arguments. These are usually passed
                to the fetch url content request.
        """
        r = requests.get(url, **kwargs)
        return self.split_text_from_file(BytesIO(r.content))
    def split_text(self, text: str) -> List[Document]:
-        """Split HTML text string.
+        """Split the given text into a list of Document objects.
        Args:
-            text: HTML text
+            text: The HTML text to split.
        Returns:
            A list of split Document objects.
        """
        return self.split_text_from_file(StringIO(text))
-    def split_text_from_file(self, file: Any) -> List[Document]:
+    def split_text_from_url(
-        """Split HTML file.
+        self, url: str, timeout: int = 10, **kwargs: Any
    ) -> List[Document]:
        """Fetch text content from a URL and split it into documents.
        Args:
-            file: HTML file
+            url: The URL to fetch content from.
            timeout: Timeout for the request. Defaults to 10.
            **kwargs: Additional keyword arguments for the request.
        Returns:
            A list of split Document objects.
        Raises:
            requests.RequestException: If the HTTP request fails.
        """
        kwargs.setdefault("timeout", timeout)
        response = requests.get(url, **kwargs)
        response.raise_for_status()
        return self.split_text(response.text)
    def _header_level(self, tag_name: str) -> int:
        """Determine the heading level of a tag."""
        if tag_name.lower() in ["h1", "h2", "h3", "h4", "h5", "h6"]:
            return int(tag_name[1])
        # Returns high level if it isn't a header
        return 9999
    def _dom_depth(self, element: Any) -> int:
        """Determine the DOM depth of an element by counting its parents."""
        depth = 0
        for _ in element.parents:
            depth += 1
        return depth
    def _get_elements(self, html_content: str) -> List[Any]:
        """Parse HTML content and return a list of BeautifulSoup elements.
        This helper function takes HTML content as input,
        parses it using BeautifulSoup4, and returns all HTML elements
        found in the document body. If no body tag exists,
        it returns all elements in the full document.
        Args:
            html_content: Raw HTML content to be parsed.
        Returns:
            List[Any]: A list of BeautifulSoup elements found in the HTML document.
        Raises:
            ImportError: If the BeautifulSoup4 package is not installed.
        """
        try:
-            from lxml import etree
+            from bs4 import BeautifulSoup  # type: ignore[import-untyped]
        except ImportError as e:
            raise ImportError(
-                "Unable to import lxml, please install with `pip install lxml`."
+                "Unable to import BeautifulSoup/PageElement, \
                    please install with `pip install \
                    bs4`."
            ) from e
-        # use lxml library to parse html document and return xml ElementTree
+        soup = BeautifulSoup(html_content, "html.parser")
-        # Explicitly encoding in utf-8 allows non-English
+        body = soup.body if soup.body else soup
-        # html files to be processed without garbled characters
+        return body.find_all()
        parser = etree.HTMLParser(encoding="utf-8")
        tree = etree.parse(file, parser)
-        # document transformation for "structure-aware" chunking is handled with xsl.
+    def split_text_from_file(self, file: Any) -> List[Document]:
-        # see comments in html_chunks_with_headers.xslt for more detailed information.
+        """Split HTML content from a file into a list of Document objects.
        xslt_path = pathlib.Path(__file__).parent / "xsl/html_chunks_with_headers.xslt"
        xslt_tree = etree.parse(xslt_path)
        transform = etree.XSLT(xslt_tree)
        result = transform(tree)
        result_dom = etree.fromstring(str(result))
-        # create filter and mapping for header metadata
+        Args:
-        header_filter = [header[0] for header in self.headers_to_split_on]
+            file: A file path or a file-like object containing HTML content.
        header_mapping = dict(self.headers_to_split_on)
-        # map xhtml namespace prefix
+        Returns:
-        ns_map = {"h": "http://www.w3.org/1999/xhtml"}
+            A list of split Document objects.
        """
        if isinstance(file, str):
            with open(file, "r", encoding="utf-8") as f:
                html_content = f.read()
        else:
            html_content = file.read()
        elements = self._get_elements(html_content)
        documents: List[Document] = []
        active_headers: Dict[str, Tuple[str, int, int]] = {}
        current_chunk: List[str] = []
        chunk_dom_depth = 0
-        # build list of elements from DOM
+        def finalize_chunk() -> None:
-        elements = []
+            if current_chunk:
-        for element in result_dom.findall("*//*", ns_map):
+                final_meta = {
-            if element.findall("*[@class='headers']") or element.findall(
+                    key: content
-                "*[@class='chunk']"
+                    for key, (content, level, dom_depth) in active_headers.items()
-            ):
+                    if chunk_dom_depth >= dom_depth
-                elements.append(
+                }
-                    ElementType(
+                combined_text = "  \n".join(
-                        url=file,
+                    line for line in current_chunk if line.strip()
-                        xpath="".join(
+                )
-                            [
+                if combined_text.strip():
-                                node.text or ""
+                    documents.append(
-                                for node in element.findall("*[@class='xpath']", ns_map)
+                        Document(page_content=combined_text, metadata=final_meta)
                    )
                current_chunk.clear()
        for element in elements:
            tag = element.name
            if not tag:
                continue
            text = " ".join(
                t
                for t in element.find_all(string=True, recursive=False)
                if isinstance(t, str)
            ).strip()
            if not text:
                continue
            level = self._header_level(tag)
            dom_depth = self._dom_depth(element)
            if tag in self.header_tags:
                if not self.return_each_element:
                    finalize_chunk()
                # Remove headers at same or deeper level
                headers_to_remove = [
                    key for key, (_, lvl, _) in active_headers.items() if lvl >= level
                ]
-                        ),
+                for key in headers_to_remove:
-                        content="".join(
+                    del active_headers[key]
-                            [
+
-                                node.text or ""
+                header_key = self.header_mapping[tag]
-                                for node in element.findall("*[@class='chunk']", ns_map)
+                active_headers[header_key] = (text, level, dom_depth)
                # Produce a document for the header itself
                header_meta = {
                    key: content
                    for key, (content, lvl, dd) in active_headers.items()
                    if dom_depth >= dd
                }
                documents.append(Document(page_content=text, metadata=header_meta))
                # After encountering a header,
                # no immediate content goes to current_chunk
                # (if return_each_element is False, we wait for next content)
                # (if return_each_element is True, we create docs per element anyway)
            else:
                # Non-header element logic
                # Remove headers that don't apply if dom_depth < their dom_depth
                headers_to_remove = [
                    key for key, (_, _, dd) in active_headers.items() if dom_depth < dd
                ]
-                        ),
+                for key in headers_to_remove:
-                        metadata={
+                    del active_headers[key]
-                            # Add text of specified headers to metadata using header
+
-                            # mapping.
+                if self.return_each_element:
-                            header_mapping[node.tag]: node.text or ""
+                    # Produce a doc for this element immediately
-                            for node in filter(
+                    element_meta = {
-                                lambda x: x.tag in header_filter,
+                        key: content
-                                element.findall("*[@class='headers']/*", ns_map),
+                        for key, (content, lvl, dd) in active_headers.items()
-                            )
+                        if dom_depth >= dd
-                        },
+                    }
-                    )
+                    if text.strip():
                        documents.append(
                            Document(page_content=text, metadata=element_meta)
                        )
                else:
                    # Accumulate content in current_chunk
                    if text.strip():
                        current_chunk.append(text)
                        chunk_dom_depth = max(chunk_dom_depth, dom_depth)
        if not self.return_each_element:
-            return self.aggregate_elements_to_chunks(elements)
+            # finalize any remaining chunk
-        else:
+            finalize_chunk()
-            return [
+
-                Document(page_content=chunk["content"], metadata=chunk["metadata"])
+        # If no headers were found at all and return_each_element=False, behavior is:
-                for chunk in elements
+        # The entire content should be in one document.
-            ]
+        # The logic above naturally handles it:
        # If no recognized headers, we never split; we ended up just accumulating text
        # in current_chunk and finalizing once at the end.
        return documents
 class HTMLSectionSplitter:
@ -269,7 +412,10 @@ class HTMLSectionSplitter:
                - 'tag_name': The name of the header tag (e.g., "h1", "h2").
        """
        try:
-            from bs4 import BeautifulSoup, PageElement  # type: ignore[import-untyped]
+            from bs4 import (
                BeautifulSoup,  # type: ignore[import-untyped]
                PageElement,
            )
        except ImportError as e:
            raise ImportError(
                "Unable to import BeautifulSoup/PageElement, \
@ -343,10 +489,13 @@ class HTMLSectionSplitter:
        return str(result)
    def split_text_from_file(self, file: Any) -> List[Document]:
-        """Split HTML file.
+        """Split HTML content from a file into a list of Document objects.
        Args:
-            file: HTML file
+            file: A file path or a file-like object containing HTML content.
        Returns:
            A list of split Document objects.
        """
        file_content = file.getvalue()
        file_content = self.convert_possible_tags_to_header(file_content)
@ -844,3 +993,6 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
        for placeholder, preserved_content in preserved_elements.items():
            content = content.replace(placeholder, preserved_content.strip())
        return content
 # %%
--- a/libs/text-splitters/tests/unit_tests/test_text_splitters.py
+++ b/libs/text-splitters/tests/unit_tests/test_text_splitters.py
@ -4,7 +4,7 @@ import random
 import re
 import string
 from pathlib import Path
-from typing import Any, List
+from typing import Any, Callable, List, Tuple
 import pytest
 from langchain_core.documents import Document
@ -2039,49 +2039,476 @@ def test_haskell_code_splitter() -> None:
    assert chunks == expected_chunks
-@pytest.mark.requires("lxml")
+@pytest.fixture
-def test_html_header_text_splitter(tmp_path: Path) -> None:
+@pytest.mark.requires("bs4")
-    splitter = HTMLHeaderTextSplitter(
+def html_header_splitter_splitter_factory() -> (
-        headers_to_split_on=[("h1", "Header 1"), ("h2", "Header 2")]
+    Callable[[List[Tuple[str, str]]], HTMLHeaderTextSplitter]
-    )
+):
-
+    """
-    content = """
+    Fixture to create an HTMLHeaderTextSplitter instance with given headers.
-<h1>Sample Document</h1>
+    This factory allows dynamic creation of splitters with different headers.
    <h2>Section</h2>
        <p id="1234">Reference content.</p>
    <h2>Lists</h2>
        <ul>
            <li>Item 1</li>
            <li>Item 2</li>
            <li>Item 3</li>
        </ul>
        <h3>A block</h3>
            <div class="amazing">
                <p>Some text</p>
                <p>Some more text</p>
            </div>
    """
-    docs = splitter.split_text(content)
+    def _create_splitter(
-    expected = [
+        headers_to_split_on: List[Tuple[str, str]],
    ) -> HTMLHeaderTextSplitter:
        return HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
    return _create_splitter
@pytest.mark.parametrize(
    "headers_to_split_on, html_input, expected_documents, test_case",
    [
        (
            # Test Case 1: Split on h1 and h2
            [("h1", "Header 1"), ("h2", "Header 2")],
            """
            <html>
                <body>
                    <h1>Introduction</h1>
                    <p>This is the introduction.</p>
                    <h2>Background</h2>
                    <p>Background information.</p>
                    <h1>Conclusion</h1>
                    <p>Final thoughts.</p>
                </body>
            </html>
            """,
            [
                Document(
-            page_content="Reference content.",
+                    page_content="Introduction", metadata={"Header 1": "Introduction"}
            metadata={"Header 1": "Sample Document", "Header 2": "Section"},
                ),
                Document(
-            page_content="Item 1 Item 2 Item 3  \nSome text  \nSome more text",
+                    page_content="This is the introduction.",
-            metadata={"Header 1": "Sample Document", "Header 2": "Lists"},
+                    metadata={"Header 1": "Introduction"},
                ),
-    ]
+                Document(
-    assert docs == expected
+                    page_content="Background",
                    metadata={"Header 1": "Introduction", "Header 2": "Background"},
                ),
                Document(
                    page_content="Background information.",
                    metadata={"Header 1": "Introduction", "Header 2": "Background"},
                ),
                Document(
                    page_content="Conclusion", metadata={"Header 1": "Conclusion"}
                ),
                Document(
                    page_content="Final thoughts.", metadata={"Header 1": "Conclusion"}
                ),
            ],
            "Simple headers and paragraphs",
        ),
        (
            # Test Case 2: Nested headers with h1, h2, and h3
            [("h1", "Header 1"), ("h2", "Header 2"), ("h3", "Header 3")],
            """
            <html>
                <body>
                    <div>
                        <h1>Main Title</h1>
                        <div>
                            <h2>Subsection</h2>
                            <p>Details of subsection.</p>
                            <div>
                                <h3>Sub-subsection</h3>
                                <p>More details.</p>
                            </div>
                        </div>
                    </div>
                    <h1>Another Main Title</h1>
                    <p>Content under another main title.</p>
                </body>
            </html>
            """,
            [
                Document(
                    page_content="Main Title", metadata={"Header 1": "Main Title"}
                ),
                Document(
                    page_content="Subsection",
                    metadata={"Header 1": "Main Title", "Header 2": "Subsection"},
                ),
                Document(
                    page_content="Details of subsection.",
                    metadata={"Header 1": "Main Title", "Header 2": "Subsection"},
                ),
                Document(
                    page_content="Sub-subsection",
                    metadata={
                        "Header 1": "Main Title",
                        "Header 2": "Subsection",
                        "Header 3": "Sub-subsection",
                    },
                ),
                Document(
                    page_content="More details.",
                    metadata={
                        "Header 1": "Main Title",
                        "Header 2": "Subsection",
                        "Header 3": "Sub-subsection",
                    },
                ),
                Document(
                    page_content="Another Main Title",
                    metadata={"Header 1": "Another Main Title"},
                ),
                Document(
                    page_content="Content under another main title.",
                    metadata={"Header 1": "Another Main Title"},
                ),
            ],
            "Nested headers with h1, h2, and h3",
        ),
        (
            # Test Case 3: No headers
            [("h1", "Header 1")],
            """
            <html>
                <body>
                    <p>Paragraph one.</p>
                    <p>Paragraph two.</p>
                    <div>
                        <p>Paragraph three.</p>
                    </div>
                </body>
            </html>
            """,
            [
                Document(
                    page_content="Paragraph one.  \nParagraph two.  \nParagraph three.",
                    metadata={},
                )
            ],
            "No headers present",
        ),
        (
            # Test Case 4: Multiple headers of the same level
            [("h1", "Header 1")],
            """
            <html>
                <body>
                    <h1>Chapter 1</h1>
                    <p>Content of chapter 1.</p>
                    <h1>Chapter 2</h1>
                    <p>Content of chapter 2.</p>
                    <h1>Chapter 3</h1>
                    <p>Content of chapter 3.</p>
                </body>
            </html>
            """,
            [
                Document(page_content="Chapter 1", metadata={"Header 1": "Chapter 1"}),
                Document(
                    page_content="Content of chapter 1.",
                    metadata={"Header 1": "Chapter 1"},
                ),
                Document(page_content="Chapter 2", metadata={"Header 1": "Chapter 2"}),
                Document(
                    page_content="Content of chapter 2.",
                    metadata={"Header 1": "Chapter 2"},
                ),
                Document(page_content="Chapter 3", metadata={"Header 1": "Chapter 3"}),
                Document(
                    page_content="Content of chapter 3.",
                    metadata={"Header 1": "Chapter 3"},
                ),
            ],
            "Multiple headers of the same level",
        ),
        (
            # Test Case 5: Headers with no content
            [("h1", "Header 1"), ("h2", "Header 2")],
            """
            <html>
                <body>
                    <h1>Header 1</h1>
                    <h2>Header 2</h2>
                    <h1>Header 3</h1>
                </body>
            </html>
            """,
            [
                Document(page_content="Header 1", metadata={"Header 1": "Header 1"}),
                Document(
                    page_content="Header 2",
                    metadata={"Header 1": "Header 1", "Header 2": "Header 2"},
                ),
                Document(page_content="Header 3", metadata={"Header 1": "Header 3"}),
            ],
            "Headers with no associated content",
        ),
    ],
 )
@pytest.mark.requires("bs4")
 def test_html_header_text_splitter(
    html_header_splitter_splitter_factory: Any,
    headers_to_split_on: List[Tuple[str, str]],
    html_input: str,
    expected_documents: List[Document],
    test_case: str,
 ) -> None:
    """
    Test the HTML header text splitter.
-    with open(tmp_path / "doc.html", "w") as tmp:
+    Args:
-        tmp.write(content)
+        html_header_splitter_splitter_factory (Any): Factory function to create
-    docs_from_file = splitter.split_text_from_file(tmp_path / "doc.html")
+            the HTML header splitter.
        headers_to_split_on (List[Tuple[str, str]]): List of headers to split on.
        html_input (str): The HTML input string to be split.
        expected_documents (List[Document]): List of expected Document objects.
        test_case (str): Description of the test case.
-    assert docs_from_file == expected
+    Raises:
        AssertionError: If the number of documents or their content/metadata
            does not match the expected values.
    """
    splitter = html_header_splitter_splitter_factory(
        headers_to_split_on=headers_to_split_on
    )
    docs = splitter.split_text(html_input)
    assert len(docs) == len(expected_documents), (
        f"Test Case '{test_case}' Failed: Number of documents mismatch. "
        f"Expected {len(expected_documents)}, got {len(docs)}."
    )
    for idx, (doc, expected) in enumerate(zip(docs, expected_documents), start=1):
        assert doc.page_content == expected.page_content, (
            f"Test Case '{test_case}' Failed at Document {idx}: "
            f"Content mismatch.\nExpected: {expected.page_content}"
            "\nGot: {doc.page_content}"
        )
        assert doc.metadata == expected.metadata, (
            f"Test Case '{test_case}' Failed at Document {idx}: "
            f"Metadata mismatch.\nExpected: {expected.metadata}\nGot: {doc.metadata}"
        )
@pytest.mark.parametrize(
    "headers_to_split_on, html_content, expected_output, test_case",
    [
        (
            # Test Case A: Split on h1 and h2 with h3 in content
            [("h1", "Header 1"), ("h2", "Header 2"), ("h3", "Header 3")],
            """
            <!DOCTYPE html>
            <html>
            <body>
                <div>
                    <h1>Foo</h1>
                    <p>Some intro text about Foo.</p>
                    <div>
                        <h2>Bar main section</h2>
                        <p>Some intro text about Bar.</p>
                        <h3>Bar subsection 1</h3>
                        <p>Some text about the first subtopic of Bar.</p>
                        <h3>Bar subsection 2</h3>
                        <p>Some text about the second subtopic of Bar.</p>
                    </div>
                    <div>
                        <h2>Baz</h2>
                        <p>Some text about Baz</p>
                    </div>
                    <br>
                    <p>Some concluding text about Foo</p>
                </div>
            </body>
            </html>
            """,
            [
                Document(metadata={"Header 1": "Foo"}, page_content="Foo"),
                Document(
                    metadata={"Header 1": "Foo"},
                    page_content="Some intro text about Foo.",
                ),
                Document(
                    metadata={"Header 1": "Foo", "Header 2": "Bar main section"},
                    page_content="Bar main section",
                ),
                Document(
                    metadata={"Header 1": "Foo", "Header 2": "Bar main section"},
                    page_content="Some intro text about Bar.",
                ),
                Document(
                    metadata={
                        "Header 1": "Foo",
                        "Header 2": "Bar main section",
                        "Header 3": "Bar subsection 1",
                    },
                    page_content="Bar subsection 1",
                ),
                Document(
                    metadata={
                        "Header 1": "Foo",
                        "Header 2": "Bar main section",
                        "Header 3": "Bar subsection 1",
                    },
                    page_content="Some text about the first subtopic of Bar.",
                ),
                Document(
                    metadata={
                        "Header 1": "Foo",
                        "Header 2": "Bar main section",
                        "Header 3": "Bar subsection 2",
                    },
                    page_content="Bar subsection 2",
                ),
                Document(
                    metadata={
                        "Header 1": "Foo",
                        "Header 2": "Bar main section",
                        "Header 3": "Bar subsection 2",
                    },
                    page_content="Some text about the second subtopic of Bar.",
                ),
                Document(
                    metadata={"Header 1": "Foo", "Header 2": "Baz"}, page_content="Baz"
                ),
                Document(
                    metadata={"Header 1": "Foo"},
                    page_content=(
                        "Some text about Baz  \nSome concluding text about Foo"
                    ),
                ),
            ],
            "Test Case A: Split on h1, h2, and h3 with nested headers",
        ),
        (
            # Test Case B: Split on h1 only without any headers
            [("h1", "Header 1")],
            """
            <html>
                <body>
                    <p>Paragraph one.</p>
                    <p>Paragraph two.</p>
                    <p>Paragraph three.</p>
                </body>
            </html>
            """,
            [
                Document(
                    metadata={},
                    page_content="Paragraph one.  \nParagraph two.  \nParagraph three.",
                )
            ],
            "Test Case B: Split on h1 only without any headers",
        ),
    ],
 )
@pytest.mark.requires("bs4")
 def test_additional_html_header_text_splitter(
    html_header_splitter_splitter_factory: Any,
    headers_to_split_on: List[Tuple[str, str]],
    html_content: str,
    expected_output: List[Document],
    test_case: str,
 ) -> None:
    """
    Test the HTML header text splitter.
    Args:
        html_header_splitter_splitter_factory (Any): Factory function to create
            the HTML header splitter.
        headers_to_split_on (List[Tuple[str, str]]): List of headers to split on.
        html_content (str): HTML content to be split.
        expected_output (List[Document]): Expected list of Document objects.
        test_case (str): Description of the test case.
    Raises:
        AssertionError: If the number of documents or their content/metadata
            does not match the expected output.
    """
    splitter = html_header_splitter_splitter_factory(
        headers_to_split_on=headers_to_split_on
    )
    docs = splitter.split_text(html_content)
    assert len(docs) == len(expected_output), (
        f"{test_case} Failed: Number of documents mismatch. "
        f"Expected {len(expected_output)}, got {len(docs)}."
    )
    for idx, (doc, expected) in enumerate(zip(docs, expected_output), start=1):
        assert doc.page_content == expected.page_content, (
            f"{test_case} Failed at Document {idx}: "
            f"Content mismatch.\nExpected: {expected.page_content}\n"
            "Got: {doc.page_content}"
        )
        assert doc.metadata == expected.metadata, (
            f"{test_case} Failed at Document {idx}: "
            f"Metadata mismatch.\nExpected: {expected.metadata}\nGot: {doc.metadata}"
        )
@pytest.mark.parametrize(
    "headers_to_split_on, html_content, expected_output, test_case",
    [
        (
            # Test Case C: Split on h1, h2, and h3 with no headers present
            [("h1", "Header 1"), ("h2", "Header 2"), ("h3", "Header 3")],
            """
            <html>
                <body>
                    <p>Just some random text without headers.</p>
                    <div>
                        <span>More text here.</span>
                    </div>
                </body>
            </html>
            """,
            [
                Document(
                    page_content="Just some random text without headers."
                    "  \nMore text here.",
                    metadata={},
                )
            ],
            "Test Case C: Split on h1, h2, and h3 without any headers",
        )
    ],
 )
@pytest.mark.requires("bs4")
 def test_html_no_headers_with_multiple_splitters(
    html_header_splitter_splitter_factory: Any,
    headers_to_split_on: List[Tuple[str, str]],
    html_content: str,
    expected_output: List[Document],
    test_case: str,
 ) -> None:
    """
    Test HTML content splitting without headers using multiple splitters.
    Args:
        html_header_splitter_splitter_factory (Any): Factory to create the
            HTML header splitter.
        headers_to_split_on (List[Tuple[str, str]]): List of headers to split on.
        html_content (str): HTML content to be split.
        expected_output (List[Document]): Expected list of Document objects
            after splitting.
        test_case (str): Description of the test case.
    Raises:
        AssertionError: If the number of documents or their content/metadata
            does not match the expected output.
    """
    splitter = html_header_splitter_splitter_factory(
        headers_to_split_on=headers_to_split_on
    )
    docs = splitter.split_text(html_content)
    assert len(docs) == len(expected_output), (
        f"{test_case} Failed: Number of documents mismatch. "
        f"Expected {len(expected_output)}, got {len(docs)}."
    )
    for idx, (doc, expected) in enumerate(zip(docs, expected_output), start=1):
        assert doc.page_content == expected.page_content, (
            f"{test_case} Failed at Document {idx}: "
            f"Content mismatch.\nExpected: {expected.page_content}\n"
            "Got: {doc.page_content}"
        )
        assert doc.metadata == expected.metadata, (
            f"{test_case} Failed at Document {idx}: "
            f"Metadata mismatch.\nExpected: {expected.metadata}\nGot: {doc.metadata}"
        )
 def test_split_text_on_tokens() -> None:
@ -2480,7 +2907,8 @@ def test_html_splitter_with_custom_extractor() -> None:
    expected = [
        Document(
-            page_content="This is an iframe: [iframe:http://example.com](http://example.com)",
+            page_content="This is an iframe: "
            "[iframe:http://example.com](http://example.com)",
            metadata={"Header 1": "Section 1"},
        ),
    ]