text-splitters[minor]: Replace lxml and XSLT with BeautifulSoup in HTMLHeaderTextSplitter for Improved Large HTML File Processing (#27678)

This pull request updates the `HTMLHeaderTextSplitter` by replacing the `split_text_from_file` method's implementation. The original method used `lxml` and XSLT for processing HTML files, which caused `lxml.etree.xsltapplyerror maxhead` when handling large HTML documents due to limitations in the XSLT processor. Fixes #13149 By switching to BeautifulSoup (`bs4`), we achieve: - **Improved Performance and Reliability:** BeautifulSoup efficiently processes large HTML files without the errors associated with `lxml` and XSLT. - **Simplified Dependencies:** Removes the dependency on `lxml` and external XSLT files, relying instead on the widely used `beautifulsoup4` library. - **Maintained Functionality:** The new method replicates the original behavior, ensuring compatibility with existing code and preserving the extraction of content and metadata. **Issue:** This change addresses issues related to processing large HTML files with the existing `HTMLHeaderTextSplitter` implementation. It resolves problems where users encounter lxml.etree.xsltapplyerror maxhead due to large HTML documents. **Dependencies:** - **BeautifulSoup (`beautifulsoup4`):** The `beautifulsoup4` library is now used for parsing HTML content. - Installation: `pip install beautifulsoup4` **Code Changes:** Updated the `split_text_from_file` method in `HTMLHeaderTextSplitter` as follows: ```python def split_text_from_file(self, file: Any) -> List[Document]: """Split HTML file using BeautifulSoup. Args: file: HTML file path or file-like object. Returns: List of Document objects with page_content and metadata. """ from bs4 import BeautifulSoup from langchain.docstore.document import Document import bs4 # Read the HTML content from the file or file-like object if isinstance(file, str): with open(file, 'r', encoding='utf-8') as f: html_content = f.read() else: # Assuming file is a file-like object html_content = file.read() # Parse the HTML content using BeautifulSoup soup = BeautifulSoup(html_content, 'html.parser') # Extract the header tags and their corresponding metadata keys headers_to_split_on = [tag[0] for tag in self.headers_to_split_on] header_mapping = dict(self.headers_to_split_on) documents = [] # Find the body of the document body = soup.body if soup.body else soup # Find all header tags in the order they appear all_headers = body.find_all(headers_to_split_on) # If there's content before the first header, collect it first_header = all_headers[0] if all_headers else None if first_header: pre_header_content = '' for elem in first_header.find_all_previous(): if isinstance(elem, bs4.Tag): text = elem.get_text(separator=' ', strip=True) if text: pre_header_content = text + ' ' + pre_header_content if pre_header_content.strip(): documents.append(Document( page_content=pre_header_content.strip(), metadata={} # No metadata since there's no header )) else: # If no headers are found, return the whole content full_text = body.get_text(separator=' ', strip=True) if full_text.strip(): documents.append(Document( page_content=full_text.strip(), metadata={} )) return documents # Process each header and its associated content for header in all_headers: current_metadata = {} header_name = header.name header_text = header.get_text(separator=' ', strip=True) current_metadata[header_mapping[header_name]] = header_text # Collect all sibling elements until the next header of the same or higher level content_elements = [] for sibling in header.find_next_siblings(): if sibling.name in headers_to_split_on: # Stop at the next header break if isinstance(sibling, bs4.Tag): content_elements.append(sibling) # Get the text content of the collected elements current_content = '' for elem in content_elements: text = elem.get_text(separator=' ', strip=True) if text: current_content += text + ' ' # Create a Document if there is content if current_content.strip(): documents.append(Document( page_content=current_content.strip(), metadata=current_metadata.copy() )) else: # If there's no content, but we have metadata, still create a Document documents.append(Document( page_content='', metadata=current_metadata.copy() )) return documents ``` --------- Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
2026-01-13 11:58:23 +00:00 · 2025-01-20 23:10:37 +02:00
parent 989eec4b7b
commit d3ed9b86be
2 changed files with 736 additions and 156 deletions
--- a/libs/text-splitters/langchain_text_splitters/html.py
+++ b/libs/text-splitters/langchain_text_splitters/html.py
@@ -3,7 +3,7 @@ from __future__ import annotations
 import copy
 import pathlib
 import re
-from io import BytesIO, StringIO
+from io import StringIO
 from typing import (
    Any,
    Callable,
@@ -34,148 +34,291 @@ class ElementType(TypedDict):


 class HTMLHeaderTextSplitter:
-    """Splitting HTML files based on specified headers.
+    """Split HTML content into structured Documents based on specified headers.

-    Requires lxml package.
+    Splits HTML content by detecting specified header tags (e.g., <h1>, <h2>) and
+    creating hierarchical Document objects that reflect the semantic structure
+    of the original content. For each identified section, the splitter associates
+    the extracted text with metadata corresponding to the encountered headers.
+
+    If no specified headers are found, the entire content is returned as a single
+    Document. This allows for flexible handling of HTML input, ensuring that
+    information is organized according to its semantic headers.
+
+    The splitter provides the option to return each HTML element as a separate
+    Document or aggregate them into semantically meaningful chunks. It also
+    gracefully handles multiple levels of nested headers, creating a rich,
+    hierarchical representation of the content.
+
+    Args:
+        headers_to_split_on (List[Tuple[str, str]]): A list of (header_tag,
+            header_name) pairs representing the headers that define splitting
+            boundaries. For example, [("h1", "Header 1"), ("h2", "Header 2")]
+            will split content by <h1> and <h2> tags, assigning their textual
+            content to the Document metadata.
+        return_each_element (bool): If True, every HTML element encountered
+            (including headers, paragraphs, etc.) is returned as a separate
+            Document. If False, content under the same header hierarchy is
+            aggregated into fewer Documents.
+
+    Returns:
+        List[Document]: A list of Document objects. Each Document contains
+        `page_content` holding the extracted text and `metadata` that maps
+        the header hierarchy to their corresponding titles.
+
+    Example:
+        .. code-block:: python
+
+            from langchain_text_splitters.html_header_text_splitter import (
+                HTMLHeaderTextSplitter,
+            )
+
+            # Define headers for splitting on h1 and h2 tags.
+            headers_to_split_on = [("h1", "Main Topic"), ("h2", "Sub Topic")]
+
+            splitter = HTMLHeaderTextSplitter(
+                headers_to_split_on=headers_to_split_on,
+                return_each_element=False
+            )
+
+            html_content = \"\"\"
+            <html>
+              <body>
+                <h1>Introduction</h1>
+                <p>Welcome to the introduction section.</p>
+                <h2>Background</h2>
+                <p>Some background details here.</p>
+                <h1>Conclusion</h1>
+                <p>Final thoughts.</p>
+              </body>
+            </html>
+            \"\"\"
+
+            documents = splitter.split_text(html_content)
+
+            # 'documents' now contains Document objects reflecting the hierarchy:
+            # - Document with metadata={"Main Topic": "Introduction"} and
+            #   content="Introduction"
+            # - Document with metadata={"Main Topic": "Introduction"} and
+            #   content="Welcome to the introduction section."
+            # - Document with metadata={"Main Topic": "Introduction",
+            #   "Sub Topic": "Background"} and content="Background"
+            # - Document with metadata={"Main Topic": "Introduction",
+            #   "Sub Topic": "Background"} and content="Some background details here."
+            # - Document with metadata={"Main Topic": "Conclusion"} and
+            #   content="Conclusion"
+            # - Document with metadata={"Main Topic": "Conclusion"} and
+            #   content="Final thoughts."
    """

    def __init__(
        self,
        headers_to_split_on: List[Tuple[str, str]],
        return_each_element: bool = False,
-    ):
-        """Create a new HTMLHeaderTextSplitter.
+    ) -> None:
+        """Initialize with headers to split on.

        Args:
-            headers_to_split_on: list of tuples of headers we want to track mapped to
-                (arbitrary) keys for metadata. Allowed header values: h1, h2, h3, h4,
-                h5, h6 e.g. [("h1", "Header 1"), ("h2", "Header 2)].
-            return_each_element: Return each element w/ associated headers.
+            headers_to_split_on: A list of tuples where
+                each tuple contains a header tag and its corresponding value.
+            return_each_element: Whether to return each HTML
+                element as a separate Document. Defaults to False.
        """
-        # Output element-by-element or aggregated into chunks w/ common headers
+        self.headers_to_split_on = sorted(
+            headers_to_split_on, key=lambda x: int(x[0][1])
+        )
+        self.header_mapping = dict(self.headers_to_split_on)
+        self.header_tags = [tag for tag, _ in self.headers_to_split_on]
        self.return_each_element = return_each_element
-        self.headers_to_split_on = sorted(headers_to_split_on)
-
-    def aggregate_elements_to_chunks(
-        self, elements: List[ElementType]
-    ) -> List[Document]:
-        """Combine elements with common metadata into chunks.
-
-        Args:
-            elements: HTML element content with associated identifying info and metadata
-        """
-        aggregated_chunks: List[ElementType] = []
-
-        for element in elements:
-            if (
-                aggregated_chunks
-                and aggregated_chunks[-1]["metadata"] == element["metadata"]
-            ):
-                # If the last element in the aggregated list
-                # has the same metadata as the current element,
-                # append the current content to the last element's content
-                aggregated_chunks[-1]["content"] += "  \n" + element["content"]
-            else:
-                # Otherwise, append the current element to the aggregated list
-                aggregated_chunks.append(element)
-
-        return [
-            Document(page_content=chunk["content"], metadata=chunk["metadata"])
-            for chunk in aggregated_chunks
-        ]
-
-    def split_text_from_url(self, url: str, **kwargs: Any) -> List[Document]:
-        """Split HTML from web URL.
-
-        Args:
-            url: web URL
-            **kwargs: Arbitrary additional keyword arguments. These are usually passed
-                to the fetch url content request.
-        """
-        r = requests.get(url, **kwargs)
-        return self.split_text_from_file(BytesIO(r.content))

    def split_text(self, text: str) -> List[Document]:
-        """Split HTML text string.
+        """Split the given text into a list of Document objects.

        Args:
-            text: HTML text
+            text: The HTML text to split.
+
+        Returns:
+            A list of split Document objects.
        """
        return self.split_text_from_file(StringIO(text))

-    def split_text_from_file(self, file: Any) -> List[Document]:
-        """Split HTML file.
+    def split_text_from_url(
+        self, url: str, timeout: int = 10, **kwargs: Any
+    ) -> List[Document]:
+        """Fetch text content from a URL and split it into documents.

        Args:
-            file: HTML file
+            url: The URL to fetch content from.
+            timeout: Timeout for the request. Defaults to 10.
+            **kwargs: Additional keyword arguments for the request.
+
+        Returns:
+            A list of split Document objects.
+
+        Raises:
+            requests.RequestException: If the HTTP request fails.
+        """
+        kwargs.setdefault("timeout", timeout)
+        response = requests.get(url, **kwargs)
+        response.raise_for_status()
+        return self.split_text(response.text)
+
+    def _header_level(self, tag_name: str) -> int:
+        """Determine the heading level of a tag."""
+        if tag_name.lower() in ["h1", "h2", "h3", "h4", "h5", "h6"]:
+            return int(tag_name[1])
+        # Returns high level if it isn't a header
+        return 9999
+
+    def _dom_depth(self, element: Any) -> int:
+        """Determine the DOM depth of an element by counting its parents."""
+        depth = 0
+        for _ in element.parents:
+            depth += 1
+        return depth
+
+    def _get_elements(self, html_content: str) -> List[Any]:
+        """Parse HTML content and return a list of BeautifulSoup elements.
+
+        This helper function takes HTML content as input,
+        parses it using BeautifulSoup4, and returns all HTML elements
+        found in the document body. If no body tag exists,
+        it returns all elements in the full document.
+
+        Args:
+            html_content: Raw HTML content to be parsed.
+
+        Returns:
+            List[Any]: A list of BeautifulSoup elements found in the HTML document.
+
+        Raises:
+            ImportError: If the BeautifulSoup4 package is not installed.
        """
        try:
-            from lxml import etree
+            from bs4 import BeautifulSoup  # type: ignore[import-untyped]
        except ImportError as e:
            raise ImportError(
-                "Unable to import lxml, please install with `pip install lxml`."
+                "Unable to import BeautifulSoup/PageElement, \
+                    please install with `pip install \
+                    bs4`."
            ) from e
-        # use lxml library to parse html document and return xml ElementTree
-        # Explicitly encoding in utf-8 allows non-English
-        # html files to be processed without garbled characters
-        parser = etree.HTMLParser(encoding="utf-8")
-        tree = etree.parse(file, parser)
+        soup = BeautifulSoup(html_content, "html.parser")
+        body = soup.body if soup.body else soup
+        return body.find_all()

-        # document transformation for "structure-aware" chunking is handled with xsl.
-        # see comments in html_chunks_with_headers.xslt for more detailed information.
-        xslt_path = pathlib.Path(__file__).parent / "xsl/html_chunks_with_headers.xslt"
-        xslt_tree = etree.parse(xslt_path)
-        transform = etree.XSLT(xslt_tree)
-        result = transform(tree)
-        result_dom = etree.fromstring(str(result))
+    def split_text_from_file(self, file: Any) -> List[Document]:
+        """Split HTML content from a file into a list of Document objects.

-        # create filter and mapping for header metadata
-        header_filter = [header[0] for header in self.headers_to_split_on]
-        header_mapping = dict(self.headers_to_split_on)
+        Args:
+            file: A file path or a file-like object containing HTML content.

-        # map xhtml namespace prefix
-        ns_map = {"h": "http://www.w3.org/1999/xhtml"}
+        Returns:
+            A list of split Document objects.
+        """
+        if isinstance(file, str):
+            with open(file, "r", encoding="utf-8") as f:
+                html_content = f.read()
+        else:
+            html_content = file.read()
+        elements = self._get_elements(html_content)
+        documents: List[Document] = []
+        active_headers: Dict[str, Tuple[str, int, int]] = {}
+        current_chunk: List[str] = []
+        chunk_dom_depth = 0

-        # build list of elements from DOM
-        elements = []
-        for element in result_dom.findall("*//*", ns_map):
-            if element.findall("*[@class='headers']") or element.findall(
-                "*[@class='chunk']"
-            ):
-                elements.append(
-                    ElementType(
-                        url=file,
-                        xpath="".join(
-                            [
-                                node.text or ""
-                                for node in element.findall("*[@class='xpath']", ns_map)
-                            ]
-                        ),
-                        content="".join(
-                            [
-                                node.text or ""
-                                for node in element.findall("*[@class='chunk']", ns_map)
-                            ]
-                        ),
-                        metadata={
-                            # Add text of specified headers to metadata using header
-                            # mapping.
-                            header_mapping[node.tag]: node.text or ""
-                            for node in filter(
-                                lambda x: x.tag in header_filter,
-                                element.findall("*[@class='headers']/*", ns_map),
-                            )
-                        },
-                    )
+        def finalize_chunk() -> None:
+            if current_chunk:
+                final_meta = {
+                    key: content
+                    for key, (content, level, dom_depth) in active_headers.items()
+                    if chunk_dom_depth >= dom_depth
+                }
+                combined_text = "  \n".join(
+                    line for line in current_chunk if line.strip()
                )
+                if combined_text.strip():
+                    documents.append(
+                        Document(page_content=combined_text, metadata=final_meta)
+                    )
+                current_chunk.clear()
+
+        for element in elements:
+            tag = element.name
+            if not tag:
+                continue
+            text = " ".join(
+                t
+                for t in element.find_all(string=True, recursive=False)
+                if isinstance(t, str)
+            ).strip()
+            if not text:
+                continue
+
+            level = self._header_level(tag)
+            dom_depth = self._dom_depth(element)
+
+            if tag in self.header_tags:
+                if not self.return_each_element:
+                    finalize_chunk()
+
+                # Remove headers at same or deeper level
+                headers_to_remove = [
+                    key for key, (_, lvl, _) in active_headers.items() if lvl >= level
+                ]
+                for key in headers_to_remove:
+                    del active_headers[key]
+
+                header_key = self.header_mapping[tag]
+                active_headers[header_key] = (text, level, dom_depth)
+
+                # Produce a document for the header itself
+                header_meta = {
+                    key: content
+                    for key, (content, lvl, dd) in active_headers.items()
+                    if dom_depth >= dd
+                }
+                documents.append(Document(page_content=text, metadata=header_meta))
+                # After encountering a header,
+                # no immediate content goes to current_chunk
+                # (if return_each_element is False, we wait for next content)
+                # (if return_each_element is True, we create docs per element anyway)
+            else:
+                # Non-header element logic
+                # Remove headers that don't apply if dom_depth < their dom_depth
+                headers_to_remove = [
+                    key for key, (_, _, dd) in active_headers.items() if dom_depth < dd
+                ]
+                for key in headers_to_remove:
+                    del active_headers[key]
+
+                if self.return_each_element:
+                    # Produce a doc for this element immediately
+                    element_meta = {
+                        key: content
+                        for key, (content, lvl, dd) in active_headers.items()
+                        if dom_depth >= dd
+                    }
+                    if text.strip():
+                        documents.append(
+                            Document(page_content=text, metadata=element_meta)
+                        )
+                else:
+                    # Accumulate content in current_chunk
+                    if text.strip():
+                        current_chunk.append(text)
+                        chunk_dom_depth = max(chunk_dom_depth, dom_depth)

        if not self.return_each_element:
-            return self.aggregate_elements_to_chunks(elements)
-        else:
-            return [
-                Document(page_content=chunk["content"], metadata=chunk["metadata"])
-                for chunk in elements
-            ]
+            # finalize any remaining chunk
+            finalize_chunk()
+
+        # If no headers were found at all and return_each_element=False, behavior is:
+        # The entire content should be in one document.
+        # The logic above naturally handles it:
+        # If no recognized headers, we never split; we ended up just accumulating text
+        # in current_chunk and finalizing once at the end.
+
+        return documents


 class HTMLSectionSplitter:
@@ -269,7 +412,10 @@ class HTMLSectionSplitter:
                - 'tag_name': The name of the header tag (e.g., "h1", "h2").
        """
        try:
-            from bs4 import BeautifulSoup, PageElement  # type: ignore[import-untyped]
+            from bs4 import (
+                BeautifulSoup,  # type: ignore[import-untyped]
+                PageElement,
+            )
        except ImportError as e:
            raise ImportError(
                "Unable to import BeautifulSoup/PageElement, \
@@ -343,10 +489,13 @@ class HTMLSectionSplitter:
        return str(result)

    def split_text_from_file(self, file: Any) -> List[Document]:
-        """Split HTML file.
+        """Split HTML content from a file into a list of Document objects.

        Args:
-            file: HTML file
+            file: A file path or a file-like object containing HTML content.
+
+        Returns:
+            A list of split Document objects.
        """
        file_content = file.getvalue()
        file_content = self.convert_possible_tags_to_header(file_content)
@@ -844,3 +993,6 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
        for placeholder, preserved_content in preserved_elements.items():
            content = content.replace(placeholder, preserved_content.strip())
        return content
+
+
+# %%
--- a/libs/text-splitters/tests/unit_tests/test_text_splitters.py
+++ b/libs/text-splitters/tests/unit_tests/test_text_splitters.py
@@ -4,7 +4,7 @@ import random
 import re
 import string
 from pathlib import Path
-from typing import Any, List
+from typing import Any, Callable, List, Tuple

 import pytest
 from langchain_core.documents import Document
@@ -2039,49 +2039,476 @@ def test_haskell_code_splitter() -> None:
    assert chunks == expected_chunks


-@pytest.mark.requires("lxml")
-def test_html_header_text_splitter(tmp_path: Path) -> None:
-    splitter = HTMLHeaderTextSplitter(
-        headers_to_split_on=[("h1", "Header 1"), ("h2", "Header 2")]
-    )
-
-    content = """
-<h1>Sample Document</h1>
-    <h2>Section</h2>
-        <p id="1234">Reference content.</p>
-
-    <h2>Lists</h2>
-        <ul>
-            <li>Item 1</li>
-            <li>Item 2</li>
-            <li>Item 3</li>
-        </ul>
-
-        <h3>A block</h3>
-            <div class="amazing">
-                <p>Some text</p>
-                <p>Some more text</p>
-            </div>
+@pytest.fixture
+@pytest.mark.requires("bs4")
+def html_header_splitter_splitter_factory() -> (
+    Callable[[List[Tuple[str, str]]], HTMLHeaderTextSplitter]
+):
+    """
+    Fixture to create an HTMLHeaderTextSplitter instance with given headers.
+    This factory allows dynamic creation of splitters with different headers.
    """

-    docs = splitter.split_text(content)
-    expected = [
-        Document(
-            page_content="Reference content.",
-            metadata={"Header 1": "Sample Document", "Header 2": "Section"},
-        ),
-        Document(
-            page_content="Item 1 Item 2 Item 3  \nSome text  \nSome more text",
-            metadata={"Header 1": "Sample Document", "Header 2": "Lists"},
-        ),
-    ]
-    assert docs == expected
+    def _create_splitter(
+        headers_to_split_on: List[Tuple[str, str]],
+    ) -> HTMLHeaderTextSplitter:
+        return HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

-    with open(tmp_path / "doc.html", "w") as tmp:
-        tmp.write(content)
-    docs_from_file = splitter.split_text_from_file(tmp_path / "doc.html")
+    return _create_splitter

-    assert docs_from_file == expected
+
+@pytest.mark.parametrize(
+    "headers_to_split_on, html_input, expected_documents, test_case",
+    [
+        (
+            # Test Case 1: Split on h1 and h2
+            [("h1", "Header 1"), ("h2", "Header 2")],
+            """
+            <html>
+                <body>
+                    <h1>Introduction</h1>
+                    <p>This is the introduction.</p>
+                    <h2>Background</h2>
+                    <p>Background information.</p>
+                    <h1>Conclusion</h1>
+                    <p>Final thoughts.</p>
+                </body>
+            </html>
+            """,
+            [
+                Document(
+                    page_content="Introduction", metadata={"Header 1": "Introduction"}
+                ),
+                Document(
+                    page_content="This is the introduction.",
+                    metadata={"Header 1": "Introduction"},
+                ),
+                Document(
+                    page_content="Background",
+                    metadata={"Header 1": "Introduction", "Header 2": "Background"},
+                ),
+                Document(
+                    page_content="Background information.",
+                    metadata={"Header 1": "Introduction", "Header 2": "Background"},
+                ),
+                Document(
+                    page_content="Conclusion", metadata={"Header 1": "Conclusion"}
+                ),
+                Document(
+                    page_content="Final thoughts.", metadata={"Header 1": "Conclusion"}
+                ),
+            ],
+            "Simple headers and paragraphs",
+        ),
+        (
+            # Test Case 2: Nested headers with h1, h2, and h3
+            [("h1", "Header 1"), ("h2", "Header 2"), ("h3", "Header 3")],
+            """
+            <html>
+                <body>
+                    <div>
+                        <h1>Main Title</h1>
+                        <div>
+                            <h2>Subsection</h2>
+                            <p>Details of subsection.</p>
+                            <div>
+                                <h3>Sub-subsection</h3>
+                                <p>More details.</p>
+                            </div>
+                        </div>
+                    </div>
+                    <h1>Another Main Title</h1>
+                    <p>Content under another main title.</p>
+                </body>
+            </html>
+            """,
+            [
+                Document(
+                    page_content="Main Title", metadata={"Header 1": "Main Title"}
+                ),
+                Document(
+                    page_content="Subsection",
+                    metadata={"Header 1": "Main Title", "Header 2": "Subsection"},
+                ),
+                Document(
+                    page_content="Details of subsection.",
+                    metadata={"Header 1": "Main Title", "Header 2": "Subsection"},
+                ),
+                Document(
+                    page_content="Sub-subsection",
+                    metadata={
+                        "Header 1": "Main Title",
+                        "Header 2": "Subsection",
+                        "Header 3": "Sub-subsection",
+                    },
+                ),
+                Document(
+                    page_content="More details.",
+                    metadata={
+                        "Header 1": "Main Title",
+                        "Header 2": "Subsection",
+                        "Header 3": "Sub-subsection",
+                    },
+                ),
+                Document(
+                    page_content="Another Main Title",
+                    metadata={"Header 1": "Another Main Title"},
+                ),
+                Document(
+                    page_content="Content under another main title.",
+                    metadata={"Header 1": "Another Main Title"},
+                ),
+            ],
+            "Nested headers with h1, h2, and h3",
+        ),
+        (
+            # Test Case 3: No headers
+            [("h1", "Header 1")],
+            """
+            <html>
+                <body>
+                    <p>Paragraph one.</p>
+                    <p>Paragraph two.</p>
+                    <div>
+                        <p>Paragraph three.</p>
+                    </div>
+                </body>
+            </html>
+            """,
+            [
+                Document(
+                    page_content="Paragraph one.  \nParagraph two.  \nParagraph three.",
+                    metadata={},
+                )
+            ],
+            "No headers present",
+        ),
+        (
+            # Test Case 4: Multiple headers of the same level
+            [("h1", "Header 1")],
+            """
+            <html>
+                <body>
+                    <h1>Chapter 1</h1>
+                    <p>Content of chapter 1.</p>
+                    <h1>Chapter 2</h1>
+                    <p>Content of chapter 2.</p>
+                    <h1>Chapter 3</h1>
+                    <p>Content of chapter 3.</p>
+                </body>
+            </html>
+            """,
+            [
+                Document(page_content="Chapter 1", metadata={"Header 1": "Chapter 1"}),
+                Document(
+                    page_content="Content of chapter 1.",
+                    metadata={"Header 1": "Chapter 1"},
+                ),
+                Document(page_content="Chapter 2", metadata={"Header 1": "Chapter 2"}),
+                Document(
+                    page_content="Content of chapter 2.",
+                    metadata={"Header 1": "Chapter 2"},
+                ),
+                Document(page_content="Chapter 3", metadata={"Header 1": "Chapter 3"}),
+                Document(
+                    page_content="Content of chapter 3.",
+                    metadata={"Header 1": "Chapter 3"},
+                ),
+            ],
+            "Multiple headers of the same level",
+        ),
+        (
+            # Test Case 5: Headers with no content
+            [("h1", "Header 1"), ("h2", "Header 2")],
+            """
+            <html>
+                <body>
+                    <h1>Header 1</h1>
+                    <h2>Header 2</h2>
+                    <h1>Header 3</h1>
+                </body>
+            </html>
+            """,
+            [
+                Document(page_content="Header 1", metadata={"Header 1": "Header 1"}),
+                Document(
+                    page_content="Header 2",
+                    metadata={"Header 1": "Header 1", "Header 2": "Header 2"},
+                ),
+                Document(page_content="Header 3", metadata={"Header 1": "Header 3"}),
+            ],
+            "Headers with no associated content",
+        ),
+    ],
+)
+@pytest.mark.requires("bs4")
+def test_html_header_text_splitter(
+    html_header_splitter_splitter_factory: Any,
+    headers_to_split_on: List[Tuple[str, str]],
+    html_input: str,
+    expected_documents: List[Document],
+    test_case: str,
+) -> None:
+    """
+    Test the HTML header text splitter.
+
+    Args:
+        html_header_splitter_splitter_factory (Any): Factory function to create
+            the HTML header splitter.
+        headers_to_split_on (List[Tuple[str, str]]): List of headers to split on.
+        html_input (str): The HTML input string to be split.
+        expected_documents (List[Document]): List of expected Document objects.
+        test_case (str): Description of the test case.
+
+    Raises:
+        AssertionError: If the number of documents or their content/metadata
+            does not match the expected values.
+    """
+
+    splitter = html_header_splitter_splitter_factory(
+        headers_to_split_on=headers_to_split_on
+    )
+    docs = splitter.split_text(html_input)
+
+    assert len(docs) == len(expected_documents), (
+        f"Test Case '{test_case}' Failed: Number of documents mismatch. "
+        f"Expected {len(expected_documents)}, got {len(docs)}."
+    )
+    for idx, (doc, expected) in enumerate(zip(docs, expected_documents), start=1):
+        assert doc.page_content == expected.page_content, (
+            f"Test Case '{test_case}' Failed at Document {idx}: "
+            f"Content mismatch.\nExpected: {expected.page_content}"
+            "\nGot: {doc.page_content}"
+        )
+        assert doc.metadata == expected.metadata, (
+            f"Test Case '{test_case}' Failed at Document {idx}: "
+            f"Metadata mismatch.\nExpected: {expected.metadata}\nGot: {doc.metadata}"
+        )
+
+
+@pytest.mark.parametrize(
+    "headers_to_split_on, html_content, expected_output, test_case",
+    [
+        (
+            # Test Case A: Split on h1 and h2 with h3 in content
+            [("h1", "Header 1"), ("h2", "Header 2"), ("h3", "Header 3")],
+            """
+            <!DOCTYPE html>
+            <html>
+            <body>
+                <div>
+                    <h1>Foo</h1>
+                    <p>Some intro text about Foo.</p>
+                    <div>
+                        <h2>Bar main section</h2>
+                        <p>Some intro text about Bar.</p>
+                        <h3>Bar subsection 1</h3>
+                        <p>Some text about the first subtopic of Bar.</p>
+                        <h3>Bar subsection 2</h3>
+                        <p>Some text about the second subtopic of Bar.</p>
+                    </div>
+                    <div>
+                        <h2>Baz</h2>
+                        <p>Some text about Baz</p>
+                    </div>
+                    <br>
+                    <p>Some concluding text about Foo</p>
+                </div>
+            </body>
+            </html>
+            """,
+            [
+                Document(metadata={"Header 1": "Foo"}, page_content="Foo"),
+                Document(
+                    metadata={"Header 1": "Foo"},
+                    page_content="Some intro text about Foo.",
+                ),
+                Document(
+                    metadata={"Header 1": "Foo", "Header 2": "Bar main section"},
+                    page_content="Bar main section",
+                ),
+                Document(
+                    metadata={"Header 1": "Foo", "Header 2": "Bar main section"},
+                    page_content="Some intro text about Bar.",
+                ),
+                Document(
+                    metadata={
+                        "Header 1": "Foo",
+                        "Header 2": "Bar main section",
+                        "Header 3": "Bar subsection 1",
+                    },
+                    page_content="Bar subsection 1",
+                ),
+                Document(
+                    metadata={
+                        "Header 1": "Foo",
+                        "Header 2": "Bar main section",
+                        "Header 3": "Bar subsection 1",
+                    },
+                    page_content="Some text about the first subtopic of Bar.",
+                ),
+                Document(
+                    metadata={
+                        "Header 1": "Foo",
+                        "Header 2": "Bar main section",
+                        "Header 3": "Bar subsection 2",
+                    },
+                    page_content="Bar subsection 2",
+                ),
+                Document(
+                    metadata={
+                        "Header 1": "Foo",
+                        "Header 2": "Bar main section",
+                        "Header 3": "Bar subsection 2",
+                    },
+                    page_content="Some text about the second subtopic of Bar.",
+                ),
+                Document(
+                    metadata={"Header 1": "Foo", "Header 2": "Baz"}, page_content="Baz"
+                ),
+                Document(
+                    metadata={"Header 1": "Foo"},
+                    page_content=(
+                        "Some text about Baz  \nSome concluding text about Foo"
+                    ),
+                ),
+            ],
+            "Test Case A: Split on h1, h2, and h3 with nested headers",
+        ),
+        (
+            # Test Case B: Split on h1 only without any headers
+            [("h1", "Header 1")],
+            """
+            <html>
+                <body>
+                    <p>Paragraph one.</p>
+                    <p>Paragraph two.</p>
+                    <p>Paragraph three.</p>
+                </body>
+            </html>
+            """,
+            [
+                Document(
+                    metadata={},
+                    page_content="Paragraph one.  \nParagraph two.  \nParagraph three.",
+                )
+            ],
+            "Test Case B: Split on h1 only without any headers",
+        ),
+    ],
+)
+@pytest.mark.requires("bs4")
+def test_additional_html_header_text_splitter(
+    html_header_splitter_splitter_factory: Any,
+    headers_to_split_on: List[Tuple[str, str]],
+    html_content: str,
+    expected_output: List[Document],
+    test_case: str,
+) -> None:
+    """
+    Test the HTML header text splitter.
+
+    Args:
+        html_header_splitter_splitter_factory (Any): Factory function to create
+            the HTML header splitter.
+        headers_to_split_on (List[Tuple[str, str]]): List of headers to split on.
+        html_content (str): HTML content to be split.
+        expected_output (List[Document]): Expected list of Document objects.
+        test_case (str): Description of the test case.
+
+    Raises:
+        AssertionError: If the number of documents or their content/metadata
+            does not match the expected output.
+    """
+    splitter = html_header_splitter_splitter_factory(
+        headers_to_split_on=headers_to_split_on
+    )
+    docs = splitter.split_text(html_content)
+
+    assert len(docs) == len(expected_output), (
+        f"{test_case} Failed: Number of documents mismatch. "
+        f"Expected {len(expected_output)}, got {len(docs)}."
+    )
+    for idx, (doc, expected) in enumerate(zip(docs, expected_output), start=1):
+        assert doc.page_content == expected.page_content, (
+            f"{test_case} Failed at Document {idx}: "
+            f"Content mismatch.\nExpected: {expected.page_content}\n"
+            "Got: {doc.page_content}"
+        )
+        assert doc.metadata == expected.metadata, (
+            f"{test_case} Failed at Document {idx}: "
+            f"Metadata mismatch.\nExpected: {expected.metadata}\nGot: {doc.metadata}"
+        )
+
+
+@pytest.mark.parametrize(
+    "headers_to_split_on, html_content, expected_output, test_case",
+    [
+        (
+            # Test Case C: Split on h1, h2, and h3 with no headers present
+            [("h1", "Header 1"), ("h2", "Header 2"), ("h3", "Header 3")],
+            """
+            <html>
+                <body>
+                    <p>Just some random text without headers.</p>
+                    <div>
+                        <span>More text here.</span>
+                    </div>
+                </body>
+            </html>
+            """,
+            [
+                Document(
+                    page_content="Just some random text without headers."
+                    "  \nMore text here.",
+                    metadata={},
+                )
+            ],
+            "Test Case C: Split on h1, h2, and h3 without any headers",
+        )
+    ],
+)
+@pytest.mark.requires("bs4")
+def test_html_no_headers_with_multiple_splitters(
+    html_header_splitter_splitter_factory: Any,
+    headers_to_split_on: List[Tuple[str, str]],
+    html_content: str,
+    expected_output: List[Document],
+    test_case: str,
+) -> None:
+    """
+    Test HTML content splitting without headers using multiple splitters.
+    Args:
+        html_header_splitter_splitter_factory (Any): Factory to create the
+            HTML header splitter.
+        headers_to_split_on (List[Tuple[str, str]]): List of headers to split on.
+        html_content (str): HTML content to be split.
+        expected_output (List[Document]): Expected list of Document objects
+            after splitting.
+        test_case (str): Description of the test case.
+    Raises:
+        AssertionError: If the number of documents or their content/metadata
+            does not match the expected output.
+    """
+    splitter = html_header_splitter_splitter_factory(
+        headers_to_split_on=headers_to_split_on
+    )
+    docs = splitter.split_text(html_content)
+
+    assert len(docs) == len(expected_output), (
+        f"{test_case} Failed: Number of documents mismatch. "
+        f"Expected {len(expected_output)}, got {len(docs)}."
+    )
+    for idx, (doc, expected) in enumerate(zip(docs, expected_output), start=1):
+        assert doc.page_content == expected.page_content, (
+            f"{test_case} Failed at Document {idx}: "
+            f"Content mismatch.\nExpected: {expected.page_content}\n"
+            "Got: {doc.page_content}"
+        )
+        assert doc.metadata == expected.metadata, (
+            f"{test_case} Failed at Document {idx}: "
+            f"Metadata mismatch.\nExpected: {expected.metadata}\nGot: {doc.metadata}"
+        )


 def test_split_text_on_tokens() -> None:
@@ -2480,7 +2907,8 @@ def test_html_splitter_with_custom_extractor() -> None:

    expected = [
        Document(
-            page_content="This is an iframe: [iframe:http://example.com](http://example.com)",
+            page_content="This is an iframe: "
+            "[iframe:http://example.com](http://example.com)",
            metadata={"Header 1": "Section 1"},
        ),
    ]
@@ -2598,7 +3026,7 @@ def test_html_splitter_with_small_chunk_size() -> None:
    """Test HTML splitting with a very small chunk size to validate chunking."""
    html_content = """
    <h1>Section 1</h1>
-    <p>This is some long text that should be split into multiple chunks due to the 
+    <p>This is some long text that should be split into multiple chunks due to the
    small chunk size.</p>
    """
    splitter = HTMLSemanticPreservingSplitter(