text-splitters[minor]: Replace lxml and XSLT with BeautifulSoup in HTMLHeaderTextSplitter for Improved Large HTML File Processing (#27678)

This pull request updates the `HTMLHeaderTextSplitter` by replacing the
`split_text_from_file` method's implementation. The original method used
`lxml` and XSLT for processing HTML files, which caused
`lxml.etree.xsltapplyerror maxhead` when handling large HTML documents
due to limitations in the XSLT processor. Fixes #13149

By switching to BeautifulSoup (`bs4`), we achieve:

- **Improved Performance and Reliability:** BeautifulSoup efficiently
processes large HTML files without the errors associated with `lxml` and
XSLT.
- **Simplified Dependencies:** Removes the dependency on `lxml` and
external XSLT files, relying instead on the widely used `beautifulsoup4`
library.
- **Maintained Functionality:** The new method replicates the original
behavior, ensuring compatibility with existing code and preserving the
extraction of content and metadata.

**Issue:**

This change addresses issues related to processing large HTML files with
the existing `HTMLHeaderTextSplitter` implementation. It resolves
problems where users encounter lxml.etree.xsltapplyerror maxhead due to
large HTML documents.

**Dependencies:**

- **BeautifulSoup (`beautifulsoup4`):** The `beautifulsoup4` library is
now used for parsing HTML content.
  - Installation: `pip install beautifulsoup4`

**Code Changes:**

Updated the `split_text_from_file` method in `HTMLHeaderTextSplitter` as
follows:

```python
def split_text_from_file(self, file: Any) -> List[Document]:
    """Split HTML file using BeautifulSoup.

    Args:
        file: HTML file path or file-like object.

    Returns:
        List of Document objects with page_content and metadata.
    """
    from bs4 import BeautifulSoup
    from langchain.docstore.document import Document
    import bs4

    # Read the HTML content from the file or file-like object
    if isinstance(file, str):
        with open(file, 'r', encoding='utf-8') as f:
            html_content = f.read()
    else:
        # Assuming file is a file-like object
        html_content = file.read()

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')

    # Extract the header tags and their corresponding metadata keys
    headers_to_split_on = [tag[0] for tag in self.headers_to_split_on]
    header_mapping = dict(self.headers_to_split_on)

    documents = []

    # Find the body of the document
    body = soup.body if soup.body else soup

    # Find all header tags in the order they appear
    all_headers = body.find_all(headers_to_split_on)

    # If there's content before the first header, collect it
    first_header = all_headers[0] if all_headers else None
    if first_header:
        pre_header_content = ''
        for elem in first_header.find_all_previous():
            if isinstance(elem, bs4.Tag):
                text = elem.get_text(separator=' ', strip=True)
                if text:
                    pre_header_content = text + ' ' + pre_header_content
        if pre_header_content.strip():
            documents.append(Document(
                page_content=pre_header_content.strip(),
                metadata={}  # No metadata since there's no header
            ))
    else:
        # If no headers are found, return the whole content
        full_text = body.get_text(separator=' ', strip=True)
        if full_text.strip():
            documents.append(Document(
                page_content=full_text.strip(),
                metadata={}
            ))
        return documents

    # Process each header and its associated content
    for header in all_headers:
        current_metadata = {}
        header_name = header.name
        header_text = header.get_text(separator=' ', strip=True)
        current_metadata[header_mapping[header_name]] = header_text

        # Collect all sibling elements until the next header of the same or higher level
        content_elements = []
        for sibling in header.find_next_siblings():
            if sibling.name in headers_to_split_on:
                # Stop at the next header
                break
            if isinstance(sibling, bs4.Tag):
                content_elements.append(sibling)

        # Get the text content of the collected elements
        current_content = ''
        for elem in content_elements:
            text = elem.get_text(separator=' ', strip=True)
            if text:
                current_content += text + ' '

        # Create a Document if there is content
        if current_content.strip():
            documents.append(Document(
                page_content=current_content.strip(),
                metadata=current_metadata.copy()
            ))
        else:
            # If there's no content, but we have metadata, still create a Document
            documents.append(Document(
                page_content='',
                metadata=current_metadata.copy()
            ))

    return documents
```

---------

Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
Ahmed Tammaa 2025-01-20 23:10:37 +02:00 committed by GitHub
parent 989eec4b7b
commit d3ed9b86be
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 736 additions and 156 deletions

View File

@ -3,7 +3,7 @@ from __future__ import annotations
import copy import copy
import pathlib import pathlib
import re import re
from io import BytesIO, StringIO from io import StringIO
from typing import ( from typing import (
Any, Any,
Callable, Callable,
@ -34,148 +34,291 @@ class ElementType(TypedDict):
class HTMLHeaderTextSplitter: class HTMLHeaderTextSplitter:
"""Splitting HTML files based on specified headers. """Split HTML content into structured Documents based on specified headers.
Requires lxml package. Splits HTML content by detecting specified header tags (e.g., <h1>, <h2>) and
creating hierarchical Document objects that reflect the semantic structure
of the original content. For each identified section, the splitter associates
the extracted text with metadata corresponding to the encountered headers.
If no specified headers are found, the entire content is returned as a single
Document. This allows for flexible handling of HTML input, ensuring that
information is organized according to its semantic headers.
The splitter provides the option to return each HTML element as a separate
Document or aggregate them into semantically meaningful chunks. It also
gracefully handles multiple levels of nested headers, creating a rich,
hierarchical representation of the content.
Args:
headers_to_split_on (List[Tuple[str, str]]): A list of (header_tag,
header_name) pairs representing the headers that define splitting
boundaries. For example, [("h1", "Header 1"), ("h2", "Header 2")]
will split content by <h1> and <h2> tags, assigning their textual
content to the Document metadata.
return_each_element (bool): If True, every HTML element encountered
(including headers, paragraphs, etc.) is returned as a separate
Document. If False, content under the same header hierarchy is
aggregated into fewer Documents.
Returns:
List[Document]: A list of Document objects. Each Document contains
`page_content` holding the extracted text and `metadata` that maps
the header hierarchy to their corresponding titles.
Example:
.. code-block:: python
from langchain_text_splitters.html_header_text_splitter import (
HTMLHeaderTextSplitter,
)
# Define headers for splitting on h1 and h2 tags.
headers_to_split_on = [("h1", "Main Topic"), ("h2", "Sub Topic")]
splitter = HTMLHeaderTextSplitter(
headers_to_split_on=headers_to_split_on,
return_each_element=False
)
html_content = \"\"\"
<html>
<body>
<h1>Introduction</h1>
<p>Welcome to the introduction section.</p>
<h2>Background</h2>
<p>Some background details here.</p>
<h1>Conclusion</h1>
<p>Final thoughts.</p>
</body>
</html>
\"\"\"
documents = splitter.split_text(html_content)
# 'documents' now contains Document objects reflecting the hierarchy:
# - Document with metadata={"Main Topic": "Introduction"} and
# content="Introduction"
# - Document with metadata={"Main Topic": "Introduction"} and
# content="Welcome to the introduction section."
# - Document with metadata={"Main Topic": "Introduction",
# "Sub Topic": "Background"} and content="Background"
# - Document with metadata={"Main Topic": "Introduction",
# "Sub Topic": "Background"} and content="Some background details here."
# - Document with metadata={"Main Topic": "Conclusion"} and
# content="Conclusion"
# - Document with metadata={"Main Topic": "Conclusion"} and
# content="Final thoughts."
""" """
def __init__( def __init__(
self, self,
headers_to_split_on: List[Tuple[str, str]], headers_to_split_on: List[Tuple[str, str]],
return_each_element: bool = False, return_each_element: bool = False,
): ) -> None:
"""Create a new HTMLHeaderTextSplitter. """Initialize with headers to split on.
Args: Args:
headers_to_split_on: list of tuples of headers we want to track mapped to headers_to_split_on: A list of tuples where
(arbitrary) keys for metadata. Allowed header values: h1, h2, h3, h4, each tuple contains a header tag and its corresponding value.
h5, h6 e.g. [("h1", "Header 1"), ("h2", "Header 2)]. return_each_element: Whether to return each HTML
return_each_element: Return each element w/ associated headers. element as a separate Document. Defaults to False.
""" """
# Output element-by-element or aggregated into chunks w/ common headers self.headers_to_split_on = sorted(
headers_to_split_on, key=lambda x: int(x[0][1])
)
self.header_mapping = dict(self.headers_to_split_on)
self.header_tags = [tag for tag, _ in self.headers_to_split_on]
self.return_each_element = return_each_element self.return_each_element = return_each_element
self.headers_to_split_on = sorted(headers_to_split_on)
def aggregate_elements_to_chunks(
self, elements: List[ElementType]
) -> List[Document]:
"""Combine elements with common metadata into chunks.
Args:
elements: HTML element content with associated identifying info and metadata
"""
aggregated_chunks: List[ElementType] = []
for element in elements:
if (
aggregated_chunks
and aggregated_chunks[-1]["metadata"] == element["metadata"]
):
# If the last element in the aggregated list
# has the same metadata as the current element,
# append the current content to the last element's content
aggregated_chunks[-1]["content"] += " \n" + element["content"]
else:
# Otherwise, append the current element to the aggregated list
aggregated_chunks.append(element)
return [
Document(page_content=chunk["content"], metadata=chunk["metadata"])
for chunk in aggregated_chunks
]
def split_text_from_url(self, url: str, **kwargs: Any) -> List[Document]:
"""Split HTML from web URL.
Args:
url: web URL
**kwargs: Arbitrary additional keyword arguments. These are usually passed
to the fetch url content request.
"""
r = requests.get(url, **kwargs)
return self.split_text_from_file(BytesIO(r.content))
def split_text(self, text: str) -> List[Document]: def split_text(self, text: str) -> List[Document]:
"""Split HTML text string. """Split the given text into a list of Document objects.
Args: Args:
text: HTML text text: The HTML text to split.
Returns:
A list of split Document objects.
""" """
return self.split_text_from_file(StringIO(text)) return self.split_text_from_file(StringIO(text))
def split_text_from_file(self, file: Any) -> List[Document]: def split_text_from_url(
"""Split HTML file. self, url: str, timeout: int = 10, **kwargs: Any
) -> List[Document]:
"""Fetch text content from a URL and split it into documents.
Args: Args:
file: HTML file url: The URL to fetch content from.
timeout: Timeout for the request. Defaults to 10.
**kwargs: Additional keyword arguments for the request.
Returns:
A list of split Document objects.
Raises:
requests.RequestException: If the HTTP request fails.
"""
kwargs.setdefault("timeout", timeout)
response = requests.get(url, **kwargs)
response.raise_for_status()
return self.split_text(response.text)
def _header_level(self, tag_name: str) -> int:
"""Determine the heading level of a tag."""
if tag_name.lower() in ["h1", "h2", "h3", "h4", "h5", "h6"]:
return int(tag_name[1])
# Returns high level if it isn't a header
return 9999
def _dom_depth(self, element: Any) -> int:
"""Determine the DOM depth of an element by counting its parents."""
depth = 0
for _ in element.parents:
depth += 1
return depth
def _get_elements(self, html_content: str) -> List[Any]:
"""Parse HTML content and return a list of BeautifulSoup elements.
This helper function takes HTML content as input,
parses it using BeautifulSoup4, and returns all HTML elements
found in the document body. If no body tag exists,
it returns all elements in the full document.
Args:
html_content: Raw HTML content to be parsed.
Returns:
List[Any]: A list of BeautifulSoup elements found in the HTML document.
Raises:
ImportError: If the BeautifulSoup4 package is not installed.
""" """
try: try:
from lxml import etree from bs4 import BeautifulSoup # type: ignore[import-untyped]
except ImportError as e: except ImportError as e:
raise ImportError( raise ImportError(
"Unable to import lxml, please install with `pip install lxml`." "Unable to import BeautifulSoup/PageElement, \
please install with `pip install \
bs4`."
) from e ) from e
# use lxml library to parse html document and return xml ElementTree soup = BeautifulSoup(html_content, "html.parser")
# Explicitly encoding in utf-8 allows non-English body = soup.body if soup.body else soup
# html files to be processed without garbled characters return body.find_all()
parser = etree.HTMLParser(encoding="utf-8")
tree = etree.parse(file, parser)
# document transformation for "structure-aware" chunking is handled with xsl. def split_text_from_file(self, file: Any) -> List[Document]:
# see comments in html_chunks_with_headers.xslt for more detailed information. """Split HTML content from a file into a list of Document objects.
xslt_path = pathlib.Path(__file__).parent / "xsl/html_chunks_with_headers.xslt"
xslt_tree = etree.parse(xslt_path)
transform = etree.XSLT(xslt_tree)
result = transform(tree)
result_dom = etree.fromstring(str(result))
# create filter and mapping for header metadata Args:
header_filter = [header[0] for header in self.headers_to_split_on] file: A file path or a file-like object containing HTML content.
header_mapping = dict(self.headers_to_split_on)
# map xhtml namespace prefix Returns:
ns_map = {"h": "http://www.w3.org/1999/xhtml"} A list of split Document objects.
"""
if isinstance(file, str):
with open(file, "r", encoding="utf-8") as f:
html_content = f.read()
else:
html_content = file.read()
elements = self._get_elements(html_content)
documents: List[Document] = []
active_headers: Dict[str, Tuple[str, int, int]] = {}
current_chunk: List[str] = []
chunk_dom_depth = 0
# build list of elements from DOM def finalize_chunk() -> None:
elements = [] if current_chunk:
for element in result_dom.findall("*//*", ns_map): final_meta = {
if element.findall("*[@class='headers']") or element.findall( key: content
"*[@class='chunk']" for key, (content, level, dom_depth) in active_headers.items()
): if chunk_dom_depth >= dom_depth
elements.append( }
ElementType( combined_text = " \n".join(
url=file, line for line in current_chunk if line.strip()
xpath="".join(
[
node.text or ""
for node in element.findall("*[@class='xpath']", ns_map)
]
),
content="".join(
[
node.text or ""
for node in element.findall("*[@class='chunk']", ns_map)
]
),
metadata={
# Add text of specified headers to metadata using header
# mapping.
header_mapping[node.tag]: node.text or ""
for node in filter(
lambda x: x.tag in header_filter,
element.findall("*[@class='headers']/*", ns_map),
)
},
)
) )
if combined_text.strip():
documents.append(
Document(page_content=combined_text, metadata=final_meta)
)
current_chunk.clear()
for element in elements:
tag = element.name
if not tag:
continue
text = " ".join(
t
for t in element.find_all(string=True, recursive=False)
if isinstance(t, str)
).strip()
if not text:
continue
level = self._header_level(tag)
dom_depth = self._dom_depth(element)
if tag in self.header_tags:
if not self.return_each_element:
finalize_chunk()
# Remove headers at same or deeper level
headers_to_remove = [
key for key, (_, lvl, _) in active_headers.items() if lvl >= level
]
for key in headers_to_remove:
del active_headers[key]
header_key = self.header_mapping[tag]
active_headers[header_key] = (text, level, dom_depth)
# Produce a document for the header itself
header_meta = {
key: content
for key, (content, lvl, dd) in active_headers.items()
if dom_depth >= dd
}
documents.append(Document(page_content=text, metadata=header_meta))
# After encountering a header,
# no immediate content goes to current_chunk
# (if return_each_element is False, we wait for next content)
# (if return_each_element is True, we create docs per element anyway)
else:
# Non-header element logic
# Remove headers that don't apply if dom_depth < their dom_depth
headers_to_remove = [
key for key, (_, _, dd) in active_headers.items() if dom_depth < dd
]
for key in headers_to_remove:
del active_headers[key]
if self.return_each_element:
# Produce a doc for this element immediately
element_meta = {
key: content
for key, (content, lvl, dd) in active_headers.items()
if dom_depth >= dd
}
if text.strip():
documents.append(
Document(page_content=text, metadata=element_meta)
)
else:
# Accumulate content in current_chunk
if text.strip():
current_chunk.append(text)
chunk_dom_depth = max(chunk_dom_depth, dom_depth)
if not self.return_each_element: if not self.return_each_element:
return self.aggregate_elements_to_chunks(elements) # finalize any remaining chunk
else: finalize_chunk()
return [
Document(page_content=chunk["content"], metadata=chunk["metadata"]) # If no headers were found at all and return_each_element=False, behavior is:
for chunk in elements # The entire content should be in one document.
] # The logic above naturally handles it:
# If no recognized headers, we never split; we ended up just accumulating text
# in current_chunk and finalizing once at the end.
return documents
class HTMLSectionSplitter: class HTMLSectionSplitter:
@ -269,7 +412,10 @@ class HTMLSectionSplitter:
- 'tag_name': The name of the header tag (e.g., "h1", "h2"). - 'tag_name': The name of the header tag (e.g., "h1", "h2").
""" """
try: try:
from bs4 import BeautifulSoup, PageElement # type: ignore[import-untyped] from bs4 import (
BeautifulSoup, # type: ignore[import-untyped]
PageElement,
)
except ImportError as e: except ImportError as e:
raise ImportError( raise ImportError(
"Unable to import BeautifulSoup/PageElement, \ "Unable to import BeautifulSoup/PageElement, \
@ -343,10 +489,13 @@ class HTMLSectionSplitter:
return str(result) return str(result)
def split_text_from_file(self, file: Any) -> List[Document]: def split_text_from_file(self, file: Any) -> List[Document]:
"""Split HTML file. """Split HTML content from a file into a list of Document objects.
Args: Args:
file: HTML file file: A file path or a file-like object containing HTML content.
Returns:
A list of split Document objects.
""" """
file_content = file.getvalue() file_content = file.getvalue()
file_content = self.convert_possible_tags_to_header(file_content) file_content = self.convert_possible_tags_to_header(file_content)
@ -844,3 +993,6 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
for placeholder, preserved_content in preserved_elements.items(): for placeholder, preserved_content in preserved_elements.items():
content = content.replace(placeholder, preserved_content.strip()) content = content.replace(placeholder, preserved_content.strip())
return content return content
# %%

View File

@ -4,7 +4,7 @@ import random
import re import re
import string import string
from pathlib import Path from pathlib import Path
from typing import Any, List from typing import Any, Callable, List, Tuple
import pytest import pytest
from langchain_core.documents import Document from langchain_core.documents import Document
@ -2039,49 +2039,476 @@ def test_haskell_code_splitter() -> None:
assert chunks == expected_chunks assert chunks == expected_chunks
@pytest.mark.requires("lxml") @pytest.fixture
def test_html_header_text_splitter(tmp_path: Path) -> None: @pytest.mark.requires("bs4")
splitter = HTMLHeaderTextSplitter( def html_header_splitter_splitter_factory() -> (
headers_to_split_on=[("h1", "Header 1"), ("h2", "Header 2")] Callable[[List[Tuple[str, str]]], HTMLHeaderTextSplitter]
) ):
"""
content = """ Fixture to create an HTMLHeaderTextSplitter instance with given headers.
<h1>Sample Document</h1> This factory allows dynamic creation of splitters with different headers.
<h2>Section</h2>
<p id="1234">Reference content.</p>
<h2>Lists</h2>
<ul>
<li>Item 1</li>
<li>Item 2</li>
<li>Item 3</li>
</ul>
<h3>A block</h3>
<div class="amazing">
<p>Some text</p>
<p>Some more text</p>
</div>
""" """
docs = splitter.split_text(content) def _create_splitter(
expected = [ headers_to_split_on: List[Tuple[str, str]],
Document( ) -> HTMLHeaderTextSplitter:
page_content="Reference content.", return HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
metadata={"Header 1": "Sample Document", "Header 2": "Section"},
),
Document(
page_content="Item 1 Item 2 Item 3 \nSome text \nSome more text",
metadata={"Header 1": "Sample Document", "Header 2": "Lists"},
),
]
assert docs == expected
with open(tmp_path / "doc.html", "w") as tmp: return _create_splitter
tmp.write(content)
docs_from_file = splitter.split_text_from_file(tmp_path / "doc.html")
assert docs_from_file == expected
@pytest.mark.parametrize(
"headers_to_split_on, html_input, expected_documents, test_case",
[
(
# Test Case 1: Split on h1 and h2
[("h1", "Header 1"), ("h2", "Header 2")],
"""
<html>
<body>
<h1>Introduction</h1>
<p>This is the introduction.</p>
<h2>Background</h2>
<p>Background information.</p>
<h1>Conclusion</h1>
<p>Final thoughts.</p>
</body>
</html>
""",
[
Document(
page_content="Introduction", metadata={"Header 1": "Introduction"}
),
Document(
page_content="This is the introduction.",
metadata={"Header 1": "Introduction"},
),
Document(
page_content="Background",
metadata={"Header 1": "Introduction", "Header 2": "Background"},
),
Document(
page_content="Background information.",
metadata={"Header 1": "Introduction", "Header 2": "Background"},
),
Document(
page_content="Conclusion", metadata={"Header 1": "Conclusion"}
),
Document(
page_content="Final thoughts.", metadata={"Header 1": "Conclusion"}
),
],
"Simple headers and paragraphs",
),
(
# Test Case 2: Nested headers with h1, h2, and h3
[("h1", "Header 1"), ("h2", "Header 2"), ("h3", "Header 3")],
"""
<html>
<body>
<div>
<h1>Main Title</h1>
<div>
<h2>Subsection</h2>
<p>Details of subsection.</p>
<div>
<h3>Sub-subsection</h3>
<p>More details.</p>
</div>
</div>
</div>
<h1>Another Main Title</h1>
<p>Content under another main title.</p>
</body>
</html>
""",
[
Document(
page_content="Main Title", metadata={"Header 1": "Main Title"}
),
Document(
page_content="Subsection",
metadata={"Header 1": "Main Title", "Header 2": "Subsection"},
),
Document(
page_content="Details of subsection.",
metadata={"Header 1": "Main Title", "Header 2": "Subsection"},
),
Document(
page_content="Sub-subsection",
metadata={
"Header 1": "Main Title",
"Header 2": "Subsection",
"Header 3": "Sub-subsection",
},
),
Document(
page_content="More details.",
metadata={
"Header 1": "Main Title",
"Header 2": "Subsection",
"Header 3": "Sub-subsection",
},
),
Document(
page_content="Another Main Title",
metadata={"Header 1": "Another Main Title"},
),
Document(
page_content="Content under another main title.",
metadata={"Header 1": "Another Main Title"},
),
],
"Nested headers with h1, h2, and h3",
),
(
# Test Case 3: No headers
[("h1", "Header 1")],
"""
<html>
<body>
<p>Paragraph one.</p>
<p>Paragraph two.</p>
<div>
<p>Paragraph three.</p>
</div>
</body>
</html>
""",
[
Document(
page_content="Paragraph one. \nParagraph two. \nParagraph three.",
metadata={},
)
],
"No headers present",
),
(
# Test Case 4: Multiple headers of the same level
[("h1", "Header 1")],
"""
<html>
<body>
<h1>Chapter 1</h1>
<p>Content of chapter 1.</p>
<h1>Chapter 2</h1>
<p>Content of chapter 2.</p>
<h1>Chapter 3</h1>
<p>Content of chapter 3.</p>
</body>
</html>
""",
[
Document(page_content="Chapter 1", metadata={"Header 1": "Chapter 1"}),
Document(
page_content="Content of chapter 1.",
metadata={"Header 1": "Chapter 1"},
),
Document(page_content="Chapter 2", metadata={"Header 1": "Chapter 2"}),
Document(
page_content="Content of chapter 2.",
metadata={"Header 1": "Chapter 2"},
),
Document(page_content="Chapter 3", metadata={"Header 1": "Chapter 3"}),
Document(
page_content="Content of chapter 3.",
metadata={"Header 1": "Chapter 3"},
),
],
"Multiple headers of the same level",
),
(
# Test Case 5: Headers with no content
[("h1", "Header 1"), ("h2", "Header 2")],
"""
<html>
<body>
<h1>Header 1</h1>
<h2>Header 2</h2>
<h1>Header 3</h1>
</body>
</html>
""",
[
Document(page_content="Header 1", metadata={"Header 1": "Header 1"}),
Document(
page_content="Header 2",
metadata={"Header 1": "Header 1", "Header 2": "Header 2"},
),
Document(page_content="Header 3", metadata={"Header 1": "Header 3"}),
],
"Headers with no associated content",
),
],
)
@pytest.mark.requires("bs4")
def test_html_header_text_splitter(
html_header_splitter_splitter_factory: Any,
headers_to_split_on: List[Tuple[str, str]],
html_input: str,
expected_documents: List[Document],
test_case: str,
) -> None:
"""
Test the HTML header text splitter.
Args:
html_header_splitter_splitter_factory (Any): Factory function to create
the HTML header splitter.
headers_to_split_on (List[Tuple[str, str]]): List of headers to split on.
html_input (str): The HTML input string to be split.
expected_documents (List[Document]): List of expected Document objects.
test_case (str): Description of the test case.
Raises:
AssertionError: If the number of documents or their content/metadata
does not match the expected values.
"""
splitter = html_header_splitter_splitter_factory(
headers_to_split_on=headers_to_split_on
)
docs = splitter.split_text(html_input)
assert len(docs) == len(expected_documents), (
f"Test Case '{test_case}' Failed: Number of documents mismatch. "
f"Expected {len(expected_documents)}, got {len(docs)}."
)
for idx, (doc, expected) in enumerate(zip(docs, expected_documents), start=1):
assert doc.page_content == expected.page_content, (
f"Test Case '{test_case}' Failed at Document {idx}: "
f"Content mismatch.\nExpected: {expected.page_content}"
"\nGot: {doc.page_content}"
)
assert doc.metadata == expected.metadata, (
f"Test Case '{test_case}' Failed at Document {idx}: "
f"Metadata mismatch.\nExpected: {expected.metadata}\nGot: {doc.metadata}"
)
@pytest.mark.parametrize(
"headers_to_split_on, html_content, expected_output, test_case",
[
(
# Test Case A: Split on h1 and h2 with h3 in content
[("h1", "Header 1"), ("h2", "Header 2"), ("h3", "Header 3")],
"""
<!DOCTYPE html>
<html>
<body>
<div>
<h1>Foo</h1>
<p>Some intro text about Foo.</p>
<div>
<h2>Bar main section</h2>
<p>Some intro text about Bar.</p>
<h3>Bar subsection 1</h3>
<p>Some text about the first subtopic of Bar.</p>
<h3>Bar subsection 2</h3>
<p>Some text about the second subtopic of Bar.</p>
</div>
<div>
<h2>Baz</h2>
<p>Some text about Baz</p>
</div>
<br>
<p>Some concluding text about Foo</p>
</div>
</body>
</html>
""",
[
Document(metadata={"Header 1": "Foo"}, page_content="Foo"),
Document(
metadata={"Header 1": "Foo"},
page_content="Some intro text about Foo.",
),
Document(
metadata={"Header 1": "Foo", "Header 2": "Bar main section"},
page_content="Bar main section",
),
Document(
metadata={"Header 1": "Foo", "Header 2": "Bar main section"},
page_content="Some intro text about Bar.",
),
Document(
metadata={
"Header 1": "Foo",
"Header 2": "Bar main section",
"Header 3": "Bar subsection 1",
},
page_content="Bar subsection 1",
),
Document(
metadata={
"Header 1": "Foo",
"Header 2": "Bar main section",
"Header 3": "Bar subsection 1",
},
page_content="Some text about the first subtopic of Bar.",
),
Document(
metadata={
"Header 1": "Foo",
"Header 2": "Bar main section",
"Header 3": "Bar subsection 2",
},
page_content="Bar subsection 2",
),
Document(
metadata={
"Header 1": "Foo",
"Header 2": "Bar main section",
"Header 3": "Bar subsection 2",
},
page_content="Some text about the second subtopic of Bar.",
),
Document(
metadata={"Header 1": "Foo", "Header 2": "Baz"}, page_content="Baz"
),
Document(
metadata={"Header 1": "Foo"},
page_content=(
"Some text about Baz \nSome concluding text about Foo"
),
),
],
"Test Case A: Split on h1, h2, and h3 with nested headers",
),
(
# Test Case B: Split on h1 only without any headers
[("h1", "Header 1")],
"""
<html>
<body>
<p>Paragraph one.</p>
<p>Paragraph two.</p>
<p>Paragraph three.</p>
</body>
</html>
""",
[
Document(
metadata={},
page_content="Paragraph one. \nParagraph two. \nParagraph three.",
)
],
"Test Case B: Split on h1 only without any headers",
),
],
)
@pytest.mark.requires("bs4")
def test_additional_html_header_text_splitter(
html_header_splitter_splitter_factory: Any,
headers_to_split_on: List[Tuple[str, str]],
html_content: str,
expected_output: List[Document],
test_case: str,
) -> None:
"""
Test the HTML header text splitter.
Args:
html_header_splitter_splitter_factory (Any): Factory function to create
the HTML header splitter.
headers_to_split_on (List[Tuple[str, str]]): List of headers to split on.
html_content (str): HTML content to be split.
expected_output (List[Document]): Expected list of Document objects.
test_case (str): Description of the test case.
Raises:
AssertionError: If the number of documents or their content/metadata
does not match the expected output.
"""
splitter = html_header_splitter_splitter_factory(
headers_to_split_on=headers_to_split_on
)
docs = splitter.split_text(html_content)
assert len(docs) == len(expected_output), (
f"{test_case} Failed: Number of documents mismatch. "
f"Expected {len(expected_output)}, got {len(docs)}."
)
for idx, (doc, expected) in enumerate(zip(docs, expected_output), start=1):
assert doc.page_content == expected.page_content, (
f"{test_case} Failed at Document {idx}: "
f"Content mismatch.\nExpected: {expected.page_content}\n"
"Got: {doc.page_content}"
)
assert doc.metadata == expected.metadata, (
f"{test_case} Failed at Document {idx}: "
f"Metadata mismatch.\nExpected: {expected.metadata}\nGot: {doc.metadata}"
)
@pytest.mark.parametrize(
"headers_to_split_on, html_content, expected_output, test_case",
[
(
# Test Case C: Split on h1, h2, and h3 with no headers present
[("h1", "Header 1"), ("h2", "Header 2"), ("h3", "Header 3")],
"""
<html>
<body>
<p>Just some random text without headers.</p>
<div>
<span>More text here.</span>
</div>
</body>
</html>
""",
[
Document(
page_content="Just some random text without headers."
" \nMore text here.",
metadata={},
)
],
"Test Case C: Split on h1, h2, and h3 without any headers",
)
],
)
@pytest.mark.requires("bs4")
def test_html_no_headers_with_multiple_splitters(
html_header_splitter_splitter_factory: Any,
headers_to_split_on: List[Tuple[str, str]],
html_content: str,
expected_output: List[Document],
test_case: str,
) -> None:
"""
Test HTML content splitting without headers using multiple splitters.
Args:
html_header_splitter_splitter_factory (Any): Factory to create the
HTML header splitter.
headers_to_split_on (List[Tuple[str, str]]): List of headers to split on.
html_content (str): HTML content to be split.
expected_output (List[Document]): Expected list of Document objects
after splitting.
test_case (str): Description of the test case.
Raises:
AssertionError: If the number of documents or their content/metadata
does not match the expected output.
"""
splitter = html_header_splitter_splitter_factory(
headers_to_split_on=headers_to_split_on
)
docs = splitter.split_text(html_content)
assert len(docs) == len(expected_output), (
f"{test_case} Failed: Number of documents mismatch. "
f"Expected {len(expected_output)}, got {len(docs)}."
)
for idx, (doc, expected) in enumerate(zip(docs, expected_output), start=1):
assert doc.page_content == expected.page_content, (
f"{test_case} Failed at Document {idx}: "
f"Content mismatch.\nExpected: {expected.page_content}\n"
"Got: {doc.page_content}"
)
assert doc.metadata == expected.metadata, (
f"{test_case} Failed at Document {idx}: "
f"Metadata mismatch.\nExpected: {expected.metadata}\nGot: {doc.metadata}"
)
def test_split_text_on_tokens() -> None: def test_split_text_on_tokens() -> None:
@ -2480,7 +2907,8 @@ def test_html_splitter_with_custom_extractor() -> None:
expected = [ expected = [
Document( Document(
page_content="This is an iframe: [iframe:http://example.com](http://example.com)", page_content="This is an iframe: "
"[iframe:http://example.com](http://example.com)",
metadata={"Header 1": "Section 1"}, metadata={"Header 1": "Section 1"},
), ),
] ]
@ -2598,7 +3026,7 @@ def test_html_splitter_with_small_chunk_size() -> None:
"""Test HTML splitting with a very small chunk size to validate chunking.""" """Test HTML splitting with a very small chunk size to validate chunking."""
html_content = """ html_content = """
<h1>Section 1</h1> <h1>Section 1</h1>
<p>This is some long text that should be split into multiple chunks due to the <p>This is some long text that should be split into multiple chunks due to the
small chunk size.</p> small chunk size.</p>
""" """
splitter = HTMLSemanticPreservingSplitter( splitter = HTMLSemanticPreservingSplitter(