mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-22 23:00:00 +00:00
text-splitters[minor]: Replace lxml and XSLT with BeautifulSoup in HTMLHeaderTextSplitter for Improved Large HTML File Processing (#27678)
This pull request updates the `HTMLHeaderTextSplitter` by replacing the `split_text_from_file` method's implementation. The original method used `lxml` and XSLT for processing HTML files, which caused `lxml.etree.xsltapplyerror maxhead` when handling large HTML documents due to limitations in the XSLT processor. Fixes #13149 By switching to BeautifulSoup (`bs4`), we achieve: - **Improved Performance and Reliability:** BeautifulSoup efficiently processes large HTML files without the errors associated with `lxml` and XSLT. - **Simplified Dependencies:** Removes the dependency on `lxml` and external XSLT files, relying instead on the widely used `beautifulsoup4` library. - **Maintained Functionality:** The new method replicates the original behavior, ensuring compatibility with existing code and preserving the extraction of content and metadata. **Issue:** This change addresses issues related to processing large HTML files with the existing `HTMLHeaderTextSplitter` implementation. It resolves problems where users encounter lxml.etree.xsltapplyerror maxhead due to large HTML documents. **Dependencies:** - **BeautifulSoup (`beautifulsoup4`):** The `beautifulsoup4` library is now used for parsing HTML content. - Installation: `pip install beautifulsoup4` **Code Changes:** Updated the `split_text_from_file` method in `HTMLHeaderTextSplitter` as follows: ```python def split_text_from_file(self, file: Any) -> List[Document]: """Split HTML file using BeautifulSoup. Args: file: HTML file path or file-like object. Returns: List of Document objects with page_content and metadata. """ from bs4 import BeautifulSoup from langchain.docstore.document import Document import bs4 # Read the HTML content from the file or file-like object if isinstance(file, str): with open(file, 'r', encoding='utf-8') as f: html_content = f.read() else: # Assuming file is a file-like object html_content = file.read() # Parse the HTML content using BeautifulSoup soup = BeautifulSoup(html_content, 'html.parser') # Extract the header tags and their corresponding metadata keys headers_to_split_on = [tag[0] for tag in self.headers_to_split_on] header_mapping = dict(self.headers_to_split_on) documents = [] # Find the body of the document body = soup.body if soup.body else soup # Find all header tags in the order they appear all_headers = body.find_all(headers_to_split_on) # If there's content before the first header, collect it first_header = all_headers[0] if all_headers else None if first_header: pre_header_content = '' for elem in first_header.find_all_previous(): if isinstance(elem, bs4.Tag): text = elem.get_text(separator=' ', strip=True) if text: pre_header_content = text + ' ' + pre_header_content if pre_header_content.strip(): documents.append(Document( page_content=pre_header_content.strip(), metadata={} # No metadata since there's no header )) else: # If no headers are found, return the whole content full_text = body.get_text(separator=' ', strip=True) if full_text.strip(): documents.append(Document( page_content=full_text.strip(), metadata={} )) return documents # Process each header and its associated content for header in all_headers: current_metadata = {} header_name = header.name header_text = header.get_text(separator=' ', strip=True) current_metadata[header_mapping[header_name]] = header_text # Collect all sibling elements until the next header of the same or higher level content_elements = [] for sibling in header.find_next_siblings(): if sibling.name in headers_to_split_on: # Stop at the next header break if isinstance(sibling, bs4.Tag): content_elements.append(sibling) # Get the text content of the collected elements current_content = '' for elem in content_elements: text = elem.get_text(separator=' ', strip=True) if text: current_content += text + ' ' # Create a Document if there is content if current_content.strip(): documents.append(Document( page_content=current_content.strip(), metadata=current_metadata.copy() )) else: # If there's no content, but we have metadata, still create a Document documents.append(Document( page_content='', metadata=current_metadata.copy() )) return documents ``` --------- Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
parent
989eec4b7b
commit
d3ed9b86be
@ -3,7 +3,7 @@ from __future__ import annotations
|
||||
import copy
|
||||
import pathlib
|
||||
import re
|
||||
from io import BytesIO, StringIO
|
||||
from io import StringIO
|
||||
from typing import (
|
||||
Any,
|
||||
Callable,
|
||||
@ -34,148 +34,291 @@ class ElementType(TypedDict):
|
||||
|
||||
|
||||
class HTMLHeaderTextSplitter:
|
||||
"""Splitting HTML files based on specified headers.
|
||||
"""Split HTML content into structured Documents based on specified headers.
|
||||
|
||||
Requires lxml package.
|
||||
Splits HTML content by detecting specified header tags (e.g., <h1>, <h2>) and
|
||||
creating hierarchical Document objects that reflect the semantic structure
|
||||
of the original content. For each identified section, the splitter associates
|
||||
the extracted text with metadata corresponding to the encountered headers.
|
||||
|
||||
If no specified headers are found, the entire content is returned as a single
|
||||
Document. This allows for flexible handling of HTML input, ensuring that
|
||||
information is organized according to its semantic headers.
|
||||
|
||||
The splitter provides the option to return each HTML element as a separate
|
||||
Document or aggregate them into semantically meaningful chunks. It also
|
||||
gracefully handles multiple levels of nested headers, creating a rich,
|
||||
hierarchical representation of the content.
|
||||
|
||||
Args:
|
||||
headers_to_split_on (List[Tuple[str, str]]): A list of (header_tag,
|
||||
header_name) pairs representing the headers that define splitting
|
||||
boundaries. For example, [("h1", "Header 1"), ("h2", "Header 2")]
|
||||
will split content by <h1> and <h2> tags, assigning their textual
|
||||
content to the Document metadata.
|
||||
return_each_element (bool): If True, every HTML element encountered
|
||||
(including headers, paragraphs, etc.) is returned as a separate
|
||||
Document. If False, content under the same header hierarchy is
|
||||
aggregated into fewer Documents.
|
||||
|
||||
Returns:
|
||||
List[Document]: A list of Document objects. Each Document contains
|
||||
`page_content` holding the extracted text and `metadata` that maps
|
||||
the header hierarchy to their corresponding titles.
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_text_splitters.html_header_text_splitter import (
|
||||
HTMLHeaderTextSplitter,
|
||||
)
|
||||
|
||||
# Define headers for splitting on h1 and h2 tags.
|
||||
headers_to_split_on = [("h1", "Main Topic"), ("h2", "Sub Topic")]
|
||||
|
||||
splitter = HTMLHeaderTextSplitter(
|
||||
headers_to_split_on=headers_to_split_on,
|
||||
return_each_element=False
|
||||
)
|
||||
|
||||
html_content = \"\"\"
|
||||
<html>
|
||||
<body>
|
||||
<h1>Introduction</h1>
|
||||
<p>Welcome to the introduction section.</p>
|
||||
<h2>Background</h2>
|
||||
<p>Some background details here.</p>
|
||||
<h1>Conclusion</h1>
|
||||
<p>Final thoughts.</p>
|
||||
</body>
|
||||
</html>
|
||||
\"\"\"
|
||||
|
||||
documents = splitter.split_text(html_content)
|
||||
|
||||
# 'documents' now contains Document objects reflecting the hierarchy:
|
||||
# - Document with metadata={"Main Topic": "Introduction"} and
|
||||
# content="Introduction"
|
||||
# - Document with metadata={"Main Topic": "Introduction"} and
|
||||
# content="Welcome to the introduction section."
|
||||
# - Document with metadata={"Main Topic": "Introduction",
|
||||
# "Sub Topic": "Background"} and content="Background"
|
||||
# - Document with metadata={"Main Topic": "Introduction",
|
||||
# "Sub Topic": "Background"} and content="Some background details here."
|
||||
# - Document with metadata={"Main Topic": "Conclusion"} and
|
||||
# content="Conclusion"
|
||||
# - Document with metadata={"Main Topic": "Conclusion"} and
|
||||
# content="Final thoughts."
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
headers_to_split_on: List[Tuple[str, str]],
|
||||
return_each_element: bool = False,
|
||||
):
|
||||
"""Create a new HTMLHeaderTextSplitter.
|
||||
) -> None:
|
||||
"""Initialize with headers to split on.
|
||||
|
||||
Args:
|
||||
headers_to_split_on: list of tuples of headers we want to track mapped to
|
||||
(arbitrary) keys for metadata. Allowed header values: h1, h2, h3, h4,
|
||||
h5, h6 e.g. [("h1", "Header 1"), ("h2", "Header 2)].
|
||||
return_each_element: Return each element w/ associated headers.
|
||||
headers_to_split_on: A list of tuples where
|
||||
each tuple contains a header tag and its corresponding value.
|
||||
return_each_element: Whether to return each HTML
|
||||
element as a separate Document. Defaults to False.
|
||||
"""
|
||||
# Output element-by-element or aggregated into chunks w/ common headers
|
||||
self.headers_to_split_on = sorted(
|
||||
headers_to_split_on, key=lambda x: int(x[0][1])
|
||||
)
|
||||
self.header_mapping = dict(self.headers_to_split_on)
|
||||
self.header_tags = [tag for tag, _ in self.headers_to_split_on]
|
||||
self.return_each_element = return_each_element
|
||||
self.headers_to_split_on = sorted(headers_to_split_on)
|
||||
|
||||
def aggregate_elements_to_chunks(
|
||||
self, elements: List[ElementType]
|
||||
) -> List[Document]:
|
||||
"""Combine elements with common metadata into chunks.
|
||||
|
||||
Args:
|
||||
elements: HTML element content with associated identifying info and metadata
|
||||
"""
|
||||
aggregated_chunks: List[ElementType] = []
|
||||
|
||||
for element in elements:
|
||||
if (
|
||||
aggregated_chunks
|
||||
and aggregated_chunks[-1]["metadata"] == element["metadata"]
|
||||
):
|
||||
# If the last element in the aggregated list
|
||||
# has the same metadata as the current element,
|
||||
# append the current content to the last element's content
|
||||
aggregated_chunks[-1]["content"] += " \n" + element["content"]
|
||||
else:
|
||||
# Otherwise, append the current element to the aggregated list
|
||||
aggregated_chunks.append(element)
|
||||
|
||||
return [
|
||||
Document(page_content=chunk["content"], metadata=chunk["metadata"])
|
||||
for chunk in aggregated_chunks
|
||||
]
|
||||
|
||||
def split_text_from_url(self, url: str, **kwargs: Any) -> List[Document]:
|
||||
"""Split HTML from web URL.
|
||||
|
||||
Args:
|
||||
url: web URL
|
||||
**kwargs: Arbitrary additional keyword arguments. These are usually passed
|
||||
to the fetch url content request.
|
||||
"""
|
||||
r = requests.get(url, **kwargs)
|
||||
return self.split_text_from_file(BytesIO(r.content))
|
||||
|
||||
def split_text(self, text: str) -> List[Document]:
|
||||
"""Split HTML text string.
|
||||
"""Split the given text into a list of Document objects.
|
||||
|
||||
Args:
|
||||
text: HTML text
|
||||
text: The HTML text to split.
|
||||
|
||||
Returns:
|
||||
A list of split Document objects.
|
||||
"""
|
||||
return self.split_text_from_file(StringIO(text))
|
||||
|
||||
def split_text_from_file(self, file: Any) -> List[Document]:
|
||||
"""Split HTML file.
|
||||
def split_text_from_url(
|
||||
self, url: str, timeout: int = 10, **kwargs: Any
|
||||
) -> List[Document]:
|
||||
"""Fetch text content from a URL and split it into documents.
|
||||
|
||||
Args:
|
||||
file: HTML file
|
||||
url: The URL to fetch content from.
|
||||
timeout: Timeout for the request. Defaults to 10.
|
||||
**kwargs: Additional keyword arguments for the request.
|
||||
|
||||
Returns:
|
||||
A list of split Document objects.
|
||||
|
||||
Raises:
|
||||
requests.RequestException: If the HTTP request fails.
|
||||
"""
|
||||
kwargs.setdefault("timeout", timeout)
|
||||
response = requests.get(url, **kwargs)
|
||||
response.raise_for_status()
|
||||
return self.split_text(response.text)
|
||||
|
||||
def _header_level(self, tag_name: str) -> int:
|
||||
"""Determine the heading level of a tag."""
|
||||
if tag_name.lower() in ["h1", "h2", "h3", "h4", "h5", "h6"]:
|
||||
return int(tag_name[1])
|
||||
# Returns high level if it isn't a header
|
||||
return 9999
|
||||
|
||||
def _dom_depth(self, element: Any) -> int:
|
||||
"""Determine the DOM depth of an element by counting its parents."""
|
||||
depth = 0
|
||||
for _ in element.parents:
|
||||
depth += 1
|
||||
return depth
|
||||
|
||||
def _get_elements(self, html_content: str) -> List[Any]:
|
||||
"""Parse HTML content and return a list of BeautifulSoup elements.
|
||||
|
||||
This helper function takes HTML content as input,
|
||||
parses it using BeautifulSoup4, and returns all HTML elements
|
||||
found in the document body. If no body tag exists,
|
||||
it returns all elements in the full document.
|
||||
|
||||
Args:
|
||||
html_content: Raw HTML content to be parsed.
|
||||
|
||||
Returns:
|
||||
List[Any]: A list of BeautifulSoup elements found in the HTML document.
|
||||
|
||||
Raises:
|
||||
ImportError: If the BeautifulSoup4 package is not installed.
|
||||
"""
|
||||
try:
|
||||
from lxml import etree
|
||||
from bs4 import BeautifulSoup # type: ignore[import-untyped]
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"Unable to import lxml, please install with `pip install lxml`."
|
||||
"Unable to import BeautifulSoup/PageElement, \
|
||||
please install with `pip install \
|
||||
bs4`."
|
||||
) from e
|
||||
# use lxml library to parse html document and return xml ElementTree
|
||||
# Explicitly encoding in utf-8 allows non-English
|
||||
# html files to be processed without garbled characters
|
||||
parser = etree.HTMLParser(encoding="utf-8")
|
||||
tree = etree.parse(file, parser)
|
||||
soup = BeautifulSoup(html_content, "html.parser")
|
||||
body = soup.body if soup.body else soup
|
||||
return body.find_all()
|
||||
|
||||
# document transformation for "structure-aware" chunking is handled with xsl.
|
||||
# see comments in html_chunks_with_headers.xslt for more detailed information.
|
||||
xslt_path = pathlib.Path(__file__).parent / "xsl/html_chunks_with_headers.xslt"
|
||||
xslt_tree = etree.parse(xslt_path)
|
||||
transform = etree.XSLT(xslt_tree)
|
||||
result = transform(tree)
|
||||
result_dom = etree.fromstring(str(result))
|
||||
def split_text_from_file(self, file: Any) -> List[Document]:
|
||||
"""Split HTML content from a file into a list of Document objects.
|
||||
|
||||
# create filter and mapping for header metadata
|
||||
header_filter = [header[0] for header in self.headers_to_split_on]
|
||||
header_mapping = dict(self.headers_to_split_on)
|
||||
Args:
|
||||
file: A file path or a file-like object containing HTML content.
|
||||
|
||||
# map xhtml namespace prefix
|
||||
ns_map = {"h": "http://www.w3.org/1999/xhtml"}
|
||||
Returns:
|
||||
A list of split Document objects.
|
||||
"""
|
||||
if isinstance(file, str):
|
||||
with open(file, "r", encoding="utf-8") as f:
|
||||
html_content = f.read()
|
||||
else:
|
||||
html_content = file.read()
|
||||
elements = self._get_elements(html_content)
|
||||
documents: List[Document] = []
|
||||
active_headers: Dict[str, Tuple[str, int, int]] = {}
|
||||
current_chunk: List[str] = []
|
||||
chunk_dom_depth = 0
|
||||
|
||||
# build list of elements from DOM
|
||||
elements = []
|
||||
for element in result_dom.findall("*//*", ns_map):
|
||||
if element.findall("*[@class='headers']") or element.findall(
|
||||
"*[@class='chunk']"
|
||||
):
|
||||
elements.append(
|
||||
ElementType(
|
||||
url=file,
|
||||
xpath="".join(
|
||||
[
|
||||
node.text or ""
|
||||
for node in element.findall("*[@class='xpath']", ns_map)
|
||||
]
|
||||
),
|
||||
content="".join(
|
||||
[
|
||||
node.text or ""
|
||||
for node in element.findall("*[@class='chunk']", ns_map)
|
||||
]
|
||||
),
|
||||
metadata={
|
||||
# Add text of specified headers to metadata using header
|
||||
# mapping.
|
||||
header_mapping[node.tag]: node.text or ""
|
||||
for node in filter(
|
||||
lambda x: x.tag in header_filter,
|
||||
element.findall("*[@class='headers']/*", ns_map),
|
||||
)
|
||||
},
|
||||
)
|
||||
def finalize_chunk() -> None:
|
||||
if current_chunk:
|
||||
final_meta = {
|
||||
key: content
|
||||
for key, (content, level, dom_depth) in active_headers.items()
|
||||
if chunk_dom_depth >= dom_depth
|
||||
}
|
||||
combined_text = " \n".join(
|
||||
line for line in current_chunk if line.strip()
|
||||
)
|
||||
if combined_text.strip():
|
||||
documents.append(
|
||||
Document(page_content=combined_text, metadata=final_meta)
|
||||
)
|
||||
current_chunk.clear()
|
||||
|
||||
for element in elements:
|
||||
tag = element.name
|
||||
if not tag:
|
||||
continue
|
||||
text = " ".join(
|
||||
t
|
||||
for t in element.find_all(string=True, recursive=False)
|
||||
if isinstance(t, str)
|
||||
).strip()
|
||||
if not text:
|
||||
continue
|
||||
|
||||
level = self._header_level(tag)
|
||||
dom_depth = self._dom_depth(element)
|
||||
|
||||
if tag in self.header_tags:
|
||||
if not self.return_each_element:
|
||||
finalize_chunk()
|
||||
|
||||
# Remove headers at same or deeper level
|
||||
headers_to_remove = [
|
||||
key for key, (_, lvl, _) in active_headers.items() if lvl >= level
|
||||
]
|
||||
for key in headers_to_remove:
|
||||
del active_headers[key]
|
||||
|
||||
header_key = self.header_mapping[tag]
|
||||
active_headers[header_key] = (text, level, dom_depth)
|
||||
|
||||
# Produce a document for the header itself
|
||||
header_meta = {
|
||||
key: content
|
||||
for key, (content, lvl, dd) in active_headers.items()
|
||||
if dom_depth >= dd
|
||||
}
|
||||
documents.append(Document(page_content=text, metadata=header_meta))
|
||||
# After encountering a header,
|
||||
# no immediate content goes to current_chunk
|
||||
# (if return_each_element is False, we wait for next content)
|
||||
# (if return_each_element is True, we create docs per element anyway)
|
||||
else:
|
||||
# Non-header element logic
|
||||
# Remove headers that don't apply if dom_depth < their dom_depth
|
||||
headers_to_remove = [
|
||||
key for key, (_, _, dd) in active_headers.items() if dom_depth < dd
|
||||
]
|
||||
for key in headers_to_remove:
|
||||
del active_headers[key]
|
||||
|
||||
if self.return_each_element:
|
||||
# Produce a doc for this element immediately
|
||||
element_meta = {
|
||||
key: content
|
||||
for key, (content, lvl, dd) in active_headers.items()
|
||||
if dom_depth >= dd
|
||||
}
|
||||
if text.strip():
|
||||
documents.append(
|
||||
Document(page_content=text, metadata=element_meta)
|
||||
)
|
||||
else:
|
||||
# Accumulate content in current_chunk
|
||||
if text.strip():
|
||||
current_chunk.append(text)
|
||||
chunk_dom_depth = max(chunk_dom_depth, dom_depth)
|
||||
|
||||
if not self.return_each_element:
|
||||
return self.aggregate_elements_to_chunks(elements)
|
||||
else:
|
||||
return [
|
||||
Document(page_content=chunk["content"], metadata=chunk["metadata"])
|
||||
for chunk in elements
|
||||
]
|
||||
# finalize any remaining chunk
|
||||
finalize_chunk()
|
||||
|
||||
# If no headers were found at all and return_each_element=False, behavior is:
|
||||
# The entire content should be in one document.
|
||||
# The logic above naturally handles it:
|
||||
# If no recognized headers, we never split; we ended up just accumulating text
|
||||
# in current_chunk and finalizing once at the end.
|
||||
|
||||
return documents
|
||||
|
||||
|
||||
class HTMLSectionSplitter:
|
||||
@ -269,7 +412,10 @@ class HTMLSectionSplitter:
|
||||
- 'tag_name': The name of the header tag (e.g., "h1", "h2").
|
||||
"""
|
||||
try:
|
||||
from bs4 import BeautifulSoup, PageElement # type: ignore[import-untyped]
|
||||
from bs4 import (
|
||||
BeautifulSoup, # type: ignore[import-untyped]
|
||||
PageElement,
|
||||
)
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"Unable to import BeautifulSoup/PageElement, \
|
||||
@ -343,10 +489,13 @@ class HTMLSectionSplitter:
|
||||
return str(result)
|
||||
|
||||
def split_text_from_file(self, file: Any) -> List[Document]:
|
||||
"""Split HTML file.
|
||||
"""Split HTML content from a file into a list of Document objects.
|
||||
|
||||
Args:
|
||||
file: HTML file
|
||||
file: A file path or a file-like object containing HTML content.
|
||||
|
||||
Returns:
|
||||
A list of split Document objects.
|
||||
"""
|
||||
file_content = file.getvalue()
|
||||
file_content = self.convert_possible_tags_to_header(file_content)
|
||||
@ -844,3 +993,6 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
|
||||
for placeholder, preserved_content in preserved_elements.items():
|
||||
content = content.replace(placeholder, preserved_content.strip())
|
||||
return content
|
||||
|
||||
|
||||
# %%
|
||||
|
@ -4,7 +4,7 @@ import random
|
||||
import re
|
||||
import string
|
||||
from pathlib import Path
|
||||
from typing import Any, List
|
||||
from typing import Any, Callable, List, Tuple
|
||||
|
||||
import pytest
|
||||
from langchain_core.documents import Document
|
||||
@ -2039,49 +2039,476 @@ def test_haskell_code_splitter() -> None:
|
||||
assert chunks == expected_chunks
|
||||
|
||||
|
||||
@pytest.mark.requires("lxml")
|
||||
def test_html_header_text_splitter(tmp_path: Path) -> None:
|
||||
splitter = HTMLHeaderTextSplitter(
|
||||
headers_to_split_on=[("h1", "Header 1"), ("h2", "Header 2")]
|
||||
)
|
||||
|
||||
content = """
|
||||
<h1>Sample Document</h1>
|
||||
<h2>Section</h2>
|
||||
<p id="1234">Reference content.</p>
|
||||
|
||||
<h2>Lists</h2>
|
||||
<ul>
|
||||
<li>Item 1</li>
|
||||
<li>Item 2</li>
|
||||
<li>Item 3</li>
|
||||
</ul>
|
||||
|
||||
<h3>A block</h3>
|
||||
<div class="amazing">
|
||||
<p>Some text</p>
|
||||
<p>Some more text</p>
|
||||
</div>
|
||||
@pytest.fixture
|
||||
@pytest.mark.requires("bs4")
|
||||
def html_header_splitter_splitter_factory() -> (
|
||||
Callable[[List[Tuple[str, str]]], HTMLHeaderTextSplitter]
|
||||
):
|
||||
"""
|
||||
Fixture to create an HTMLHeaderTextSplitter instance with given headers.
|
||||
This factory allows dynamic creation of splitters with different headers.
|
||||
"""
|
||||
|
||||
docs = splitter.split_text(content)
|
||||
expected = [
|
||||
Document(
|
||||
page_content="Reference content.",
|
||||
metadata={"Header 1": "Sample Document", "Header 2": "Section"},
|
||||
),
|
||||
Document(
|
||||
page_content="Item 1 Item 2 Item 3 \nSome text \nSome more text",
|
||||
metadata={"Header 1": "Sample Document", "Header 2": "Lists"},
|
||||
),
|
||||
]
|
||||
assert docs == expected
|
||||
def _create_splitter(
|
||||
headers_to_split_on: List[Tuple[str, str]],
|
||||
) -> HTMLHeaderTextSplitter:
|
||||
return HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
|
||||
|
||||
with open(tmp_path / "doc.html", "w") as tmp:
|
||||
tmp.write(content)
|
||||
docs_from_file = splitter.split_text_from_file(tmp_path / "doc.html")
|
||||
return _create_splitter
|
||||
|
||||
assert docs_from_file == expected
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"headers_to_split_on, html_input, expected_documents, test_case",
|
||||
[
|
||||
(
|
||||
# Test Case 1: Split on h1 and h2
|
||||
[("h1", "Header 1"), ("h2", "Header 2")],
|
||||
"""
|
||||
<html>
|
||||
<body>
|
||||
<h1>Introduction</h1>
|
||||
<p>This is the introduction.</p>
|
||||
<h2>Background</h2>
|
||||
<p>Background information.</p>
|
||||
<h1>Conclusion</h1>
|
||||
<p>Final thoughts.</p>
|
||||
</body>
|
||||
</html>
|
||||
""",
|
||||
[
|
||||
Document(
|
||||
page_content="Introduction", metadata={"Header 1": "Introduction"}
|
||||
),
|
||||
Document(
|
||||
page_content="This is the introduction.",
|
||||
metadata={"Header 1": "Introduction"},
|
||||
),
|
||||
Document(
|
||||
page_content="Background",
|
||||
metadata={"Header 1": "Introduction", "Header 2": "Background"},
|
||||
),
|
||||
Document(
|
||||
page_content="Background information.",
|
||||
metadata={"Header 1": "Introduction", "Header 2": "Background"},
|
||||
),
|
||||
Document(
|
||||
page_content="Conclusion", metadata={"Header 1": "Conclusion"}
|
||||
),
|
||||
Document(
|
||||
page_content="Final thoughts.", metadata={"Header 1": "Conclusion"}
|
||||
),
|
||||
],
|
||||
"Simple headers and paragraphs",
|
||||
),
|
||||
(
|
||||
# Test Case 2: Nested headers with h1, h2, and h3
|
||||
[("h1", "Header 1"), ("h2", "Header 2"), ("h3", "Header 3")],
|
||||
"""
|
||||
<html>
|
||||
<body>
|
||||
<div>
|
||||
<h1>Main Title</h1>
|
||||
<div>
|
||||
<h2>Subsection</h2>
|
||||
<p>Details of subsection.</p>
|
||||
<div>
|
||||
<h3>Sub-subsection</h3>
|
||||
<p>More details.</p>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<h1>Another Main Title</h1>
|
||||
<p>Content under another main title.</p>
|
||||
</body>
|
||||
</html>
|
||||
""",
|
||||
[
|
||||
Document(
|
||||
page_content="Main Title", metadata={"Header 1": "Main Title"}
|
||||
),
|
||||
Document(
|
||||
page_content="Subsection",
|
||||
metadata={"Header 1": "Main Title", "Header 2": "Subsection"},
|
||||
),
|
||||
Document(
|
||||
page_content="Details of subsection.",
|
||||
metadata={"Header 1": "Main Title", "Header 2": "Subsection"},
|
||||
),
|
||||
Document(
|
||||
page_content="Sub-subsection",
|
||||
metadata={
|
||||
"Header 1": "Main Title",
|
||||
"Header 2": "Subsection",
|
||||
"Header 3": "Sub-subsection",
|
||||
},
|
||||
),
|
||||
Document(
|
||||
page_content="More details.",
|
||||
metadata={
|
||||
"Header 1": "Main Title",
|
||||
"Header 2": "Subsection",
|
||||
"Header 3": "Sub-subsection",
|
||||
},
|
||||
),
|
||||
Document(
|
||||
page_content="Another Main Title",
|
||||
metadata={"Header 1": "Another Main Title"},
|
||||
),
|
||||
Document(
|
||||
page_content="Content under another main title.",
|
||||
metadata={"Header 1": "Another Main Title"},
|
||||
),
|
||||
],
|
||||
"Nested headers with h1, h2, and h3",
|
||||
),
|
||||
(
|
||||
# Test Case 3: No headers
|
||||
[("h1", "Header 1")],
|
||||
"""
|
||||
<html>
|
||||
<body>
|
||||
<p>Paragraph one.</p>
|
||||
<p>Paragraph two.</p>
|
||||
<div>
|
||||
<p>Paragraph three.</p>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
""",
|
||||
[
|
||||
Document(
|
||||
page_content="Paragraph one. \nParagraph two. \nParagraph three.",
|
||||
metadata={},
|
||||
)
|
||||
],
|
||||
"No headers present",
|
||||
),
|
||||
(
|
||||
# Test Case 4: Multiple headers of the same level
|
||||
[("h1", "Header 1")],
|
||||
"""
|
||||
<html>
|
||||
<body>
|
||||
<h1>Chapter 1</h1>
|
||||
<p>Content of chapter 1.</p>
|
||||
<h1>Chapter 2</h1>
|
||||
<p>Content of chapter 2.</p>
|
||||
<h1>Chapter 3</h1>
|
||||
<p>Content of chapter 3.</p>
|
||||
</body>
|
||||
</html>
|
||||
""",
|
||||
[
|
||||
Document(page_content="Chapter 1", metadata={"Header 1": "Chapter 1"}),
|
||||
Document(
|
||||
page_content="Content of chapter 1.",
|
||||
metadata={"Header 1": "Chapter 1"},
|
||||
),
|
||||
Document(page_content="Chapter 2", metadata={"Header 1": "Chapter 2"}),
|
||||
Document(
|
||||
page_content="Content of chapter 2.",
|
||||
metadata={"Header 1": "Chapter 2"},
|
||||
),
|
||||
Document(page_content="Chapter 3", metadata={"Header 1": "Chapter 3"}),
|
||||
Document(
|
||||
page_content="Content of chapter 3.",
|
||||
metadata={"Header 1": "Chapter 3"},
|
||||
),
|
||||
],
|
||||
"Multiple headers of the same level",
|
||||
),
|
||||
(
|
||||
# Test Case 5: Headers with no content
|
||||
[("h1", "Header 1"), ("h2", "Header 2")],
|
||||
"""
|
||||
<html>
|
||||
<body>
|
||||
<h1>Header 1</h1>
|
||||
<h2>Header 2</h2>
|
||||
<h1>Header 3</h1>
|
||||
</body>
|
||||
</html>
|
||||
""",
|
||||
[
|
||||
Document(page_content="Header 1", metadata={"Header 1": "Header 1"}),
|
||||
Document(
|
||||
page_content="Header 2",
|
||||
metadata={"Header 1": "Header 1", "Header 2": "Header 2"},
|
||||
),
|
||||
Document(page_content="Header 3", metadata={"Header 1": "Header 3"}),
|
||||
],
|
||||
"Headers with no associated content",
|
||||
),
|
||||
],
|
||||
)
|
||||
@pytest.mark.requires("bs4")
|
||||
def test_html_header_text_splitter(
|
||||
html_header_splitter_splitter_factory: Any,
|
||||
headers_to_split_on: List[Tuple[str, str]],
|
||||
html_input: str,
|
||||
expected_documents: List[Document],
|
||||
test_case: str,
|
||||
) -> None:
|
||||
"""
|
||||
Test the HTML header text splitter.
|
||||
|
||||
Args:
|
||||
html_header_splitter_splitter_factory (Any): Factory function to create
|
||||
the HTML header splitter.
|
||||
headers_to_split_on (List[Tuple[str, str]]): List of headers to split on.
|
||||
html_input (str): The HTML input string to be split.
|
||||
expected_documents (List[Document]): List of expected Document objects.
|
||||
test_case (str): Description of the test case.
|
||||
|
||||
Raises:
|
||||
AssertionError: If the number of documents or their content/metadata
|
||||
does not match the expected values.
|
||||
"""
|
||||
|
||||
splitter = html_header_splitter_splitter_factory(
|
||||
headers_to_split_on=headers_to_split_on
|
||||
)
|
||||
docs = splitter.split_text(html_input)
|
||||
|
||||
assert len(docs) == len(expected_documents), (
|
||||
f"Test Case '{test_case}' Failed: Number of documents mismatch. "
|
||||
f"Expected {len(expected_documents)}, got {len(docs)}."
|
||||
)
|
||||
for idx, (doc, expected) in enumerate(zip(docs, expected_documents), start=1):
|
||||
assert doc.page_content == expected.page_content, (
|
||||
f"Test Case '{test_case}' Failed at Document {idx}: "
|
||||
f"Content mismatch.\nExpected: {expected.page_content}"
|
||||
"\nGot: {doc.page_content}"
|
||||
)
|
||||
assert doc.metadata == expected.metadata, (
|
||||
f"Test Case '{test_case}' Failed at Document {idx}: "
|
||||
f"Metadata mismatch.\nExpected: {expected.metadata}\nGot: {doc.metadata}"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"headers_to_split_on, html_content, expected_output, test_case",
|
||||
[
|
||||
(
|
||||
# Test Case A: Split on h1 and h2 with h3 in content
|
||||
[("h1", "Header 1"), ("h2", "Header 2"), ("h3", "Header 3")],
|
||||
"""
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<body>
|
||||
<div>
|
||||
<h1>Foo</h1>
|
||||
<p>Some intro text about Foo.</p>
|
||||
<div>
|
||||
<h2>Bar main section</h2>
|
||||
<p>Some intro text about Bar.</p>
|
||||
<h3>Bar subsection 1</h3>
|
||||
<p>Some text about the first subtopic of Bar.</p>
|
||||
<h3>Bar subsection 2</h3>
|
||||
<p>Some text about the second subtopic of Bar.</p>
|
||||
</div>
|
||||
<div>
|
||||
<h2>Baz</h2>
|
||||
<p>Some text about Baz</p>
|
||||
</div>
|
||||
<br>
|
||||
<p>Some concluding text about Foo</p>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
""",
|
||||
[
|
||||
Document(metadata={"Header 1": "Foo"}, page_content="Foo"),
|
||||
Document(
|
||||
metadata={"Header 1": "Foo"},
|
||||
page_content="Some intro text about Foo.",
|
||||
),
|
||||
Document(
|
||||
metadata={"Header 1": "Foo", "Header 2": "Bar main section"},
|
||||
page_content="Bar main section",
|
||||
),
|
||||
Document(
|
||||
metadata={"Header 1": "Foo", "Header 2": "Bar main section"},
|
||||
page_content="Some intro text about Bar.",
|
||||
),
|
||||
Document(
|
||||
metadata={
|
||||
"Header 1": "Foo",
|
||||
"Header 2": "Bar main section",
|
||||
"Header 3": "Bar subsection 1",
|
||||
},
|
||||
page_content="Bar subsection 1",
|
||||
),
|
||||
Document(
|
||||
metadata={
|
||||
"Header 1": "Foo",
|
||||
"Header 2": "Bar main section",
|
||||
"Header 3": "Bar subsection 1",
|
||||
},
|
||||
page_content="Some text about the first subtopic of Bar.",
|
||||
),
|
||||
Document(
|
||||
metadata={
|
||||
"Header 1": "Foo",
|
||||
"Header 2": "Bar main section",
|
||||
"Header 3": "Bar subsection 2",
|
||||
},
|
||||
page_content="Bar subsection 2",
|
||||
),
|
||||
Document(
|
||||
metadata={
|
||||
"Header 1": "Foo",
|
||||
"Header 2": "Bar main section",
|
||||
"Header 3": "Bar subsection 2",
|
||||
},
|
||||
page_content="Some text about the second subtopic of Bar.",
|
||||
),
|
||||
Document(
|
||||
metadata={"Header 1": "Foo", "Header 2": "Baz"}, page_content="Baz"
|
||||
),
|
||||
Document(
|
||||
metadata={"Header 1": "Foo"},
|
||||
page_content=(
|
||||
"Some text about Baz \nSome concluding text about Foo"
|
||||
),
|
||||
),
|
||||
],
|
||||
"Test Case A: Split on h1, h2, and h3 with nested headers",
|
||||
),
|
||||
(
|
||||
# Test Case B: Split on h1 only without any headers
|
||||
[("h1", "Header 1")],
|
||||
"""
|
||||
<html>
|
||||
<body>
|
||||
<p>Paragraph one.</p>
|
||||
<p>Paragraph two.</p>
|
||||
<p>Paragraph three.</p>
|
||||
</body>
|
||||
</html>
|
||||
""",
|
||||
[
|
||||
Document(
|
||||
metadata={},
|
||||
page_content="Paragraph one. \nParagraph two. \nParagraph three.",
|
||||
)
|
||||
],
|
||||
"Test Case B: Split on h1 only without any headers",
|
||||
),
|
||||
],
|
||||
)
|
||||
@pytest.mark.requires("bs4")
|
||||
def test_additional_html_header_text_splitter(
|
||||
html_header_splitter_splitter_factory: Any,
|
||||
headers_to_split_on: List[Tuple[str, str]],
|
||||
html_content: str,
|
||||
expected_output: List[Document],
|
||||
test_case: str,
|
||||
) -> None:
|
||||
"""
|
||||
Test the HTML header text splitter.
|
||||
|
||||
Args:
|
||||
html_header_splitter_splitter_factory (Any): Factory function to create
|
||||
the HTML header splitter.
|
||||
headers_to_split_on (List[Tuple[str, str]]): List of headers to split on.
|
||||
html_content (str): HTML content to be split.
|
||||
expected_output (List[Document]): Expected list of Document objects.
|
||||
test_case (str): Description of the test case.
|
||||
|
||||
Raises:
|
||||
AssertionError: If the number of documents or their content/metadata
|
||||
does not match the expected output.
|
||||
"""
|
||||
splitter = html_header_splitter_splitter_factory(
|
||||
headers_to_split_on=headers_to_split_on
|
||||
)
|
||||
docs = splitter.split_text(html_content)
|
||||
|
||||
assert len(docs) == len(expected_output), (
|
||||
f"{test_case} Failed: Number of documents mismatch. "
|
||||
f"Expected {len(expected_output)}, got {len(docs)}."
|
||||
)
|
||||
for idx, (doc, expected) in enumerate(zip(docs, expected_output), start=1):
|
||||
assert doc.page_content == expected.page_content, (
|
||||
f"{test_case} Failed at Document {idx}: "
|
||||
f"Content mismatch.\nExpected: {expected.page_content}\n"
|
||||
"Got: {doc.page_content}"
|
||||
)
|
||||
assert doc.metadata == expected.metadata, (
|
||||
f"{test_case} Failed at Document {idx}: "
|
||||
f"Metadata mismatch.\nExpected: {expected.metadata}\nGot: {doc.metadata}"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"headers_to_split_on, html_content, expected_output, test_case",
|
||||
[
|
||||
(
|
||||
# Test Case C: Split on h1, h2, and h3 with no headers present
|
||||
[("h1", "Header 1"), ("h2", "Header 2"), ("h3", "Header 3")],
|
||||
"""
|
||||
<html>
|
||||
<body>
|
||||
<p>Just some random text without headers.</p>
|
||||
<div>
|
||||
<span>More text here.</span>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
""",
|
||||
[
|
||||
Document(
|
||||
page_content="Just some random text without headers."
|
||||
" \nMore text here.",
|
||||
metadata={},
|
||||
)
|
||||
],
|
||||
"Test Case C: Split on h1, h2, and h3 without any headers",
|
||||
)
|
||||
],
|
||||
)
|
||||
@pytest.mark.requires("bs4")
|
||||
def test_html_no_headers_with_multiple_splitters(
|
||||
html_header_splitter_splitter_factory: Any,
|
||||
headers_to_split_on: List[Tuple[str, str]],
|
||||
html_content: str,
|
||||
expected_output: List[Document],
|
||||
test_case: str,
|
||||
) -> None:
|
||||
"""
|
||||
Test HTML content splitting without headers using multiple splitters.
|
||||
Args:
|
||||
html_header_splitter_splitter_factory (Any): Factory to create the
|
||||
HTML header splitter.
|
||||
headers_to_split_on (List[Tuple[str, str]]): List of headers to split on.
|
||||
html_content (str): HTML content to be split.
|
||||
expected_output (List[Document]): Expected list of Document objects
|
||||
after splitting.
|
||||
test_case (str): Description of the test case.
|
||||
Raises:
|
||||
AssertionError: If the number of documents or their content/metadata
|
||||
does not match the expected output.
|
||||
"""
|
||||
splitter = html_header_splitter_splitter_factory(
|
||||
headers_to_split_on=headers_to_split_on
|
||||
)
|
||||
docs = splitter.split_text(html_content)
|
||||
|
||||
assert len(docs) == len(expected_output), (
|
||||
f"{test_case} Failed: Number of documents mismatch. "
|
||||
f"Expected {len(expected_output)}, got {len(docs)}."
|
||||
)
|
||||
for idx, (doc, expected) in enumerate(zip(docs, expected_output), start=1):
|
||||
assert doc.page_content == expected.page_content, (
|
||||
f"{test_case} Failed at Document {idx}: "
|
||||
f"Content mismatch.\nExpected: {expected.page_content}\n"
|
||||
"Got: {doc.page_content}"
|
||||
)
|
||||
assert doc.metadata == expected.metadata, (
|
||||
f"{test_case} Failed at Document {idx}: "
|
||||
f"Metadata mismatch.\nExpected: {expected.metadata}\nGot: {doc.metadata}"
|
||||
)
|
||||
|
||||
|
||||
def test_split_text_on_tokens() -> None:
|
||||
@ -2480,7 +2907,8 @@ def test_html_splitter_with_custom_extractor() -> None:
|
||||
|
||||
expected = [
|
||||
Document(
|
||||
page_content="This is an iframe: [iframe:http://example.com](http://example.com)",
|
||||
page_content="This is an iframe: "
|
||||
"[iframe:http://example.com](http://example.com)",
|
||||
metadata={"Header 1": "Section 1"},
|
||||
),
|
||||
]
|
||||
@ -2598,7 +3026,7 @@ def test_html_splitter_with_small_chunk_size() -> None:
|
||||
"""Test HTML splitting with a very small chunk size to validate chunking."""
|
||||
html_content = """
|
||||
<h1>Section 1</h1>
|
||||
<p>This is some long text that should be split into multiple chunks due to the
|
||||
<p>This is some long text that should be split into multiple chunks due to the
|
||||
small chunk size.</p>
|
||||
"""
|
||||
splitter = HTMLSemanticPreservingSplitter(
|
||||
|
Loading…
Reference in New Issue
Block a user