text-splitters[minor]: Replace lxml and XSLT with BeautifulSoup in HTMLHeaderTextSplitter for Improved Large HTML File Processing (#27678)

This pull request updates the `HTMLHeaderTextSplitter` by replacing the
`split_text_from_file` method's implementation. The original method used
`lxml` and XSLT for processing HTML files, which caused
`lxml.etree.xsltapplyerror maxhead` when handling large HTML documents
due to limitations in the XSLT processor. Fixes #13149

By switching to BeautifulSoup (`bs4`), we achieve:

- **Improved Performance and Reliability:** BeautifulSoup efficiently
processes large HTML files without the errors associated with `lxml` and
XSLT.
- **Simplified Dependencies:** Removes the dependency on `lxml` and
external XSLT files, relying instead on the widely used `beautifulsoup4`
library.
- **Maintained Functionality:** The new method replicates the original
behavior, ensuring compatibility with existing code and preserving the
extraction of content and metadata.

**Issue:**

This change addresses issues related to processing large HTML files with
the existing `HTMLHeaderTextSplitter` implementation. It resolves
problems where users encounter lxml.etree.xsltapplyerror maxhead due to
large HTML documents.

**Dependencies:**

- **BeautifulSoup (`beautifulsoup4`):** The `beautifulsoup4` library is
now used for parsing HTML content.
  - Installation: `pip install beautifulsoup4`

**Code Changes:**

Updated the `split_text_from_file` method in `HTMLHeaderTextSplitter` as
follows:

```python
def split_text_from_file(self, file: Any) -> List[Document]:
    """Split HTML file using BeautifulSoup.

    Args:
        file: HTML file path or file-like object.

    Returns:
        List of Document objects with page_content and metadata.
    """
    from bs4 import BeautifulSoup
    from langchain.docstore.document import Document
    import bs4

    # Read the HTML content from the file or file-like object
    if isinstance(file, str):
        with open(file, 'r', encoding='utf-8') as f:
            html_content = f.read()
    else:
        # Assuming file is a file-like object
        html_content = file.read()

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')

    # Extract the header tags and their corresponding metadata keys
    headers_to_split_on = [tag[0] for tag in self.headers_to_split_on]
    header_mapping = dict(self.headers_to_split_on)

    documents = []

    # Find the body of the document
    body = soup.body if soup.body else soup

    # Find all header tags in the order they appear
    all_headers = body.find_all(headers_to_split_on)

    # If there's content before the first header, collect it
    first_header = all_headers[0] if all_headers else None
    if first_header:
        pre_header_content = ''
        for elem in first_header.find_all_previous():
            if isinstance(elem, bs4.Tag):
                text = elem.get_text(separator=' ', strip=True)
                if text:
                    pre_header_content = text + ' ' + pre_header_content
        if pre_header_content.strip():
            documents.append(Document(
                page_content=pre_header_content.strip(),
                metadata={}  # No metadata since there's no header
            ))
    else:
        # If no headers are found, return the whole content
        full_text = body.get_text(separator=' ', strip=True)
        if full_text.strip():
            documents.append(Document(
                page_content=full_text.strip(),
                metadata={}
            ))
        return documents

    # Process each header and its associated content
    for header in all_headers:
        current_metadata = {}
        header_name = header.name
        header_text = header.get_text(separator=' ', strip=True)
        current_metadata[header_mapping[header_name]] = header_text

        # Collect all sibling elements until the next header of the same or higher level
        content_elements = []
        for sibling in header.find_next_siblings():
            if sibling.name in headers_to_split_on:
                # Stop at the next header
                break
            if isinstance(sibling, bs4.Tag):
                content_elements.append(sibling)

        # Get the text content of the collected elements
        current_content = ''
        for elem in content_elements:
            text = elem.get_text(separator=' ', strip=True)
            if text:
                current_content += text + ' '

        # Create a Document if there is content
        if current_content.strip():
            documents.append(Document(
                page_content=current_content.strip(),
                metadata=current_metadata.copy()
            ))
        else:
            # If there's no content, but we have metadata, still create a Document
            documents.append(Document(
                page_content='',
                metadata=current_metadata.copy()
            ))

    return documents
```

---------

Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
Ahmed Tammaa
2025-01-20 23:10:37 +02:00
committed by GitHub
parent 989eec4b7b
commit d3ed9b86be
2 changed files with 736 additions and 156 deletions

View File

@@ -4,7 +4,7 @@ import random
import re
import string
from pathlib import Path
from typing import Any, List
from typing import Any, Callable, List, Tuple
import pytest
from langchain_core.documents import Document
@@ -2039,49 +2039,476 @@ def test_haskell_code_splitter() -> None:
assert chunks == expected_chunks
@pytest.mark.requires("lxml")
def test_html_header_text_splitter(tmp_path: Path) -> None:
splitter = HTMLHeaderTextSplitter(
headers_to_split_on=[("h1", "Header 1"), ("h2", "Header 2")]
)
content = """
<h1>Sample Document</h1>
<h2>Section</h2>
<p id="1234">Reference content.</p>
<h2>Lists</h2>
<ul>
<li>Item 1</li>
<li>Item 2</li>
<li>Item 3</li>
</ul>
<h3>A block</h3>
<div class="amazing">
<p>Some text</p>
<p>Some more text</p>
</div>
@pytest.fixture
@pytest.mark.requires("bs4")
def html_header_splitter_splitter_factory() -> (
Callable[[List[Tuple[str, str]]], HTMLHeaderTextSplitter]
):
"""
Fixture to create an HTMLHeaderTextSplitter instance with given headers.
This factory allows dynamic creation of splitters with different headers.
"""
docs = splitter.split_text(content)
expected = [
Document(
page_content="Reference content.",
metadata={"Header 1": "Sample Document", "Header 2": "Section"},
),
Document(
page_content="Item 1 Item 2 Item 3 \nSome text \nSome more text",
metadata={"Header 1": "Sample Document", "Header 2": "Lists"},
),
]
assert docs == expected
def _create_splitter(
headers_to_split_on: List[Tuple[str, str]],
) -> HTMLHeaderTextSplitter:
return HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
with open(tmp_path / "doc.html", "w") as tmp:
tmp.write(content)
docs_from_file = splitter.split_text_from_file(tmp_path / "doc.html")
return _create_splitter
assert docs_from_file == expected
@pytest.mark.parametrize(
"headers_to_split_on, html_input, expected_documents, test_case",
[
(
# Test Case 1: Split on h1 and h2
[("h1", "Header 1"), ("h2", "Header 2")],
"""
<html>
<body>
<h1>Introduction</h1>
<p>This is the introduction.</p>
<h2>Background</h2>
<p>Background information.</p>
<h1>Conclusion</h1>
<p>Final thoughts.</p>
</body>
</html>
""",
[
Document(
page_content="Introduction", metadata={"Header 1": "Introduction"}
),
Document(
page_content="This is the introduction.",
metadata={"Header 1": "Introduction"},
),
Document(
page_content="Background",
metadata={"Header 1": "Introduction", "Header 2": "Background"},
),
Document(
page_content="Background information.",
metadata={"Header 1": "Introduction", "Header 2": "Background"},
),
Document(
page_content="Conclusion", metadata={"Header 1": "Conclusion"}
),
Document(
page_content="Final thoughts.", metadata={"Header 1": "Conclusion"}
),
],
"Simple headers and paragraphs",
),
(
# Test Case 2: Nested headers with h1, h2, and h3
[("h1", "Header 1"), ("h2", "Header 2"), ("h3", "Header 3")],
"""
<html>
<body>
<div>
<h1>Main Title</h1>
<div>
<h2>Subsection</h2>
<p>Details of subsection.</p>
<div>
<h3>Sub-subsection</h3>
<p>More details.</p>
</div>
</div>
</div>
<h1>Another Main Title</h1>
<p>Content under another main title.</p>
</body>
</html>
""",
[
Document(
page_content="Main Title", metadata={"Header 1": "Main Title"}
),
Document(
page_content="Subsection",
metadata={"Header 1": "Main Title", "Header 2": "Subsection"},
),
Document(
page_content="Details of subsection.",
metadata={"Header 1": "Main Title", "Header 2": "Subsection"},
),
Document(
page_content="Sub-subsection",
metadata={
"Header 1": "Main Title",
"Header 2": "Subsection",
"Header 3": "Sub-subsection",
},
),
Document(
page_content="More details.",
metadata={
"Header 1": "Main Title",
"Header 2": "Subsection",
"Header 3": "Sub-subsection",
},
),
Document(
page_content="Another Main Title",
metadata={"Header 1": "Another Main Title"},
),
Document(
page_content="Content under another main title.",
metadata={"Header 1": "Another Main Title"},
),
],
"Nested headers with h1, h2, and h3",
),
(
# Test Case 3: No headers
[("h1", "Header 1")],
"""
<html>
<body>
<p>Paragraph one.</p>
<p>Paragraph two.</p>
<div>
<p>Paragraph three.</p>
</div>
</body>
</html>
""",
[
Document(
page_content="Paragraph one. \nParagraph two. \nParagraph three.",
metadata={},
)
],
"No headers present",
),
(
# Test Case 4: Multiple headers of the same level
[("h1", "Header 1")],
"""
<html>
<body>
<h1>Chapter 1</h1>
<p>Content of chapter 1.</p>
<h1>Chapter 2</h1>
<p>Content of chapter 2.</p>
<h1>Chapter 3</h1>
<p>Content of chapter 3.</p>
</body>
</html>
""",
[
Document(page_content="Chapter 1", metadata={"Header 1": "Chapter 1"}),
Document(
page_content="Content of chapter 1.",
metadata={"Header 1": "Chapter 1"},
),
Document(page_content="Chapter 2", metadata={"Header 1": "Chapter 2"}),
Document(
page_content="Content of chapter 2.",
metadata={"Header 1": "Chapter 2"},
),
Document(page_content="Chapter 3", metadata={"Header 1": "Chapter 3"}),
Document(
page_content="Content of chapter 3.",
metadata={"Header 1": "Chapter 3"},
),
],
"Multiple headers of the same level",
),
(
# Test Case 5: Headers with no content
[("h1", "Header 1"), ("h2", "Header 2")],
"""
<html>
<body>
<h1>Header 1</h1>
<h2>Header 2</h2>
<h1>Header 3</h1>
</body>
</html>
""",
[
Document(page_content="Header 1", metadata={"Header 1": "Header 1"}),
Document(
page_content="Header 2",
metadata={"Header 1": "Header 1", "Header 2": "Header 2"},
),
Document(page_content="Header 3", metadata={"Header 1": "Header 3"}),
],
"Headers with no associated content",
),
],
)
@pytest.mark.requires("bs4")
def test_html_header_text_splitter(
html_header_splitter_splitter_factory: Any,
headers_to_split_on: List[Tuple[str, str]],
html_input: str,
expected_documents: List[Document],
test_case: str,
) -> None:
"""
Test the HTML header text splitter.
Args:
html_header_splitter_splitter_factory (Any): Factory function to create
the HTML header splitter.
headers_to_split_on (List[Tuple[str, str]]): List of headers to split on.
html_input (str): The HTML input string to be split.
expected_documents (List[Document]): List of expected Document objects.
test_case (str): Description of the test case.
Raises:
AssertionError: If the number of documents or their content/metadata
does not match the expected values.
"""
splitter = html_header_splitter_splitter_factory(
headers_to_split_on=headers_to_split_on
)
docs = splitter.split_text(html_input)
assert len(docs) == len(expected_documents), (
f"Test Case '{test_case}' Failed: Number of documents mismatch. "
f"Expected {len(expected_documents)}, got {len(docs)}."
)
for idx, (doc, expected) in enumerate(zip(docs, expected_documents), start=1):
assert doc.page_content == expected.page_content, (
f"Test Case '{test_case}' Failed at Document {idx}: "
f"Content mismatch.\nExpected: {expected.page_content}"
"\nGot: {doc.page_content}"
)
assert doc.metadata == expected.metadata, (
f"Test Case '{test_case}' Failed at Document {idx}: "
f"Metadata mismatch.\nExpected: {expected.metadata}\nGot: {doc.metadata}"
)
@pytest.mark.parametrize(
"headers_to_split_on, html_content, expected_output, test_case",
[
(
# Test Case A: Split on h1 and h2 with h3 in content
[("h1", "Header 1"), ("h2", "Header 2"), ("h3", "Header 3")],
"""
<!DOCTYPE html>
<html>
<body>
<div>
<h1>Foo</h1>
<p>Some intro text about Foo.</p>
<div>
<h2>Bar main section</h2>
<p>Some intro text about Bar.</p>
<h3>Bar subsection 1</h3>
<p>Some text about the first subtopic of Bar.</p>
<h3>Bar subsection 2</h3>
<p>Some text about the second subtopic of Bar.</p>
</div>
<div>
<h2>Baz</h2>
<p>Some text about Baz</p>
</div>
<br>
<p>Some concluding text about Foo</p>
</div>
</body>
</html>
""",
[
Document(metadata={"Header 1": "Foo"}, page_content="Foo"),
Document(
metadata={"Header 1": "Foo"},
page_content="Some intro text about Foo.",
),
Document(
metadata={"Header 1": "Foo", "Header 2": "Bar main section"},
page_content="Bar main section",
),
Document(
metadata={"Header 1": "Foo", "Header 2": "Bar main section"},
page_content="Some intro text about Bar.",
),
Document(
metadata={
"Header 1": "Foo",
"Header 2": "Bar main section",
"Header 3": "Bar subsection 1",
},
page_content="Bar subsection 1",
),
Document(
metadata={
"Header 1": "Foo",
"Header 2": "Bar main section",
"Header 3": "Bar subsection 1",
},
page_content="Some text about the first subtopic of Bar.",
),
Document(
metadata={
"Header 1": "Foo",
"Header 2": "Bar main section",
"Header 3": "Bar subsection 2",
},
page_content="Bar subsection 2",
),
Document(
metadata={
"Header 1": "Foo",
"Header 2": "Bar main section",
"Header 3": "Bar subsection 2",
},
page_content="Some text about the second subtopic of Bar.",
),
Document(
metadata={"Header 1": "Foo", "Header 2": "Baz"}, page_content="Baz"
),
Document(
metadata={"Header 1": "Foo"},
page_content=(
"Some text about Baz \nSome concluding text about Foo"
),
),
],
"Test Case A: Split on h1, h2, and h3 with nested headers",
),
(
# Test Case B: Split on h1 only without any headers
[("h1", "Header 1")],
"""
<html>
<body>
<p>Paragraph one.</p>
<p>Paragraph two.</p>
<p>Paragraph three.</p>
</body>
</html>
""",
[
Document(
metadata={},
page_content="Paragraph one. \nParagraph two. \nParagraph three.",
)
],
"Test Case B: Split on h1 only without any headers",
),
],
)
@pytest.mark.requires("bs4")
def test_additional_html_header_text_splitter(
html_header_splitter_splitter_factory: Any,
headers_to_split_on: List[Tuple[str, str]],
html_content: str,
expected_output: List[Document],
test_case: str,
) -> None:
"""
Test the HTML header text splitter.
Args:
html_header_splitter_splitter_factory (Any): Factory function to create
the HTML header splitter.
headers_to_split_on (List[Tuple[str, str]]): List of headers to split on.
html_content (str): HTML content to be split.
expected_output (List[Document]): Expected list of Document objects.
test_case (str): Description of the test case.
Raises:
AssertionError: If the number of documents or their content/metadata
does not match the expected output.
"""
splitter = html_header_splitter_splitter_factory(
headers_to_split_on=headers_to_split_on
)
docs = splitter.split_text(html_content)
assert len(docs) == len(expected_output), (
f"{test_case} Failed: Number of documents mismatch. "
f"Expected {len(expected_output)}, got {len(docs)}."
)
for idx, (doc, expected) in enumerate(zip(docs, expected_output), start=1):
assert doc.page_content == expected.page_content, (
f"{test_case} Failed at Document {idx}: "
f"Content mismatch.\nExpected: {expected.page_content}\n"
"Got: {doc.page_content}"
)
assert doc.metadata == expected.metadata, (
f"{test_case} Failed at Document {idx}: "
f"Metadata mismatch.\nExpected: {expected.metadata}\nGot: {doc.metadata}"
)
@pytest.mark.parametrize(
"headers_to_split_on, html_content, expected_output, test_case",
[
(
# Test Case C: Split on h1, h2, and h3 with no headers present
[("h1", "Header 1"), ("h2", "Header 2"), ("h3", "Header 3")],
"""
<html>
<body>
<p>Just some random text without headers.</p>
<div>
<span>More text here.</span>
</div>
</body>
</html>
""",
[
Document(
page_content="Just some random text without headers."
" \nMore text here.",
metadata={},
)
],
"Test Case C: Split on h1, h2, and h3 without any headers",
)
],
)
@pytest.mark.requires("bs4")
def test_html_no_headers_with_multiple_splitters(
html_header_splitter_splitter_factory: Any,
headers_to_split_on: List[Tuple[str, str]],
html_content: str,
expected_output: List[Document],
test_case: str,
) -> None:
"""
Test HTML content splitting without headers using multiple splitters.
Args:
html_header_splitter_splitter_factory (Any): Factory to create the
HTML header splitter.
headers_to_split_on (List[Tuple[str, str]]): List of headers to split on.
html_content (str): HTML content to be split.
expected_output (List[Document]): Expected list of Document objects
after splitting.
test_case (str): Description of the test case.
Raises:
AssertionError: If the number of documents or their content/metadata
does not match the expected output.
"""
splitter = html_header_splitter_splitter_factory(
headers_to_split_on=headers_to_split_on
)
docs = splitter.split_text(html_content)
assert len(docs) == len(expected_output), (
f"{test_case} Failed: Number of documents mismatch. "
f"Expected {len(expected_output)}, got {len(docs)}."
)
for idx, (doc, expected) in enumerate(zip(docs, expected_output), start=1):
assert doc.page_content == expected.page_content, (
f"{test_case} Failed at Document {idx}: "
f"Content mismatch.\nExpected: {expected.page_content}\n"
"Got: {doc.page_content}"
)
assert doc.metadata == expected.metadata, (
f"{test_case} Failed at Document {idx}: "
f"Metadata mismatch.\nExpected: {expected.metadata}\nGot: {doc.metadata}"
)
def test_split_text_on_tokens() -> None:
@@ -2480,7 +2907,8 @@ def test_html_splitter_with_custom_extractor() -> None:
expected = [
Document(
page_content="This is an iframe: [iframe:http://example.com](http://example.com)",
page_content="This is an iframe: "
"[iframe:http://example.com](http://example.com)",
metadata={"Header 1": "Section 1"},
),
]
@@ -2598,7 +3026,7 @@ def test_html_splitter_with_small_chunk_size() -> None:
"""Test HTML splitting with a very small chunk size to validate chunking."""
html_content = """
<h1>Section 1</h1>
<p>This is some long text that should be split into multiple chunks due to the
<p>This is some long text that should be split into multiple chunks due to the
small chunk size.</p>
"""
splitter = HTMLSemanticPreservingSplitter(