text-splitters[minor]: Replace lxml and XSLT with BeautifulSoup in HTMLHeaderTextSplitter for Improved Large HTML File Processing (#27678)

This pull request updates the `HTMLHeaderTextSplitter` by replacing the `split_text_from_file` method's implementation. The original method used `lxml` and XSLT for processing HTML files, which caused `lxml.etree.xsltapplyerror maxhead` when handling large HTML documents due to limitations in the XSLT processor. Fixes #13149 By switching to BeautifulSoup (`bs4`), we achieve: - **Improved Performance and Reliability:** BeautifulSoup efficiently processes large HTML files without the errors associated with `lxml` and XSLT. - **Simplified Dependencies:** Removes the dependency on `lxml` and external XSLT files, relying instead on the widely used `beautifulsoup4` library. - **Maintained Functionality:** The new method replicates the original behavior, ensuring compatibility with existing code and preserving the extraction of content and metadata. **Issue:** This change addresses issues related to processing large HTML files with the existing `HTMLHeaderTextSplitter` implementation. It resolves problems where users encounter lxml.etree.xsltapplyerror maxhead due to large HTML documents. **Dependencies:** - **BeautifulSoup (`beautifulsoup4`):** The `beautifulsoup4` library is now used for parsing HTML content. - Installation: `pip install beautifulsoup4` **Code Changes:** Updated the `split_text_from_file` method in `HTMLHeaderTextSplitter` as follows: ```python def split_text_from_file(self, file: Any) -> List[Document]: """Split HTML file using BeautifulSoup. Args: file: HTML file path or file-like object. Returns: List of Document objects with page_content and metadata. """ from bs4 import BeautifulSoup from langchain.docstore.document import Document import bs4 # Read the HTML content from the file or file-like object if isinstance(file, str): with open(file, 'r', encoding='utf-8') as f: html_content = f.read() else: # Assuming file is a file-like object html_content = file.read() # Parse the HTML content using BeautifulSoup soup = BeautifulSoup(html_content, 'html.parser') # Extract the header tags and their corresponding metadata keys headers_to_split_on = [tag[0] for tag in self.headers_to_split_on] header_mapping = dict(self.headers_to_split_on) documents = [] # Find the body of the document body = soup.body if soup.body else soup # Find all header tags in the order they appear all_headers = body.find_all(headers_to_split_on) # If there's content before the first header, collect it first_header = all_headers[0] if all_headers else None if first_header: pre_header_content = '' for elem in first_header.find_all_previous(): if isinstance(elem, bs4.Tag): text = elem.get_text(separator=' ', strip=True) if text: pre_header_content = text + ' ' + pre_header_content if pre_header_content.strip(): documents.append(Document( page_content=pre_header_content.strip(), metadata={} # No metadata since there's no header )) else: # If no headers are found, return the whole content full_text = body.get_text(separator=' ', strip=True) if full_text.strip(): documents.append(Document( page_content=full_text.strip(), metadata={} )) return documents # Process each header and its associated content for header in all_headers: current_metadata = {} header_name = header.name header_text = header.get_text(separator=' ', strip=True) current_metadata[header_mapping[header_name]] = header_text # Collect all sibling elements until the next header of the same or higher level content_elements = [] for sibling in header.find_next_siblings(): if sibling.name in headers_to_split_on: # Stop at the next header break if isinstance(sibling, bs4.Tag): content_elements.append(sibling) # Get the text content of the collected elements current_content = '' for elem in content_elements: text = elem.get_text(separator=' ', strip=True) if text: current_content += text + ' ' # Create a Document if there is content if current_content.strip(): documents.append(Document( page_content=current_content.strip(), metadata=current_metadata.copy() )) else: # If there's no content, but we have metadata, still create a Document documents.append(Document( page_content='', metadata=current_metadata.copy() )) return documents ``` --------- Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
2025-09-07 22:11:51 +00:00 · 2025-01-20 23:10:37 +02:00
parent 989eec4b7b
commit d3ed9b86be
2 changed files with 736 additions and 156 deletions
--- a/libs/text-splitters/tests/unit_tests/test_text_splitters.py
+++ b/libs/text-splitters/tests/unit_tests/test_text_splitters.py
@@ -4,7 +4,7 @@ import random
 import re
 import string
 from pathlib import Path
-from typing import Any, List
+from typing import Any, Callable, List, Tuple

 import pytest
 from langchain_core.documents import Document
@@ -2039,49 +2039,476 @@ def test_haskell_code_splitter() -> None:
    assert chunks == expected_chunks


-@pytest.mark.requires("lxml")
-def test_html_header_text_splitter(tmp_path: Path) -> None:
-    splitter = HTMLHeaderTextSplitter(
-        headers_to_split_on=[("h1", "Header 1"), ("h2", "Header 2")]
-    )
-
-    content = """
-<h1>Sample Document</h1>
-    <h2>Section</h2>
-        <p id="1234">Reference content.</p>
-
-    <h2>Lists</h2>
-        <ul>
-            <li>Item 1</li>
-            <li>Item 2</li>
-            <li>Item 3</li>
-        </ul>
-
-        <h3>A block</h3>
-            <div class="amazing">
-                <p>Some text</p>
-                <p>Some more text</p>
-            </div>
+@pytest.fixture
+@pytest.mark.requires("bs4")
+def html_header_splitter_splitter_factory() -> (
+    Callable[[List[Tuple[str, str]]], HTMLHeaderTextSplitter]
+):
+    """
+    Fixture to create an HTMLHeaderTextSplitter instance with given headers.
+    This factory allows dynamic creation of splitters with different headers.
    """

-    docs = splitter.split_text(content)
-    expected = [
-        Document(
-            page_content="Reference content.",
-            metadata={"Header 1": "Sample Document", "Header 2": "Section"},
-        ),
-        Document(
-            page_content="Item 1 Item 2 Item 3  \nSome text  \nSome more text",
-            metadata={"Header 1": "Sample Document", "Header 2": "Lists"},
-        ),
-    ]
-    assert docs == expected
+    def _create_splitter(
+        headers_to_split_on: List[Tuple[str, str]],
+    ) -> HTMLHeaderTextSplitter:
+        return HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

-    with open(tmp_path / "doc.html", "w") as tmp:
-        tmp.write(content)
-    docs_from_file = splitter.split_text_from_file(tmp_path / "doc.html")
+    return _create_splitter

-    assert docs_from_file == expected
+
+@pytest.mark.parametrize(
+    "headers_to_split_on, html_input, expected_documents, test_case",
+    [
+        (
+            # Test Case 1: Split on h1 and h2
+            [("h1", "Header 1"), ("h2", "Header 2")],
+            """
+            <html>
+                <body>
+                    <h1>Introduction</h1>
+                    <p>This is the introduction.</p>
+                    <h2>Background</h2>
+                    <p>Background information.</p>
+                    <h1>Conclusion</h1>
+                    <p>Final thoughts.</p>
+                </body>
+            </html>
+            """,
+            [
+                Document(
+                    page_content="Introduction", metadata={"Header 1": "Introduction"}
+                ),
+                Document(
+                    page_content="This is the introduction.",
+                    metadata={"Header 1": "Introduction"},
+                ),
+                Document(
+                    page_content="Background",
+                    metadata={"Header 1": "Introduction", "Header 2": "Background"},
+                ),
+                Document(
+                    page_content="Background information.",
+                    metadata={"Header 1": "Introduction", "Header 2": "Background"},
+                ),
+                Document(
+                    page_content="Conclusion", metadata={"Header 1": "Conclusion"}
+                ),
+                Document(
+                    page_content="Final thoughts.", metadata={"Header 1": "Conclusion"}
+                ),
+            ],
+            "Simple headers and paragraphs",
+        ),
+        (
+            # Test Case 2: Nested headers with h1, h2, and h3
+            [("h1", "Header 1"), ("h2", "Header 2"), ("h3", "Header 3")],
+            """
+            <html>
+                <body>
+                    <div>
+                        <h1>Main Title</h1>
+                        <div>
+                            <h2>Subsection</h2>
+                            <p>Details of subsection.</p>
+                            <div>
+                                <h3>Sub-subsection</h3>
+                                <p>More details.</p>
+                            </div>
+                        </div>
+                    </div>
+                    <h1>Another Main Title</h1>
+                    <p>Content under another main title.</p>
+                </body>
+            </html>
+            """,
+            [
+                Document(
+                    page_content="Main Title", metadata={"Header 1": "Main Title"}
+                ),
+                Document(
+                    page_content="Subsection",
+                    metadata={"Header 1": "Main Title", "Header 2": "Subsection"},
+                ),
+                Document(
+                    page_content="Details of subsection.",
+                    metadata={"Header 1": "Main Title", "Header 2": "Subsection"},
+                ),
+                Document(
+                    page_content="Sub-subsection",
+                    metadata={
+                        "Header 1": "Main Title",
+                        "Header 2": "Subsection",
+                        "Header 3": "Sub-subsection",
+                    },
+                ),
+                Document(
+                    page_content="More details.",
+                    metadata={
+                        "Header 1": "Main Title",
+                        "Header 2": "Subsection",
+                        "Header 3": "Sub-subsection",
+                    },
+                ),
+                Document(
+                    page_content="Another Main Title",
+                    metadata={"Header 1": "Another Main Title"},
+                ),
+                Document(
+                    page_content="Content under another main title.",
+                    metadata={"Header 1": "Another Main Title"},
+                ),
+            ],
+            "Nested headers with h1, h2, and h3",
+        ),
+        (
+            # Test Case 3: No headers
+            [("h1", "Header 1")],
+            """
+            <html>
+                <body>
+                    <p>Paragraph one.</p>
+                    <p>Paragraph two.</p>
+                    <div>
+                        <p>Paragraph three.</p>
+                    </div>
+                </body>
+            </html>
+            """,
+            [
+                Document(
+                    page_content="Paragraph one.  \nParagraph two.  \nParagraph three.",
+                    metadata={},
+                )
+            ],
+            "No headers present",
+        ),
+        (
+            # Test Case 4: Multiple headers of the same level
+            [("h1", "Header 1")],
+            """
+            <html>
+                <body>
+                    <h1>Chapter 1</h1>
+                    <p>Content of chapter 1.</p>
+                    <h1>Chapter 2</h1>
+                    <p>Content of chapter 2.</p>
+                    <h1>Chapter 3</h1>
+                    <p>Content of chapter 3.</p>
+                </body>
+            </html>
+            """,
+            [
+                Document(page_content="Chapter 1", metadata={"Header 1": "Chapter 1"}),
+                Document(
+                    page_content="Content of chapter 1.",
+                    metadata={"Header 1": "Chapter 1"},
+                ),
+                Document(page_content="Chapter 2", metadata={"Header 1": "Chapter 2"}),
+                Document(
+                    page_content="Content of chapter 2.",
+                    metadata={"Header 1": "Chapter 2"},
+                ),
+                Document(page_content="Chapter 3", metadata={"Header 1": "Chapter 3"}),
+                Document(
+                    page_content="Content of chapter 3.",
+                    metadata={"Header 1": "Chapter 3"},
+                ),
+            ],
+            "Multiple headers of the same level",
+        ),
+        (
+            # Test Case 5: Headers with no content
+            [("h1", "Header 1"), ("h2", "Header 2")],
+            """
+            <html>
+                <body>
+                    <h1>Header 1</h1>
+                    <h2>Header 2</h2>
+                    <h1>Header 3</h1>
+                </body>
+            </html>
+            """,
+            [
+                Document(page_content="Header 1", metadata={"Header 1": "Header 1"}),
+                Document(
+                    page_content="Header 2",
+                    metadata={"Header 1": "Header 1", "Header 2": "Header 2"},
+                ),
+                Document(page_content="Header 3", metadata={"Header 1": "Header 3"}),
+            ],
+            "Headers with no associated content",
+        ),
+    ],
+)
+@pytest.mark.requires("bs4")
+def test_html_header_text_splitter(
+    html_header_splitter_splitter_factory: Any,
+    headers_to_split_on: List[Tuple[str, str]],
+    html_input: str,
+    expected_documents: List[Document],
+    test_case: str,
+) -> None:
+    """
+    Test the HTML header text splitter.
+
+    Args:
+        html_header_splitter_splitter_factory (Any): Factory function to create
+            the HTML header splitter.
+        headers_to_split_on (List[Tuple[str, str]]): List of headers to split on.
+        html_input (str): The HTML input string to be split.
+        expected_documents (List[Document]): List of expected Document objects.
+        test_case (str): Description of the test case.
+
+    Raises:
+        AssertionError: If the number of documents or their content/metadata
+            does not match the expected values.
+    """
+
+    splitter = html_header_splitter_splitter_factory(
+        headers_to_split_on=headers_to_split_on
+    )
+    docs = splitter.split_text(html_input)
+
+    assert len(docs) == len(expected_documents), (
+        f"Test Case '{test_case}' Failed: Number of documents mismatch. "
+        f"Expected {len(expected_documents)}, got {len(docs)}."
+    )
+    for idx, (doc, expected) in enumerate(zip(docs, expected_documents), start=1):
+        assert doc.page_content == expected.page_content, (
+            f"Test Case '{test_case}' Failed at Document {idx}: "
+            f"Content mismatch.\nExpected: {expected.page_content}"
+            "\nGot: {doc.page_content}"
+        )
+        assert doc.metadata == expected.metadata, (
+            f"Test Case '{test_case}' Failed at Document {idx}: "
+            f"Metadata mismatch.\nExpected: {expected.metadata}\nGot: {doc.metadata}"
+        )
+
+
+@pytest.mark.parametrize(
+    "headers_to_split_on, html_content, expected_output, test_case",
+    [
+        (
+            # Test Case A: Split on h1 and h2 with h3 in content
+            [("h1", "Header 1"), ("h2", "Header 2"), ("h3", "Header 3")],
+            """
+            <!DOCTYPE html>
+            <html>
+            <body>
+                <div>
+                    <h1>Foo</h1>
+                    <p>Some intro text about Foo.</p>
+                    <div>
+                        <h2>Bar main section</h2>
+                        <p>Some intro text about Bar.</p>
+                        <h3>Bar subsection 1</h3>
+                        <p>Some text about the first subtopic of Bar.</p>
+                        <h3>Bar subsection 2</h3>
+                        <p>Some text about the second subtopic of Bar.</p>
+                    </div>
+                    <div>
+                        <h2>Baz</h2>
+                        <p>Some text about Baz</p>
+                    </div>
+                    <br>
+                    <p>Some concluding text about Foo</p>
+                </div>
+            </body>
+            </html>
+            """,
+            [
+                Document(metadata={"Header 1": "Foo"}, page_content="Foo"),
+                Document(
+                    metadata={"Header 1": "Foo"},
+                    page_content="Some intro text about Foo.",
+                ),
+                Document(
+                    metadata={"Header 1": "Foo", "Header 2": "Bar main section"},
+                    page_content="Bar main section",
+                ),
+                Document(
+                    metadata={"Header 1": "Foo", "Header 2": "Bar main section"},
+                    page_content="Some intro text about Bar.",
+                ),
+                Document(
+                    metadata={
+                        "Header 1": "Foo",
+                        "Header 2": "Bar main section",
+                        "Header 3": "Bar subsection 1",
+                    },
+                    page_content="Bar subsection 1",
+                ),
+                Document(
+                    metadata={
+                        "Header 1": "Foo",
+                        "Header 2": "Bar main section",
+                        "Header 3": "Bar subsection 1",
+                    },
+                    page_content="Some text about the first subtopic of Bar.",
+                ),
+                Document(
+                    metadata={
+                        "Header 1": "Foo",
+                        "Header 2": "Bar main section",
+                        "Header 3": "Bar subsection 2",
+                    },
+                    page_content="Bar subsection 2",
+                ),
+                Document(
+                    metadata={
+                        "Header 1": "Foo",
+                        "Header 2": "Bar main section",
+                        "Header 3": "Bar subsection 2",
+                    },
+                    page_content="Some text about the second subtopic of Bar.",
+                ),
+                Document(
+                    metadata={"Header 1": "Foo", "Header 2": "Baz"}, page_content="Baz"
+                ),
+                Document(
+                    metadata={"Header 1": "Foo"},
+                    page_content=(
+                        "Some text about Baz  \nSome concluding text about Foo"
+                    ),
+                ),
+            ],
+            "Test Case A: Split on h1, h2, and h3 with nested headers",
+        ),
+        (
+            # Test Case B: Split on h1 only without any headers
+            [("h1", "Header 1")],
+            """
+            <html>
+                <body>
+                    <p>Paragraph one.</p>
+                    <p>Paragraph two.</p>
+                    <p>Paragraph three.</p>
+                </body>
+            </html>
+            """,
+            [
+                Document(
+                    metadata={},
+                    page_content="Paragraph one.  \nParagraph two.  \nParagraph three.",
+                )
+            ],
+            "Test Case B: Split on h1 only without any headers",
+        ),
+    ],
+)
+@pytest.mark.requires("bs4")
+def test_additional_html_header_text_splitter(
+    html_header_splitter_splitter_factory: Any,
+    headers_to_split_on: List[Tuple[str, str]],
+    html_content: str,
+    expected_output: List[Document],
+    test_case: str,
+) -> None:
+    """
+    Test the HTML header text splitter.
+
+    Args:
+        html_header_splitter_splitter_factory (Any): Factory function to create
+            the HTML header splitter.
+        headers_to_split_on (List[Tuple[str, str]]): List of headers to split on.
+        html_content (str): HTML content to be split.
+        expected_output (List[Document]): Expected list of Document objects.
+        test_case (str): Description of the test case.
+
+    Raises:
+        AssertionError: If the number of documents or their content/metadata
+            does not match the expected output.
+    """
+    splitter = html_header_splitter_splitter_factory(
+        headers_to_split_on=headers_to_split_on
+    )
+    docs = splitter.split_text(html_content)
+
+    assert len(docs) == len(expected_output), (
+        f"{test_case} Failed: Number of documents mismatch. "
+        f"Expected {len(expected_output)}, got {len(docs)}."
+    )
+    for idx, (doc, expected) in enumerate(zip(docs, expected_output), start=1):
+        assert doc.page_content == expected.page_content, (
+            f"{test_case} Failed at Document {idx}: "
+            f"Content mismatch.\nExpected: {expected.page_content}\n"
+            "Got: {doc.page_content}"
+        )
+        assert doc.metadata == expected.metadata, (
+            f"{test_case} Failed at Document {idx}: "
+            f"Metadata mismatch.\nExpected: {expected.metadata}\nGot: {doc.metadata}"
+        )
+
+
+@pytest.mark.parametrize(
+    "headers_to_split_on, html_content, expected_output, test_case",
+    [
+        (
+            # Test Case C: Split on h1, h2, and h3 with no headers present
+            [("h1", "Header 1"), ("h2", "Header 2"), ("h3", "Header 3")],
+            """
+            <html>
+                <body>
+                    <p>Just some random text without headers.</p>
+                    <div>
+                        <span>More text here.</span>
+                    </div>
+                </body>
+            </html>
+            """,
+            [
+                Document(
+                    page_content="Just some random text without headers."
+                    "  \nMore text here.",
+                    metadata={},
+                )
+            ],
+            "Test Case C: Split on h1, h2, and h3 without any headers",
+        )
+    ],
+)
+@pytest.mark.requires("bs4")
+def test_html_no_headers_with_multiple_splitters(
+    html_header_splitter_splitter_factory: Any,
+    headers_to_split_on: List[Tuple[str, str]],
+    html_content: str,
+    expected_output: List[Document],
+    test_case: str,
+) -> None:
+    """
+    Test HTML content splitting without headers using multiple splitters.
+    Args:
+        html_header_splitter_splitter_factory (Any): Factory to create the
+            HTML header splitter.
+        headers_to_split_on (List[Tuple[str, str]]): List of headers to split on.
+        html_content (str): HTML content to be split.
+        expected_output (List[Document]): Expected list of Document objects
+            after splitting.
+        test_case (str): Description of the test case.
+    Raises:
+        AssertionError: If the number of documents or their content/metadata
+            does not match the expected output.
+    """
+    splitter = html_header_splitter_splitter_factory(
+        headers_to_split_on=headers_to_split_on
+    )
+    docs = splitter.split_text(html_content)
+
+    assert len(docs) == len(expected_output), (
+        f"{test_case} Failed: Number of documents mismatch. "
+        f"Expected {len(expected_output)}, got {len(docs)}."
+    )
+    for idx, (doc, expected) in enumerate(zip(docs, expected_output), start=1):
+        assert doc.page_content == expected.page_content, (
+            f"{test_case} Failed at Document {idx}: "
+            f"Content mismatch.\nExpected: {expected.page_content}\n"
+            "Got: {doc.page_content}"
+        )
+        assert doc.metadata == expected.metadata, (
+            f"{test_case} Failed at Document {idx}: "
+            f"Metadata mismatch.\nExpected: {expected.metadata}\nGot: {doc.metadata}"
+        )


 def test_split_text_on_tokens() -> None:
@@ -2480,7 +2907,8 @@ def test_html_splitter_with_custom_extractor() -> None:

    expected = [
        Document(
-            page_content="This is an iframe: [iframe:http://example.com](http://example.com)",
+            page_content="This is an iframe: "
+            "[iframe:http://example.com](http://example.com)",
            metadata={"Header 1": "Section 1"},
        ),
    ]
@@ -2598,7 +3026,7 @@ def test_html_splitter_with_small_chunk_size() -> None:
    """Test HTML splitting with a very small chunk size to validate chunking."""
    html_content = """
    <h1>Section 1</h1>
-    <p>This is some long text that should be split into multiple chunks due to the 
+    <p>This is some long text that should be split into multiple chunks due to the
    small chunk size.</p>
    """
    splitter = HTMLSemanticPreservingSplitter(