mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-07 22:11:51 +00:00
text-splitters[minor]: Replace lxml and XSLT with BeautifulSoup in HTMLHeaderTextSplitter for Improved Large HTML File Processing (#27678)
This pull request updates the `HTMLHeaderTextSplitter` by replacing the `split_text_from_file` method's implementation. The original method used `lxml` and XSLT for processing HTML files, which caused `lxml.etree.xsltapplyerror maxhead` when handling large HTML documents due to limitations in the XSLT processor. Fixes #13149 By switching to BeautifulSoup (`bs4`), we achieve: - **Improved Performance and Reliability:** BeautifulSoup efficiently processes large HTML files without the errors associated with `lxml` and XSLT. - **Simplified Dependencies:** Removes the dependency on `lxml` and external XSLT files, relying instead on the widely used `beautifulsoup4` library. - **Maintained Functionality:** The new method replicates the original behavior, ensuring compatibility with existing code and preserving the extraction of content and metadata. **Issue:** This change addresses issues related to processing large HTML files with the existing `HTMLHeaderTextSplitter` implementation. It resolves problems where users encounter lxml.etree.xsltapplyerror maxhead due to large HTML documents. **Dependencies:** - **BeautifulSoup (`beautifulsoup4`):** The `beautifulsoup4` library is now used for parsing HTML content. - Installation: `pip install beautifulsoup4` **Code Changes:** Updated the `split_text_from_file` method in `HTMLHeaderTextSplitter` as follows: ```python def split_text_from_file(self, file: Any) -> List[Document]: """Split HTML file using BeautifulSoup. Args: file: HTML file path or file-like object. Returns: List of Document objects with page_content and metadata. """ from bs4 import BeautifulSoup from langchain.docstore.document import Document import bs4 # Read the HTML content from the file or file-like object if isinstance(file, str): with open(file, 'r', encoding='utf-8') as f: html_content = f.read() else: # Assuming file is a file-like object html_content = file.read() # Parse the HTML content using BeautifulSoup soup = BeautifulSoup(html_content, 'html.parser') # Extract the header tags and their corresponding metadata keys headers_to_split_on = [tag[0] for tag in self.headers_to_split_on] header_mapping = dict(self.headers_to_split_on) documents = [] # Find the body of the document body = soup.body if soup.body else soup # Find all header tags in the order they appear all_headers = body.find_all(headers_to_split_on) # If there's content before the first header, collect it first_header = all_headers[0] if all_headers else None if first_header: pre_header_content = '' for elem in first_header.find_all_previous(): if isinstance(elem, bs4.Tag): text = elem.get_text(separator=' ', strip=True) if text: pre_header_content = text + ' ' + pre_header_content if pre_header_content.strip(): documents.append(Document( page_content=pre_header_content.strip(), metadata={} # No metadata since there's no header )) else: # If no headers are found, return the whole content full_text = body.get_text(separator=' ', strip=True) if full_text.strip(): documents.append(Document( page_content=full_text.strip(), metadata={} )) return documents # Process each header and its associated content for header in all_headers: current_metadata = {} header_name = header.name header_text = header.get_text(separator=' ', strip=True) current_metadata[header_mapping[header_name]] = header_text # Collect all sibling elements until the next header of the same or higher level content_elements = [] for sibling in header.find_next_siblings(): if sibling.name in headers_to_split_on: # Stop at the next header break if isinstance(sibling, bs4.Tag): content_elements.append(sibling) # Get the text content of the collected elements current_content = '' for elem in content_elements: text = elem.get_text(separator=' ', strip=True) if text: current_content += text + ' ' # Create a Document if there is content if current_content.strip(): documents.append(Document( page_content=current_content.strip(), metadata=current_metadata.copy() )) else: # If there's no content, but we have metadata, still create a Document documents.append(Document( page_content='', metadata=current_metadata.copy() )) return documents ``` --------- Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
@@ -4,7 +4,7 @@ import random
|
||||
import re
|
||||
import string
|
||||
from pathlib import Path
|
||||
from typing import Any, List
|
||||
from typing import Any, Callable, List, Tuple
|
||||
|
||||
import pytest
|
||||
from langchain_core.documents import Document
|
||||
@@ -2039,49 +2039,476 @@ def test_haskell_code_splitter() -> None:
|
||||
assert chunks == expected_chunks
|
||||
|
||||
|
||||
@pytest.mark.requires("lxml")
|
||||
def test_html_header_text_splitter(tmp_path: Path) -> None:
|
||||
splitter = HTMLHeaderTextSplitter(
|
||||
headers_to_split_on=[("h1", "Header 1"), ("h2", "Header 2")]
|
||||
)
|
||||
|
||||
content = """
|
||||
<h1>Sample Document</h1>
|
||||
<h2>Section</h2>
|
||||
<p id="1234">Reference content.</p>
|
||||
|
||||
<h2>Lists</h2>
|
||||
<ul>
|
||||
<li>Item 1</li>
|
||||
<li>Item 2</li>
|
||||
<li>Item 3</li>
|
||||
</ul>
|
||||
|
||||
<h3>A block</h3>
|
||||
<div class="amazing">
|
||||
<p>Some text</p>
|
||||
<p>Some more text</p>
|
||||
</div>
|
||||
@pytest.fixture
|
||||
@pytest.mark.requires("bs4")
|
||||
def html_header_splitter_splitter_factory() -> (
|
||||
Callable[[List[Tuple[str, str]]], HTMLHeaderTextSplitter]
|
||||
):
|
||||
"""
|
||||
Fixture to create an HTMLHeaderTextSplitter instance with given headers.
|
||||
This factory allows dynamic creation of splitters with different headers.
|
||||
"""
|
||||
|
||||
docs = splitter.split_text(content)
|
||||
expected = [
|
||||
Document(
|
||||
page_content="Reference content.",
|
||||
metadata={"Header 1": "Sample Document", "Header 2": "Section"},
|
||||
),
|
||||
Document(
|
||||
page_content="Item 1 Item 2 Item 3 \nSome text \nSome more text",
|
||||
metadata={"Header 1": "Sample Document", "Header 2": "Lists"},
|
||||
),
|
||||
]
|
||||
assert docs == expected
|
||||
def _create_splitter(
|
||||
headers_to_split_on: List[Tuple[str, str]],
|
||||
) -> HTMLHeaderTextSplitter:
|
||||
return HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
|
||||
|
||||
with open(tmp_path / "doc.html", "w") as tmp:
|
||||
tmp.write(content)
|
||||
docs_from_file = splitter.split_text_from_file(tmp_path / "doc.html")
|
||||
return _create_splitter
|
||||
|
||||
assert docs_from_file == expected
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"headers_to_split_on, html_input, expected_documents, test_case",
|
||||
[
|
||||
(
|
||||
# Test Case 1: Split on h1 and h2
|
||||
[("h1", "Header 1"), ("h2", "Header 2")],
|
||||
"""
|
||||
<html>
|
||||
<body>
|
||||
<h1>Introduction</h1>
|
||||
<p>This is the introduction.</p>
|
||||
<h2>Background</h2>
|
||||
<p>Background information.</p>
|
||||
<h1>Conclusion</h1>
|
||||
<p>Final thoughts.</p>
|
||||
</body>
|
||||
</html>
|
||||
""",
|
||||
[
|
||||
Document(
|
||||
page_content="Introduction", metadata={"Header 1": "Introduction"}
|
||||
),
|
||||
Document(
|
||||
page_content="This is the introduction.",
|
||||
metadata={"Header 1": "Introduction"},
|
||||
),
|
||||
Document(
|
||||
page_content="Background",
|
||||
metadata={"Header 1": "Introduction", "Header 2": "Background"},
|
||||
),
|
||||
Document(
|
||||
page_content="Background information.",
|
||||
metadata={"Header 1": "Introduction", "Header 2": "Background"},
|
||||
),
|
||||
Document(
|
||||
page_content="Conclusion", metadata={"Header 1": "Conclusion"}
|
||||
),
|
||||
Document(
|
||||
page_content="Final thoughts.", metadata={"Header 1": "Conclusion"}
|
||||
),
|
||||
],
|
||||
"Simple headers and paragraphs",
|
||||
),
|
||||
(
|
||||
# Test Case 2: Nested headers with h1, h2, and h3
|
||||
[("h1", "Header 1"), ("h2", "Header 2"), ("h3", "Header 3")],
|
||||
"""
|
||||
<html>
|
||||
<body>
|
||||
<div>
|
||||
<h1>Main Title</h1>
|
||||
<div>
|
||||
<h2>Subsection</h2>
|
||||
<p>Details of subsection.</p>
|
||||
<div>
|
||||
<h3>Sub-subsection</h3>
|
||||
<p>More details.</p>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<h1>Another Main Title</h1>
|
||||
<p>Content under another main title.</p>
|
||||
</body>
|
||||
</html>
|
||||
""",
|
||||
[
|
||||
Document(
|
||||
page_content="Main Title", metadata={"Header 1": "Main Title"}
|
||||
),
|
||||
Document(
|
||||
page_content="Subsection",
|
||||
metadata={"Header 1": "Main Title", "Header 2": "Subsection"},
|
||||
),
|
||||
Document(
|
||||
page_content="Details of subsection.",
|
||||
metadata={"Header 1": "Main Title", "Header 2": "Subsection"},
|
||||
),
|
||||
Document(
|
||||
page_content="Sub-subsection",
|
||||
metadata={
|
||||
"Header 1": "Main Title",
|
||||
"Header 2": "Subsection",
|
||||
"Header 3": "Sub-subsection",
|
||||
},
|
||||
),
|
||||
Document(
|
||||
page_content="More details.",
|
||||
metadata={
|
||||
"Header 1": "Main Title",
|
||||
"Header 2": "Subsection",
|
||||
"Header 3": "Sub-subsection",
|
||||
},
|
||||
),
|
||||
Document(
|
||||
page_content="Another Main Title",
|
||||
metadata={"Header 1": "Another Main Title"},
|
||||
),
|
||||
Document(
|
||||
page_content="Content under another main title.",
|
||||
metadata={"Header 1": "Another Main Title"},
|
||||
),
|
||||
],
|
||||
"Nested headers with h1, h2, and h3",
|
||||
),
|
||||
(
|
||||
# Test Case 3: No headers
|
||||
[("h1", "Header 1")],
|
||||
"""
|
||||
<html>
|
||||
<body>
|
||||
<p>Paragraph one.</p>
|
||||
<p>Paragraph two.</p>
|
||||
<div>
|
||||
<p>Paragraph three.</p>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
""",
|
||||
[
|
||||
Document(
|
||||
page_content="Paragraph one. \nParagraph two. \nParagraph three.",
|
||||
metadata={},
|
||||
)
|
||||
],
|
||||
"No headers present",
|
||||
),
|
||||
(
|
||||
# Test Case 4: Multiple headers of the same level
|
||||
[("h1", "Header 1")],
|
||||
"""
|
||||
<html>
|
||||
<body>
|
||||
<h1>Chapter 1</h1>
|
||||
<p>Content of chapter 1.</p>
|
||||
<h1>Chapter 2</h1>
|
||||
<p>Content of chapter 2.</p>
|
||||
<h1>Chapter 3</h1>
|
||||
<p>Content of chapter 3.</p>
|
||||
</body>
|
||||
</html>
|
||||
""",
|
||||
[
|
||||
Document(page_content="Chapter 1", metadata={"Header 1": "Chapter 1"}),
|
||||
Document(
|
||||
page_content="Content of chapter 1.",
|
||||
metadata={"Header 1": "Chapter 1"},
|
||||
),
|
||||
Document(page_content="Chapter 2", metadata={"Header 1": "Chapter 2"}),
|
||||
Document(
|
||||
page_content="Content of chapter 2.",
|
||||
metadata={"Header 1": "Chapter 2"},
|
||||
),
|
||||
Document(page_content="Chapter 3", metadata={"Header 1": "Chapter 3"}),
|
||||
Document(
|
||||
page_content="Content of chapter 3.",
|
||||
metadata={"Header 1": "Chapter 3"},
|
||||
),
|
||||
],
|
||||
"Multiple headers of the same level",
|
||||
),
|
||||
(
|
||||
# Test Case 5: Headers with no content
|
||||
[("h1", "Header 1"), ("h2", "Header 2")],
|
||||
"""
|
||||
<html>
|
||||
<body>
|
||||
<h1>Header 1</h1>
|
||||
<h2>Header 2</h2>
|
||||
<h1>Header 3</h1>
|
||||
</body>
|
||||
</html>
|
||||
""",
|
||||
[
|
||||
Document(page_content="Header 1", metadata={"Header 1": "Header 1"}),
|
||||
Document(
|
||||
page_content="Header 2",
|
||||
metadata={"Header 1": "Header 1", "Header 2": "Header 2"},
|
||||
),
|
||||
Document(page_content="Header 3", metadata={"Header 1": "Header 3"}),
|
||||
],
|
||||
"Headers with no associated content",
|
||||
),
|
||||
],
|
||||
)
|
||||
@pytest.mark.requires("bs4")
|
||||
def test_html_header_text_splitter(
|
||||
html_header_splitter_splitter_factory: Any,
|
||||
headers_to_split_on: List[Tuple[str, str]],
|
||||
html_input: str,
|
||||
expected_documents: List[Document],
|
||||
test_case: str,
|
||||
) -> None:
|
||||
"""
|
||||
Test the HTML header text splitter.
|
||||
|
||||
Args:
|
||||
html_header_splitter_splitter_factory (Any): Factory function to create
|
||||
the HTML header splitter.
|
||||
headers_to_split_on (List[Tuple[str, str]]): List of headers to split on.
|
||||
html_input (str): The HTML input string to be split.
|
||||
expected_documents (List[Document]): List of expected Document objects.
|
||||
test_case (str): Description of the test case.
|
||||
|
||||
Raises:
|
||||
AssertionError: If the number of documents or their content/metadata
|
||||
does not match the expected values.
|
||||
"""
|
||||
|
||||
splitter = html_header_splitter_splitter_factory(
|
||||
headers_to_split_on=headers_to_split_on
|
||||
)
|
||||
docs = splitter.split_text(html_input)
|
||||
|
||||
assert len(docs) == len(expected_documents), (
|
||||
f"Test Case '{test_case}' Failed: Number of documents mismatch. "
|
||||
f"Expected {len(expected_documents)}, got {len(docs)}."
|
||||
)
|
||||
for idx, (doc, expected) in enumerate(zip(docs, expected_documents), start=1):
|
||||
assert doc.page_content == expected.page_content, (
|
||||
f"Test Case '{test_case}' Failed at Document {idx}: "
|
||||
f"Content mismatch.\nExpected: {expected.page_content}"
|
||||
"\nGot: {doc.page_content}"
|
||||
)
|
||||
assert doc.metadata == expected.metadata, (
|
||||
f"Test Case '{test_case}' Failed at Document {idx}: "
|
||||
f"Metadata mismatch.\nExpected: {expected.metadata}\nGot: {doc.metadata}"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"headers_to_split_on, html_content, expected_output, test_case",
|
||||
[
|
||||
(
|
||||
# Test Case A: Split on h1 and h2 with h3 in content
|
||||
[("h1", "Header 1"), ("h2", "Header 2"), ("h3", "Header 3")],
|
||||
"""
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<body>
|
||||
<div>
|
||||
<h1>Foo</h1>
|
||||
<p>Some intro text about Foo.</p>
|
||||
<div>
|
||||
<h2>Bar main section</h2>
|
||||
<p>Some intro text about Bar.</p>
|
||||
<h3>Bar subsection 1</h3>
|
||||
<p>Some text about the first subtopic of Bar.</p>
|
||||
<h3>Bar subsection 2</h3>
|
||||
<p>Some text about the second subtopic of Bar.</p>
|
||||
</div>
|
||||
<div>
|
||||
<h2>Baz</h2>
|
||||
<p>Some text about Baz</p>
|
||||
</div>
|
||||
<br>
|
||||
<p>Some concluding text about Foo</p>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
""",
|
||||
[
|
||||
Document(metadata={"Header 1": "Foo"}, page_content="Foo"),
|
||||
Document(
|
||||
metadata={"Header 1": "Foo"},
|
||||
page_content="Some intro text about Foo.",
|
||||
),
|
||||
Document(
|
||||
metadata={"Header 1": "Foo", "Header 2": "Bar main section"},
|
||||
page_content="Bar main section",
|
||||
),
|
||||
Document(
|
||||
metadata={"Header 1": "Foo", "Header 2": "Bar main section"},
|
||||
page_content="Some intro text about Bar.",
|
||||
),
|
||||
Document(
|
||||
metadata={
|
||||
"Header 1": "Foo",
|
||||
"Header 2": "Bar main section",
|
||||
"Header 3": "Bar subsection 1",
|
||||
},
|
||||
page_content="Bar subsection 1",
|
||||
),
|
||||
Document(
|
||||
metadata={
|
||||
"Header 1": "Foo",
|
||||
"Header 2": "Bar main section",
|
||||
"Header 3": "Bar subsection 1",
|
||||
},
|
||||
page_content="Some text about the first subtopic of Bar.",
|
||||
),
|
||||
Document(
|
||||
metadata={
|
||||
"Header 1": "Foo",
|
||||
"Header 2": "Bar main section",
|
||||
"Header 3": "Bar subsection 2",
|
||||
},
|
||||
page_content="Bar subsection 2",
|
||||
),
|
||||
Document(
|
||||
metadata={
|
||||
"Header 1": "Foo",
|
||||
"Header 2": "Bar main section",
|
||||
"Header 3": "Bar subsection 2",
|
||||
},
|
||||
page_content="Some text about the second subtopic of Bar.",
|
||||
),
|
||||
Document(
|
||||
metadata={"Header 1": "Foo", "Header 2": "Baz"}, page_content="Baz"
|
||||
),
|
||||
Document(
|
||||
metadata={"Header 1": "Foo"},
|
||||
page_content=(
|
||||
"Some text about Baz \nSome concluding text about Foo"
|
||||
),
|
||||
),
|
||||
],
|
||||
"Test Case A: Split on h1, h2, and h3 with nested headers",
|
||||
),
|
||||
(
|
||||
# Test Case B: Split on h1 only without any headers
|
||||
[("h1", "Header 1")],
|
||||
"""
|
||||
<html>
|
||||
<body>
|
||||
<p>Paragraph one.</p>
|
||||
<p>Paragraph two.</p>
|
||||
<p>Paragraph three.</p>
|
||||
</body>
|
||||
</html>
|
||||
""",
|
||||
[
|
||||
Document(
|
||||
metadata={},
|
||||
page_content="Paragraph one. \nParagraph two. \nParagraph three.",
|
||||
)
|
||||
],
|
||||
"Test Case B: Split on h1 only without any headers",
|
||||
),
|
||||
],
|
||||
)
|
||||
@pytest.mark.requires("bs4")
|
||||
def test_additional_html_header_text_splitter(
|
||||
html_header_splitter_splitter_factory: Any,
|
||||
headers_to_split_on: List[Tuple[str, str]],
|
||||
html_content: str,
|
||||
expected_output: List[Document],
|
||||
test_case: str,
|
||||
) -> None:
|
||||
"""
|
||||
Test the HTML header text splitter.
|
||||
|
||||
Args:
|
||||
html_header_splitter_splitter_factory (Any): Factory function to create
|
||||
the HTML header splitter.
|
||||
headers_to_split_on (List[Tuple[str, str]]): List of headers to split on.
|
||||
html_content (str): HTML content to be split.
|
||||
expected_output (List[Document]): Expected list of Document objects.
|
||||
test_case (str): Description of the test case.
|
||||
|
||||
Raises:
|
||||
AssertionError: If the number of documents or their content/metadata
|
||||
does not match the expected output.
|
||||
"""
|
||||
splitter = html_header_splitter_splitter_factory(
|
||||
headers_to_split_on=headers_to_split_on
|
||||
)
|
||||
docs = splitter.split_text(html_content)
|
||||
|
||||
assert len(docs) == len(expected_output), (
|
||||
f"{test_case} Failed: Number of documents mismatch. "
|
||||
f"Expected {len(expected_output)}, got {len(docs)}."
|
||||
)
|
||||
for idx, (doc, expected) in enumerate(zip(docs, expected_output), start=1):
|
||||
assert doc.page_content == expected.page_content, (
|
||||
f"{test_case} Failed at Document {idx}: "
|
||||
f"Content mismatch.\nExpected: {expected.page_content}\n"
|
||||
"Got: {doc.page_content}"
|
||||
)
|
||||
assert doc.metadata == expected.metadata, (
|
||||
f"{test_case} Failed at Document {idx}: "
|
||||
f"Metadata mismatch.\nExpected: {expected.metadata}\nGot: {doc.metadata}"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"headers_to_split_on, html_content, expected_output, test_case",
|
||||
[
|
||||
(
|
||||
# Test Case C: Split on h1, h2, and h3 with no headers present
|
||||
[("h1", "Header 1"), ("h2", "Header 2"), ("h3", "Header 3")],
|
||||
"""
|
||||
<html>
|
||||
<body>
|
||||
<p>Just some random text without headers.</p>
|
||||
<div>
|
||||
<span>More text here.</span>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
""",
|
||||
[
|
||||
Document(
|
||||
page_content="Just some random text without headers."
|
||||
" \nMore text here.",
|
||||
metadata={},
|
||||
)
|
||||
],
|
||||
"Test Case C: Split on h1, h2, and h3 without any headers",
|
||||
)
|
||||
],
|
||||
)
|
||||
@pytest.mark.requires("bs4")
|
||||
def test_html_no_headers_with_multiple_splitters(
|
||||
html_header_splitter_splitter_factory: Any,
|
||||
headers_to_split_on: List[Tuple[str, str]],
|
||||
html_content: str,
|
||||
expected_output: List[Document],
|
||||
test_case: str,
|
||||
) -> None:
|
||||
"""
|
||||
Test HTML content splitting without headers using multiple splitters.
|
||||
Args:
|
||||
html_header_splitter_splitter_factory (Any): Factory to create the
|
||||
HTML header splitter.
|
||||
headers_to_split_on (List[Tuple[str, str]]): List of headers to split on.
|
||||
html_content (str): HTML content to be split.
|
||||
expected_output (List[Document]): Expected list of Document objects
|
||||
after splitting.
|
||||
test_case (str): Description of the test case.
|
||||
Raises:
|
||||
AssertionError: If the number of documents or their content/metadata
|
||||
does not match the expected output.
|
||||
"""
|
||||
splitter = html_header_splitter_splitter_factory(
|
||||
headers_to_split_on=headers_to_split_on
|
||||
)
|
||||
docs = splitter.split_text(html_content)
|
||||
|
||||
assert len(docs) == len(expected_output), (
|
||||
f"{test_case} Failed: Number of documents mismatch. "
|
||||
f"Expected {len(expected_output)}, got {len(docs)}."
|
||||
)
|
||||
for idx, (doc, expected) in enumerate(zip(docs, expected_output), start=1):
|
||||
assert doc.page_content == expected.page_content, (
|
||||
f"{test_case} Failed at Document {idx}: "
|
||||
f"Content mismatch.\nExpected: {expected.page_content}\n"
|
||||
"Got: {doc.page_content}"
|
||||
)
|
||||
assert doc.metadata == expected.metadata, (
|
||||
f"{test_case} Failed at Document {idx}: "
|
||||
f"Metadata mismatch.\nExpected: {expected.metadata}\nGot: {doc.metadata}"
|
||||
)
|
||||
|
||||
|
||||
def test_split_text_on_tokens() -> None:
|
||||
@@ -2480,7 +2907,8 @@ def test_html_splitter_with_custom_extractor() -> None:
|
||||
|
||||
expected = [
|
||||
Document(
|
||||
page_content="This is an iframe: [iframe:http://example.com](http://example.com)",
|
||||
page_content="This is an iframe: "
|
||||
"[iframe:http://example.com](http://example.com)",
|
||||
metadata={"Header 1": "Section 1"},
|
||||
),
|
||||
]
|
||||
@@ -2598,7 +3026,7 @@ def test_html_splitter_with_small_chunk_size() -> None:
|
||||
"""Test HTML splitting with a very small chunk size to validate chunking."""
|
||||
html_content = """
|
||||
<h1>Section 1</h1>
|
||||
<p>This is some long text that should be split into multiple chunks due to the
|
||||
<p>This is some long text that should be split into multiple chunks due to the
|
||||
small chunk size.</p>
|
||||
"""
|
||||
splitter = HTMLSemanticPreservingSplitter(
|
||||
|
Reference in New Issue
Block a user