mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-26 08:33:49 +00:00
text-splitters: Add keep_separator arg to HTMLSemanticPreservingSplitter (#31588)
### Description Add keep_separator arg to HTMLSemanticPreservingSplitter and pass value to instance of RecursiveCharacterTextSplitter used under the hood. ### Issue Documents returned by `HTMLSemanticPreservingSplitter.split_text(text)` are defaulted to use separators at beginning of page_content. [See third and fourth document in example output from how-to guide](https://python.langchain.com/docs/how_to/split_html/#using-htmlsemanticpreservingsplitter): ``` [Document(metadata={'Header 1': 'Main Title'}, page_content='This is an introductory paragraph with some basic content.'), Document(metadata={'Header 2': 'Section 1: Introduction'}, page_content='This section introduces the topic'), Document(metadata={'Header 2': 'Section 1: Introduction'}, page_content='. Below is a list: First item Second item Third item with bold text and a link Subsection 1.1: Details This subsection provides additional details'), Document(metadata={'Header 2': 'Section 1: Introduction'}, page_content=". Here's a table: Header 1 Header 2 Header 3 Row 1, Cell 1 Row 1, Cell 2 Row 1, Cell 3 Row 2, Cell 1 Row 2, Cell 2 Row 2, Cell 3"), Document(metadata={'Header 2': 'Section 2: Media Content'}, page_content='This section contains an image and a video:  '), Document(metadata={'Header 2': 'Section 3: Code Example'}, page_content='This section contains a code block: <code:html> <div> <p>This is a paragraph inside a div.</p> </div> </code>'), Document(metadata={'Header 2': 'Conclusion'}, page_content='This is the conclusion of the document.')] ``` ### Dependencies None @ttrumper3
This commit is contained in:
parent
52e57cdc20
commit
532e6455e9
@ -10,10 +10,12 @@ from typing import (
|
||||
Dict,
|
||||
Iterable,
|
||||
List,
|
||||
Literal,
|
||||
Optional,
|
||||
Sequence,
|
||||
Tuple,
|
||||
TypedDict,
|
||||
Union,
|
||||
cast,
|
||||
)
|
||||
|
||||
@ -535,6 +537,8 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
|
||||
preserve_parent_metadata (bool): Whether to pass through parent document
|
||||
metadata to split documents when calling
|
||||
``transform_documents/atransform_documents()``.
|
||||
keep_separator (Union[bool, Literal["start", "end"]]): Whether separators
|
||||
should be at the beginning of a chunk, at the end, or not at all.
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
@ -584,6 +588,7 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
|
||||
allowlist_tags: Optional[List[str]] = None,
|
||||
denylist_tags: Optional[List[str]] = None,
|
||||
preserve_parent_metadata: bool = False,
|
||||
keep_separator: Union[bool, Literal["start", "end"]] = True,
|
||||
):
|
||||
"""Initialize splitter."""
|
||||
try:
|
||||
@ -611,6 +616,7 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
|
||||
self._external_metadata = external_metadata or {}
|
||||
self._allowlist_tags = allowlist_tags
|
||||
self._preserve_parent_metadata = preserve_parent_metadata
|
||||
self._keep_separator = keep_separator
|
||||
if allowlist_tags:
|
||||
self._allowlist_tags = list(
|
||||
set(allowlist_tags + [header[0] for header in headers_to_split_on])
|
||||
@ -625,12 +631,15 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
|
||||
if separators:
|
||||
self._recursive_splitter = RecursiveCharacterTextSplitter(
|
||||
separators=separators,
|
||||
keep_separator=keep_separator,
|
||||
chunk_size=max_chunk_size,
|
||||
chunk_overlap=chunk_overlap,
|
||||
)
|
||||
else:
|
||||
self._recursive_splitter = RecursiveCharacterTextSplitter(
|
||||
chunk_size=max_chunk_size, chunk_overlap=chunk_overlap
|
||||
keep_separator=keep_separator,
|
||||
chunk_size=max_chunk_size,
|
||||
chunk_overlap=chunk_overlap,
|
||||
)
|
||||
|
||||
if self._stopword_removal:
|
||||
|
@ -3375,6 +3375,148 @@ def test_html_splitter_with_media_preservation() -> None:
|
||||
assert documents == expected
|
||||
|
||||
|
||||
@pytest.mark.requires("bs4")
|
||||
def test_html_splitter_keep_separator_true() -> None:
|
||||
"""Test HTML splitting with keep_separator=True"""
|
||||
html_content = """
|
||||
<h1>Section 1</h1>
|
||||
<p>This is some text. This is some other text.</p>
|
||||
"""
|
||||
splitter = HTMLSemanticPreservingSplitter(
|
||||
headers_to_split_on=[("h1", "Header 1")],
|
||||
max_chunk_size=10,
|
||||
separators=[". "],
|
||||
keep_separator=True,
|
||||
)
|
||||
documents = splitter.split_text(html_content)
|
||||
|
||||
expected = [
|
||||
Document(
|
||||
page_content="This is some text",
|
||||
metadata={"Header 1": "Section 1"},
|
||||
),
|
||||
Document(
|
||||
page_content=". This is some other text.",
|
||||
metadata={"Header 1": "Section 1"},
|
||||
),
|
||||
]
|
||||
|
||||
assert documents == expected
|
||||
|
||||
|
||||
@pytest.mark.requires("bs4")
|
||||
def test_html_splitter_keep_separator_false() -> None:
|
||||
"""Test HTML splitting with keep_separator=False"""
|
||||
html_content = """
|
||||
<h1>Section 1</h1>
|
||||
<p>This is some text. This is some other text.</p>
|
||||
"""
|
||||
splitter = HTMLSemanticPreservingSplitter(
|
||||
headers_to_split_on=[("h1", "Header 1")],
|
||||
max_chunk_size=10,
|
||||
separators=[". "],
|
||||
keep_separator=False,
|
||||
)
|
||||
documents = splitter.split_text(html_content)
|
||||
|
||||
expected = [
|
||||
Document(
|
||||
page_content="This is some text",
|
||||
metadata={"Header 1": "Section 1"},
|
||||
),
|
||||
Document(
|
||||
page_content="This is some other text.",
|
||||
metadata={"Header 1": "Section 1"},
|
||||
),
|
||||
]
|
||||
|
||||
assert documents == expected
|
||||
|
||||
|
||||
@pytest.mark.requires("bs4")
|
||||
def test_html_splitter_keep_separator_start() -> None:
|
||||
"""Test HTML splitting with keep_separator="start" """
|
||||
html_content = """
|
||||
<h1>Section 1</h1>
|
||||
<p>This is some text. This is some other text.</p>
|
||||
"""
|
||||
splitter = HTMLSemanticPreservingSplitter(
|
||||
headers_to_split_on=[("h1", "Header 1")],
|
||||
max_chunk_size=10,
|
||||
separators=[". "],
|
||||
keep_separator="start",
|
||||
)
|
||||
documents = splitter.split_text(html_content)
|
||||
|
||||
expected = [
|
||||
Document(
|
||||
page_content="This is some text",
|
||||
metadata={"Header 1": "Section 1"},
|
||||
),
|
||||
Document(
|
||||
page_content=". This is some other text.",
|
||||
metadata={"Header 1": "Section 1"},
|
||||
),
|
||||
]
|
||||
|
||||
assert documents == expected
|
||||
|
||||
|
||||
@pytest.mark.requires("bs4")
|
||||
def test_html_splitter_keep_separator_end() -> None:
|
||||
"""Test HTML splitting with keep_separator="end" """
|
||||
html_content = """
|
||||
<h1>Section 1</h1>
|
||||
<p>This is some text. This is some other text.</p>
|
||||
"""
|
||||
splitter = HTMLSemanticPreservingSplitter(
|
||||
headers_to_split_on=[("h1", "Header 1")],
|
||||
max_chunk_size=10,
|
||||
separators=[". "],
|
||||
keep_separator="end",
|
||||
)
|
||||
documents = splitter.split_text(html_content)
|
||||
|
||||
expected = [
|
||||
Document(
|
||||
page_content="This is some text.",
|
||||
metadata={"Header 1": "Section 1"},
|
||||
),
|
||||
Document(
|
||||
page_content="This is some other text.",
|
||||
metadata={"Header 1": "Section 1"},
|
||||
),
|
||||
]
|
||||
|
||||
assert documents == expected
|
||||
|
||||
|
||||
@pytest.mark.requires("bs4")
|
||||
def test_html_splitter_keep_separator_default() -> None:
|
||||
"""Test HTML splitting with keep_separator not set"""
|
||||
html_content = """
|
||||
<h1>Section 1</h1>
|
||||
<p>This is some text. This is some other text.</p>
|
||||
"""
|
||||
splitter = HTMLSemanticPreservingSplitter(
|
||||
headers_to_split_on=[("h1", "Header 1")], max_chunk_size=10, separators=[". "]
|
||||
)
|
||||
documents = splitter.split_text(html_content)
|
||||
|
||||
expected = [
|
||||
Document(
|
||||
page_content="This is some text",
|
||||
metadata={"Header 1": "Section 1"},
|
||||
),
|
||||
Document(
|
||||
page_content=". This is some other text.",
|
||||
metadata={"Header 1": "Section 1"},
|
||||
),
|
||||
]
|
||||
|
||||
assert documents == expected
|
||||
|
||||
|
||||
def test_character_text_splitter_discard_regex_separator_on_merge() -> None:
|
||||
"""Test that regex lookahead separator is not re-inserted when merging."""
|
||||
text = "SCE191 First chunk. SCE103 Second chunk."
|
||||
|
Loading…
Reference in New Issue
Block a user