mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-26 16:43:35 +00:00
text-splitters: Add keep_separator arg to HTMLSemanticPreservingSplitter (#31588)
### Description Add keep_separator arg to HTMLSemanticPreservingSplitter and pass value to instance of RecursiveCharacterTextSplitter used under the hood. ### Issue Documents returned by `HTMLSemanticPreservingSplitter.split_text(text)` are defaulted to use separators at beginning of page_content. [See third and fourth document in example output from how-to guide](https://python.langchain.com/docs/how_to/split_html/#using-htmlsemanticpreservingsplitter): ``` [Document(metadata={'Header 1': 'Main Title'}, page_content='This is an introductory paragraph with some basic content.'), Document(metadata={'Header 2': 'Section 1: Introduction'}, page_content='This section introduces the topic'), Document(metadata={'Header 2': 'Section 1: Introduction'}, page_content='. Below is a list: First item Second item Third item with bold text and a link Subsection 1.1: Details This subsection provides additional details'), Document(metadata={'Header 2': 'Section 1: Introduction'}, page_content=". Here's a table: Header 1 Header 2 Header 3 Row 1, Cell 1 Row 1, Cell 2 Row 1, Cell 3 Row 2, Cell 1 Row 2, Cell 2 Row 2, Cell 3"), Document(metadata={'Header 2': 'Section 2: Media Content'}, page_content='This section contains an image and a video:  '), Document(metadata={'Header 2': 'Section 3: Code Example'}, page_content='This section contains a code block: <code:html> <div> <p>This is a paragraph inside a div.</p> </div> </code>'), Document(metadata={'Header 2': 'Conclusion'}, page_content='This is the conclusion of the document.')] ``` ### Dependencies None @ttrumper3
This commit is contained in:
parent
52e57cdc20
commit
532e6455e9
@ -10,10 +10,12 @@ from typing import (
|
|||||||
Dict,
|
Dict,
|
||||||
Iterable,
|
Iterable,
|
||||||
List,
|
List,
|
||||||
|
Literal,
|
||||||
Optional,
|
Optional,
|
||||||
Sequence,
|
Sequence,
|
||||||
Tuple,
|
Tuple,
|
||||||
TypedDict,
|
TypedDict,
|
||||||
|
Union,
|
||||||
cast,
|
cast,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -535,6 +537,8 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
|
|||||||
preserve_parent_metadata (bool): Whether to pass through parent document
|
preserve_parent_metadata (bool): Whether to pass through parent document
|
||||||
metadata to split documents when calling
|
metadata to split documents when calling
|
||||||
``transform_documents/atransform_documents()``.
|
``transform_documents/atransform_documents()``.
|
||||||
|
keep_separator (Union[bool, Literal["start", "end"]]): Whether separators
|
||||||
|
should be at the beginning of a chunk, at the end, or not at all.
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
.. code-block:: python
|
.. code-block:: python
|
||||||
@ -584,6 +588,7 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
|
|||||||
allowlist_tags: Optional[List[str]] = None,
|
allowlist_tags: Optional[List[str]] = None,
|
||||||
denylist_tags: Optional[List[str]] = None,
|
denylist_tags: Optional[List[str]] = None,
|
||||||
preserve_parent_metadata: bool = False,
|
preserve_parent_metadata: bool = False,
|
||||||
|
keep_separator: Union[bool, Literal["start", "end"]] = True,
|
||||||
):
|
):
|
||||||
"""Initialize splitter."""
|
"""Initialize splitter."""
|
||||||
try:
|
try:
|
||||||
@ -611,6 +616,7 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
|
|||||||
self._external_metadata = external_metadata or {}
|
self._external_metadata = external_metadata or {}
|
||||||
self._allowlist_tags = allowlist_tags
|
self._allowlist_tags = allowlist_tags
|
||||||
self._preserve_parent_metadata = preserve_parent_metadata
|
self._preserve_parent_metadata = preserve_parent_metadata
|
||||||
|
self._keep_separator = keep_separator
|
||||||
if allowlist_tags:
|
if allowlist_tags:
|
||||||
self._allowlist_tags = list(
|
self._allowlist_tags = list(
|
||||||
set(allowlist_tags + [header[0] for header in headers_to_split_on])
|
set(allowlist_tags + [header[0] for header in headers_to_split_on])
|
||||||
@ -625,12 +631,15 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
|
|||||||
if separators:
|
if separators:
|
||||||
self._recursive_splitter = RecursiveCharacterTextSplitter(
|
self._recursive_splitter = RecursiveCharacterTextSplitter(
|
||||||
separators=separators,
|
separators=separators,
|
||||||
|
keep_separator=keep_separator,
|
||||||
chunk_size=max_chunk_size,
|
chunk_size=max_chunk_size,
|
||||||
chunk_overlap=chunk_overlap,
|
chunk_overlap=chunk_overlap,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
self._recursive_splitter = RecursiveCharacterTextSplitter(
|
self._recursive_splitter = RecursiveCharacterTextSplitter(
|
||||||
chunk_size=max_chunk_size, chunk_overlap=chunk_overlap
|
keep_separator=keep_separator,
|
||||||
|
chunk_size=max_chunk_size,
|
||||||
|
chunk_overlap=chunk_overlap,
|
||||||
)
|
)
|
||||||
|
|
||||||
if self._stopword_removal:
|
if self._stopword_removal:
|
||||||
|
@ -3375,6 +3375,148 @@ def test_html_splitter_with_media_preservation() -> None:
|
|||||||
assert documents == expected
|
assert documents == expected
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("bs4")
|
||||||
|
def test_html_splitter_keep_separator_true() -> None:
|
||||||
|
"""Test HTML splitting with keep_separator=True"""
|
||||||
|
html_content = """
|
||||||
|
<h1>Section 1</h1>
|
||||||
|
<p>This is some text. This is some other text.</p>
|
||||||
|
"""
|
||||||
|
splitter = HTMLSemanticPreservingSplitter(
|
||||||
|
headers_to_split_on=[("h1", "Header 1")],
|
||||||
|
max_chunk_size=10,
|
||||||
|
separators=[". "],
|
||||||
|
keep_separator=True,
|
||||||
|
)
|
||||||
|
documents = splitter.split_text(html_content)
|
||||||
|
|
||||||
|
expected = [
|
||||||
|
Document(
|
||||||
|
page_content="This is some text",
|
||||||
|
metadata={"Header 1": "Section 1"},
|
||||||
|
),
|
||||||
|
Document(
|
||||||
|
page_content=". This is some other text.",
|
||||||
|
metadata={"Header 1": "Section 1"},
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
assert documents == expected
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("bs4")
|
||||||
|
def test_html_splitter_keep_separator_false() -> None:
|
||||||
|
"""Test HTML splitting with keep_separator=False"""
|
||||||
|
html_content = """
|
||||||
|
<h1>Section 1</h1>
|
||||||
|
<p>This is some text. This is some other text.</p>
|
||||||
|
"""
|
||||||
|
splitter = HTMLSemanticPreservingSplitter(
|
||||||
|
headers_to_split_on=[("h1", "Header 1")],
|
||||||
|
max_chunk_size=10,
|
||||||
|
separators=[". "],
|
||||||
|
keep_separator=False,
|
||||||
|
)
|
||||||
|
documents = splitter.split_text(html_content)
|
||||||
|
|
||||||
|
expected = [
|
||||||
|
Document(
|
||||||
|
page_content="This is some text",
|
||||||
|
metadata={"Header 1": "Section 1"},
|
||||||
|
),
|
||||||
|
Document(
|
||||||
|
page_content="This is some other text.",
|
||||||
|
metadata={"Header 1": "Section 1"},
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
assert documents == expected
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("bs4")
|
||||||
|
def test_html_splitter_keep_separator_start() -> None:
|
||||||
|
"""Test HTML splitting with keep_separator="start" """
|
||||||
|
html_content = """
|
||||||
|
<h1>Section 1</h1>
|
||||||
|
<p>This is some text. This is some other text.</p>
|
||||||
|
"""
|
||||||
|
splitter = HTMLSemanticPreservingSplitter(
|
||||||
|
headers_to_split_on=[("h1", "Header 1")],
|
||||||
|
max_chunk_size=10,
|
||||||
|
separators=[". "],
|
||||||
|
keep_separator="start",
|
||||||
|
)
|
||||||
|
documents = splitter.split_text(html_content)
|
||||||
|
|
||||||
|
expected = [
|
||||||
|
Document(
|
||||||
|
page_content="This is some text",
|
||||||
|
metadata={"Header 1": "Section 1"},
|
||||||
|
),
|
||||||
|
Document(
|
||||||
|
page_content=". This is some other text.",
|
||||||
|
metadata={"Header 1": "Section 1"},
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
assert documents == expected
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("bs4")
|
||||||
|
def test_html_splitter_keep_separator_end() -> None:
|
||||||
|
"""Test HTML splitting with keep_separator="end" """
|
||||||
|
html_content = """
|
||||||
|
<h1>Section 1</h1>
|
||||||
|
<p>This is some text. This is some other text.</p>
|
||||||
|
"""
|
||||||
|
splitter = HTMLSemanticPreservingSplitter(
|
||||||
|
headers_to_split_on=[("h1", "Header 1")],
|
||||||
|
max_chunk_size=10,
|
||||||
|
separators=[". "],
|
||||||
|
keep_separator="end",
|
||||||
|
)
|
||||||
|
documents = splitter.split_text(html_content)
|
||||||
|
|
||||||
|
expected = [
|
||||||
|
Document(
|
||||||
|
page_content="This is some text.",
|
||||||
|
metadata={"Header 1": "Section 1"},
|
||||||
|
),
|
||||||
|
Document(
|
||||||
|
page_content="This is some other text.",
|
||||||
|
metadata={"Header 1": "Section 1"},
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
assert documents == expected
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("bs4")
|
||||||
|
def test_html_splitter_keep_separator_default() -> None:
|
||||||
|
"""Test HTML splitting with keep_separator not set"""
|
||||||
|
html_content = """
|
||||||
|
<h1>Section 1</h1>
|
||||||
|
<p>This is some text. This is some other text.</p>
|
||||||
|
"""
|
||||||
|
splitter = HTMLSemanticPreservingSplitter(
|
||||||
|
headers_to_split_on=[("h1", "Header 1")], max_chunk_size=10, separators=[". "]
|
||||||
|
)
|
||||||
|
documents = splitter.split_text(html_content)
|
||||||
|
|
||||||
|
expected = [
|
||||||
|
Document(
|
||||||
|
page_content="This is some text",
|
||||||
|
metadata={"Header 1": "Section 1"},
|
||||||
|
),
|
||||||
|
Document(
|
||||||
|
page_content=". This is some other text.",
|
||||||
|
metadata={"Header 1": "Section 1"},
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
assert documents == expected
|
||||||
|
|
||||||
|
|
||||||
def test_character_text_splitter_discard_regex_separator_on_merge() -> None:
|
def test_character_text_splitter_discard_regex_separator_on_merge() -> None:
|
||||||
"""Test that regex lookahead separator is not re-inserted when merging."""
|
"""Test that regex lookahead separator is not re-inserted when merging."""
|
||||||
text = "SCE191 First chunk. SCE103 Second chunk."
|
text = "SCE191 First chunk. SCE103 Second chunk."
|
||||||
|
Loading…
Reference in New Issue
Block a user