text-splitters: Add keep_separator arg to HTMLSemanticPreservingSplitter (#31588)

### Description
Add keep_separator arg to HTMLSemanticPreservingSplitter and pass value
to instance of RecursiveCharacterTextSplitter used under the hood.
### Issue
Documents returned by `HTMLSemanticPreservingSplitter.split_text(text)`
are defaulted to use separators at beginning of page_content. [See third
and fourth document in example output from how-to
guide](https://python.langchain.com/docs/how_to/split_html/#using-htmlsemanticpreservingsplitter):
```
[Document(metadata={'Header 1': 'Main Title'}, page_content='This is an introductory paragraph with some basic content.'),
 Document(metadata={'Header 2': 'Section 1: Introduction'}, page_content='This section introduces the topic'),
 Document(metadata={'Header 2': 'Section 1: Introduction'}, page_content='. Below is a list: First item Second item Third item with bold text and a link Subsection 1.1: Details This subsection provides additional details'),
 Document(metadata={'Header 2': 'Section 1: Introduction'}, page_content=". Here's a table: Header 1 Header 2 Header 3 Row 1, Cell 1 Row 1, Cell 2 Row 1, Cell 3 Row 2, Cell 1 Row 2, Cell 2 Row 2, Cell 3"),
 Document(metadata={'Header 2': 'Section 2: Media Content'}, page_content='This section contains an image and a video: ![image:example_image_link.mp4](example_image_link.mp4) ![video:example_video_link.mp4](example_video_link.mp4)'),
 Document(metadata={'Header 2': 'Section 3: Code Example'}, page_content='This section contains a code block: <code:html> <div> <p>This is a paragraph inside a div.</p> </div> </code>'),
 Document(metadata={'Header 2': 'Conclusion'}, page_content='This is the conclusion of the document.')]
```
### Dependencies
None

@ttrumper3
This commit is contained in:
Tom-Trumper 2025-06-14 22:56:14 +01:00 committed by GitHub
parent 52e57cdc20
commit 532e6455e9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 152 additions and 1 deletions

View File

@ -10,10 +10,12 @@ from typing import (
Dict, Dict,
Iterable, Iterable,
List, List,
Literal,
Optional, Optional,
Sequence, Sequence,
Tuple, Tuple,
TypedDict, TypedDict,
Union,
cast, cast,
) )
@ -535,6 +537,8 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
preserve_parent_metadata (bool): Whether to pass through parent document preserve_parent_metadata (bool): Whether to pass through parent document
metadata to split documents when calling metadata to split documents when calling
``transform_documents/atransform_documents()``. ``transform_documents/atransform_documents()``.
keep_separator (Union[bool, Literal["start", "end"]]): Whether separators
should be at the beginning of a chunk, at the end, or not at all.
Example: Example:
.. code-block:: python .. code-block:: python
@ -584,6 +588,7 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
allowlist_tags: Optional[List[str]] = None, allowlist_tags: Optional[List[str]] = None,
denylist_tags: Optional[List[str]] = None, denylist_tags: Optional[List[str]] = None,
preserve_parent_metadata: bool = False, preserve_parent_metadata: bool = False,
keep_separator: Union[bool, Literal["start", "end"]] = True,
): ):
"""Initialize splitter.""" """Initialize splitter."""
try: try:
@ -611,6 +616,7 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
self._external_metadata = external_metadata or {} self._external_metadata = external_metadata or {}
self._allowlist_tags = allowlist_tags self._allowlist_tags = allowlist_tags
self._preserve_parent_metadata = preserve_parent_metadata self._preserve_parent_metadata = preserve_parent_metadata
self._keep_separator = keep_separator
if allowlist_tags: if allowlist_tags:
self._allowlist_tags = list( self._allowlist_tags = list(
set(allowlist_tags + [header[0] for header in headers_to_split_on]) set(allowlist_tags + [header[0] for header in headers_to_split_on])
@ -625,12 +631,15 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
if separators: if separators:
self._recursive_splitter = RecursiveCharacterTextSplitter( self._recursive_splitter = RecursiveCharacterTextSplitter(
separators=separators, separators=separators,
keep_separator=keep_separator,
chunk_size=max_chunk_size, chunk_size=max_chunk_size,
chunk_overlap=chunk_overlap, chunk_overlap=chunk_overlap,
) )
else: else:
self._recursive_splitter = RecursiveCharacterTextSplitter( self._recursive_splitter = RecursiveCharacterTextSplitter(
chunk_size=max_chunk_size, chunk_overlap=chunk_overlap keep_separator=keep_separator,
chunk_size=max_chunk_size,
chunk_overlap=chunk_overlap,
) )
if self._stopword_removal: if self._stopword_removal:

View File

@ -3375,6 +3375,148 @@ def test_html_splitter_with_media_preservation() -> None:
assert documents == expected assert documents == expected
@pytest.mark.requires("bs4")
def test_html_splitter_keep_separator_true() -> None:
"""Test HTML splitting with keep_separator=True"""
html_content = """
<h1>Section 1</h1>
<p>This is some text. This is some other text.</p>
"""
splitter = HTMLSemanticPreservingSplitter(
headers_to_split_on=[("h1", "Header 1")],
max_chunk_size=10,
separators=[". "],
keep_separator=True,
)
documents = splitter.split_text(html_content)
expected = [
Document(
page_content="This is some text",
metadata={"Header 1": "Section 1"},
),
Document(
page_content=". This is some other text.",
metadata={"Header 1": "Section 1"},
),
]
assert documents == expected
@pytest.mark.requires("bs4")
def test_html_splitter_keep_separator_false() -> None:
"""Test HTML splitting with keep_separator=False"""
html_content = """
<h1>Section 1</h1>
<p>This is some text. This is some other text.</p>
"""
splitter = HTMLSemanticPreservingSplitter(
headers_to_split_on=[("h1", "Header 1")],
max_chunk_size=10,
separators=[". "],
keep_separator=False,
)
documents = splitter.split_text(html_content)
expected = [
Document(
page_content="This is some text",
metadata={"Header 1": "Section 1"},
),
Document(
page_content="This is some other text.",
metadata={"Header 1": "Section 1"},
),
]
assert documents == expected
@pytest.mark.requires("bs4")
def test_html_splitter_keep_separator_start() -> None:
"""Test HTML splitting with keep_separator="start" """
html_content = """
<h1>Section 1</h1>
<p>This is some text. This is some other text.</p>
"""
splitter = HTMLSemanticPreservingSplitter(
headers_to_split_on=[("h1", "Header 1")],
max_chunk_size=10,
separators=[". "],
keep_separator="start",
)
documents = splitter.split_text(html_content)
expected = [
Document(
page_content="This is some text",
metadata={"Header 1": "Section 1"},
),
Document(
page_content=". This is some other text.",
metadata={"Header 1": "Section 1"},
),
]
assert documents == expected
@pytest.mark.requires("bs4")
def test_html_splitter_keep_separator_end() -> None:
"""Test HTML splitting with keep_separator="end" """
html_content = """
<h1>Section 1</h1>
<p>This is some text. This is some other text.</p>
"""
splitter = HTMLSemanticPreservingSplitter(
headers_to_split_on=[("h1", "Header 1")],
max_chunk_size=10,
separators=[". "],
keep_separator="end",
)
documents = splitter.split_text(html_content)
expected = [
Document(
page_content="This is some text.",
metadata={"Header 1": "Section 1"},
),
Document(
page_content="This is some other text.",
metadata={"Header 1": "Section 1"},
),
]
assert documents == expected
@pytest.mark.requires("bs4")
def test_html_splitter_keep_separator_default() -> None:
"""Test HTML splitting with keep_separator not set"""
html_content = """
<h1>Section 1</h1>
<p>This is some text. This is some other text.</p>
"""
splitter = HTMLSemanticPreservingSplitter(
headers_to_split_on=[("h1", "Header 1")], max_chunk_size=10, separators=[". "]
)
documents = splitter.split_text(html_content)
expected = [
Document(
page_content="This is some text",
metadata={"Header 1": "Section 1"},
),
Document(
page_content=". This is some other text.",
metadata={"Header 1": "Section 1"},
),
]
assert documents == expected
def test_character_text_splitter_discard_regex_separator_on_merge() -> None: def test_character_text_splitter_discard_regex_separator_on_merge() -> None:
"""Test that regex lookahead separator is not re-inserted when merging.""" """Test that regex lookahead separator is not re-inserted when merging."""
text = "SCE191 First chunk. SCE103 Second chunk." text = "SCE191 First chunk. SCE103 Second chunk."