text-splitters: Add keep_separator arg to HTMLSemanticPreservingSplitter (#31588)

### Description Add keep_separator arg to HTMLSemanticPreservingSplitter and pass value to instance of RecursiveCharacterTextSplitter used under the hood. ### Issue Documents returned by `HTMLSemanticPreservingSplitter.split_text(text)` are defaulted to use separators at beginning of page_content. [See third and fourth document in example output from how-to guide](https://python.langchain.com/docs/how_to/split_html/#using-htmlsemanticpreservingsplitter): ``` [Document(metadata={'Header 1': 'Main Title'}, page_content='This is an introductory paragraph with some basic content.'), Document(metadata={'Header 2': 'Section 1: Introduction'}, page_content='This section introduces the topic'), Document(metadata={'Header 2': 'Section 1: Introduction'}, page_content='. Below is a list: First item Second item Third item with bold text and a link Subsection 1.1: Details This subsection provides additional details'), Document(metadata={'Header 2': 'Section 1: Introduction'}, page_content=". Here's a table: Header 1 Header 2 Header 3 Row 1, Cell 1 Row 1, Cell 2 Row 1, Cell 3 Row 2, Cell 1 Row 2, Cell 2 Row 2, Cell 3"), Document(metadata={'Header 2': 'Section 2: Media Content'}, page_content='This section contains an image and a video: ![image:example_image_link.mp4](example_image_link.mp4) ![video:example_video_link.mp4](example_video_link.mp4)'), Document(metadata={'Header 2': 'Section 3: Code Example'}, page_content='This section contains a code block: <code:html> <div> <p>This is a paragraph inside a div.</p> </div> </code>'), Document(metadata={'Header 2': 'Conclusion'}, page_content='This is the conclusion of the document.')] ``` ### Dependencies None @ttrumper3
2025-09-28 15:00:23 +00:00 · 2025-06-14 22:56:14 +01:00
parent 52e57cdc20
commit 532e6455e9
2 changed files with 152 additions and 1 deletions
--- a/libs/text-splitters/langchain_text_splitters/html.py
+++ b/libs/text-splitters/langchain_text_splitters/html.py
@@ -10,10 +10,12 @@ from typing import (
    Dict,
    Iterable,
    List,
+    Literal,
    Optional,
    Sequence,
    Tuple,
    TypedDict,
+    Union,
    cast,
 )

@@ -535,6 +537,8 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
        preserve_parent_metadata (bool): Whether to pass through parent document
            metadata to split documents when calling
            ``transform_documents/atransform_documents()``.
+        keep_separator (Union[bool, Literal["start", "end"]]): Whether separators
+            should be at the beginning of a chunk, at the end, or not at all.

    Example:
        .. code-block:: python
@@ -584,6 +588,7 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
        allowlist_tags: Optional[List[str]] = None,
        denylist_tags: Optional[List[str]] = None,
        preserve_parent_metadata: bool = False,
+        keep_separator: Union[bool, Literal["start", "end"]] = True,
    ):
        """Initialize splitter."""
        try:
@@ -611,6 +616,7 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
        self._external_metadata = external_metadata or {}
        self._allowlist_tags = allowlist_tags
        self._preserve_parent_metadata = preserve_parent_metadata
+        self._keep_separator = keep_separator
        if allowlist_tags:
            self._allowlist_tags = list(
                set(allowlist_tags + [header[0] for header in headers_to_split_on])
@@ -625,12 +631,15 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
        if separators:
            self._recursive_splitter = RecursiveCharacterTextSplitter(
                separators=separators,
+                keep_separator=keep_separator,
                chunk_size=max_chunk_size,
                chunk_overlap=chunk_overlap,
            )
        else:
            self._recursive_splitter = RecursiveCharacterTextSplitter(
-                chunk_size=max_chunk_size, chunk_overlap=chunk_overlap
+                keep_separator=keep_separator,
+                chunk_size=max_chunk_size,
+                chunk_overlap=chunk_overlap,
            )

        if self._stopword_removal:
--- a/libs/text-splitters/tests/unit_tests/test_text_splitters.py
+++ b/libs/text-splitters/tests/unit_tests/test_text_splitters.py
@@ -3375,6 +3375,148 @@ def test_html_splitter_with_media_preservation() -> None:
    assert documents == expected


+@pytest.mark.requires("bs4")
+def test_html_splitter_keep_separator_true() -> None:
+    """Test HTML splitting with keep_separator=True"""
+    html_content = """
+    <h1>Section 1</h1>
+    <p>This is some text. This is some other text.</p>
+    """
+    splitter = HTMLSemanticPreservingSplitter(
+        headers_to_split_on=[("h1", "Header 1")],
+        max_chunk_size=10,
+        separators=[". "],
+        keep_separator=True,
+    )
+    documents = splitter.split_text(html_content)
+
+    expected = [
+        Document(
+            page_content="This is some text",
+            metadata={"Header 1": "Section 1"},
+        ),
+        Document(
+            page_content=". This is some other text.",
+            metadata={"Header 1": "Section 1"},
+        ),
+    ]
+
+    assert documents == expected
+
+
+@pytest.mark.requires("bs4")
+def test_html_splitter_keep_separator_false() -> None:
+    """Test HTML splitting with keep_separator=False"""
+    html_content = """
+    <h1>Section 1</h1>
+    <p>This is some text. This is some other text.</p>
+    """
+    splitter = HTMLSemanticPreservingSplitter(
+        headers_to_split_on=[("h1", "Header 1")],
+        max_chunk_size=10,
+        separators=[". "],
+        keep_separator=False,
+    )
+    documents = splitter.split_text(html_content)
+
+    expected = [
+        Document(
+            page_content="This is some text",
+            metadata={"Header 1": "Section 1"},
+        ),
+        Document(
+            page_content="This is some other text.",
+            metadata={"Header 1": "Section 1"},
+        ),
+    ]
+
+    assert documents == expected
+
+
+@pytest.mark.requires("bs4")
+def test_html_splitter_keep_separator_start() -> None:
+    """Test HTML splitting with keep_separator="start" """
+    html_content = """
+    <h1>Section 1</h1>
+    <p>This is some text. This is some other text.</p>
+    """
+    splitter = HTMLSemanticPreservingSplitter(
+        headers_to_split_on=[("h1", "Header 1")],
+        max_chunk_size=10,
+        separators=[". "],
+        keep_separator="start",
+    )
+    documents = splitter.split_text(html_content)
+
+    expected = [
+        Document(
+            page_content="This is some text",
+            metadata={"Header 1": "Section 1"},
+        ),
+        Document(
+            page_content=". This is some other text.",
+            metadata={"Header 1": "Section 1"},
+        ),
+    ]
+
+    assert documents == expected
+
+
+@pytest.mark.requires("bs4")
+def test_html_splitter_keep_separator_end() -> None:
+    """Test HTML splitting with keep_separator="end" """
+    html_content = """
+    <h1>Section 1</h1>
+    <p>This is some text. This is some other text.</p>
+    """
+    splitter = HTMLSemanticPreservingSplitter(
+        headers_to_split_on=[("h1", "Header 1")],
+        max_chunk_size=10,
+        separators=[". "],
+        keep_separator="end",
+    )
+    documents = splitter.split_text(html_content)
+
+    expected = [
+        Document(
+            page_content="This is some text.",
+            metadata={"Header 1": "Section 1"},
+        ),
+        Document(
+            page_content="This is some other text.",
+            metadata={"Header 1": "Section 1"},
+        ),
+    ]
+
+    assert documents == expected
+
+
+@pytest.mark.requires("bs4")
+def test_html_splitter_keep_separator_default() -> None:
+    """Test HTML splitting with keep_separator not set"""
+    html_content = """
+    <h1>Section 1</h1>
+    <p>This is some text. This is some other text.</p>
+    """
+    splitter = HTMLSemanticPreservingSplitter(
+        headers_to_split_on=[("h1", "Header 1")], max_chunk_size=10, separators=[". "]
+    )
+    documents = splitter.split_text(html_content)
+
+    expected = [
+        Document(
+            page_content="This is some text",
+            metadata={"Header 1": "Section 1"},
+        ),
+        Document(
+            page_content=". This is some other text.",
+            metadata={"Header 1": "Section 1"},
+        ),
+    ]
+
+    assert documents == expected
+
+
 def test_character_text_splitter_discard_regex_separator_on_merge() -> None:
    """Test that regex lookahead separator is not re-inserted when merging."""
    text = "SCE191 First chunk. SCE103 Second chunk."