text-splitters: Add keep_separator arg to HTMLSemanticPreservingSplitter (#31588)

### Description Add keep_separator arg to HTMLSemanticPreservingSplitter and pass value to instance of RecursiveCharacterTextSplitter used under the hood. ### Issue Documents returned by `HTMLSemanticPreservingSplitter.split_text(text)` are defaulted to use separators at beginning of page_content. [See third and fourth document in example output from how-to guide](https://python.langchain.com/docs/how_to/split_html/#using-htmlsemanticpreservingsplitter): ``` [Document(metadata={'Header 1': 'Main Title'}, page_content='This is an introductory paragraph with some basic content.'), Document(metadata={'Header 2': 'Section 1: Introduction'}, page_content='This section introduces the topic'), Document(metadata={'Header 2': 'Section 1: Introduction'}, page_content='. Below is a list: First item Second item Third item with bold text and a link Subsection 1.1: Details This subsection provides additional details'), Document(metadata={'Header 2': 'Section 1: Introduction'}, page_content=". Here's a table: Header 1 Header 2 Header 3 Row 1, Cell 1 Row 1, Cell 2 Row 1, Cell 3 Row 2, Cell 1 Row 2, Cell 2 Row 2, Cell 3"), Document(metadata={'Header 2': 'Section 2: Media Content'}, page_content='This section contains an image and a video: ![image:example_image_link.mp4](example_image_link.mp4) ![video:example_video_link.mp4](example_video_link.mp4)'), Document(metadata={'Header 2': 'Section 3: Code Example'}, page_content='This section contains a code block: <code:html> <div> <p>This is a paragraph inside a div.</p> </div> </code>'), Document(metadata={'Header 2': 'Conclusion'}, page_content='This is the conclusion of the document.')] ``` ### Dependencies None @ttrumper3
2025-09-09 06:53:59 +00:00 · 2025-06-14 22:56:14 +01:00
parent 52e57cdc20
commit 532e6455e9
2 changed files with 152 additions and 1 deletions
--- a/libs/text-splitters/langchain_text_splitters/html.py
+++ b/libs/text-splitters/langchain_text_splitters/html.py
@@ -10,10 +10,12 @@ from typing import (
    Dict,
    Iterable,
    List,
+    Literal,
    Optional,
    Sequence,
    Tuple,
    TypedDict,
+    Union,
    cast,
 )

@@ -535,6 +537,8 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
        preserve_parent_metadata (bool): Whether to pass through parent document
            metadata to split documents when calling
            ``transform_documents/atransform_documents()``.
+        keep_separator (Union[bool, Literal["start", "end"]]): Whether separators
+            should be at the beginning of a chunk, at the end, or not at all.

    Example:
        .. code-block:: python
@@ -584,6 +588,7 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
        allowlist_tags: Optional[List[str]] = None,
        denylist_tags: Optional[List[str]] = None,
        preserve_parent_metadata: bool = False,
+        keep_separator: Union[bool, Literal["start", "end"]] = True,
    ):
        """Initialize splitter."""
        try:
@@ -611,6 +616,7 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
        self._external_metadata = external_metadata or {}
        self._allowlist_tags = allowlist_tags
        self._preserve_parent_metadata = preserve_parent_metadata
+        self._keep_separator = keep_separator
        if allowlist_tags:
            self._allowlist_tags = list(
                set(allowlist_tags + [header[0] for header in headers_to_split_on])
@@ -625,12 +631,15 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
        if separators:
            self._recursive_splitter = RecursiveCharacterTextSplitter(
                separators=separators,
+                keep_separator=keep_separator,
                chunk_size=max_chunk_size,
                chunk_overlap=chunk_overlap,
            )
        else:
            self._recursive_splitter = RecursiveCharacterTextSplitter(
-                chunk_size=max_chunk_size, chunk_overlap=chunk_overlap
+                keep_separator=keep_separator,
+                chunk_size=max_chunk_size,
+                chunk_overlap=chunk_overlap,
            )

        if self._stopword_removal: