diff --git a/libs/text-splitters/langchain_text_splitters/html.py b/libs/text-splitters/langchain_text_splitters/html.py index 405055b03b5..a917f6a9c34 100644 --- a/libs/text-splitters/langchain_text_splitters/html.py +++ b/libs/text-splitters/langchain_text_splitters/html.py @@ -10,10 +10,12 @@ from typing import ( Dict, Iterable, List, + Literal, Optional, Sequence, Tuple, TypedDict, + Union, cast, ) @@ -535,6 +537,8 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer): preserve_parent_metadata (bool): Whether to pass through parent document metadata to split documents when calling ``transform_documents/atransform_documents()``. + keep_separator (Union[bool, Literal["start", "end"]]): Whether separators + should be at the beginning of a chunk, at the end, or not at all. Example: .. code-block:: python @@ -584,6 +588,7 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer): allowlist_tags: Optional[List[str]] = None, denylist_tags: Optional[List[str]] = None, preserve_parent_metadata: bool = False, + keep_separator: Union[bool, Literal["start", "end"]] = True, ): """Initialize splitter.""" try: @@ -611,6 +616,7 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer): self._external_metadata = external_metadata or {} self._allowlist_tags = allowlist_tags self._preserve_parent_metadata = preserve_parent_metadata + self._keep_separator = keep_separator if allowlist_tags: self._allowlist_tags = list( set(allowlist_tags + [header[0] for header in headers_to_split_on]) @@ -625,12 +631,15 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer): if separators: self._recursive_splitter = RecursiveCharacterTextSplitter( separators=separators, + keep_separator=keep_separator, chunk_size=max_chunk_size, chunk_overlap=chunk_overlap, ) else: self._recursive_splitter = RecursiveCharacterTextSplitter( - chunk_size=max_chunk_size, chunk_overlap=chunk_overlap + keep_separator=keep_separator, + chunk_size=max_chunk_size, + chunk_overlap=chunk_overlap, ) if self._stopword_removal: diff --git a/libs/text-splitters/tests/unit_tests/test_text_splitters.py b/libs/text-splitters/tests/unit_tests/test_text_splitters.py index bc94dca89e9..85c6caace15 100644 --- a/libs/text-splitters/tests/unit_tests/test_text_splitters.py +++ b/libs/text-splitters/tests/unit_tests/test_text_splitters.py @@ -3375,6 +3375,148 @@ def test_html_splitter_with_media_preservation() -> None: assert documents == expected +@pytest.mark.requires("bs4") +def test_html_splitter_keep_separator_true() -> None: + """Test HTML splitting with keep_separator=True""" + html_content = """ +

Section 1

+

This is some text. This is some other text.

+ """ + splitter = HTMLSemanticPreservingSplitter( + headers_to_split_on=[("h1", "Header 1")], + max_chunk_size=10, + separators=[". "], + keep_separator=True, + ) + documents = splitter.split_text(html_content) + + expected = [ + Document( + page_content="This is some text", + metadata={"Header 1": "Section 1"}, + ), + Document( + page_content=". This is some other text.", + metadata={"Header 1": "Section 1"}, + ), + ] + + assert documents == expected + + +@pytest.mark.requires("bs4") +def test_html_splitter_keep_separator_false() -> None: + """Test HTML splitting with keep_separator=False""" + html_content = """ +

Section 1

+

This is some text. This is some other text.

+ """ + splitter = HTMLSemanticPreservingSplitter( + headers_to_split_on=[("h1", "Header 1")], + max_chunk_size=10, + separators=[". "], + keep_separator=False, + ) + documents = splitter.split_text(html_content) + + expected = [ + Document( + page_content="This is some text", + metadata={"Header 1": "Section 1"}, + ), + Document( + page_content="This is some other text.", + metadata={"Header 1": "Section 1"}, + ), + ] + + assert documents == expected + + +@pytest.mark.requires("bs4") +def test_html_splitter_keep_separator_start() -> None: + """Test HTML splitting with keep_separator="start" """ + html_content = """ +

Section 1

+

This is some text. This is some other text.

+ """ + splitter = HTMLSemanticPreservingSplitter( + headers_to_split_on=[("h1", "Header 1")], + max_chunk_size=10, + separators=[". "], + keep_separator="start", + ) + documents = splitter.split_text(html_content) + + expected = [ + Document( + page_content="This is some text", + metadata={"Header 1": "Section 1"}, + ), + Document( + page_content=". This is some other text.", + metadata={"Header 1": "Section 1"}, + ), + ] + + assert documents == expected + + +@pytest.mark.requires("bs4") +def test_html_splitter_keep_separator_end() -> None: + """Test HTML splitting with keep_separator="end" """ + html_content = """ +

Section 1

+

This is some text. This is some other text.

+ """ + splitter = HTMLSemanticPreservingSplitter( + headers_to_split_on=[("h1", "Header 1")], + max_chunk_size=10, + separators=[". "], + keep_separator="end", + ) + documents = splitter.split_text(html_content) + + expected = [ + Document( + page_content="This is some text.", + metadata={"Header 1": "Section 1"}, + ), + Document( + page_content="This is some other text.", + metadata={"Header 1": "Section 1"}, + ), + ] + + assert documents == expected + + +@pytest.mark.requires("bs4") +def test_html_splitter_keep_separator_default() -> None: + """Test HTML splitting with keep_separator not set""" + html_content = """ +

Section 1

+

This is some text. This is some other text.

+ """ + splitter = HTMLSemanticPreservingSplitter( + headers_to_split_on=[("h1", "Header 1")], max_chunk_size=10, separators=[". "] + ) + documents = splitter.split_text(html_content) + + expected = [ + Document( + page_content="This is some text", + metadata={"Header 1": "Section 1"}, + ), + Document( + page_content=". This is some other text.", + metadata={"Header 1": "Section 1"}, + ), + ] + + assert documents == expected + + def test_character_text_splitter_discard_regex_separator_on_merge() -> None: """Test that regex lookahead separator is not re-inserted when merging.""" text = "SCE191 First chunk. SCE103 Second chunk."