mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-09 06:53:59 +00:00
text-splitters: Add keep_separator arg to HTMLSemanticPreservingSplitter (#31588)
### Description Add keep_separator arg to HTMLSemanticPreservingSplitter and pass value to instance of RecursiveCharacterTextSplitter used under the hood. ### Issue Documents returned by `HTMLSemanticPreservingSplitter.split_text(text)` are defaulted to use separators at beginning of page_content. [See third and fourth document in example output from how-to guide](https://python.langchain.com/docs/how_to/split_html/#using-htmlsemanticpreservingsplitter): ``` [Document(metadata={'Header 1': 'Main Title'}, page_content='This is an introductory paragraph with some basic content.'), Document(metadata={'Header 2': 'Section 1: Introduction'}, page_content='This section introduces the topic'), Document(metadata={'Header 2': 'Section 1: Introduction'}, page_content='. Below is a list: First item Second item Third item with bold text and a link Subsection 1.1: Details This subsection provides additional details'), Document(metadata={'Header 2': 'Section 1: Introduction'}, page_content=". Here's a table: Header 1 Header 2 Header 3 Row 1, Cell 1 Row 1, Cell 2 Row 1, Cell 3 Row 2, Cell 1 Row 2, Cell 2 Row 2, Cell 3"), Document(metadata={'Header 2': 'Section 2: Media Content'}, page_content='This section contains an image and a video:  '), Document(metadata={'Header 2': 'Section 3: Code Example'}, page_content='This section contains a code block: <code:html> <div> <p>This is a paragraph inside a div.</p> </div> </code>'), Document(metadata={'Header 2': 'Conclusion'}, page_content='This is the conclusion of the document.')] ``` ### Dependencies None @ttrumper3
This commit is contained in:
@@ -10,10 +10,12 @@ from typing import (
|
||||
Dict,
|
||||
Iterable,
|
||||
List,
|
||||
Literal,
|
||||
Optional,
|
||||
Sequence,
|
||||
Tuple,
|
||||
TypedDict,
|
||||
Union,
|
||||
cast,
|
||||
)
|
||||
|
||||
@@ -535,6 +537,8 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
|
||||
preserve_parent_metadata (bool): Whether to pass through parent document
|
||||
metadata to split documents when calling
|
||||
``transform_documents/atransform_documents()``.
|
||||
keep_separator (Union[bool, Literal["start", "end"]]): Whether separators
|
||||
should be at the beginning of a chunk, at the end, or not at all.
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
@@ -584,6 +588,7 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
|
||||
allowlist_tags: Optional[List[str]] = None,
|
||||
denylist_tags: Optional[List[str]] = None,
|
||||
preserve_parent_metadata: bool = False,
|
||||
keep_separator: Union[bool, Literal["start", "end"]] = True,
|
||||
):
|
||||
"""Initialize splitter."""
|
||||
try:
|
||||
@@ -611,6 +616,7 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
|
||||
self._external_metadata = external_metadata or {}
|
||||
self._allowlist_tags = allowlist_tags
|
||||
self._preserve_parent_metadata = preserve_parent_metadata
|
||||
self._keep_separator = keep_separator
|
||||
if allowlist_tags:
|
||||
self._allowlist_tags = list(
|
||||
set(allowlist_tags + [header[0] for header in headers_to_split_on])
|
||||
@@ -625,12 +631,15 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
|
||||
if separators:
|
||||
self._recursive_splitter = RecursiveCharacterTextSplitter(
|
||||
separators=separators,
|
||||
keep_separator=keep_separator,
|
||||
chunk_size=max_chunk_size,
|
||||
chunk_overlap=chunk_overlap,
|
||||
)
|
||||
else:
|
||||
self._recursive_splitter = RecursiveCharacterTextSplitter(
|
||||
chunk_size=max_chunk_size, chunk_overlap=chunk_overlap
|
||||
keep_separator=keep_separator,
|
||||
chunk_size=max_chunk_size,
|
||||
chunk_overlap=chunk_overlap,
|
||||
)
|
||||
|
||||
if self._stopword_removal:
|
||||
|
Reference in New Issue
Block a user