docs(text-splitters): fix some docstrings (#32767)

This commit is contained in:
Christophe Bornet
2025-08-31 20:46:11 +02:00
committed by GitHub
parent fcf7175392
commit e0a4af8d8b
4 changed files with 99 additions and 106 deletions

View File

@@ -53,23 +53,8 @@ class HTMLHeaderTextSplitter:
gracefully handles multiple levels of nested headers, creating a rich,
hierarchical representation of the content.
Args:
headers_to_split_on (List[Tuple[str, str]]): A list of (header_tag,
header_name) pairs representing the headers that define splitting
boundaries. For example, [("h1", "Header 1"), ("h2", "Header 2")]
will split content by <h1> and <h2> tags, assigning their textual
content to the Document metadata.
return_each_element (bool): If True, every HTML element encountered
(including headers, paragraphs, etc.) is returned as a separate
Document. If False, content under the same header hierarchy is
aggregated into fewer Documents.
Returns:
List[Document]: A list of Document objects. Each Document contains
`page_content` holding the extracted text and `metadata` that maps
the header hierarchy to their corresponding titles.
Example:
.. code-block:: python
from langchain_text_splitters.html_header_text_splitter import (
@@ -123,10 +108,15 @@ class HTMLHeaderTextSplitter:
"""Initialize with headers to split on.
Args:
headers_to_split_on: A list of tuples where
each tuple contains a header tag and its corresponding value.
return_each_element: Whether to return each HTML
element as a separate Document. Defaults to False.
headers_to_split_on: A list of (header_tag,
header_name) pairs representing the headers that define splitting
boundaries. For example, [("h1", "Header 1"), ("h2", "Header 2")]
will split content by <h1> and <h2> tags, assigning their textual
content to the Document metadata.
return_each_element: If True, every HTML element encountered
(including headers, paragraphs, etc.) is returned as a separate
Document. If False, content under the same header hierarchy is
aggregated into fewer Documents.
"""
# Sort headers by their numeric level so that h1 < h2 < h3...
self.headers_to_split_on = sorted(
@@ -143,7 +133,9 @@ class HTMLHeaderTextSplitter:
text: The HTML text to split.
Returns:
A list of split Document objects.
A list of split Document objects. Each Document contains
`page_content` holding the extracted text and `metadata` that maps
the header hierarchy to their corresponding titles.
"""
return self.split_text_from_file(StringIO(text))
@@ -158,7 +150,9 @@ class HTMLHeaderTextSplitter:
**kwargs: Additional keyword arguments for the request.
Returns:
A list of split Document objects.
A list of split Document objects. Each Document contains
`page_content` holding the extracted text and `metadata` that maps
the header hierarchy to their corresponding titles.
Raises:
requests.RequestException: If the HTTP request fails.
@@ -179,7 +173,9 @@ class HTMLHeaderTextSplitter:
file: A file path or a file-like object containing HTML content.
Returns:
A list of split Document objects.
A list of split Document objects. Each Document contains
`page_content` holding the extracted text and `metadata` that maps
the header hierarchy to their corresponding titles.
"""
if isinstance(file, str):
with open(file, encoding="utf-8") as f:
@@ -384,10 +380,11 @@ class HTMLSectionSplitter:
Returns:
List[Dict[str, Optional[str]]]: A list of dictionaries representing
sections.
Each dictionary contains:
- 'header': The header text or a default title for the first section.
- 'content': The content under the header.
- 'tag_name': The name of the header tag (e.g., "h1", "h2").
Each dictionary contains:
* 'header': The header text or a default title for the first section.
* 'content': The content under the header.
* 'tag_name': The name of the header tag (e.g., "h1", "h2").
"""
try:
from bs4 import BeautifulSoup
@@ -508,40 +505,6 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
.. versionadded: 0.3.5
Args:
headers_to_split_on (List[Tuple[str, str]]): HTML headers (e.g., "h1", "h2")
that define content sections.
max_chunk_size (int): Maximum size for each chunk, with allowance for
exceeding this limit to preserve semantics.
chunk_overlap (int): Number of characters to overlap between chunks to ensure
contextual continuity.
separators (List[str]): Delimiters used by RecursiveCharacterTextSplitter for
further splitting.
elements_to_preserve (List[str]): HTML tags (e.g., <table>, <ul>) to remain
intact during splitting.
preserve_links (bool): Converts <a> tags to Markdown links ([text](url)).
preserve_images (bool): Converts <img> tags to Markdown images (![alt](src)).
preserve_videos (bool): Converts <video> tags to Markdown
video links (![video](src)).
preserve_audio (bool): Converts <audio> tags to Markdown
audio links (![audio](src)).
custom_handlers (Dict[str, Callable[[Any], str]]): Optional custom handlers for
specific HTML tags, allowing tailored extraction or processing.
stopword_removal (bool): Optionally remove stopwords from the text.
stopword_lang (str): The language of stopwords to remove.
normalize_text (bool): Optionally normalize text
(e.g., lowercasing, removing punctuation).
external_metadata (Optional[Dict[str, str]]): Additional metadata to attach to
the Document objects.
allowlist_tags (Optional[List[str]]): Only these tags will be retained in
the HTML.
denylist_tags (Optional[List[str]]): These tags will be removed from the HTML.
preserve_parent_metadata (bool): Whether to pass through parent document
metadata to split documents when calling
``transform_documents/atransform_documents()``.
keep_separator (Union[bool, Literal["start", "end"]]): Whether separators
should be at the beginning of a chunk, at the end, or not at all.
Example:
.. code-block:: python
@@ -593,7 +556,42 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
preserve_parent_metadata: bool = False,
keep_separator: Union[bool, Literal["start", "end"]] = True,
) -> None:
"""Initialize splitter."""
"""Initialize splitter.
Args:
headers_to_split_on: HTML headers (e.g., "h1", "h2")
that define content sections.
max_chunk_size: Maximum size for each chunk, with allowance for
exceeding this limit to preserve semantics.
chunk_overlap: Number of characters to overlap between chunks to ensure
contextual continuity.
separators: Delimiters used by RecursiveCharacterTextSplitter for
further splitting.
elements_to_preserve: HTML tags (e.g., <table>, <ul>) to remain
intact during splitting.
preserve_links: Converts <a> tags to Markdown links ([text](url)).
preserve_images: Converts <img> tags to Markdown images (![alt](src)).
preserve_videos: Converts <video> tags to Markdown
video links (![video](src)).
preserve_audio: Converts <audio> tags to Markdown
audio links (![audio](src)).
custom_handlers: Optional custom handlers for
specific HTML tags, allowing tailored extraction or processing.
stopword_removal: Optionally remove stopwords from the text.
stopword_lang: The language of stopwords to remove.
normalize_text: Optionally normalize text
(e.g., lowercasing, removing punctuation).
external_metadata: Additional metadata to attach to
the Document objects.
allowlist_tags: Only these tags will be retained in
the HTML.
denylist_tags: These tags will be removed from the HTML.
preserve_parent_metadata: Whether to pass through parent document
metadata to split documents when calling
``transform_documents/atransform_documents()``.
keep_separator: Whether separators
should be at the beginning of a chunk, at the end, or not at all.
"""
try:
from bs4 import BeautifulSoup, Tag

View File

@@ -14,31 +14,27 @@ class RecursiveJsonSplitter:
JSON-formatted strings based on configurable maximum and minimum chunk sizes.
It supports nested JSON structures, optionally converts lists into dictionaries
for better chunking, and allows the creation of document objects for further use.
Attributes:
max_chunk_size (int): The maximum size for each chunk. Defaults to 2000.
min_chunk_size (int): The minimum size for each chunk, derived from
`max_chunk_size` if not explicitly provided.
"""
max_chunk_size: int = 2000
"""The maximum size for each chunk. Defaults to 2000."""
min_chunk_size: int = 1800
"""The minimum size for each chunk, derived from ``max_chunk_size`` if not
explicitly provided."""
def __init__(
self, max_chunk_size: int = 2000, min_chunk_size: Optional[int] = None
) -> None:
"""Initialize the chunk size configuration for text processing.
This constructor sets up the maximum and minimum chunk sizes, ensuring that
the `min_chunk_size` defaults to a value slightly smaller than the
`max_chunk_size` if not explicitly provided.
the ``min_chunk_size`` defaults to a value slightly smaller than the
``max_chunk_size`` if not explicitly provided.
Args:
max_chunk_size (int): The maximum size for a chunk. Defaults to 2000.
min_chunk_size (Optional[int]): The minimum size for a chunk. If None,
max_chunk_size: The maximum size for a chunk. Defaults to 2000.
min_chunk_size: The minimum size for a chunk. If None,
defaults to the maximum chunk size minus 200, with a lower bound of 50.
Attributes:
max_chunk_size (int): The configured maximum size for each chunk.
min_chunk_size (int): The configured minimum size for each chunk, derived
from `max_chunk_size` if not explicitly provided.
"""
super().__init__()
self.max_chunk_size = max_chunk_size

View File

@@ -9,13 +9,15 @@ class JSFrameworkTextSplitter(RecursiveCharacterTextSplitter):
This splitter extends RecursiveCharacterTextSplitter to handle
React (JSX), Vue, and Svelte code by:
1. Detecting and extracting custom component tags from the text
2. Using those tags as additional separators along with standard JS syntax
The splitter combines:
- Custom component tags as separators (e.g. <Component, <div)
- JavaScript syntax elements (function, const, if, etc)
- Standard text splitting on newlines
* Custom component tags as separators (e.g. <Component, <div)
* JavaScript syntax elements (function, const, if, etc)
* Standard text splitting on newlines
This allows chunks to break at natural boundaries in
React, Vue, and Svelte component code.
@@ -43,9 +45,10 @@ class JSFrameworkTextSplitter(RecursiveCharacterTextSplitter):
"""Split text into chunks.
This method splits the text into chunks by:
- Extracting unique opening component tags using regex
- Creating separators list with extracted tags and JS separators
- Splitting the text using the separators by calling the parent class method
* Extracting unique opening component tags using regex
* Creating separators list with extracted tags and JS separators
* Splitting the text using the separators by calling the parent class method
Args:
text: String containing code to split

View File

@@ -289,32 +289,28 @@ class ExperimentalMarkdownSyntaxTextSplitter:
additional features.
Key Features:
- Retains the original whitespace and formatting of the Markdown text.
- Extracts headers, code blocks, and horizontal rules as metadata.
- Splits out code blocks and includes the language in the "Code" metadata key.
- Splits text on horizontal rules (`---`) as well.
- Defaults to sensible splitting behavior, which can be overridden using the
`headers_to_split_on` parameter.
Parameters:
----------
headers_to_split_on : List[Tuple[str, str]], optional
Headers to split on, defaulting to common Markdown headers if not specified.
return_each_line : bool, optional
When set to True, returns each line as a separate chunk. Default is False.
* Retains the original whitespace and formatting of the Markdown text.
* Extracts headers, code blocks, and horizontal rules as metadata.
* Splits out code blocks and includes the language in the "Code" metadata key.
* Splits text on horizontal rules (`---`) as well.
* Defaults to sensible splitting behavior, which can be overridden using the
``headers_to_split_on`` parameter.
Usage example:
--------------
>>> headers_to_split_on = [
>>> ("#", "Header 1"),
>>> ("##", "Header 2"),
>>> ]
>>> splitter = ExperimentalMarkdownSyntaxTextSplitter(
>>> headers_to_split_on=headers_to_split_on
>>> )
>>> chunks = splitter.split(text)
>>> for chunk in chunks:
>>> print(chunk)
Example:
.. code-block:: python
headers_to_split_on = [
("#", "Header 1"),
("##", "Header 2"),
]
splitter = ExperimentalMarkdownSyntaxTextSplitter(
headers_to_split_on=headers_to_split_on
)
chunks = splitter.split(text)
for chunk in chunks:
print(chunk)
This class is currently experimental and subject to change based on feedback and
further development.