mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-05 13:06:03 +00:00
docs(text-splitters): fix some docstrings (#32767)
This commit is contained in:
committed by
GitHub
parent
fcf7175392
commit
e0a4af8d8b
@@ -53,23 +53,8 @@ class HTMLHeaderTextSplitter:
|
|||||||
gracefully handles multiple levels of nested headers, creating a rich,
|
gracefully handles multiple levels of nested headers, creating a rich,
|
||||||
hierarchical representation of the content.
|
hierarchical representation of the content.
|
||||||
|
|
||||||
Args:
|
|
||||||
headers_to_split_on (List[Tuple[str, str]]): A list of (header_tag,
|
|
||||||
header_name) pairs representing the headers that define splitting
|
|
||||||
boundaries. For example, [("h1", "Header 1"), ("h2", "Header 2")]
|
|
||||||
will split content by <h1> and <h2> tags, assigning their textual
|
|
||||||
content to the Document metadata.
|
|
||||||
return_each_element (bool): If True, every HTML element encountered
|
|
||||||
(including headers, paragraphs, etc.) is returned as a separate
|
|
||||||
Document. If False, content under the same header hierarchy is
|
|
||||||
aggregated into fewer Documents.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List[Document]: A list of Document objects. Each Document contains
|
|
||||||
`page_content` holding the extracted text and `metadata` that maps
|
|
||||||
the header hierarchy to their corresponding titles.
|
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
|
||||||
.. code-block:: python
|
.. code-block:: python
|
||||||
|
|
||||||
from langchain_text_splitters.html_header_text_splitter import (
|
from langchain_text_splitters.html_header_text_splitter import (
|
||||||
@@ -123,10 +108,15 @@ class HTMLHeaderTextSplitter:
|
|||||||
"""Initialize with headers to split on.
|
"""Initialize with headers to split on.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
headers_to_split_on: A list of tuples where
|
headers_to_split_on: A list of (header_tag,
|
||||||
each tuple contains a header tag and its corresponding value.
|
header_name) pairs representing the headers that define splitting
|
||||||
return_each_element: Whether to return each HTML
|
boundaries. For example, [("h1", "Header 1"), ("h2", "Header 2")]
|
||||||
element as a separate Document. Defaults to False.
|
will split content by <h1> and <h2> tags, assigning their textual
|
||||||
|
content to the Document metadata.
|
||||||
|
return_each_element: If True, every HTML element encountered
|
||||||
|
(including headers, paragraphs, etc.) is returned as a separate
|
||||||
|
Document. If False, content under the same header hierarchy is
|
||||||
|
aggregated into fewer Documents.
|
||||||
"""
|
"""
|
||||||
# Sort headers by their numeric level so that h1 < h2 < h3...
|
# Sort headers by their numeric level so that h1 < h2 < h3...
|
||||||
self.headers_to_split_on = sorted(
|
self.headers_to_split_on = sorted(
|
||||||
@@ -143,7 +133,9 @@ class HTMLHeaderTextSplitter:
|
|||||||
text: The HTML text to split.
|
text: The HTML text to split.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
A list of split Document objects.
|
A list of split Document objects. Each Document contains
|
||||||
|
`page_content` holding the extracted text and `metadata` that maps
|
||||||
|
the header hierarchy to their corresponding titles.
|
||||||
"""
|
"""
|
||||||
return self.split_text_from_file(StringIO(text))
|
return self.split_text_from_file(StringIO(text))
|
||||||
|
|
||||||
@@ -158,7 +150,9 @@ class HTMLHeaderTextSplitter:
|
|||||||
**kwargs: Additional keyword arguments for the request.
|
**kwargs: Additional keyword arguments for the request.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
A list of split Document objects.
|
A list of split Document objects. Each Document contains
|
||||||
|
`page_content` holding the extracted text and `metadata` that maps
|
||||||
|
the header hierarchy to their corresponding titles.
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
requests.RequestException: If the HTTP request fails.
|
requests.RequestException: If the HTTP request fails.
|
||||||
@@ -179,7 +173,9 @@ class HTMLHeaderTextSplitter:
|
|||||||
file: A file path or a file-like object containing HTML content.
|
file: A file path or a file-like object containing HTML content.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
A list of split Document objects.
|
A list of split Document objects. Each Document contains
|
||||||
|
`page_content` holding the extracted text and `metadata` that maps
|
||||||
|
the header hierarchy to their corresponding titles.
|
||||||
"""
|
"""
|
||||||
if isinstance(file, str):
|
if isinstance(file, str):
|
||||||
with open(file, encoding="utf-8") as f:
|
with open(file, encoding="utf-8") as f:
|
||||||
@@ -385,9 +381,10 @@ class HTMLSectionSplitter:
|
|||||||
List[Dict[str, Optional[str]]]: A list of dictionaries representing
|
List[Dict[str, Optional[str]]]: A list of dictionaries representing
|
||||||
sections.
|
sections.
|
||||||
Each dictionary contains:
|
Each dictionary contains:
|
||||||
- 'header': The header text or a default title for the first section.
|
|
||||||
- 'content': The content under the header.
|
* 'header': The header text or a default title for the first section.
|
||||||
- 'tag_name': The name of the header tag (e.g., "h1", "h2").
|
* 'content': The content under the header.
|
||||||
|
* 'tag_name': The name of the header tag (e.g., "h1", "h2").
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
@@ -508,40 +505,6 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
|
|||||||
|
|
||||||
.. versionadded: 0.3.5
|
.. versionadded: 0.3.5
|
||||||
|
|
||||||
Args:
|
|
||||||
headers_to_split_on (List[Tuple[str, str]]): HTML headers (e.g., "h1", "h2")
|
|
||||||
that define content sections.
|
|
||||||
max_chunk_size (int): Maximum size for each chunk, with allowance for
|
|
||||||
exceeding this limit to preserve semantics.
|
|
||||||
chunk_overlap (int): Number of characters to overlap between chunks to ensure
|
|
||||||
contextual continuity.
|
|
||||||
separators (List[str]): Delimiters used by RecursiveCharacterTextSplitter for
|
|
||||||
further splitting.
|
|
||||||
elements_to_preserve (List[str]): HTML tags (e.g., <table>, <ul>) to remain
|
|
||||||
intact during splitting.
|
|
||||||
preserve_links (bool): Converts <a> tags to Markdown links ([text](url)).
|
|
||||||
preserve_images (bool): Converts <img> tags to Markdown images ().
|
|
||||||
preserve_videos (bool): Converts <video> tags to Markdown
|
|
||||||
video links ().
|
|
||||||
preserve_audio (bool): Converts <audio> tags to Markdown
|
|
||||||
audio links ().
|
|
||||||
custom_handlers (Dict[str, Callable[[Any], str]]): Optional custom handlers for
|
|
||||||
specific HTML tags, allowing tailored extraction or processing.
|
|
||||||
stopword_removal (bool): Optionally remove stopwords from the text.
|
|
||||||
stopword_lang (str): The language of stopwords to remove.
|
|
||||||
normalize_text (bool): Optionally normalize text
|
|
||||||
(e.g., lowercasing, removing punctuation).
|
|
||||||
external_metadata (Optional[Dict[str, str]]): Additional metadata to attach to
|
|
||||||
the Document objects.
|
|
||||||
allowlist_tags (Optional[List[str]]): Only these tags will be retained in
|
|
||||||
the HTML.
|
|
||||||
denylist_tags (Optional[List[str]]): These tags will be removed from the HTML.
|
|
||||||
preserve_parent_metadata (bool): Whether to pass through parent document
|
|
||||||
metadata to split documents when calling
|
|
||||||
``transform_documents/atransform_documents()``.
|
|
||||||
keep_separator (Union[bool, Literal["start", "end"]]): Whether separators
|
|
||||||
should be at the beginning of a chunk, at the end, or not at all.
|
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
.. code-block:: python
|
.. code-block:: python
|
||||||
|
|
||||||
@@ -593,7 +556,42 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
|
|||||||
preserve_parent_metadata: bool = False,
|
preserve_parent_metadata: bool = False,
|
||||||
keep_separator: Union[bool, Literal["start", "end"]] = True,
|
keep_separator: Union[bool, Literal["start", "end"]] = True,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize splitter."""
|
"""Initialize splitter.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
headers_to_split_on: HTML headers (e.g., "h1", "h2")
|
||||||
|
that define content sections.
|
||||||
|
max_chunk_size: Maximum size for each chunk, with allowance for
|
||||||
|
exceeding this limit to preserve semantics.
|
||||||
|
chunk_overlap: Number of characters to overlap between chunks to ensure
|
||||||
|
contextual continuity.
|
||||||
|
separators: Delimiters used by RecursiveCharacterTextSplitter for
|
||||||
|
further splitting.
|
||||||
|
elements_to_preserve: HTML tags (e.g., <table>, <ul>) to remain
|
||||||
|
intact during splitting.
|
||||||
|
preserve_links: Converts <a> tags to Markdown links ([text](url)).
|
||||||
|
preserve_images: Converts <img> tags to Markdown images ().
|
||||||
|
preserve_videos: Converts <video> tags to Markdown
|
||||||
|
video links ().
|
||||||
|
preserve_audio: Converts <audio> tags to Markdown
|
||||||
|
audio links ().
|
||||||
|
custom_handlers: Optional custom handlers for
|
||||||
|
specific HTML tags, allowing tailored extraction or processing.
|
||||||
|
stopword_removal: Optionally remove stopwords from the text.
|
||||||
|
stopword_lang: The language of stopwords to remove.
|
||||||
|
normalize_text: Optionally normalize text
|
||||||
|
(e.g., lowercasing, removing punctuation).
|
||||||
|
external_metadata: Additional metadata to attach to
|
||||||
|
the Document objects.
|
||||||
|
allowlist_tags: Only these tags will be retained in
|
||||||
|
the HTML.
|
||||||
|
denylist_tags: These tags will be removed from the HTML.
|
||||||
|
preserve_parent_metadata: Whether to pass through parent document
|
||||||
|
metadata to split documents when calling
|
||||||
|
``transform_documents/atransform_documents()``.
|
||||||
|
keep_separator: Whether separators
|
||||||
|
should be at the beginning of a chunk, at the end, or not at all.
|
||||||
|
"""
|
||||||
try:
|
try:
|
||||||
from bs4 import BeautifulSoup, Tag
|
from bs4 import BeautifulSoup, Tag
|
||||||
|
|
||||||
|
@@ -14,31 +14,27 @@ class RecursiveJsonSplitter:
|
|||||||
JSON-formatted strings based on configurable maximum and minimum chunk sizes.
|
JSON-formatted strings based on configurable maximum and minimum chunk sizes.
|
||||||
It supports nested JSON structures, optionally converts lists into dictionaries
|
It supports nested JSON structures, optionally converts lists into dictionaries
|
||||||
for better chunking, and allows the creation of document objects for further use.
|
for better chunking, and allows the creation of document objects for further use.
|
||||||
|
|
||||||
Attributes:
|
|
||||||
max_chunk_size (int): The maximum size for each chunk. Defaults to 2000.
|
|
||||||
min_chunk_size (int): The minimum size for each chunk, derived from
|
|
||||||
`max_chunk_size` if not explicitly provided.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
max_chunk_size: int = 2000
|
||||||
|
"""The maximum size for each chunk. Defaults to 2000."""
|
||||||
|
min_chunk_size: int = 1800
|
||||||
|
"""The minimum size for each chunk, derived from ``max_chunk_size`` if not
|
||||||
|
explicitly provided."""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, max_chunk_size: int = 2000, min_chunk_size: Optional[int] = None
|
self, max_chunk_size: int = 2000, min_chunk_size: Optional[int] = None
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize the chunk size configuration for text processing.
|
"""Initialize the chunk size configuration for text processing.
|
||||||
|
|
||||||
This constructor sets up the maximum and minimum chunk sizes, ensuring that
|
This constructor sets up the maximum and minimum chunk sizes, ensuring that
|
||||||
the `min_chunk_size` defaults to a value slightly smaller than the
|
the ``min_chunk_size`` defaults to a value slightly smaller than the
|
||||||
`max_chunk_size` if not explicitly provided.
|
``max_chunk_size`` if not explicitly provided.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
max_chunk_size (int): The maximum size for a chunk. Defaults to 2000.
|
max_chunk_size: The maximum size for a chunk. Defaults to 2000.
|
||||||
min_chunk_size (Optional[int]): The minimum size for a chunk. If None,
|
min_chunk_size: The minimum size for a chunk. If None,
|
||||||
defaults to the maximum chunk size minus 200, with a lower bound of 50.
|
defaults to the maximum chunk size minus 200, with a lower bound of 50.
|
||||||
|
|
||||||
Attributes:
|
|
||||||
max_chunk_size (int): The configured maximum size for each chunk.
|
|
||||||
min_chunk_size (int): The configured minimum size for each chunk, derived
|
|
||||||
from `max_chunk_size` if not explicitly provided.
|
|
||||||
"""
|
"""
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.max_chunk_size = max_chunk_size
|
self.max_chunk_size = max_chunk_size
|
||||||
|
@@ -9,13 +9,15 @@ class JSFrameworkTextSplitter(RecursiveCharacterTextSplitter):
|
|||||||
|
|
||||||
This splitter extends RecursiveCharacterTextSplitter to handle
|
This splitter extends RecursiveCharacterTextSplitter to handle
|
||||||
React (JSX), Vue, and Svelte code by:
|
React (JSX), Vue, and Svelte code by:
|
||||||
|
|
||||||
1. Detecting and extracting custom component tags from the text
|
1. Detecting and extracting custom component tags from the text
|
||||||
2. Using those tags as additional separators along with standard JS syntax
|
2. Using those tags as additional separators along with standard JS syntax
|
||||||
|
|
||||||
The splitter combines:
|
The splitter combines:
|
||||||
- Custom component tags as separators (e.g. <Component, <div)
|
|
||||||
- JavaScript syntax elements (function, const, if, etc)
|
* Custom component tags as separators (e.g. <Component, <div)
|
||||||
- Standard text splitting on newlines
|
* JavaScript syntax elements (function, const, if, etc)
|
||||||
|
* Standard text splitting on newlines
|
||||||
|
|
||||||
This allows chunks to break at natural boundaries in
|
This allows chunks to break at natural boundaries in
|
||||||
React, Vue, and Svelte component code.
|
React, Vue, and Svelte component code.
|
||||||
@@ -43,9 +45,10 @@ class JSFrameworkTextSplitter(RecursiveCharacterTextSplitter):
|
|||||||
"""Split text into chunks.
|
"""Split text into chunks.
|
||||||
|
|
||||||
This method splits the text into chunks by:
|
This method splits the text into chunks by:
|
||||||
- Extracting unique opening component tags using regex
|
|
||||||
- Creating separators list with extracted tags and JS separators
|
* Extracting unique opening component tags using regex
|
||||||
- Splitting the text using the separators by calling the parent class method
|
* Creating separators list with extracted tags and JS separators
|
||||||
|
* Splitting the text using the separators by calling the parent class method
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
text: String containing code to split
|
text: String containing code to split
|
||||||
|
@@ -289,32 +289,28 @@ class ExperimentalMarkdownSyntaxTextSplitter:
|
|||||||
additional features.
|
additional features.
|
||||||
|
|
||||||
Key Features:
|
Key Features:
|
||||||
- Retains the original whitespace and formatting of the Markdown text.
|
|
||||||
- Extracts headers, code blocks, and horizontal rules as metadata.
|
|
||||||
- Splits out code blocks and includes the language in the "Code" metadata key.
|
|
||||||
- Splits text on horizontal rules (`---`) as well.
|
|
||||||
- Defaults to sensible splitting behavior, which can be overridden using the
|
|
||||||
`headers_to_split_on` parameter.
|
|
||||||
|
|
||||||
Parameters:
|
* Retains the original whitespace and formatting of the Markdown text.
|
||||||
----------
|
* Extracts headers, code blocks, and horizontal rules as metadata.
|
||||||
headers_to_split_on : List[Tuple[str, str]], optional
|
* Splits out code blocks and includes the language in the "Code" metadata key.
|
||||||
Headers to split on, defaulting to common Markdown headers if not specified.
|
* Splits text on horizontal rules (`---`) as well.
|
||||||
return_each_line : bool, optional
|
* Defaults to sensible splitting behavior, which can be overridden using the
|
||||||
When set to True, returns each line as a separate chunk. Default is False.
|
``headers_to_split_on`` parameter.
|
||||||
|
|
||||||
Usage example:
|
Example:
|
||||||
--------------
|
|
||||||
>>> headers_to_split_on = [
|
.. code-block:: python
|
||||||
>>> ("#", "Header 1"),
|
|
||||||
>>> ("##", "Header 2"),
|
headers_to_split_on = [
|
||||||
>>> ]
|
("#", "Header 1"),
|
||||||
>>> splitter = ExperimentalMarkdownSyntaxTextSplitter(
|
("##", "Header 2"),
|
||||||
>>> headers_to_split_on=headers_to_split_on
|
]
|
||||||
>>> )
|
splitter = ExperimentalMarkdownSyntaxTextSplitter(
|
||||||
>>> chunks = splitter.split(text)
|
headers_to_split_on=headers_to_split_on
|
||||||
>>> for chunk in chunks:
|
)
|
||||||
>>> print(chunk)
|
chunks = splitter.split(text)
|
||||||
|
for chunk in chunks:
|
||||||
|
print(chunk)
|
||||||
|
|
||||||
This class is currently experimental and subject to change based on feedback and
|
This class is currently experimental and subject to change based on feedback and
|
||||||
further development.
|
further development.
|
||||||
|
Reference in New Issue
Block a user