docs(text-splitters): fix some docstrings (#32767)

2025-09-04 12:39:32 +00:00 · 2025-08-31 20:46:11 +02:00
parent fcf7175392
commit e0a4af8d8b
4 changed files with 99 additions and 106 deletions
--- a/libs/text-splitters/langchain_text_splitters/html.py
+++ b/libs/text-splitters/langchain_text_splitters/html.py
@@ -53,23 +53,8 @@ class HTMLHeaderTextSplitter:
    gracefully handles multiple levels of nested headers, creating a rich,
    hierarchical representation of the content.

-    Args:
-        headers_to_split_on (List[Tuple[str, str]]): A list of (header_tag,
-            header_name) pairs representing the headers that define splitting
-            boundaries. For example, [("h1", "Header 1"), ("h2", "Header 2")]
-            will split content by <h1> and <h2> tags, assigning their textual
-            content to the Document metadata.
-        return_each_element (bool): If True, every HTML element encountered
-            (including headers, paragraphs, etc.) is returned as a separate
-            Document. If False, content under the same header hierarchy is
-            aggregated into fewer Documents.
-
-    Returns:
-        List[Document]: A list of Document objects. Each Document contains
-        `page_content` holding the extracted text and `metadata` that maps
-        the header hierarchy to their corresponding titles.
-
    Example:
+
        .. code-block:: python

            from langchain_text_splitters.html_header_text_splitter import (
@@ -123,10 +108,15 @@ class HTMLHeaderTextSplitter:
        """Initialize with headers to split on.

        Args:
-            headers_to_split_on: A list of tuples where
-                each tuple contains a header tag and its corresponding value.
-            return_each_element: Whether to return each HTML
-                element as a separate Document. Defaults to False.
+            headers_to_split_on: A list of (header_tag,
+                header_name) pairs representing the headers that define splitting
+                boundaries. For example, [("h1", "Header 1"), ("h2", "Header 2")]
+                will split content by <h1> and <h2> tags, assigning their textual
+                content to the Document metadata.
+            return_each_element: If True, every HTML element encountered
+                (including headers, paragraphs, etc.) is returned as a separate
+                Document. If False, content under the same header hierarchy is
+                aggregated into fewer Documents.
        """
        # Sort headers by their numeric level so that h1 < h2 < h3...
        self.headers_to_split_on = sorted(
@@ -143,7 +133,9 @@ class HTMLHeaderTextSplitter:
            text: The HTML text to split.

        Returns:
-            A list of split Document objects.
+            A list of split Document objects. Each Document contains
+            `page_content` holding the extracted text and `metadata` that maps
+            the header hierarchy to their corresponding titles.
        """
        return self.split_text_from_file(StringIO(text))

@@ -158,7 +150,9 @@ class HTMLHeaderTextSplitter:
            **kwargs: Additional keyword arguments for the request.

        Returns:
-            A list of split Document objects.
+            A list of split Document objects. Each Document contains
+            `page_content` holding the extracted text and `metadata` that maps
+            the header hierarchy to their corresponding titles.

        Raises:
            requests.RequestException: If the HTTP request fails.
@@ -179,7 +173,9 @@ class HTMLHeaderTextSplitter:
            file: A file path or a file-like object containing HTML content.

        Returns:
-            A list of split Document objects.
+            A list of split Document objects. Each Document contains
+            `page_content` holding the extracted text and `metadata` that maps
+            the header hierarchy to their corresponding titles.
        """
        if isinstance(file, str):
            with open(file, encoding="utf-8") as f:
@@ -384,10 +380,11 @@ class HTMLSectionSplitter:
        Returns:
            List[Dict[str, Optional[str]]]: A list of dictionaries representing
            sections.
-                Each dictionary contains:
-                - 'header': The header text or a default title for the first section.
-                - 'content': The content under the header.
-                - 'tag_name': The name of the header tag (e.g., "h1", "h2").
+            Each dictionary contains:
+
+            * 'header': The header text or a default title for the first section.
+            * 'content': The content under the header.
+            * 'tag_name': The name of the header tag (e.g., "h1", "h2").
        """
        try:
            from bs4 import BeautifulSoup
@@ -508,40 +505,6 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):

    .. versionadded: 0.3.5

-    Args:
-        headers_to_split_on (List[Tuple[str, str]]): HTML headers (e.g., "h1", "h2")
-            that define content sections.
-        max_chunk_size (int): Maximum size for each chunk, with allowance for
-            exceeding this limit to preserve semantics.
-        chunk_overlap (int): Number of characters to overlap between chunks to ensure
-            contextual continuity.
-        separators (List[str]): Delimiters used by RecursiveCharacterTextSplitter for
-            further splitting.
-        elements_to_preserve (List[str]): HTML tags (e.g., <table>, <ul>) to remain
-            intact during splitting.
-        preserve_links (bool): Converts <a> tags to Markdown links ([text](url)).
-        preserve_images (bool): Converts <img> tags to Markdown images (![alt](src)).
-        preserve_videos (bool): Converts <video> tags to Markdown
-        video links (![video](src)).
-        preserve_audio (bool): Converts <audio> tags to Markdown
-        audio links (![audio](src)).
-        custom_handlers (Dict[str, Callable[[Any], str]]): Optional custom handlers for
-            specific HTML tags, allowing tailored extraction or processing.
-        stopword_removal (bool): Optionally remove stopwords from the text.
-        stopword_lang (str): The language of stopwords to remove.
-        normalize_text (bool): Optionally normalize text
-            (e.g., lowercasing, removing punctuation).
-        external_metadata (Optional[Dict[str, str]]): Additional metadata to attach to
-            the Document objects.
-        allowlist_tags (Optional[List[str]]): Only these tags will be retained in
-            the HTML.
-        denylist_tags (Optional[List[str]]): These tags will be removed from the HTML.
-        preserve_parent_metadata (bool): Whether to pass through parent document
-            metadata to split documents when calling
-            ``transform_documents/atransform_documents()``.
-        keep_separator (Union[bool, Literal["start", "end"]]): Whether separators
-            should be at the beginning of a chunk, at the end, or not at all.
-
    Example:
        .. code-block:: python

@@ -593,7 +556,42 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
        preserve_parent_metadata: bool = False,
        keep_separator: Union[bool, Literal["start", "end"]] = True,
    ) -> None:
-        """Initialize splitter."""
+        """Initialize splitter.
+
+        Args:
+            headers_to_split_on: HTML headers (e.g., "h1", "h2")
+                that define content sections.
+            max_chunk_size: Maximum size for each chunk, with allowance for
+                exceeding this limit to preserve semantics.
+            chunk_overlap: Number of characters to overlap between chunks to ensure
+                contextual continuity.
+            separators: Delimiters used by RecursiveCharacterTextSplitter for
+                further splitting.
+            elements_to_preserve: HTML tags (e.g., <table>, <ul>) to remain
+                intact during splitting.
+            preserve_links: Converts <a> tags to Markdown links ([text](url)).
+            preserve_images: Converts <img> tags to Markdown images (![alt](src)).
+            preserve_videos: Converts <video> tags to Markdown
+                video links (![video](src)).
+            preserve_audio: Converts <audio> tags to Markdown
+                audio links (![audio](src)).
+            custom_handlers: Optional custom handlers for
+                specific HTML tags, allowing tailored extraction or processing.
+            stopword_removal: Optionally remove stopwords from the text.
+            stopword_lang: The language of stopwords to remove.
+            normalize_text: Optionally normalize text
+                (e.g., lowercasing, removing punctuation).
+            external_metadata: Additional metadata to attach to
+                the Document objects.
+            allowlist_tags: Only these tags will be retained in
+                the HTML.
+            denylist_tags: These tags will be removed from the HTML.
+            preserve_parent_metadata: Whether to pass through parent document
+                metadata to split documents when calling
+                ``transform_documents/atransform_documents()``.
+            keep_separator: Whether separators
+                should be at the beginning of a chunk, at the end, or not at all.
+        """
        try:
            from bs4 import BeautifulSoup, Tag

--- a/libs/text-splitters/langchain_text_splitters/json.py
+++ b/libs/text-splitters/langchain_text_splitters/json.py
@@ -14,31 +14,27 @@ class RecursiveJsonSplitter:
    JSON-formatted strings based on configurable maximum and minimum chunk sizes.
    It supports nested JSON structures, optionally converts lists into dictionaries
    for better chunking, and allows the creation of document objects for further use.
-
-    Attributes:
-        max_chunk_size (int): The maximum size for each chunk. Defaults to 2000.
-        min_chunk_size (int): The minimum size for each chunk, derived from
-            `max_chunk_size` if not explicitly provided.
    """

+    max_chunk_size: int = 2000
+    """The maximum size for each chunk. Defaults to 2000."""
+    min_chunk_size: int = 1800
+    """The minimum size for each chunk, derived from ``max_chunk_size`` if not
+    explicitly provided."""
+
    def __init__(
        self, max_chunk_size: int = 2000, min_chunk_size: Optional[int] = None
    ) -> None:
        """Initialize the chunk size configuration for text processing.

        This constructor sets up the maximum and minimum chunk sizes, ensuring that
-        the `min_chunk_size` defaults to a value slightly smaller than the
-        `max_chunk_size` if not explicitly provided.
+        the ``min_chunk_size`` defaults to a value slightly smaller than the
+        ``max_chunk_size`` if not explicitly provided.

        Args:
-            max_chunk_size (int): The maximum size for a chunk. Defaults to 2000.
-            min_chunk_size (Optional[int]): The minimum size for a chunk. If None,
+            max_chunk_size: The maximum size for a chunk. Defaults to 2000.
+            min_chunk_size: The minimum size for a chunk. If None,
                defaults to the maximum chunk size minus 200, with a lower bound of 50.
-
-        Attributes:
-            max_chunk_size (int): The configured maximum size for each chunk.
-            min_chunk_size (int): The configured minimum size for each chunk, derived
-                from `max_chunk_size` if not explicitly provided.
        """
        super().__init__()
        self.max_chunk_size = max_chunk_size
--- a/libs/text-splitters/langchain_text_splitters/jsx.py
+++ b/libs/text-splitters/langchain_text_splitters/jsx.py
@@ -9,13 +9,15 @@ class JSFrameworkTextSplitter(RecursiveCharacterTextSplitter):

    This splitter extends RecursiveCharacterTextSplitter to handle
    React (JSX), Vue, and Svelte code by:
+
    1. Detecting and extracting custom component tags from the text
    2. Using those tags as additional separators along with standard JS syntax

    The splitter combines:
-    - Custom component tags as separators (e.g. <Component, <div)
-    - JavaScript syntax elements (function, const, if, etc)
-    - Standard text splitting on newlines
+
+    * Custom component tags as separators (e.g. <Component, <div)
+    * JavaScript syntax elements (function, const, if, etc)
+    * Standard text splitting on newlines

    This allows chunks to break at natural boundaries in
    React, Vue, and Svelte component code.
@@ -43,9 +45,10 @@ class JSFrameworkTextSplitter(RecursiveCharacterTextSplitter):
        """Split text into chunks.

        This method splits the text into chunks by:
-        - Extracting unique opening component tags using regex
-        - Creating separators list with extracted tags and JS separators
-        - Splitting the text using the separators by calling the parent class method
+
+        * Extracting unique opening component tags using regex
+        * Creating separators list with extracted tags and JS separators
+        * Splitting the text using the separators by calling the parent class method

        Args:
            text: String containing code to split
--- a/libs/text-splitters/langchain_text_splitters/markdown.py
+++ b/libs/text-splitters/langchain_text_splitters/markdown.py
@@ -289,32 +289,28 @@ class ExperimentalMarkdownSyntaxTextSplitter:
    additional features.

    Key Features:
-    - Retains the original whitespace and formatting of the Markdown text.
-    - Extracts headers, code blocks, and horizontal rules as metadata.
-    - Splits out code blocks and includes the language in the "Code" metadata key.
-    - Splits text on horizontal rules (`---`) as well.
-    - Defaults to sensible splitting behavior, which can be overridden using the
-      `headers_to_split_on` parameter.

-    Parameters:
-    ----------
-    headers_to_split_on : List[Tuple[str, str]], optional
-        Headers to split on, defaulting to common Markdown headers if not specified.
-    return_each_line : bool, optional
-        When set to True, returns each line as a separate chunk. Default is False.
+    * Retains the original whitespace and formatting of the Markdown text.
+    * Extracts headers, code blocks, and horizontal rules as metadata.
+    * Splits out code blocks and includes the language in the "Code" metadata key.
+    * Splits text on horizontal rules (`---`) as well.
+    * Defaults to sensible splitting behavior, which can be overridden using the
+      ``headers_to_split_on`` parameter.

-    Usage example:
-    --------------
-    >>> headers_to_split_on = [
-    >>>     ("#", "Header 1"),
-    >>>     ("##", "Header 2"),
-    >>> ]
-    >>> splitter = ExperimentalMarkdownSyntaxTextSplitter(
-    >>>     headers_to_split_on=headers_to_split_on
-    >>> )
-    >>> chunks = splitter.split(text)
-    >>> for chunk in chunks:
-    >>>     print(chunk)
+    Example:
+
+        .. code-block:: python
+
+            headers_to_split_on = [
+                ("#", "Header 1"),
+                ("##", "Header 2"),
+            ]
+            splitter = ExperimentalMarkdownSyntaxTextSplitter(
+                headers_to_split_on=headers_to_split_on
+            )
+            chunks = splitter.split(text)
+            for chunk in chunks:
+                print(chunk)

    This class is currently experimental and subject to change based on feedback and
    further development.