docs(text-splitters): fix some docstrings (#32767)

2025-09-05 13:06:03 +00:00 · 2025-08-31 20:46:11 +02:00
parent fcf7175392
commit e0a4af8d8b
4 changed files with 99 additions and 106 deletions
--- a/libs/text-splitters/langchain_text_splitters/html.py
+++ b/libs/text-splitters/langchain_text_splitters/html.py
@@ -53,23 +53,8 @@ class HTMLHeaderTextSplitter:
    gracefully handles multiple levels of nested headers, creating a rich,
    hierarchical representation of the content.
    Args:
        headers_to_split_on (List[Tuple[str, str]]): A list of (header_tag,
            header_name) pairs representing the headers that define splitting
            boundaries. For example, [("h1", "Header 1"), ("h2", "Header 2")]
            will split content by <h1> and <h2> tags, assigning their textual
            content to the Document metadata.
        return_each_element (bool): If True, every HTML element encountered
            (including headers, paragraphs, etc.) is returned as a separate
            Document. If False, content under the same header hierarchy is
            aggregated into fewer Documents.
    Returns:
        List[Document]: A list of Document objects. Each Document contains
        `page_content` holding the extracted text and `metadata` that maps
        the header hierarchy to their corresponding titles.
    Example:
        .. code-block:: python
            from langchain_text_splitters.html_header_text_splitter import (
@@ -123,10 +108,15 @@ class HTMLHeaderTextSplitter:
        """Initialize with headers to split on.
        Args:
-            headers_to_split_on: A list of tuples where
+            headers_to_split_on: A list of (header_tag,
-                each tuple contains a header tag and its corresponding value.
+                header_name) pairs representing the headers that define splitting
-            return_each_element: Whether to return each HTML
+                boundaries. For example, [("h1", "Header 1"), ("h2", "Header 2")]
-                element as a separate Document. Defaults to False.
+                will split content by <h1> and <h2> tags, assigning their textual
                content to the Document metadata.
            return_each_element: If True, every HTML element encountered
                (including headers, paragraphs, etc.) is returned as a separate
                Document. If False, content under the same header hierarchy is
                aggregated into fewer Documents.
        """
        # Sort headers by their numeric level so that h1 < h2 < h3...
        self.headers_to_split_on = sorted(
@@ -143,7 +133,9 @@ class HTMLHeaderTextSplitter:
            text: The HTML text to split.
        Returns:
-            A list of split Document objects.
+            A list of split Document objects. Each Document contains
            `page_content` holding the extracted text and `metadata` that maps
            the header hierarchy to their corresponding titles.
        """
        return self.split_text_from_file(StringIO(text))
@@ -158,7 +150,9 @@ class HTMLHeaderTextSplitter:
            **kwargs: Additional keyword arguments for the request.
        Returns:
-            A list of split Document objects.
+            A list of split Document objects. Each Document contains
            `page_content` holding the extracted text and `metadata` that maps
            the header hierarchy to their corresponding titles.
        Raises:
            requests.RequestException: If the HTTP request fails.
@@ -179,7 +173,9 @@ class HTMLHeaderTextSplitter:
            file: A file path or a file-like object containing HTML content.
        Returns:
-            A list of split Document objects.
+            A list of split Document objects. Each Document contains
            `page_content` holding the extracted text and `metadata` that maps
            the header hierarchy to their corresponding titles.
        """
        if isinstance(file, str):
            with open(file, encoding="utf-8") as f:
@@ -385,9 +381,10 @@ class HTMLSectionSplitter:
            List[Dict[str, Optional[str]]]: A list of dictionaries representing
            sections.
            Each dictionary contains:
-                - 'header': The header text or a default title for the first section.
+
-                - 'content': The content under the header.
+            * 'header': The header text or a default title for the first section.
-                - 'tag_name': The name of the header tag (e.g., "h1", "h2").
+            * 'content': The content under the header.
            * 'tag_name': The name of the header tag (e.g., "h1", "h2").
        """
        try:
            from bs4 import BeautifulSoup
@@ -508,40 +505,6 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
    .. versionadded: 0.3.5
    Args:
        headers_to_split_on (List[Tuple[str, str]]): HTML headers (e.g., "h1", "h2")
            that define content sections.
        max_chunk_size (int): Maximum size for each chunk, with allowance for
            exceeding this limit to preserve semantics.
        chunk_overlap (int): Number of characters to overlap between chunks to ensure
            contextual continuity.
        separators (List[str]): Delimiters used by RecursiveCharacterTextSplitter for
            further splitting.
        elements_to_preserve (List[str]): HTML tags (e.g., <table>, <ul>) to remain
            intact during splitting.
        preserve_links (bool): Converts <a> tags to Markdown links ([text](url)).
        preserve_images (bool): Converts <img> tags to Markdown images (![alt](src)).
        preserve_videos (bool): Converts <video> tags to Markdown
        video links (![video](src)).
        preserve_audio (bool): Converts <audio> tags to Markdown
        audio links (![audio](src)).
        custom_handlers (Dict[str, Callable[[Any], str]]): Optional custom handlers for
            specific HTML tags, allowing tailored extraction or processing.
        stopword_removal (bool): Optionally remove stopwords from the text.
        stopword_lang (str): The language of stopwords to remove.
        normalize_text (bool): Optionally normalize text
            (e.g., lowercasing, removing punctuation).
        external_metadata (Optional[Dict[str, str]]): Additional metadata to attach to
            the Document objects.
        allowlist_tags (Optional[List[str]]): Only these tags will be retained in
            the HTML.
        denylist_tags (Optional[List[str]]): These tags will be removed from the HTML.
        preserve_parent_metadata (bool): Whether to pass through parent document
            metadata to split documents when calling
            ``transform_documents/atransform_documents()``.
        keep_separator (Union[bool, Literal["start", "end"]]): Whether separators
            should be at the beginning of a chunk, at the end, or not at all.
    Example:
        .. code-block:: python
@@ -593,7 +556,42 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
        preserve_parent_metadata: bool = False,
        keep_separator: Union[bool, Literal["start", "end"]] = True,
    ) -> None:
-        """Initialize splitter."""
+        """Initialize splitter.
        Args:
            headers_to_split_on: HTML headers (e.g., "h1", "h2")
                that define content sections.
            max_chunk_size: Maximum size for each chunk, with allowance for
                exceeding this limit to preserve semantics.
            chunk_overlap: Number of characters to overlap between chunks to ensure
                contextual continuity.
            separators: Delimiters used by RecursiveCharacterTextSplitter for
                further splitting.
            elements_to_preserve: HTML tags (e.g., <table>, <ul>) to remain
                intact during splitting.
            preserve_links: Converts <a> tags to Markdown links ([text](url)).
            preserve_images: Converts <img> tags to Markdown images (![alt](src)).
            preserve_videos: Converts <video> tags to Markdown
                video links (![video](src)).
            preserve_audio: Converts <audio> tags to Markdown
                audio links (![audio](src)).
            custom_handlers: Optional custom handlers for
                specific HTML tags, allowing tailored extraction or processing.
            stopword_removal: Optionally remove stopwords from the text.
            stopword_lang: The language of stopwords to remove.
            normalize_text: Optionally normalize text
                (e.g., lowercasing, removing punctuation).
            external_metadata: Additional metadata to attach to
                the Document objects.
            allowlist_tags: Only these tags will be retained in
                the HTML.
            denylist_tags: These tags will be removed from the HTML.
            preserve_parent_metadata: Whether to pass through parent document
                metadata to split documents when calling
                ``transform_documents/atransform_documents()``.
            keep_separator: Whether separators
                should be at the beginning of a chunk, at the end, or not at all.
        """
        try:
            from bs4 import BeautifulSoup, Tag
--- a/libs/text-splitters/langchain_text_splitters/json.py
+++ b/libs/text-splitters/langchain_text_splitters/json.py
@@ -14,31 +14,27 @@ class RecursiveJsonSplitter:
    JSON-formatted strings based on configurable maximum and minimum chunk sizes.
    It supports nested JSON structures, optionally converts lists into dictionaries
    for better chunking, and allows the creation of document objects for further use.
    Attributes:
        max_chunk_size (int): The maximum size for each chunk. Defaults to 2000.
        min_chunk_size (int): The minimum size for each chunk, derived from
            `max_chunk_size` if not explicitly provided.
    """
    max_chunk_size: int = 2000
    """The maximum size for each chunk. Defaults to 2000."""
    min_chunk_size: int = 1800
    """The minimum size for each chunk, derived from ``max_chunk_size`` if not
    explicitly provided."""
    def __init__(
        self, max_chunk_size: int = 2000, min_chunk_size: Optional[int] = None
    ) -> None:
        """Initialize the chunk size configuration for text processing.
        This constructor sets up the maximum and minimum chunk sizes, ensuring that
-        the `min_chunk_size` defaults to a value slightly smaller than the
+        the ``min_chunk_size`` defaults to a value slightly smaller than the
-        `max_chunk_size` if not explicitly provided.
+        ``max_chunk_size`` if not explicitly provided.
        Args:
-            max_chunk_size (int): The maximum size for a chunk. Defaults to 2000.
+            max_chunk_size: The maximum size for a chunk. Defaults to 2000.
-            min_chunk_size (Optional[int]): The minimum size for a chunk. If None,
+            min_chunk_size: The minimum size for a chunk. If None,
                defaults to the maximum chunk size minus 200, with a lower bound of 50.
        Attributes:
            max_chunk_size (int): The configured maximum size for each chunk.
            min_chunk_size (int): The configured minimum size for each chunk, derived
                from `max_chunk_size` if not explicitly provided.
        """
        super().__init__()
        self.max_chunk_size = max_chunk_size
--- a/libs/text-splitters/langchain_text_splitters/jsx.py
+++ b/libs/text-splitters/langchain_text_splitters/jsx.py
@@ -9,13 +9,15 @@ class JSFrameworkTextSplitter(RecursiveCharacterTextSplitter):
    This splitter extends RecursiveCharacterTextSplitter to handle
    React (JSX), Vue, and Svelte code by:
    1. Detecting and extracting custom component tags from the text
    2. Using those tags as additional separators along with standard JS syntax
    The splitter combines:
-    - Custom component tags as separators (e.g. <Component, <div)
+
-    - JavaScript syntax elements (function, const, if, etc)
+    * Custom component tags as separators (e.g. <Component, <div)
-    - Standard text splitting on newlines
+    * JavaScript syntax elements (function, const, if, etc)
    * Standard text splitting on newlines
    This allows chunks to break at natural boundaries in
    React, Vue, and Svelte component code.
@@ -43,9 +45,10 @@ class JSFrameworkTextSplitter(RecursiveCharacterTextSplitter):
        """Split text into chunks.
        This method splits the text into chunks by:
-        - Extracting unique opening component tags using regex
+
-        - Creating separators list with extracted tags and JS separators
+        * Extracting unique opening component tags using regex
-        - Splitting the text using the separators by calling the parent class method
+        * Creating separators list with extracted tags and JS separators
        * Splitting the text using the separators by calling the parent class method
        Args:
            text: String containing code to split
--- a/libs/text-splitters/langchain_text_splitters/markdown.py
+++ b/libs/text-splitters/langchain_text_splitters/markdown.py
@@ -289,32 +289,28 @@ class ExperimentalMarkdownSyntaxTextSplitter:
    additional features.
    Key Features:
    - Retains the original whitespace and formatting of the Markdown text.
    - Extracts headers, code blocks, and horizontal rules as metadata.
    - Splits out code blocks and includes the language in the "Code" metadata key.
    - Splits text on horizontal rules (`---`) as well.
    - Defaults to sensible splitting behavior, which can be overridden using the
      `headers_to_split_on` parameter.
-    Parameters:
+    * Retains the original whitespace and formatting of the Markdown text.
-    ----------
+    * Extracts headers, code blocks, and horizontal rules as metadata.
-    headers_to_split_on : List[Tuple[str, str]], optional
+    * Splits out code blocks and includes the language in the "Code" metadata key.
-        Headers to split on, defaulting to common Markdown headers if not specified.
+    * Splits text on horizontal rules (`---`) as well.
-    return_each_line : bool, optional
+    * Defaults to sensible splitting behavior, which can be overridden using the
-        When set to True, returns each line as a separate chunk. Default is False.
+      ``headers_to_split_on`` parameter.
-    Usage example:
+    Example:
-    --------------
+
-    >>> headers_to_split_on = [
+        .. code-block:: python
-    >>>     ("#", "Header 1"),
+
-    >>>     ("##", "Header 2"),
+            headers_to_split_on = [
-    >>> ]
+                ("#", "Header 1"),
-    >>> splitter = ExperimentalMarkdownSyntaxTextSplitter(
+                ("##", "Header 2"),
-    >>>     headers_to_split_on=headers_to_split_on
+            ]
-    >>> )
+            splitter = ExperimentalMarkdownSyntaxTextSplitter(
-    >>> chunks = splitter.split(text)
+                headers_to_split_on=headers_to_split_on
-    >>> for chunk in chunks:
+            )
-    >>>     print(chunk)
+            chunks = splitter.split(text)
            for chunk in chunks:
                print(chunk)
    This class is currently experimental and subject to change based on feedback and
    further development.