text-splitters: add pydocstyle linting (#28127)

As seen in #23188, turned on Google-style docstrings by enabling `pydocstyle` linting in the `text-splitters` package. Each resulting linting error was addressed differently: ignored, resolved, suppressed, and missing docstrings were added. Fixes one of the checklist items from #25154, similar to #25939 in `core` package. Ran `make format`, `make lint` and `make test` from the root of the package `text-splitters` to ensure no issues were found. --------- Co-authored-by: Erick Friis <erick@langchain.dev>
2025-07-22 04:24:39 +00:00 · 2024-12-08 22:01:03 -08:00 · 2024-12-08 22:01:03 -08:00 · 90f162efb6
commit 90f162efb6
parent b53f07bfb9
9 changed files with 194 additions and 27 deletions
--- a/libs/text-splitters/langchain_text_splitters/init.py
+++ b/libs/text-splitters/langchain_text_splitters/init.py
@ -1,6 +1,5 @@
 """**Text Splitters** are classes for splitting text.
 **Class hierarchy:**
 .. code-block::
--- a/libs/text-splitters/langchain_text_splitters/base.py
+++ b/libs/text-splitters/langchain_text_splitters/base.py
@ -249,6 +249,21 @@ class TokenTextSplitter(TextSplitter):
        self._disallowed_special = disallowed_special
    def split_text(self, text: str) -> List[str]:
        """Splits the input text into smaller chunks based on tokenization.
        This method uses a custom tokenizer configuration to encode the input text
        into tokens, processes the tokens in chunks of a specified size with overlap,
        and decodes them back into text chunks. The splitting is performed using the
        `split_text_on_tokens` function.
        Args:
            text (str): The input text to be split into smaller chunks.
        Returns:
            List[str]: A list of text chunks, where each chunk is derived from a portion
            of the input text based on the tokenization and chunking rules.
        """
        def _encode(_text: str) -> List[int]:
            return self._tokenizer.encode(
                _text,
--- a/libs/text-splitters/langchain_text_splitters/character.py
+++ b/libs/text-splitters/langchain_text_splitters/character.py
@ -115,17 +115,45 @@ class RecursiveCharacterTextSplitter(TextSplitter):
        return final_chunks
    def split_text(self, text: str) -> List[str]:
        """Split the input text into smaller chunks based on predefined separators.
        Args:
            text (str): The input text to be split.
        Returns:
            List[str]: A list of text chunks obtained after splitting.
        """
        return self._split_text(text, self._separators)
    @classmethod
    def from_language(
        cls, language: Language, **kwargs: Any
    ) -> RecursiveCharacterTextSplitter:
        """Return an instance of this class based on a specific language.
        This method initializes the text splitter with language-specific separators.
        Args:
            language (Language): The language to configure the text splitter for.
            **kwargs (Any): Additional keyword arguments to customize the splitter.
        Returns:
            RecursiveCharacterTextSplitter: An instance of the text splitter configured
            for the specified language.
        """
        separators = cls.get_separators_for_language(language)
        return cls(separators=separators, is_separator_regex=True, **kwargs)
    @staticmethod
    def get_separators_for_language(language: Language) -> List[str]:
        """Retrieve a list of separators specific to the given language.
        Args:
            language (Language): The language for which to get the separators.
        Returns:
            List[str]: A list of separators appropriate for the specified language.
        """
        if language == Language.C or language == Language.CPP:
            return [
                # Split along class definitions
--- a/libs/text-splitters/langchain_text_splitters/html.py
+++ b/libs/text-splitters/langchain_text_splitters/html.py
@ -21,8 +21,8 @@ class ElementType(TypedDict):
 class HTMLHeaderTextSplitter:
-    """
+    """Splitting HTML files based on specified headers.
-    Splitting HTML files based on specified headers.
+
    Requires lxml package.
    """
@ -46,7 +46,7 @@ class HTMLHeaderTextSplitter:
    def aggregate_elements_to_chunks(
        self, elements: List[ElementType]
    ) -> List[Document]:
-        """Combine elements with common metadata into chunks
+        """Combine elements with common metadata into chunks.
        Args:
            elements: HTML element content with associated identifying info and metadata
@ -72,7 +72,7 @@ class HTMLHeaderTextSplitter:
        ]
    def split_text_from_url(self, url: str, **kwargs: Any) -> List[Document]:
-        """Split HTML from web URL
+        """Split HTML from web URL.
        Args:
            url: web URL
@ -83,7 +83,7 @@ class HTMLHeaderTextSplitter:
        return self.split_text_from_file(BytesIO(r.content))
    def split_text(self, text: str) -> List[Document]:
-        """Split HTML text string
+        """Split HTML text string.
        Args:
            text: HTML text
@ -91,7 +91,7 @@ class HTMLHeaderTextSplitter:
        return self.split_text_from_file(StringIO(text))
    def split_text_from_file(self, file: Any) -> List[Document]:
-        """Split HTML file
+        """Split HTML file.
        Args:
            file: HTML file
@ -166,8 +166,8 @@ class HTMLHeaderTextSplitter:
 class HTMLSectionSplitter:
-    """
+    """Splitting HTML files based on specified tag and font sizes.
-    Splitting HTML files based on specified tag and font sizes.
+
    Requires lxml package.
    """
@ -186,6 +186,8 @@ class HTMLSectionSplitter:
            xslt_path: path to xslt file for document transformation.
            Uses a default if not passed.
            Needed for html contents that using different format and layouts.
            **kwargs (Any): Additional optional arguments for customizations.
        """
        self.headers_to_split_on = dict(headers_to_split_on)
@ -210,7 +212,7 @@ class HTMLSectionSplitter:
        return text_splitter.split_documents(results)
    def split_text(self, text: str) -> List[Document]:
-        """Split HTML text string
+        """Split HTML text string.
        Args:
            text: HTML text
@ -236,6 +238,23 @@ class HTMLSectionSplitter:
        return documents
    def split_html_by_headers(self, html_doc: str) -> List[Dict[str, Optional[str]]]:
        """Split an HTML document into sections based on specified header tags.
        This method uses BeautifulSoup to parse the HTML content and divides it into
        sections based on headers defined in `headers_to_split_on`. Each section
        contains the header text, content under the header, and the tag name.
        Args:
            html_doc (str): The HTML document to be split into sections.
        Returns:
            List[Dict[str, Optional[str]]]: A list of dictionaries representing
            sections.
                Each dictionary contains:
                - 'header': The header text or a default title for the first section.
                - 'content': The content under the header.
                - 'tag_name': The name of the header tag (e.g., "h1", "h2").
        """
        try:
            from bs4 import BeautifulSoup, PageElement  # type: ignore[import-untyped]
        except ImportError as e:
@ -259,7 +278,7 @@ class HTMLSectionSplitter:
                section_content: List = []
            else:
                current_header = header_element.text.strip()
-                current_header_tag = header_element.name
+                current_header_tag = header_element.name  # type: ignore[attr-defined]
                section_content = []
            for element in header_element.next_elements:
                if i + 1 < len(headers) and element == headers[i + 1]:
@ -280,6 +299,18 @@ class HTMLSectionSplitter:
        return sections
    def convert_possible_tags_to_header(self, html_content: str) -> str:
        """Convert specific HTML tags to headers using an XSLT transformation.
        This method uses an XSLT file to transform the HTML content, converting
        certain tags into headers for easier parsing. If no XSLT path is provided,
        the HTML content is returned unchanged.
        Args:
            html_content (str): The HTML content to be transformed.
        Returns:
            str: The transformed HTML content as a string.
        """
        if self.xslt_path is None:
            return html_content
@ -299,7 +330,7 @@ class HTMLSectionSplitter:
        return str(result)
    def split_text_from_file(self, file: Any) -> List[Document]:
-        """Split HTML file
+        """Split HTML file.
        Args:
            file: HTML file
--- a/libs/text-splitters/langchain_text_splitters/json.py
+++ b/libs/text-splitters/langchain_text_splitters/json.py
@ -8,9 +8,38 @@ from langchain_core.documents import Document
 class RecursiveJsonSplitter:
    """Splits JSON data into smaller, structured chunks while preserving hierarchy.
    This class provides methods to split JSON data into smaller dictionaries or
    JSON-formatted strings based on configurable maximum and minimum chunk sizes.
    It supports nested JSON structures, optionally converts lists into dictionaries
    for better chunking, and allows the creation of document objects for further use.
    Attributes:
        max_chunk_size (int): The maximum size for each chunk. Defaults to 2000.
        min_chunk_size (int): The minimum size for each chunk, derived from
            `max_chunk_size` if not explicitly provided.
    """
    def __init__(
        self, max_chunk_size: int = 2000, min_chunk_size: Optional[int] = None
    ):
        """Initialize the chunk size configuration for text processing.
        This constructor sets up the maximum and minimum chunk sizes, ensuring that
        the `min_chunk_size` defaults to a value slightly smaller than the
        `max_chunk_size` if not explicitly provided.
        Args:
            max_chunk_size (int): The maximum size for a chunk. Defaults to 2000.
            min_chunk_size (Optional[int]): The minimum size for a chunk. If None,
                defaults to the maximum chunk size minus 200, with a lower bound of 50.
        Attributes:
            max_chunk_size (int): The configured maximum size for each chunk.
            min_chunk_size (int): The configured minimum size for each chunk, derived
                from `max_chunk_size` if not explicitly provided.
        """
        super().__init__()
        self.max_chunk_size = max_chunk_size
        self.min_chunk_size = (
@ -51,9 +80,7 @@ class RecursiveJsonSplitter:
        current_path: Optional[List[str]] = None,
        chunks: Optional[List[Dict]] = None,
    ) -> List[Dict]:
-        """
+        """Split json into maximum size dictionaries while preserving structure."""
        Split json into maximum size dictionaries while preserving structure.
        """
        current_path = current_path or []
        chunks = chunks if chunks is not None else [{}]
        if isinstance(data, dict):
@ -83,8 +110,7 @@ class RecursiveJsonSplitter:
        json_data: Dict[str, Any],
        convert_lists: bool = False,
    ) -> List[Dict]:
-        """Splits JSON into a list of JSON chunks"""
+        """Splits JSON into a list of JSON chunks."""
        if convert_lists:
            chunks = self._json_split(self._list_to_dict_preprocessing(json_data))
        else:
@ -101,8 +127,7 @@ class RecursiveJsonSplitter:
        convert_lists: bool = False,
        ensure_ascii: bool = True,
    ) -> List[str]:
-        """Splits JSON into a list of JSON formatted strings"""
+        """Splits JSON into a list of JSON formatted strings."""
        chunks = self.split_json(json_data=json_data, convert_lists=convert_lists)
        # Convert to string
--- a/libs/text-splitters/langchain_text_splitters/markdown.py
+++ b/libs/text-splitters/langchain_text_splitters/markdown.py
@ -45,7 +45,8 @@ class MarkdownHeaderTextSplitter:
        self.strip_headers = strip_headers
    def aggregate_lines_to_chunks(self, lines: List[LineType]) -> List[Document]:
-        """Combine lines with common metadata into chunks
+        """Combine lines with common metadata into chunks.
        Args:
            lines: Line of text / associated header metadata
        """
@ -87,10 +88,11 @@ class MarkdownHeaderTextSplitter:
        ]
    def split_text(self, text: str) -> List[Document]:
-        """Split markdown file
+        """Split markdown file.
        Args:
            text: Markdown file"""
        Args:
            text: Markdown file
        """
        # Split the input text by newline character ("\n").
        lines = text.split("\n")
        # Final output
@ -225,8 +227,7 @@ class HeaderType(TypedDict):
 class ExperimentalMarkdownSyntaxTextSplitter:
-    """
+    """An experimental text splitter for handling Markdown syntax.
    An experimental text splitter for handling Markdown syntax.
    This splitter aims to retain the exact whitespace of the original text while
    extracting structured metadata, such as headers. It is a re-implementation of the
@ -280,6 +281,22 @@ class ExperimentalMarkdownSyntaxTextSplitter:
        return_each_line: bool = False,
        strip_headers: bool = True,
    ):
        """Initialize the text splitter with header splitting and formatting options.
        This constructor sets up the required configuration for splitting text into
        chunks based on specified headers and formatting preferences.
        Args:
            headers_to_split_on (Union[List[Tuple[str, str]], None]):
                A list of tuples, where each tuple contains a header tag (e.g., "h1")
                and its corresponding metadata key. If None, default headers are used.
            return_each_line (bool):
                Whether to return each line as an individual chunk.
                Defaults to False, which aggregates lines into larger chunks.
            strip_headers (bool):
                Whether to exclude headers from the resulting chunks.
                Defaults to True.
        """
        self.chunks: List[Document] = []
        self.current_chunk = Document(page_content="")
        self.current_header_stack: List[Tuple[int, str]] = []
@ -292,6 +309,21 @@ class ExperimentalMarkdownSyntaxTextSplitter:
        self.return_each_line = return_each_line
    def split_text(self, text: str) -> List[Document]:
        """Split the input text into structured chunks.
        This method processes the input text line by line, identifying and handling
        specific patterns such as headers, code blocks, and horizontal rules to
        split it into structured chunks based on headers, code blocks, and
        horizontal rules.
        Args:
            text (str): The input text to be split into chunks.
        Returns:
            List[Document]: A list of `Document` objects representing the structured
            chunks of the input text. If `return_each_line` is enabled, each line
            is returned as a separate `Document`.
        """
        raw_lines = text.splitlines(keepends=True)
        while raw_lines:
--- a/libs/text-splitters/langchain_text_splitters/sentence_transformers.py
+++ b/libs/text-splitters/langchain_text_splitters/sentence_transformers.py
@ -51,6 +51,20 @@ class SentenceTransformersTokenTextSplitter(TextSplitter):
            )
    def split_text(self, text: str) -> List[str]:
        """Splits the input text into smaller components by splitting text on tokens.
        This method encodes the input text using a private `_encode` method, then
        strips the start and stop token IDs from the encoded result. It returns the
        processed segments as a list of strings.
        Args:
            text (str): The input text to be split.
        Returns:
            List[str]: A list of string components derived from the input text after
            encoding and processing.
        """
        def encode_strip_start_and_stop_token_ids(text: str) -> List[int]:
            return self._encode(text)[1:-1]
@ -64,6 +78,17 @@ class SentenceTransformersTokenTextSplitter(TextSplitter):
        return split_text_on_tokens(text=text, tokenizer=tokenizer)
    def count_tokens(self, *, text: str) -> int:
        """Counts the number of tokens in the given text.
        This method encodes the input text using a private `_encode` method and
        calculates the total number of tokens in the encoded result.
        Args:
            text (str): The input text for which the token count is calculated.
        Returns:
            int: The number of tokens in the encoded text.
        """
        return len(self._encode(text))
    _max_length_equal_32_bit_integer: int = 2**32
--- a/libs/text-splitters/langchain_text_splitters/spacy.py
+++ b/libs/text-splitters/langchain_text_splitters/spacy.py
@ -8,7 +8,6 @@ from langchain_text_splitters.base import TextSplitter
 class SpacyTextSplitter(TextSplitter):
    """Splitting text using Spacy package.
    Per default, Spacy's `en_core_web_sm` model is used and
    its default max_length is 1000000 (it is the length of maximum character
    this model takes which can be increased for large files). For a faster, but
--- a/libs/text-splitters/pyproject.toml
+++ b/libs/text-splitters/pyproject.toml
@ -26,7 +26,20 @@ python = ">=3.9,<4.0"
 langchain-core = "^0.3.15"
 [tool.ruff.lint]
-select = [ "E", "F", "I", "T201",]
+select = [
    "E",        # pycodestyle
    "F",        # Pyflakes
    "I",        # isort
    "T201",     # print
    "D",        # pydocstyle
 ]
 ignore = ["D100"]  # ignore missing module docstring
 [tool.ruff.lint.pydocstyle]
 convention = "google"
 [tool.ruff.lint.per-file-ignores]
 "tests/**" = ["D"]  # ignore docstring checks for tests
 [tool.coverage.run]
 omit = [ "tests/*",]