text-splitters: add pydocstyle linting (#28127)

As seen in #23188, turned on Google-style docstrings by enabling
`pydocstyle` linting in the `text-splitters` package. Each resulting
linting error was addressed differently: ignored, resolved, suppressed,
and missing docstrings were added.

Fixes one of the checklist items from #25154, similar to #25939 in
`core` package. Ran `make format`, `make lint` and `make test` from the
root of the package `text-splitters` to ensure no issues were found.

---------

Co-authored-by: Erick Friis <erick@langchain.dev>
This commit is contained in:
Ankit Dangi 2024-12-08 22:01:03 -08:00 committed by GitHub
parent b53f07bfb9
commit 90f162efb6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 194 additions and 27 deletions

View File

@ -1,6 +1,5 @@
"""**Text Splitters** are classes for splitting text.
**Class hierarchy:**
.. code-block::

View File

@ -249,6 +249,21 @@ class TokenTextSplitter(TextSplitter):
self._disallowed_special = disallowed_special
def split_text(self, text: str) -> List[str]:
"""Splits the input text into smaller chunks based on tokenization.
This method uses a custom tokenizer configuration to encode the input text
into tokens, processes the tokens in chunks of a specified size with overlap,
and decodes them back into text chunks. The splitting is performed using the
`split_text_on_tokens` function.
Args:
text (str): The input text to be split into smaller chunks.
Returns:
List[str]: A list of text chunks, where each chunk is derived from a portion
of the input text based on the tokenization and chunking rules.
"""
def _encode(_text: str) -> List[int]:
return self._tokenizer.encode(
_text,

View File

@ -115,17 +115,45 @@ class RecursiveCharacterTextSplitter(TextSplitter):
return final_chunks
def split_text(self, text: str) -> List[str]:
"""Split the input text into smaller chunks based on predefined separators.
Args:
text (str): The input text to be split.
Returns:
List[str]: A list of text chunks obtained after splitting.
"""
return self._split_text(text, self._separators)
@classmethod
def from_language(
cls, language: Language, **kwargs: Any
) -> RecursiveCharacterTextSplitter:
"""Return an instance of this class based on a specific language.
This method initializes the text splitter with language-specific separators.
Args:
language (Language): The language to configure the text splitter for.
**kwargs (Any): Additional keyword arguments to customize the splitter.
Returns:
RecursiveCharacterTextSplitter: An instance of the text splitter configured
for the specified language.
"""
separators = cls.get_separators_for_language(language)
return cls(separators=separators, is_separator_regex=True, **kwargs)
@staticmethod
def get_separators_for_language(language: Language) -> List[str]:
"""Retrieve a list of separators specific to the given language.
Args:
language (Language): The language for which to get the separators.
Returns:
List[str]: A list of separators appropriate for the specified language.
"""
if language == Language.C or language == Language.CPP:
return [
# Split along class definitions

View File

@ -21,8 +21,8 @@ class ElementType(TypedDict):
class HTMLHeaderTextSplitter:
"""
Splitting HTML files based on specified headers.
"""Splitting HTML files based on specified headers.
Requires lxml package.
"""
@ -46,7 +46,7 @@ class HTMLHeaderTextSplitter:
def aggregate_elements_to_chunks(
self, elements: List[ElementType]
) -> List[Document]:
"""Combine elements with common metadata into chunks
"""Combine elements with common metadata into chunks.
Args:
elements: HTML element content with associated identifying info and metadata
@ -72,7 +72,7 @@ class HTMLHeaderTextSplitter:
]
def split_text_from_url(self, url: str, **kwargs: Any) -> List[Document]:
"""Split HTML from web URL
"""Split HTML from web URL.
Args:
url: web URL
@ -83,7 +83,7 @@ class HTMLHeaderTextSplitter:
return self.split_text_from_file(BytesIO(r.content))
def split_text(self, text: str) -> List[Document]:
"""Split HTML text string
"""Split HTML text string.
Args:
text: HTML text
@ -91,7 +91,7 @@ class HTMLHeaderTextSplitter:
return self.split_text_from_file(StringIO(text))
def split_text_from_file(self, file: Any) -> List[Document]:
"""Split HTML file
"""Split HTML file.
Args:
file: HTML file
@ -166,8 +166,8 @@ class HTMLHeaderTextSplitter:
class HTMLSectionSplitter:
"""
Splitting HTML files based on specified tag and font sizes.
"""Splitting HTML files based on specified tag and font sizes.
Requires lxml package.
"""
@ -186,6 +186,8 @@ class HTMLSectionSplitter:
xslt_path: path to xslt file for document transformation.
Uses a default if not passed.
Needed for html contents that using different format and layouts.
**kwargs (Any): Additional optional arguments for customizations.
"""
self.headers_to_split_on = dict(headers_to_split_on)
@ -210,7 +212,7 @@ class HTMLSectionSplitter:
return text_splitter.split_documents(results)
def split_text(self, text: str) -> List[Document]:
"""Split HTML text string
"""Split HTML text string.
Args:
text: HTML text
@ -236,6 +238,23 @@ class HTMLSectionSplitter:
return documents
def split_html_by_headers(self, html_doc: str) -> List[Dict[str, Optional[str]]]:
"""Split an HTML document into sections based on specified header tags.
This method uses BeautifulSoup to parse the HTML content and divides it into
sections based on headers defined in `headers_to_split_on`. Each section
contains the header text, content under the header, and the tag name.
Args:
html_doc (str): The HTML document to be split into sections.
Returns:
List[Dict[str, Optional[str]]]: A list of dictionaries representing
sections.
Each dictionary contains:
- 'header': The header text or a default title for the first section.
- 'content': The content under the header.
- 'tag_name': The name of the header tag (e.g., "h1", "h2").
"""
try:
from bs4 import BeautifulSoup, PageElement # type: ignore[import-untyped]
except ImportError as e:
@ -259,7 +278,7 @@ class HTMLSectionSplitter:
section_content: List = []
else:
current_header = header_element.text.strip()
current_header_tag = header_element.name
current_header_tag = header_element.name # type: ignore[attr-defined]
section_content = []
for element in header_element.next_elements:
if i + 1 < len(headers) and element == headers[i + 1]:
@ -280,6 +299,18 @@ class HTMLSectionSplitter:
return sections
def convert_possible_tags_to_header(self, html_content: str) -> str:
"""Convert specific HTML tags to headers using an XSLT transformation.
This method uses an XSLT file to transform the HTML content, converting
certain tags into headers for easier parsing. If no XSLT path is provided,
the HTML content is returned unchanged.
Args:
html_content (str): The HTML content to be transformed.
Returns:
str: The transformed HTML content as a string.
"""
if self.xslt_path is None:
return html_content
@ -299,7 +330,7 @@ class HTMLSectionSplitter:
return str(result)
def split_text_from_file(self, file: Any) -> List[Document]:
"""Split HTML file
"""Split HTML file.
Args:
file: HTML file

View File

@ -8,9 +8,38 @@ from langchain_core.documents import Document
class RecursiveJsonSplitter:
"""Splits JSON data into smaller, structured chunks while preserving hierarchy.
This class provides methods to split JSON data into smaller dictionaries or
JSON-formatted strings based on configurable maximum and minimum chunk sizes.
It supports nested JSON structures, optionally converts lists into dictionaries
for better chunking, and allows the creation of document objects for further use.
Attributes:
max_chunk_size (int): The maximum size for each chunk. Defaults to 2000.
min_chunk_size (int): The minimum size for each chunk, derived from
`max_chunk_size` if not explicitly provided.
"""
def __init__(
self, max_chunk_size: int = 2000, min_chunk_size: Optional[int] = None
):
"""Initialize the chunk size configuration for text processing.
This constructor sets up the maximum and minimum chunk sizes, ensuring that
the `min_chunk_size` defaults to a value slightly smaller than the
`max_chunk_size` if not explicitly provided.
Args:
max_chunk_size (int): The maximum size for a chunk. Defaults to 2000.
min_chunk_size (Optional[int]): The minimum size for a chunk. If None,
defaults to the maximum chunk size minus 200, with a lower bound of 50.
Attributes:
max_chunk_size (int): The configured maximum size for each chunk.
min_chunk_size (int): The configured minimum size for each chunk, derived
from `max_chunk_size` if not explicitly provided.
"""
super().__init__()
self.max_chunk_size = max_chunk_size
self.min_chunk_size = (
@ -51,9 +80,7 @@ class RecursiveJsonSplitter:
current_path: Optional[List[str]] = None,
chunks: Optional[List[Dict]] = None,
) -> List[Dict]:
"""
Split json into maximum size dictionaries while preserving structure.
"""
"""Split json into maximum size dictionaries while preserving structure."""
current_path = current_path or []
chunks = chunks if chunks is not None else [{}]
if isinstance(data, dict):
@ -83,8 +110,7 @@ class RecursiveJsonSplitter:
json_data: Dict[str, Any],
convert_lists: bool = False,
) -> List[Dict]:
"""Splits JSON into a list of JSON chunks"""
"""Splits JSON into a list of JSON chunks."""
if convert_lists:
chunks = self._json_split(self._list_to_dict_preprocessing(json_data))
else:
@ -101,8 +127,7 @@ class RecursiveJsonSplitter:
convert_lists: bool = False,
ensure_ascii: bool = True,
) -> List[str]:
"""Splits JSON into a list of JSON formatted strings"""
"""Splits JSON into a list of JSON formatted strings."""
chunks = self.split_json(json_data=json_data, convert_lists=convert_lists)
# Convert to string

View File

@ -45,7 +45,8 @@ class MarkdownHeaderTextSplitter:
self.strip_headers = strip_headers
def aggregate_lines_to_chunks(self, lines: List[LineType]) -> List[Document]:
"""Combine lines with common metadata into chunks
"""Combine lines with common metadata into chunks.
Args:
lines: Line of text / associated header metadata
"""
@ -87,10 +88,11 @@ class MarkdownHeaderTextSplitter:
]
def split_text(self, text: str) -> List[Document]:
"""Split markdown file
Args:
text: Markdown file"""
"""Split markdown file.
Args:
text: Markdown file
"""
# Split the input text by newline character ("\n").
lines = text.split("\n")
# Final output
@ -225,8 +227,7 @@ class HeaderType(TypedDict):
class ExperimentalMarkdownSyntaxTextSplitter:
"""
An experimental text splitter for handling Markdown syntax.
"""An experimental text splitter for handling Markdown syntax.
This splitter aims to retain the exact whitespace of the original text while
extracting structured metadata, such as headers. It is a re-implementation of the
@ -280,6 +281,22 @@ class ExperimentalMarkdownSyntaxTextSplitter:
return_each_line: bool = False,
strip_headers: bool = True,
):
"""Initialize the text splitter with header splitting and formatting options.
This constructor sets up the required configuration for splitting text into
chunks based on specified headers and formatting preferences.
Args:
headers_to_split_on (Union[List[Tuple[str, str]], None]):
A list of tuples, where each tuple contains a header tag (e.g., "h1")
and its corresponding metadata key. If None, default headers are used.
return_each_line (bool):
Whether to return each line as an individual chunk.
Defaults to False, which aggregates lines into larger chunks.
strip_headers (bool):
Whether to exclude headers from the resulting chunks.
Defaults to True.
"""
self.chunks: List[Document] = []
self.current_chunk = Document(page_content="")
self.current_header_stack: List[Tuple[int, str]] = []
@ -292,6 +309,21 @@ class ExperimentalMarkdownSyntaxTextSplitter:
self.return_each_line = return_each_line
def split_text(self, text: str) -> List[Document]:
"""Split the input text into structured chunks.
This method processes the input text line by line, identifying and handling
specific patterns such as headers, code blocks, and horizontal rules to
split it into structured chunks based on headers, code blocks, and
horizontal rules.
Args:
text (str): The input text to be split into chunks.
Returns:
List[Document]: A list of `Document` objects representing the structured
chunks of the input text. If `return_each_line` is enabled, each line
is returned as a separate `Document`.
"""
raw_lines = text.splitlines(keepends=True)
while raw_lines:

View File

@ -51,6 +51,20 @@ class SentenceTransformersTokenTextSplitter(TextSplitter):
)
def split_text(self, text: str) -> List[str]:
"""Splits the input text into smaller components by splitting text on tokens.
This method encodes the input text using a private `_encode` method, then
strips the start and stop token IDs from the encoded result. It returns the
processed segments as a list of strings.
Args:
text (str): The input text to be split.
Returns:
List[str]: A list of string components derived from the input text after
encoding and processing.
"""
def encode_strip_start_and_stop_token_ids(text: str) -> List[int]:
return self._encode(text)[1:-1]
@ -64,6 +78,17 @@ class SentenceTransformersTokenTextSplitter(TextSplitter):
return split_text_on_tokens(text=text, tokenizer=tokenizer)
def count_tokens(self, *, text: str) -> int:
"""Counts the number of tokens in the given text.
This method encodes the input text using a private `_encode` method and
calculates the total number of tokens in the encoded result.
Args:
text (str): The input text for which the token count is calculated.
Returns:
int: The number of tokens in the encoded text.
"""
return len(self._encode(text))
_max_length_equal_32_bit_integer: int = 2**32

View File

@ -8,7 +8,6 @@ from langchain_text_splitters.base import TextSplitter
class SpacyTextSplitter(TextSplitter):
"""Splitting text using Spacy package.
Per default, Spacy's `en_core_web_sm` model is used and
its default max_length is 1000000 (it is the length of maximum character
this model takes which can be increased for large files). For a faster, but

View File

@ -26,7 +26,20 @@ python = ">=3.9,<4.0"
langchain-core = "^0.3.15"
[tool.ruff.lint]
select = [ "E", "F", "I", "T201",]
select = [
"E", # pycodestyle
"F", # Pyflakes
"I", # isort
"T201", # print
"D", # pydocstyle
]
ignore = ["D100"] # ignore missing module docstring
[tool.ruff.lint.pydocstyle]
convention = "google"
[tool.ruff.lint.per-file-ignores]
"tests/**" = ["D"] # ignore docstring checks for tests
[tool.coverage.run]
omit = [ "tests/*",]