text-splitters: add pydocstyle linting (#28127)

As seen in #23188, turned on Google-style docstrings by enabling
`pydocstyle` linting in the `text-splitters` package. Each resulting
linting error was addressed differently: ignored, resolved, suppressed,
and missing docstrings were added.

Fixes one of the checklist items from #25154, similar to #25939 in
`core` package. Ran `make format`, `make lint` and `make test` from the
root of the package `text-splitters` to ensure no issues were found.

---------

Co-authored-by: Erick Friis <erick@langchain.dev>
This commit is contained in:
Ankit Dangi
2024-12-08 22:01:03 -08:00
committed by GitHub
parent b53f07bfb9
commit 90f162efb6
9 changed files with 194 additions and 27 deletions

View File

@@ -8,9 +8,38 @@ from langchain_core.documents import Document
class RecursiveJsonSplitter:
"""Splits JSON data into smaller, structured chunks while preserving hierarchy.
This class provides methods to split JSON data into smaller dictionaries or
JSON-formatted strings based on configurable maximum and minimum chunk sizes.
It supports nested JSON structures, optionally converts lists into dictionaries
for better chunking, and allows the creation of document objects for further use.
Attributes:
max_chunk_size (int): The maximum size for each chunk. Defaults to 2000.
min_chunk_size (int): The minimum size for each chunk, derived from
`max_chunk_size` if not explicitly provided.
"""
def __init__(
self, max_chunk_size: int = 2000, min_chunk_size: Optional[int] = None
):
"""Initialize the chunk size configuration for text processing.
This constructor sets up the maximum and minimum chunk sizes, ensuring that
the `min_chunk_size` defaults to a value slightly smaller than the
`max_chunk_size` if not explicitly provided.
Args:
max_chunk_size (int): The maximum size for a chunk. Defaults to 2000.
min_chunk_size (Optional[int]): The minimum size for a chunk. If None,
defaults to the maximum chunk size minus 200, with a lower bound of 50.
Attributes:
max_chunk_size (int): The configured maximum size for each chunk.
min_chunk_size (int): The configured minimum size for each chunk, derived
from `max_chunk_size` if not explicitly provided.
"""
super().__init__()
self.max_chunk_size = max_chunk_size
self.min_chunk_size = (
@@ -51,9 +80,7 @@ class RecursiveJsonSplitter:
current_path: Optional[List[str]] = None,
chunks: Optional[List[Dict]] = None,
) -> List[Dict]:
"""
Split json into maximum size dictionaries while preserving structure.
"""
"""Split json into maximum size dictionaries while preserving structure."""
current_path = current_path or []
chunks = chunks if chunks is not None else [{}]
if isinstance(data, dict):
@@ -83,8 +110,7 @@ class RecursiveJsonSplitter:
json_data: Dict[str, Any],
convert_lists: bool = False,
) -> List[Dict]:
"""Splits JSON into a list of JSON chunks"""
"""Splits JSON into a list of JSON chunks."""
if convert_lists:
chunks = self._json_split(self._list_to_dict_preprocessing(json_data))
else:
@@ -101,8 +127,7 @@ class RecursiveJsonSplitter:
convert_lists: bool = False,
ensure_ascii: bool = True,
) -> List[str]:
"""Splits JSON into a list of JSON formatted strings"""
"""Splits JSON into a list of JSON formatted strings."""
chunks = self.split_json(json_data=json_data, convert_lists=convert_lists)
# Convert to string