mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-10 15:33:11 +00:00
text-splitters: add pydocstyle linting (#28127)
As seen in #23188, turned on Google-style docstrings by enabling `pydocstyle` linting in the `text-splitters` package. Each resulting linting error was addressed differently: ignored, resolved, suppressed, and missing docstrings were added. Fixes one of the checklist items from #25154, similar to #25939 in `core` package. Ran `make format`, `make lint` and `make test` from the root of the package `text-splitters` to ensure no issues were found. --------- Co-authored-by: Erick Friis <erick@langchain.dev>
This commit is contained in:
@@ -8,9 +8,38 @@ from langchain_core.documents import Document
|
||||
|
||||
|
||||
class RecursiveJsonSplitter:
|
||||
"""Splits JSON data into smaller, structured chunks while preserving hierarchy.
|
||||
|
||||
This class provides methods to split JSON data into smaller dictionaries or
|
||||
JSON-formatted strings based on configurable maximum and minimum chunk sizes.
|
||||
It supports nested JSON structures, optionally converts lists into dictionaries
|
||||
for better chunking, and allows the creation of document objects for further use.
|
||||
|
||||
Attributes:
|
||||
max_chunk_size (int): The maximum size for each chunk. Defaults to 2000.
|
||||
min_chunk_size (int): The minimum size for each chunk, derived from
|
||||
`max_chunk_size` if not explicitly provided.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, max_chunk_size: int = 2000, min_chunk_size: Optional[int] = None
|
||||
):
|
||||
"""Initialize the chunk size configuration for text processing.
|
||||
|
||||
This constructor sets up the maximum and minimum chunk sizes, ensuring that
|
||||
the `min_chunk_size` defaults to a value slightly smaller than the
|
||||
`max_chunk_size` if not explicitly provided.
|
||||
|
||||
Args:
|
||||
max_chunk_size (int): The maximum size for a chunk. Defaults to 2000.
|
||||
min_chunk_size (Optional[int]): The minimum size for a chunk. If None,
|
||||
defaults to the maximum chunk size minus 200, with a lower bound of 50.
|
||||
|
||||
Attributes:
|
||||
max_chunk_size (int): The configured maximum size for each chunk.
|
||||
min_chunk_size (int): The configured minimum size for each chunk, derived
|
||||
from `max_chunk_size` if not explicitly provided.
|
||||
"""
|
||||
super().__init__()
|
||||
self.max_chunk_size = max_chunk_size
|
||||
self.min_chunk_size = (
|
||||
@@ -51,9 +80,7 @@ class RecursiveJsonSplitter:
|
||||
current_path: Optional[List[str]] = None,
|
||||
chunks: Optional[List[Dict]] = None,
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
Split json into maximum size dictionaries while preserving structure.
|
||||
"""
|
||||
"""Split json into maximum size dictionaries while preserving structure."""
|
||||
current_path = current_path or []
|
||||
chunks = chunks if chunks is not None else [{}]
|
||||
if isinstance(data, dict):
|
||||
@@ -83,8 +110,7 @@ class RecursiveJsonSplitter:
|
||||
json_data: Dict[str, Any],
|
||||
convert_lists: bool = False,
|
||||
) -> List[Dict]:
|
||||
"""Splits JSON into a list of JSON chunks"""
|
||||
|
||||
"""Splits JSON into a list of JSON chunks."""
|
||||
if convert_lists:
|
||||
chunks = self._json_split(self._list_to_dict_preprocessing(json_data))
|
||||
else:
|
||||
@@ -101,8 +127,7 @@ class RecursiveJsonSplitter:
|
||||
convert_lists: bool = False,
|
||||
ensure_ascii: bool = True,
|
||||
) -> List[str]:
|
||||
"""Splits JSON into a list of JSON formatted strings"""
|
||||
|
||||
"""Splits JSON into a list of JSON formatted strings."""
|
||||
chunks = self.split_json(json_data=json_data, convert_lists=convert_lists)
|
||||
|
||||
# Convert to string
|
||||
|
Reference in New Issue
Block a user