mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-11 07:50:47 +00:00
text-splitters: add pydocstyle linting (#28127)
As seen in #23188, turned on Google-style docstrings by enabling `pydocstyle` linting in the `text-splitters` package. Each resulting linting error was addressed differently: ignored, resolved, suppressed, and missing docstrings were added. Fixes one of the checklist items from #25154, similar to #25939 in `core` package. Ran `make format`, `make lint` and `make test` from the root of the package `text-splitters` to ensure no issues were found. --------- Co-authored-by: Erick Friis <erick@langchain.dev>
This commit is contained in:
@@ -51,6 +51,20 @@ class SentenceTransformersTokenTextSplitter(TextSplitter):
|
||||
)
|
||||
|
||||
def split_text(self, text: str) -> List[str]:
|
||||
"""Splits the input text into smaller components by splitting text on tokens.
|
||||
|
||||
This method encodes the input text using a private `_encode` method, then
|
||||
strips the start and stop token IDs from the encoded result. It returns the
|
||||
processed segments as a list of strings.
|
||||
|
||||
Args:
|
||||
text (str): The input text to be split.
|
||||
|
||||
Returns:
|
||||
List[str]: A list of string components derived from the input text after
|
||||
encoding and processing.
|
||||
"""
|
||||
|
||||
def encode_strip_start_and_stop_token_ids(text: str) -> List[int]:
|
||||
return self._encode(text)[1:-1]
|
||||
|
||||
@@ -64,6 +78,17 @@ class SentenceTransformersTokenTextSplitter(TextSplitter):
|
||||
return split_text_on_tokens(text=text, tokenizer=tokenizer)
|
||||
|
||||
def count_tokens(self, *, text: str) -> int:
|
||||
"""Counts the number of tokens in the given text.
|
||||
|
||||
This method encodes the input text using a private `_encode` method and
|
||||
calculates the total number of tokens in the encoded result.
|
||||
|
||||
Args:
|
||||
text (str): The input text for which the token count is calculated.
|
||||
|
||||
Returns:
|
||||
int: The number of tokens in the encoded text.
|
||||
"""
|
||||
return len(self._encode(text))
|
||||
|
||||
_max_length_equal_32_bit_integer: int = 2**32
|
||||
|
Reference in New Issue
Block a user