mirror of
https://github.com/hwchase17/langchain.git
synced 2025-04-29 20:35:43 +00:00
As seen in #23188, turned on Google-style docstrings by enabling `pydocstyle` linting in the `text-splitters` package. Each resulting linting error was addressed differently: ignored, resolved, suppressed, and missing docstrings were added. Fixes one of the checklist items from #25154, similar to #25939 in `core` package. Ran `make format`, `make lint` and `make test` from the root of the package `text-splitters` to ensure no issues were found. --------- Co-authored-by: Erick Friis <erick@langchain.dev>
61 lines
1.9 KiB
Python
61 lines
1.9 KiB
Python
from __future__ import annotations
|
|
|
|
from typing import Any, List
|
|
|
|
from langchain_text_splitters.base import TextSplitter
|
|
|
|
|
|
class SpacyTextSplitter(TextSplitter):
|
|
"""Splitting text using Spacy package.
|
|
|
|
Per default, Spacy's `en_core_web_sm` model is used and
|
|
its default max_length is 1000000 (it is the length of maximum character
|
|
this model takes which can be increased for large files). For a faster, but
|
|
potentially less accurate splitting, you can use `pipeline='sentencizer'`.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
separator: str = "\n\n",
|
|
pipeline: str = "en_core_web_sm",
|
|
max_length: int = 1_000_000,
|
|
*,
|
|
strip_whitespace: bool = True,
|
|
**kwargs: Any,
|
|
) -> None:
|
|
"""Initialize the spacy text splitter."""
|
|
super().__init__(**kwargs)
|
|
self._tokenizer = _make_spacy_pipeline_for_splitting(
|
|
pipeline, max_length=max_length
|
|
)
|
|
self._separator = separator
|
|
self._strip_whitespace = strip_whitespace
|
|
|
|
def split_text(self, text: str) -> List[str]:
|
|
"""Split incoming text and return chunks."""
|
|
splits = (
|
|
s.text if self._strip_whitespace else s.text_with_ws
|
|
for s in self._tokenizer(text).sents
|
|
)
|
|
return self._merge_splits(splits, self._separator)
|
|
|
|
|
|
def _make_spacy_pipeline_for_splitting(
|
|
pipeline: str, *, max_length: int = 1_000_000
|
|
) -> Any: # avoid importing spacy
|
|
try:
|
|
import spacy
|
|
except ImportError:
|
|
raise ImportError(
|
|
"Spacy is not installed, please install it with `pip install spacy`."
|
|
)
|
|
if pipeline == "sentencizer":
|
|
from spacy.lang.en import English
|
|
|
|
sentencizer: Any = English()
|
|
sentencizer.add_pipe("sentencizer")
|
|
else:
|
|
sentencizer = spacy.load(pipeline, exclude=["ner", "tagger"])
|
|
sentencizer.max_length = max_length
|
|
return sentencizer
|