langchain/libs/text-splitters/langchain_text_splitters/spacy.py
Ankit Dangi 90f162efb6
text-splitters: add pydocstyle linting (#28127)
As seen in #23188, turned on Google-style docstrings by enabling
`pydocstyle` linting in the `text-splitters` package. Each resulting
linting error was addressed differently: ignored, resolved, suppressed,
and missing docstrings were added.

Fixes one of the checklist items from #25154, similar to #25939 in
`core` package. Ran `make format`, `make lint` and `make test` from the
root of the package `text-splitters` to ensure no issues were found.

---------

Co-authored-by: Erick Friis <erick@langchain.dev>
2024-12-09 06:01:03 +00:00

61 lines
1.9 KiB
Python

from __future__ import annotations
from typing import Any, List
from langchain_text_splitters.base import TextSplitter
class SpacyTextSplitter(TextSplitter):
"""Splitting text using Spacy package.
Per default, Spacy's `en_core_web_sm` model is used and
its default max_length is 1000000 (it is the length of maximum character
this model takes which can be increased for large files). For a faster, but
potentially less accurate splitting, you can use `pipeline='sentencizer'`.
"""
def __init__(
self,
separator: str = "\n\n",
pipeline: str = "en_core_web_sm",
max_length: int = 1_000_000,
*,
strip_whitespace: bool = True,
**kwargs: Any,
) -> None:
"""Initialize the spacy text splitter."""
super().__init__(**kwargs)
self._tokenizer = _make_spacy_pipeline_for_splitting(
pipeline, max_length=max_length
)
self._separator = separator
self._strip_whitespace = strip_whitespace
def split_text(self, text: str) -> List[str]:
"""Split incoming text and return chunks."""
splits = (
s.text if self._strip_whitespace else s.text_with_ws
for s in self._tokenizer(text).sents
)
return self._merge_splits(splits, self._separator)
def _make_spacy_pipeline_for_splitting(
pipeline: str, *, max_length: int = 1_000_000
) -> Any: # avoid importing spacy
try:
import spacy
except ImportError:
raise ImportError(
"Spacy is not installed, please install it with `pip install spacy`."
)
if pipeline == "sentencizer":
from spacy.lang.en import English
sentencizer: Any = English()
sentencizer.add_pipe("sentencizer")
else:
sentencizer = spacy.load(pipeline, exclude=["ner", "tagger"])
sentencizer.max_length = max_length
return sentencizer