mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-08 14:31:55 +00:00
text-splitters[minor], langchain[minor], community[patch], templates, docs: langchain-text-splitters 0.0.1 (#18346)
This commit is contained in:
55
libs/text-splitters/langchain_text_splitters/spacy.py
Normal file
55
libs/text-splitters/langchain_text_splitters/spacy.py
Normal file
@@ -0,0 +1,55 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, List
|
||||
|
||||
from langchain_text_splitters.base import TextSplitter
|
||||
|
||||
|
||||
class SpacyTextSplitter(TextSplitter):
|
||||
"""Splitting text using Spacy package.
|
||||
|
||||
|
||||
Per default, Spacy's `en_core_web_sm` model is used and
|
||||
its default max_length is 1000000 (it is the length of maximum character
|
||||
this model takes which can be increased for large files). For a faster, but
|
||||
potentially less accurate splitting, you can use `pipeline='sentencizer'`.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
separator: str = "\n\n",
|
||||
pipeline: str = "en_core_web_sm",
|
||||
max_length: int = 1_000_000,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Initialize the spacy text splitter."""
|
||||
super().__init__(**kwargs)
|
||||
self._tokenizer = _make_spacy_pipeline_for_splitting(
|
||||
pipeline, max_length=max_length
|
||||
)
|
||||
self._separator = separator
|
||||
|
||||
def split_text(self, text: str) -> List[str]:
|
||||
"""Split incoming text and return chunks."""
|
||||
splits = (s.text for s in self._tokenizer(text).sents)
|
||||
return self._merge_splits(splits, self._separator)
|
||||
|
||||
|
||||
def _make_spacy_pipeline_for_splitting(
|
||||
pipeline: str, *, max_length: int = 1_000_000
|
||||
) -> Any: # avoid importing spacy
|
||||
try:
|
||||
import spacy
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Spacy is not installed, please install it with `pip install spacy`."
|
||||
)
|
||||
if pipeline == "sentencizer":
|
||||
from spacy.lang.en import English
|
||||
|
||||
sentencizer: Any = English()
|
||||
sentencizer.add_pipe("sentencizer")
|
||||
else:
|
||||
sentencizer = spacy.load(pipeline, exclude=["ner", "tagger"])
|
||||
sentencizer.max_length = max_length
|
||||
return sentencizer
|
Reference in New Issue
Block a user