max length attribute for spacy splitter for large docs (#13875)

For large size documents spacy splitter doesn't work it throws an error
as shown in below screenshot.
Reason its default max_length is 1000000 and there is no option to
increase it. So i added it in this PR.


![image](https://github.com/langchain-ai/langchain/assets/73680423/613625c3-0e21-4834-9aad-2a73cf56eecc)

---------

Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
Kunal 2023-11-29 09:00:26 +05:30 committed by GitHub
parent 0bc7c1b5b4
commit 74045bf5c0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -58,7 +58,9 @@ logger = logging.getLogger(__name__)
TS = TypeVar("TS", bound="TextSplitter")
def _make_spacy_pipeline_for_splitting(pipeline: str) -> Any: # avoid importing spacy
def _make_spacy_pipeline_for_splitting(
pipeline: str, *, max_length: int = 1_000_000
) -> Any: # avoid importing spacy
try:
import spacy
except ImportError:
@ -72,6 +74,7 @@ def _make_spacy_pipeline_for_splitting(pipeline: str) -> Any: # avoid importing
sentencizer.add_pipe("sentencizer")
else:
sentencizer = spacy.load(pipeline, exclude=["ner", "tagger"])
sentencizer.max_length = max_length
return sentencizer
@ -1380,16 +1383,24 @@ class SpacyTextSplitter(TextSplitter):
"""Splitting text using Spacy package.
Per default, Spacy's `en_core_web_sm` model is used. For a faster, but
Per default, Spacy's `en_core_web_sm` model is used and
its default max_length is 1000000 (it is the length of maximum character
this model takes which can be increased for large files). For a faster, but
potentially less accurate splitting, you can use `pipeline='sentencizer'`.
"""
def __init__(
self, separator: str = "\n\n", pipeline: str = "en_core_web_sm", **kwargs: Any
self,
separator: str = "\n\n",
pipeline: str = "en_core_web_sm",
max_length: int = 1_000_000,
**kwargs: Any,
) -> None:
"""Initialize the spacy text splitter."""
super().__init__(**kwargs)
self._tokenizer = _make_spacy_pipeline_for_splitting(pipeline)
self._tokenizer = _make_spacy_pipeline_for_splitting(
pipeline, max_length=max_length
)
self._separator = separator
def split_text(self, text: str) -> List[str]: