From 74045bf5c01630ff0df3717280b8df3a5f4805ce Mon Sep 17 00:00:00 2001 From: Kunal <73680423+kunal8164705@users.noreply.github.com> Date: Wed, 29 Nov 2023 09:00:26 +0530 Subject: [PATCH] max length attribute for spacy splitter for large docs (#13875) For large size documents spacy splitter doesn't work it throws an error as shown in below screenshot. Reason its default max_length is 1000000 and there is no option to increase it. So i added it in this PR. ![image](https://github.com/langchain-ai/langchain/assets/73680423/613625c3-0e21-4834-9aad-2a73cf56eecc) --------- Co-authored-by: Bagatur --- libs/langchain/langchain/text_splitter.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/libs/langchain/langchain/text_splitter.py b/libs/langchain/langchain/text_splitter.py index 6000cd5ebec..c0ce1c74fac 100644 --- a/libs/langchain/langchain/text_splitter.py +++ b/libs/langchain/langchain/text_splitter.py @@ -58,7 +58,9 @@ logger = logging.getLogger(__name__) TS = TypeVar("TS", bound="TextSplitter") -def _make_spacy_pipeline_for_splitting(pipeline: str) -> Any: # avoid importing spacy +def _make_spacy_pipeline_for_splitting( + pipeline: str, *, max_length: int = 1_000_000 +) -> Any: # avoid importing spacy try: import spacy except ImportError: @@ -72,6 +74,7 @@ def _make_spacy_pipeline_for_splitting(pipeline: str) -> Any: # avoid importing sentencizer.add_pipe("sentencizer") else: sentencizer = spacy.load(pipeline, exclude=["ner", "tagger"]) + sentencizer.max_length = max_length return sentencizer @@ -1380,16 +1383,24 @@ class SpacyTextSplitter(TextSplitter): """Splitting text using Spacy package. - Per default, Spacy's `en_core_web_sm` model is used. For a faster, but + Per default, Spacy's `en_core_web_sm` model is used and + its default max_length is 1000000 (it is the length of maximum character + this model takes which can be increased for large files). For a faster, but potentially less accurate splitting, you can use `pipeline='sentencizer'`. """ def __init__( - self, separator: str = "\n\n", pipeline: str = "en_core_web_sm", **kwargs: Any + self, + separator: str = "\n\n", + pipeline: str = "en_core_web_sm", + max_length: int = 1_000_000, + **kwargs: Any, ) -> None: """Initialize the spacy text splitter.""" super().__init__(**kwargs) - self._tokenizer = _make_spacy_pipeline_for_splitting(pipeline) + self._tokenizer = _make_spacy_pipeline_for_splitting( + pipeline, max_length=max_length + ) self._separator = separator def split_text(self, text: str) -> List[str]: