From 74045bf5c01630ff0df3717280b8df3a5f4805ce Mon Sep 17 00:00:00 2001
From: Kunal <73680423+kunal8164705@users.noreply.github.com>
Date: Wed, 29 Nov 2023 09:00:26 +0530
Subject: [PATCH] max length attribute for spacy splitter for large docs
 (#13875)

For large size documents spacy splitter doesn't work it throws an error
as shown in below screenshot.
Reason its default max_length is 1000000 and there is no option to
increase it. So i added it in this PR.


![image](https://github.com/langchain-ai/langchain/assets/73680423/613625c3-0e21-4834-9aad-2a73cf56eecc)

---------

Co-authored-by: Bagatur <baskaryan@gmail.com>
---
 libs/langchain/langchain/text_splitter.py | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/libs/langchain/langchain/text_splitter.py b/libs/langchain/langchain/text_splitter.py
index 6000cd5ebec..c0ce1c74fac 100644
--- a/libs/langchain/langchain/text_splitter.py
+++ b/libs/langchain/langchain/text_splitter.py
@@ -58,7 +58,9 @@ logger = logging.getLogger(__name__)
 TS = TypeVar("TS", bound="TextSplitter")
 
 
-def _make_spacy_pipeline_for_splitting(pipeline: str) -> Any:  # avoid importing spacy
+def _make_spacy_pipeline_for_splitting(
+    pipeline: str, *, max_length: int = 1_000_000
+) -> Any:  # avoid importing spacy
     try:
         import spacy
     except ImportError:
@@ -72,6 +74,7 @@ def _make_spacy_pipeline_for_splitting(pipeline: str) -> Any:  # avoid importing
         sentencizer.add_pipe("sentencizer")
     else:
         sentencizer = spacy.load(pipeline, exclude=["ner", "tagger"])
+        sentencizer.max_length = max_length
     return sentencizer
 
 
@@ -1380,16 +1383,24 @@ class SpacyTextSplitter(TextSplitter):
     """Splitting text using Spacy package.
 
 
-    Per default, Spacy's `en_core_web_sm` model is used. For a faster, but
+    Per default, Spacy's `en_core_web_sm` model is used and
+    its default max_length is 1000000 (it is the length of maximum character
+    this model takes which can be increased for large files). For a faster, but
     potentially less accurate splitting, you can use `pipeline='sentencizer'`.
     """
 
     def __init__(
-        self, separator: str = "\n\n", pipeline: str = "en_core_web_sm", **kwargs: Any
+        self,
+        separator: str = "\n\n",
+        pipeline: str = "en_core_web_sm",
+        max_length: int = 1_000_000,
+        **kwargs: Any,
     ) -> None:
         """Initialize the spacy text splitter."""
         super().__init__(**kwargs)
-        self._tokenizer = _make_spacy_pipeline_for_splitting(pipeline)
+        self._tokenizer = _make_spacy_pipeline_for_splitting(
+            pipeline, max_length=max_length
+        )
         self._separator = separator
 
     def split_text(self, text: str) -> List[str]: