max length attribute for spacy splitter for large docs (#13875)

For large size documents spacy splitter doesn't work it throws an error as shown in below screenshot. Reason its default max_length is 1000000 and there is no option to increase it. So i added it in this PR. ![image](https://github.com/langchain-ai/langchain/assets/73680423/613625c3-0e21-4834-9aad-2a73cf56eecc) --------- Co-authored-by: Bagatur <baskaryan@gmail.com>
2025-07-17 18:23:59 +00:00 · 2023-11-29 09:00:26 +05:30 · 2023-11-29 09:00:26 +05:30 · 74045bf5c0
commit 74045bf5c0
parent 0bc7c1b5b4
1 changed files with 15 additions and 4 deletions
--- a/libs/langchain/langchain/text_splitter.py
+++ b/libs/langchain/langchain/text_splitter.py
@ -58,7 +58,9 @@ logger = logging.getLogger(__name__)
 TS = TypeVar("TS", bound="TextSplitter")


-def _make_spacy_pipeline_for_splitting(pipeline: str) -> Any:  # avoid importing spacy
+def _make_spacy_pipeline_for_splitting(
+    pipeline: str, *, max_length: int = 1_000_000
+) -> Any:  # avoid importing spacy
    try:
        import spacy
    except ImportError:
@ -72,6 +74,7 @@ def _make_spacy_pipeline_for_splitting(pipeline: str) -> Any:  # avoid importing
        sentencizer.add_pipe("sentencizer")
    else:
        sentencizer = spacy.load(pipeline, exclude=["ner", "tagger"])
+        sentencizer.max_length = max_length
    return sentencizer


@ -1380,16 +1383,24 @@ class SpacyTextSplitter(TextSplitter):
    """Splitting text using Spacy package.


-    Per default, Spacy's `en_core_web_sm` model is used. For a faster, but
+    Per default, Spacy's `en_core_web_sm` model is used and
+    its default max_length is 1000000 (it is the length of maximum character
+    this model takes which can be increased for large files). For a faster, but
    potentially less accurate splitting, you can use `pipeline='sentencizer'`.
    """

    def __init__(
-        self, separator: str = "\n\n", pipeline: str = "en_core_web_sm", **kwargs: Any
+        self,
+        separator: str = "\n\n",
+        pipeline: str = "en_core_web_sm",
+        max_length: int = 1_000_000,
+        **kwargs: Any,
    ) -> None:
        """Initialize the spacy text splitter."""
        super().__init__(**kwargs)
-        self._tokenizer = _make_spacy_pipeline_for_splitting(pipeline)
+        self._tokenizer = _make_spacy_pipeline_for_splitting(
+            pipeline, max_length=max_length
+        )
        self._separator = separator

    def split_text(self, text: str) -> List[str]: