mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-14 08:56:27 +00:00
max length attribute for spacy splitter for large docs (#13875)
For large size documents spacy splitter doesn't work it throws an error as shown in below screenshot. Reason its default max_length is 1000000 and there is no option to increase it. So i added it in this PR.  --------- Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
parent
0bc7c1b5b4
commit
74045bf5c0
@ -58,7 +58,9 @@ logger = logging.getLogger(__name__)
|
|||||||
TS = TypeVar("TS", bound="TextSplitter")
|
TS = TypeVar("TS", bound="TextSplitter")
|
||||||
|
|
||||||
|
|
||||||
def _make_spacy_pipeline_for_splitting(pipeline: str) -> Any: # avoid importing spacy
|
def _make_spacy_pipeline_for_splitting(
|
||||||
|
pipeline: str, *, max_length: int = 1_000_000
|
||||||
|
) -> Any: # avoid importing spacy
|
||||||
try:
|
try:
|
||||||
import spacy
|
import spacy
|
||||||
except ImportError:
|
except ImportError:
|
||||||
@ -72,6 +74,7 @@ def _make_spacy_pipeline_for_splitting(pipeline: str) -> Any: # avoid importing
|
|||||||
sentencizer.add_pipe("sentencizer")
|
sentencizer.add_pipe("sentencizer")
|
||||||
else:
|
else:
|
||||||
sentencizer = spacy.load(pipeline, exclude=["ner", "tagger"])
|
sentencizer = spacy.load(pipeline, exclude=["ner", "tagger"])
|
||||||
|
sentencizer.max_length = max_length
|
||||||
return sentencizer
|
return sentencizer
|
||||||
|
|
||||||
|
|
||||||
@ -1380,16 +1383,24 @@ class SpacyTextSplitter(TextSplitter):
|
|||||||
"""Splitting text using Spacy package.
|
"""Splitting text using Spacy package.
|
||||||
|
|
||||||
|
|
||||||
Per default, Spacy's `en_core_web_sm` model is used. For a faster, but
|
Per default, Spacy's `en_core_web_sm` model is used and
|
||||||
|
its default max_length is 1000000 (it is the length of maximum character
|
||||||
|
this model takes which can be increased for large files). For a faster, but
|
||||||
potentially less accurate splitting, you can use `pipeline='sentencizer'`.
|
potentially less accurate splitting, you can use `pipeline='sentencizer'`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, separator: str = "\n\n", pipeline: str = "en_core_web_sm", **kwargs: Any
|
self,
|
||||||
|
separator: str = "\n\n",
|
||||||
|
pipeline: str = "en_core_web_sm",
|
||||||
|
max_length: int = 1_000_000,
|
||||||
|
**kwargs: Any,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize the spacy text splitter."""
|
"""Initialize the spacy text splitter."""
|
||||||
super().__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self._tokenizer = _make_spacy_pipeline_for_splitting(pipeline)
|
self._tokenizer = _make_spacy_pipeline_for_splitting(
|
||||||
|
pipeline, max_length=max_length
|
||||||
|
)
|
||||||
self._separator = separator
|
self._separator = separator
|
||||||
|
|
||||||
def split_text(self, text: str) -> List[str]:
|
def split_text(self, text: str) -> List[str]:
|
||||||
|
Loading…
Reference in New Issue
Block a user