mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-07 22:11:51 +00:00
text-splitters[patch]: fix some import-untyped errors (#31030)
This commit is contained in:
committed by
GitHub
parent
672339f3c6
commit
eab8484a80
@@ -150,7 +150,7 @@ class TextSplitter(BaseDocumentTransformer, ABC):
|
||||
def from_huggingface_tokenizer(cls, tokenizer: Any, **kwargs: Any) -> TextSplitter:
|
||||
"""Text splitter that uses HuggingFace tokenizer to count length."""
|
||||
try:
|
||||
from transformers import PreTrainedTokenizerBase
|
||||
from transformers.tokenization_utils_base import PreTrainedTokenizerBase
|
||||
|
||||
if not isinstance(tokenizer, PreTrainedTokenizerBase):
|
||||
raise ValueError(
|
||||
|
@@ -636,10 +636,9 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
|
||||
if self._stopword_removal:
|
||||
try:
|
||||
import nltk
|
||||
from nltk.corpus import stopwords # type: ignore[import-untyped]
|
||||
|
||||
nltk.download("stopwords")
|
||||
self._stopwords = set(stopwords.words(self._stopword_lang))
|
||||
self._stopwords = set(nltk.corpus.stopwords.words(self._stopword_lang))
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Could not import nltk. Please install it with 'pip install nltk'."
|
||||
|
@@ -20,7 +20,7 @@ class KonlpyTextSplitter(TextSplitter):
|
||||
super().__init__(**kwargs)
|
||||
self._separator = separator
|
||||
try:
|
||||
from konlpy.tag import Kkma
|
||||
import konlpy
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"""
|
||||
@@ -28,7 +28,7 @@ class KonlpyTextSplitter(TextSplitter):
|
||||
`pip install konlpy`
|
||||
"""
|
||||
)
|
||||
self.kkma = Kkma()
|
||||
self.kkma = konlpy.tag.Kkma()
|
||||
|
||||
def split_text(self, text: str) -> List[str]:
|
||||
"""Split incoming text and return chunks."""
|
||||
|
@@ -24,14 +24,12 @@ class NLTKTextSplitter(TextSplitter):
|
||||
if self._use_span_tokenize and self._separator != "":
|
||||
raise ValueError("When use_span_tokenize is True, separator should be ''")
|
||||
try:
|
||||
import nltk
|
||||
|
||||
if self._use_span_tokenize:
|
||||
from nltk.tokenize import _get_punkt_tokenizer
|
||||
|
||||
self._tokenizer = _get_punkt_tokenizer(self._language)
|
||||
self._tokenizer = nltk.tokenize._get_punkt_tokenizer(self._language)
|
||||
else:
|
||||
from nltk.tokenize import sent_tokenize
|
||||
|
||||
self._tokenizer = sent_tokenize
|
||||
self._tokenizer = nltk.tokenize.sent_tokenize
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"NLTK is not installed, please install it with `pip install nltk`."
|
||||
|
@@ -50,9 +50,7 @@ def _make_spacy_pipeline_for_splitting(
|
||||
"Spacy is not installed, please install it with `pip install spacy`."
|
||||
)
|
||||
if pipeline == "sentencizer":
|
||||
from spacy.lang.en import English
|
||||
|
||||
sentencizer: Any = English()
|
||||
sentencizer: Any = spacy.lang.en.English()
|
||||
sentencizer.add_pipe("sentencizer")
|
||||
else:
|
||||
sentencizer = spacy.load(pipeline, exclude=["ner", "tagger"])
|
||||
|
Reference in New Issue
Block a user