text-splitters[patch]: fix some import-untyped errors (#31030)

This commit is contained in:
Christophe Bornet
2025-05-15 17:34:22 +02:00
committed by GitHub
parent 672339f3c6
commit eab8484a80
7 changed files with 172 additions and 36 deletions

View File

@@ -150,7 +150,7 @@ class TextSplitter(BaseDocumentTransformer, ABC):
def from_huggingface_tokenizer(cls, tokenizer: Any, **kwargs: Any) -> TextSplitter:
"""Text splitter that uses HuggingFace tokenizer to count length."""
try:
from transformers import PreTrainedTokenizerBase
from transformers.tokenization_utils_base import PreTrainedTokenizerBase
if not isinstance(tokenizer, PreTrainedTokenizerBase):
raise ValueError(

View File

@@ -636,10 +636,9 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
if self._stopword_removal:
try:
import nltk
from nltk.corpus import stopwords # type: ignore[import-untyped]
nltk.download("stopwords")
self._stopwords = set(stopwords.words(self._stopword_lang))
self._stopwords = set(nltk.corpus.stopwords.words(self._stopword_lang))
except ImportError:
raise ImportError(
"Could not import nltk. Please install it with 'pip install nltk'."

View File

@@ -20,7 +20,7 @@ class KonlpyTextSplitter(TextSplitter):
super().__init__(**kwargs)
self._separator = separator
try:
from konlpy.tag import Kkma
import konlpy
except ImportError:
raise ImportError(
"""
@@ -28,7 +28,7 @@ class KonlpyTextSplitter(TextSplitter):
`pip install konlpy`
"""
)
self.kkma = Kkma()
self.kkma = konlpy.tag.Kkma()
def split_text(self, text: str) -> List[str]:
"""Split incoming text and return chunks."""

View File

@@ -24,14 +24,12 @@ class NLTKTextSplitter(TextSplitter):
if self._use_span_tokenize and self._separator != "":
raise ValueError("When use_span_tokenize is True, separator should be ''")
try:
import nltk
if self._use_span_tokenize:
from nltk.tokenize import _get_punkt_tokenizer
self._tokenizer = _get_punkt_tokenizer(self._language)
self._tokenizer = nltk.tokenize._get_punkt_tokenizer(self._language)
else:
from nltk.tokenize import sent_tokenize
self._tokenizer = sent_tokenize
self._tokenizer = nltk.tokenize.sent_tokenize
except ImportError:
raise ImportError(
"NLTK is not installed, please install it with `pip install nltk`."

View File

@@ -50,9 +50,7 @@ def _make_spacy_pipeline_for_splitting(
"Spacy is not installed, please install it with `pip install spacy`."
)
if pipeline == "sentencizer":
from spacy.lang.en import English
sentencizer: Any = English()
sentencizer: Any = spacy.lang.en.English()
sentencizer.add_pipe("sentencizer")
else:
sentencizer = spacy.load(pipeline, exclude=["ner", "tagger"])