mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-13 16:36:06 +00:00
adding language as parameter to NLTK text splitter (#10229)
- Description: Adding language as parameter to NLTK, by default it is only using English. This will help using NLTK splitter for other languages. Change is simple, via adding language as parameter to NLTKTextSplitter and then passing it to nltk "sent_tokenize". - Issue: N/A - Dependencies: N/A --------- Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
parent
b3a8fc7cb1
commit
ddd07001f3
@ -1081,7 +1081,9 @@ class RecursiveCharacterTextSplitter(TextSplitter):
|
|||||||
class NLTKTextSplitter(TextSplitter):
|
class NLTKTextSplitter(TextSplitter):
|
||||||
"""Splitting text using NLTK package."""
|
"""Splitting text using NLTK package."""
|
||||||
|
|
||||||
def __init__(self, separator: str = "\n\n", **kwargs: Any) -> None:
|
def __init__(
|
||||||
|
self, separator: str = "\n\n", language: str = "english", **kwargs: Any
|
||||||
|
) -> None:
|
||||||
"""Initialize the NLTK splitter."""
|
"""Initialize the NLTK splitter."""
|
||||||
super().__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
try:
|
try:
|
||||||
@ -1093,11 +1095,12 @@ class NLTKTextSplitter(TextSplitter):
|
|||||||
"NLTK is not installed, please install it with `pip install nltk`."
|
"NLTK is not installed, please install it with `pip install nltk`."
|
||||||
)
|
)
|
||||||
self._separator = separator
|
self._separator = separator
|
||||||
|
self._language = language
|
||||||
|
|
||||||
def split_text(self, text: str) -> List[str]:
|
def split_text(self, text: str) -> List[str]:
|
||||||
"""Split incoming text and return chunks."""
|
"""Split incoming text and return chunks."""
|
||||||
# First we naively split the large input into a bunch of smaller ones.
|
# First we naively split the large input into a bunch of smaller ones.
|
||||||
splits = self._tokenizer(text)
|
splits = self._tokenizer(text, language=self._language)
|
||||||
return self._merge_splits(splits, self._separator)
|
return self._merge_splits(splits, self._separator)
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user