mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-22 23:00:00 +00:00
Implements NLTK and Spacy-based TextSplitters (#103)
This PR is for Issue #88 - [x] `make format` - [x] `make lint` - [x] `make tests`
This commit is contained in:
parent
28282ad099
commit
3ee6e332dd
@ -53,6 +53,8 @@ The following use cases require specific installs and api keys:
|
||||
- _FAISS_:
|
||||
- Install requirements with `pip install faiss` for Python 3.7 and `pip install faiss-cpu` for Python 3.10+.
|
||||
|
||||
If you are using the `NLTKTextSplitter` or the `SpacyTextSplitter`, you will also need to install the appropriate models. For example, if you want to use the `SpacyTextSplitter`, you will need to install the `en_core_web_sm` model with `python -m spacy download en_core_web_sm`. Similarly, if you want to use the `NLTKTextSplitter`, you will need to install the `punkt` model with `python -m nltk.downloader punkt`.
|
||||
|
||||
## 🚀 What can I do with this
|
||||
|
||||
This project was largely inspired by a few projects seen on Twitter for which we thought it would make sense to have more explicit tooling. A lot of the initial functionality was done in an attempt to recreate those. Those are:
|
||||
|
@ -1,23 +1,13 @@
|
||||
"""Functionality for splitting text."""
|
||||
from abc import abstractmethod
|
||||
from typing import List
|
||||
from typing import Iterable, List
|
||||
|
||||
|
||||
class TextSplitter:
|
||||
"""Interface for splitting text into chunks."""
|
||||
|
||||
@abstractmethod
|
||||
def split_text(self, text: str) -> List[str]:
|
||||
"""Split text into multiple components."""
|
||||
|
||||
|
||||
class CharacterTextSplitter(TextSplitter):
|
||||
"""Implementation of splitting text that looks at characters."""
|
||||
|
||||
def __init__(
|
||||
self, separator: str = "\n\n", chunk_size: int = 4000, chunk_overlap: int = 200
|
||||
):
|
||||
"""Initialize with parameters."""
|
||||
def __init__(self, separator: str, chunk_size: int, chunk_overlap: int):
|
||||
"""Create a new TextSplitter."""
|
||||
if chunk_overlap > chunk_size:
|
||||
raise ValueError(
|
||||
f"Got a larger chunk overlap ({chunk_overlap}) than chunk size "
|
||||
@ -27,10 +17,11 @@ class CharacterTextSplitter(TextSplitter):
|
||||
self._chunk_size = chunk_size
|
||||
self._chunk_overlap = chunk_overlap
|
||||
|
||||
@abstractmethod
|
||||
def split_text(self, text: str) -> List[str]:
|
||||
"""Split incoming text and return chunks."""
|
||||
# First we naively split the large input into a bunch of smaller ones.
|
||||
splits = text.split(self._separator)
|
||||
"""Split text into multiple components."""
|
||||
|
||||
def _merge_splits(self, splits: Iterable[str]) -> List[str]:
|
||||
# We now want to combine these smaller pieces into medium size
|
||||
# chunks to send to the LLM.
|
||||
docs = []
|
||||
@ -46,3 +37,72 @@ class CharacterTextSplitter(TextSplitter):
|
||||
total += len(d)
|
||||
docs.append(self._separator.join(current_doc))
|
||||
return docs
|
||||
|
||||
|
||||
class CharacterTextSplitter(TextSplitter):
|
||||
"""Implementation of splitting text that looks at characters."""
|
||||
|
||||
def __init__(
|
||||
self, separator: str = "\n\n", chunk_size: int = 4000, chunk_overlap: int = 200
|
||||
):
|
||||
"""Create a new CharacterTextSplitter."""
|
||||
super(CharacterTextSplitter, self).__init__(
|
||||
separator, chunk_size, chunk_overlap
|
||||
)
|
||||
self._separator = separator
|
||||
|
||||
def split_text(self, text: str) -> List[str]:
|
||||
"""Split incoming text and return chunks."""
|
||||
# First we naively split the large input into a bunch of smaller ones.
|
||||
splits = text.split(self._separator)
|
||||
return self._merge_splits(splits)
|
||||
|
||||
|
||||
class NLTKTextSplitter(TextSplitter):
|
||||
"""Implementation of splitting text that looks at sentences using NLTK."""
|
||||
|
||||
def __init__(
|
||||
self, separator: str = "\n\n", chunk_size: int = 4000, chunk_overlap: int = 200
|
||||
):
|
||||
"""Initialize the NLTK splitter."""
|
||||
super(NLTKTextSplitter, self).__init__(separator, chunk_size, chunk_overlap)
|
||||
try:
|
||||
from nltk.tokenize import sent_tokenize
|
||||
|
||||
self._tokenizer = sent_tokenize
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"NLTK is not installed, please install it with `pip install nltk`."
|
||||
)
|
||||
|
||||
def split_text(self, text: str) -> List[str]:
|
||||
"""Split incoming text and return chunks."""
|
||||
# First we naively split the large input into a bunch of smaller ones.
|
||||
splits = self._tokenizer(text)
|
||||
return self._merge_splits(splits)
|
||||
|
||||
|
||||
class SpacyTextSplitter(TextSplitter):
|
||||
"""Implementation of splitting text that looks at sentences using Spacy."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
separator: str = "\n\n",
|
||||
pipeline: str = "en_core_web_sm",
|
||||
chunk_size: int = 4000,
|
||||
chunk_overlap: int = 200,
|
||||
):
|
||||
"""Initialize the spacy text splitter."""
|
||||
super(SpacyTextSplitter, self).__init__(separator, chunk_size, chunk_overlap)
|
||||
try:
|
||||
import spacy
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Spacy is not installed, please install it with `pip install spacy`."
|
||||
)
|
||||
self._tokenizer = spacy.load(pipeline)
|
||||
|
||||
def split_text(self, text: str) -> List[str]:
|
||||
"""Split incoming text and return chunks."""
|
||||
splits = (str(s) for s in self._tokenizer(text).sents)
|
||||
return self._merge_splits(splits)
|
||||
|
@ -10,5 +10,7 @@ wikipedia
|
||||
huggingface_hub
|
||||
faiss-cpu
|
||||
sentence_transformers
|
||||
spacy
|
||||
nltk
|
||||
# For development
|
||||
jupyter
|
||||
|
38
tests/integration_tests/test_nlp_text_splitters.py
Normal file
38
tests/integration_tests/test_nlp_text_splitters.py
Normal file
@ -0,0 +1,38 @@
|
||||
"""
|
||||
Test text splitting functionality using NLTK and Spacy based sentence splitters.
|
||||
"""
|
||||
import pytest
|
||||
|
||||
from langchain.text_splitter import NLTKTextSplitter, SpacyTextSplitter
|
||||
|
||||
|
||||
def test_nltk_text_splitting_args() -> None:
|
||||
"""Test invalid arguments."""
|
||||
with pytest.raises(ValueError):
|
||||
NLTKTextSplitter(chunk_size=2, chunk_overlap=4)
|
||||
|
||||
|
||||
def test_spacy_text_splitting_args() -> None:
|
||||
"""Test invalid arguments."""
|
||||
with pytest.raises(ValueError):
|
||||
SpacyTextSplitter(chunk_size=2, chunk_overlap=4)
|
||||
|
||||
|
||||
def test_nltk_text_splitter() -> None:
|
||||
"""Test splitting by sentence using NLTK."""
|
||||
text = "This is sentence one. And this is sentence two."
|
||||
separator = "|||"
|
||||
splitter = NLTKTextSplitter(separator=separator)
|
||||
output = splitter.split_text(text)
|
||||
expected_output = [f"This is sentence one.{separator}And this is sentence two."]
|
||||
assert output == expected_output
|
||||
|
||||
|
||||
def test_spacy_text_splitter() -> None:
|
||||
"""Test splitting by sentence using Spacy."""
|
||||
text = "This is sentence one. And this is sentence two."
|
||||
separator = "|||"
|
||||
splitter = SpacyTextSplitter(separator=separator)
|
||||
output = splitter.split_text(text)
|
||||
expected_output = [f"This is sentence one.{separator}And this is sentence two."]
|
||||
assert output == expected_output
|
Loading…
Reference in New Issue
Block a user