Implements NLTK and Spacy-based TextSplitters (#103)

This PR is for Issue #88 - [x] `make format` - [x] `make lint` - [x] `make tests`
2025-09-21 02:19:31 +00:00 · 2022-11-09 23:45:30 -05:00
parent 28282ad099
commit 3ee6e332dd
4 changed files with 118 additions and 16 deletions
--- a/README.md
+++ b/README.md
@@ -53,6 +53,8 @@ The following use cases require specific installs and api keys:
 - _FAISS_:
  - Install requirements with `pip install faiss` for Python 3.7 and `pip install faiss-cpu` for Python 3.10+.

+If you are using the `NLTKTextSplitter` or the `SpacyTextSplitter`, you will also need to install the appropriate models. For example, if you want to use the `SpacyTextSplitter`, you will need to install the `en_core_web_sm` model with `python -m spacy download en_core_web_sm`. Similarly, if you want to use the `NLTKTextSplitter`, you will need to install the `punkt` model with `python -m nltk.downloader punkt`.
+
 ## 🚀 What can I do with this

 This project was largely inspired by a few projects seen on Twitter for which we thought it would make sense to have more explicit tooling. A lot of the initial functionality was done in an attempt to recreate those. Those are:
--- a/langchain/text_splitter.py
+++ b/langchain/text_splitter.py
@@ -1,23 +1,13 @@
 """Functionality for splitting text."""
 from abc import abstractmethod
-from typing import List
+from typing import Iterable, List


 class TextSplitter:
    """Interface for splitting text into chunks."""

-    @abstractmethod
-    def split_text(self, text: str) -> List[str]:
-        """Split text into multiple components."""
-
-
-class CharacterTextSplitter(TextSplitter):
-    """Implementation of splitting text that looks at characters."""
-
-    def __init__(
-        self, separator: str = "\n\n", chunk_size: int = 4000, chunk_overlap: int = 200
-    ):
-        """Initialize with parameters."""
+    def __init__(self, separator: str, chunk_size: int, chunk_overlap: int):
+        """Create a new TextSplitter."""
        if chunk_overlap > chunk_size:
            raise ValueError(
                f"Got a larger chunk overlap ({chunk_overlap}) than chunk size "
@@ -27,10 +17,11 @@ class CharacterTextSplitter(TextSplitter):
        self._chunk_size = chunk_size
        self._chunk_overlap = chunk_overlap

+    @abstractmethod
    def split_text(self, text: str) -> List[str]:
-        """Split incoming text and return chunks."""
-        # First we naively split the large input into a bunch of smaller ones.
-        splits = text.split(self._separator)
+        """Split text into multiple components."""
+
+    def _merge_splits(self, splits: Iterable[str]) -> List[str]:
        # We now want to combine these smaller pieces into medium size
        # chunks to send to the LLM.
        docs = []
@@ -46,3 +37,72 @@ class CharacterTextSplitter(TextSplitter):
            total += len(d)
        docs.append(self._separator.join(current_doc))
        return docs
+
+
+class CharacterTextSplitter(TextSplitter):
+    """Implementation of splitting text that looks at characters."""
+
+    def __init__(
+        self, separator: str = "\n\n", chunk_size: int = 4000, chunk_overlap: int = 200
+    ):
+        """Create a new CharacterTextSplitter."""
+        super(CharacterTextSplitter, self).__init__(
+            separator, chunk_size, chunk_overlap
+        )
+        self._separator = separator
+
+    def split_text(self, text: str) -> List[str]:
+        """Split incoming text and return chunks."""
+        # First we naively split the large input into a bunch of smaller ones.
+        splits = text.split(self._separator)
+        return self._merge_splits(splits)
+
+
+class NLTKTextSplitter(TextSplitter):
+    """Implementation of splitting text that looks at sentences using NLTK."""
+
+    def __init__(
+        self, separator: str = "\n\n", chunk_size: int = 4000, chunk_overlap: int = 200
+    ):
+        """Initialize the NLTK splitter."""
+        super(NLTKTextSplitter, self).__init__(separator, chunk_size, chunk_overlap)
+        try:
+            from nltk.tokenize import sent_tokenize
+
+            self._tokenizer = sent_tokenize
+        except ImportError:
+            raise ImportError(
+                "NLTK is not installed, please install it with `pip install nltk`."
+            )
+
+    def split_text(self, text: str) -> List[str]:
+        """Split incoming text and return chunks."""
+        # First we naively split the large input into a bunch of smaller ones.
+        splits = self._tokenizer(text)
+        return self._merge_splits(splits)
+
+
+class SpacyTextSplitter(TextSplitter):
+    """Implementation of splitting text that looks at sentences using Spacy."""
+
+    def __init__(
+        self,
+        separator: str = "\n\n",
+        pipeline: str = "en_core_web_sm",
+        chunk_size: int = 4000,
+        chunk_overlap: int = 200,
+    ):
+        """Initialize the spacy text splitter."""
+        super(SpacyTextSplitter, self).__init__(separator, chunk_size, chunk_overlap)
+        try:
+            import spacy
+        except ImportError:
+            raise ImportError(
+                "Spacy is not installed, please install it with `pip install spacy`."
+            )
+        self._tokenizer = spacy.load(pipeline)
+
+    def split_text(self, text: str) -> List[str]:
+        """Split incoming text and return chunks."""
+        splits = (str(s) for s in self._tokenizer(text).sents)
+        return self._merge_splits(splits)
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,5 +10,7 @@ wikipedia
 huggingface_hub
 faiss-cpu
 sentence_transformers
+spacy
+nltk
 # For development
 jupyter
--- a/tests/integration_tests/test_nlp_text_splitters.py
+++ b/tests/integration_tests/test_nlp_text_splitters.py
@@ -0,0 +1,38 @@
+"""
+Test text splitting functionality using NLTK and Spacy based sentence splitters.
+"""
+import pytest
+
+from langchain.text_splitter import NLTKTextSplitter, SpacyTextSplitter
+
+
+def test_nltk_text_splitting_args() -> None:
+    """Test invalid arguments."""
+    with pytest.raises(ValueError):
+        NLTKTextSplitter(chunk_size=2, chunk_overlap=4)
+
+
+def test_spacy_text_splitting_args() -> None:
+    """Test invalid arguments."""
+    with pytest.raises(ValueError):
+        SpacyTextSplitter(chunk_size=2, chunk_overlap=4)
+
+
+def test_nltk_text_splitter() -> None:
+    """Test splitting by sentence using NLTK."""
+    text = "This is sentence one. And this is sentence two."
+    separator = "|||"
+    splitter = NLTKTextSplitter(separator=separator)
+    output = splitter.split_text(text)
+    expected_output = [f"This is sentence one.{separator}And this is sentence two."]
+    assert output == expected_output
+
+
+def test_spacy_text_splitter() -> None:
+    """Test splitting by sentence using Spacy."""
+    text = "This is sentence one. And this is sentence two."
+    separator = "|||"
+    splitter = SpacyTextSplitter(separator=separator)
+    output = splitter.split_text(text)
+    expected_output = [f"This is sentence one.{separator}And this is sentence two."]
+    assert output == expected_output