Implements NLTK and Spacy-based TextSplitters (#103)

This PR is for Issue #88 

- [x] `make format`
- [x] `make lint`
- [x] `make tests`
This commit is contained in:
Delip Rao
2022-11-09 23:45:30 -05:00
committed by GitHub
parent 28282ad099
commit 3ee6e332dd
4 changed files with 118 additions and 16 deletions

View File

@@ -0,0 +1,38 @@
"""
Test text splitting functionality using NLTK and Spacy based sentence splitters.
"""
import pytest
from langchain.text_splitter import NLTKTextSplitter, SpacyTextSplitter
def test_nltk_text_splitting_args() -> None:
"""Test invalid arguments."""
with pytest.raises(ValueError):
NLTKTextSplitter(chunk_size=2, chunk_overlap=4)
def test_spacy_text_splitting_args() -> None:
"""Test invalid arguments."""
with pytest.raises(ValueError):
SpacyTextSplitter(chunk_size=2, chunk_overlap=4)
def test_nltk_text_splitter() -> None:
"""Test splitting by sentence using NLTK."""
text = "This is sentence one. And this is sentence two."
separator = "|||"
splitter = NLTKTextSplitter(separator=separator)
output = splitter.split_text(text)
expected_output = [f"This is sentence one.{separator}And this is sentence two."]
assert output == expected_output
def test_spacy_text_splitter() -> None:
"""Test splitting by sentence using Spacy."""
text = "This is sentence one. And this is sentence two."
separator = "|||"
splitter = SpacyTextSplitter(separator=separator)
output = splitter.split_text(text)
expected_output = [f"This is sentence one.{separator}And this is sentence two."]
assert output == expected_output