mirror of
https://github.com/hwchase17/langchain.git
synced 2025-05-02 13:55:42 +00:00
Previously, regardless of whether or not strip_whitespace was set to true or false, the strip text method in the SpacyTextSplitter class used `sent.text` to get the sentence. I modified this to include a ternary such that if strip_whitespace is false, it uses `sent.text_with_ws` I also modified the project.toml to include the spacy pipeline package and to lock the numpy version, as higher versions break spacy. - **Issue:** N/a - **Dependencies:** None
53 lines
2.0 KiB
Python
53 lines
2.0 KiB
Python
"""Test text splitting functionality using NLTK and Spacy based sentence splitters."""
|
|
|
|
import pytest
|
|
|
|
from langchain_text_splitters.nltk import NLTKTextSplitter
|
|
from langchain_text_splitters.spacy import SpacyTextSplitter
|
|
|
|
|
|
def test_nltk_text_splitting_args() -> None:
|
|
"""Test invalid arguments."""
|
|
with pytest.raises(ValueError):
|
|
NLTKTextSplitter(chunk_size=2, chunk_overlap=4)
|
|
|
|
|
|
def test_spacy_text_splitting_args() -> None:
|
|
"""Test invalid arguments."""
|
|
with pytest.raises(ValueError):
|
|
SpacyTextSplitter(chunk_size=2, chunk_overlap=4)
|
|
|
|
|
|
def test_nltk_text_splitter() -> None:
|
|
"""Test splitting by sentence using NLTK."""
|
|
text = "This is sentence one. And this is sentence two."
|
|
separator = "|||"
|
|
splitter = NLTKTextSplitter(separator=separator)
|
|
output = splitter.split_text(text)
|
|
expected_output = [f"This is sentence one.{separator}And this is sentence two."]
|
|
assert output == expected_output
|
|
|
|
|
|
@pytest.mark.parametrize("pipeline", ["sentencizer", "en_core_web_sm"])
|
|
def test_spacy_text_splitter(pipeline: str) -> None:
|
|
"""Test splitting by sentence using Spacy."""
|
|
text = "This is sentence one. And this is sentence two."
|
|
separator = "|||"
|
|
splitter = SpacyTextSplitter(separator=separator, pipeline=pipeline)
|
|
output = splitter.split_text(text)
|
|
expected_output = [f"This is sentence one.{separator}And this is sentence two."]
|
|
assert output == expected_output
|
|
|
|
|
|
@pytest.mark.parametrize("pipeline", ["sentencizer", "en_core_web_sm"])
|
|
def test_spacy_text_splitter_strip_whitespace(pipeline: str) -> None:
|
|
"""Test splitting by sentence using Spacy."""
|
|
text = "This is sentence one. And this is sentence two."
|
|
separator = "|||"
|
|
splitter = SpacyTextSplitter(
|
|
separator=separator, pipeline=pipeline, strip_whitespace=False
|
|
)
|
|
output = splitter.split_text(text)
|
|
expected_output = [f"This is sentence one. {separator}And this is sentence two."]
|
|
assert output == expected_output
|