mirror of
https://github.com/hwchase17/langchain.git
synced 2025-10-23 02:15:42 +00:00
Fixes #32747 SpaCy integration test fixture was trying to use pip to download the SpaCy language model (`en_core_web_sm`), but uv-managed environments don't include pip by default. Fail test if not installed as opposed to downloading.
114 lines
3.6 KiB
Python
114 lines
3.6 KiB
Python
"""Test text splitting functionality using NLTK and Spacy based sentence splitters."""
|
|
|
|
from typing import Any
|
|
|
|
import nltk
|
|
import pytest
|
|
from langchain_core.documents import Document
|
|
|
|
from langchain_text_splitters.nltk import NLTKTextSplitter
|
|
from langchain_text_splitters.spacy import SpacyTextSplitter
|
|
|
|
|
|
def setup_module() -> None:
|
|
nltk.download("punkt_tab")
|
|
|
|
|
|
@pytest.fixture
|
|
def spacy() -> Any:
|
|
try:
|
|
import spacy
|
|
except ImportError:
|
|
pytest.skip("Spacy not installed.")
|
|
|
|
# Check if en_core_web_sm model is available
|
|
try:
|
|
spacy.load("en_core_web_sm")
|
|
except OSError:
|
|
pytest.skip(
|
|
"en_core_web_sm model not installed. Install with: "
|
|
"uv add --group test_integration "
|
|
"https://github.com/explosion/spacy-models/releases/download/"
|
|
"en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl"
|
|
)
|
|
|
|
return spacy
|
|
|
|
|
|
def test_nltk_text_splitting_args() -> None:
|
|
"""Test invalid arguments."""
|
|
with pytest.raises(ValueError):
|
|
NLTKTextSplitter(chunk_size=2, chunk_overlap=4)
|
|
|
|
|
|
def test_spacy_text_splitting_args(spacy: Any) -> None:
|
|
"""Test invalid arguments."""
|
|
with pytest.raises(ValueError):
|
|
SpacyTextSplitter(chunk_size=2, chunk_overlap=4)
|
|
|
|
|
|
def test_nltk_text_splitter() -> None:
|
|
"""Test splitting by sentence using NLTK."""
|
|
text = "This is sentence one. And this is sentence two."
|
|
separator = "|||"
|
|
splitter = NLTKTextSplitter(separator=separator)
|
|
output = splitter.split_text(text)
|
|
expected_output = [f"This is sentence one.{separator}And this is sentence two."]
|
|
assert output == expected_output
|
|
|
|
|
|
@pytest.mark.parametrize("pipeline", ["sentencizer", "en_core_web_sm"])
|
|
def test_spacy_text_splitter(pipeline: str, spacy: Any) -> None:
|
|
"""Test splitting by sentence using Spacy."""
|
|
text = "This is sentence one. And this is sentence two."
|
|
separator = "|||"
|
|
splitter = SpacyTextSplitter(separator=separator, pipeline=pipeline)
|
|
output = splitter.split_text(text)
|
|
expected_output = [f"This is sentence one.{separator}And this is sentence two."]
|
|
assert output == expected_output
|
|
|
|
|
|
@pytest.mark.parametrize("pipeline", ["sentencizer", "en_core_web_sm"])
|
|
def test_spacy_text_splitter_strip_whitespace(pipeline: str, spacy: Any) -> None:
|
|
"""Test splitting by sentence using Spacy."""
|
|
text = "This is sentence one. And this is sentence two."
|
|
separator = "|||"
|
|
splitter = SpacyTextSplitter(
|
|
separator=separator, pipeline=pipeline, strip_whitespace=False
|
|
)
|
|
output = splitter.split_text(text)
|
|
expected_output = [f"This is sentence one. {separator}And this is sentence two."]
|
|
assert output == expected_output
|
|
|
|
|
|
def test_nltk_text_splitter_args() -> None:
|
|
"""Test invalid arguments for NLTKTextSplitter."""
|
|
with pytest.raises(ValueError):
|
|
NLTKTextSplitter(
|
|
chunk_size=80,
|
|
chunk_overlap=0,
|
|
separator="\n\n",
|
|
use_span_tokenize=True,
|
|
)
|
|
|
|
|
|
def test_nltk_text_splitter_with_add_start_index() -> None:
|
|
splitter = NLTKTextSplitter(
|
|
chunk_size=80,
|
|
chunk_overlap=0,
|
|
separator="",
|
|
use_span_tokenize=True,
|
|
add_start_index=True,
|
|
)
|
|
txt = (
|
|
"Innovation drives our success. "
|
|
"Collaboration fosters creative solutions. "
|
|
"Efficiency enhances data management."
|
|
)
|
|
docs = [Document(txt)]
|
|
chunks = splitter.split_documents(docs)
|
|
assert len(chunks) == 2
|
|
for chunk in chunks:
|
|
s_i = chunk.metadata["start_index"]
|
|
assert chunk.page_content == txt[s_i : s_i + len(chunk.page_content)]
|