langchain/libs/text-splitters/tests/integration_tests/test_nlp_text_splitters.py

"""Test text splitting functionality using NLTK and Spacy based sentence splitters."""

import re

import nltk
import pytest
from langchain_core.documents import Document

from langchain_text_splitters.nltk import NLTKTextSplitter
from langchain_text_splitters.spacy import SpacyTextSplitter


def setup_module() -> None:
    nltk.download("punkt_tab")


@pytest.fixture
def spacy() -> None:
    spacy = pytest.importorskip("spacy")

    # Check if en_core_web_sm model is available
    try:
        spacy.load("en_core_web_sm")
    except OSError:
        pytest.skip(
            "en_core_web_sm model not installed. Install with: "
            "uv add --group test_integration "
            "https://github.com/explosion/spacy-models/releases/download/"
            "en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl"
        )


def test_nltk_text_splitting_args() -> None:
    """Test invalid arguments."""
    with pytest.raises(
        ValueError,
        match=re.escape(
            "Got a larger chunk overlap (4) than chunk size (2), should be smaller."
        ),
    ):
        NLTKTextSplitter(chunk_size=2, chunk_overlap=4)


@pytest.mark.usefixtures("spacy")
def test_spacy_text_splitting_args() -> None:
    """Test invalid arguments."""
    with pytest.raises(
        ValueError,
        match=re.escape(
            "Got a larger chunk overlap (4) than chunk size (2), should be smaller."
        ),
    ):
        SpacyTextSplitter(chunk_size=2, chunk_overlap=4)


def test_nltk_text_splitter() -> None:
    """Test splitting by sentence using NLTK."""
    text = "This is sentence one. And this is sentence two."
    separator = "|||"
    splitter = NLTKTextSplitter(separator=separator)
    output = splitter.split_text(text)
    expected_output = [f"This is sentence one.{separator}And this is sentence two."]
    assert output == expected_output


@pytest.mark.usefixtures("spacy")
@pytest.mark.parametrize("pipeline", ["sentencizer", "en_core_web_sm"])
def test_spacy_text_splitter(pipeline: str) -> None:
    """Test splitting by sentence using Spacy."""
    text = "This is sentence one. And this is sentence two."
    separator = "|||"
    splitter = SpacyTextSplitter(separator=separator, pipeline=pipeline)
    output = splitter.split_text(text)
    expected_output = [f"This is sentence one.{separator}And this is sentence two."]
    assert output == expected_output


@pytest.mark.usefixtures("spacy")
@pytest.mark.parametrize("pipeline", ["sentencizer", "en_core_web_sm"])
def test_spacy_text_splitter_strip_whitespace(pipeline: str) -> None:
    """Test splitting by sentence using Spacy."""
    text = "This is sentence one. And this is sentence two."
    separator = "|||"
    splitter = SpacyTextSplitter(
        separator=separator, pipeline=pipeline, strip_whitespace=False
    )
    output = splitter.split_text(text)
    expected_output = [f"This is sentence one. {separator}And this is sentence two."]
    assert output == expected_output


def test_nltk_text_splitter_args() -> None:
    """Test invalid arguments for NLTKTextSplitter."""
    with pytest.raises(
        ValueError, match="When use_span_tokenize is True, separator should be ''"
    ):
        NLTKTextSplitter(
            chunk_size=80,
            chunk_overlap=0,
            separator="\n\n",
            use_span_tokenize=True,
        )


def test_nltk_text_splitter_with_add_start_index() -> None:
    splitter = NLTKTextSplitter(
        chunk_size=80,
        chunk_overlap=0,
        separator="",
        use_span_tokenize=True,
        add_start_index=True,
    )
    txt = (
        "Innovation drives our success.        "
        "Collaboration fosters creative solutions. "
        "Efficiency enhances data management."
    )
    docs = [Document(txt)]
    chunks = splitter.split_documents(docs)
    assert len(chunks) == 2
    for chunk in chunks:
        s_i = chunk.metadata["start_index"]
        assert chunk.page_content == txt[s_i : s_i + len(chunk.page_content)]