text-splitters[minor], langchain[minor], community[patch], templates, docs: langchain-text-splitters 0.0.1 (#18346)

This commit is contained in:
Bagatur
2024-02-29 18:33:21 -08:00
committed by GitHub
parent 7891934173
commit 5efb5c099f
226 changed files with 6626 additions and 1980 deletions

View File

View File

@@ -0,0 +1,7 @@
import pytest
@pytest.mark.compile
def test_placeholder() -> None:
"""Used for compiling integration tests without running any real tests."""
pass

View File

@@ -0,0 +1,38 @@
"""Test text splitting functionality using NLTK and Spacy based sentence splitters."""
import pytest
from langchain_text_splitters.nltk import NLTKTextSplitter
from langchain_text_splitters.spacy import SpacyTextSplitter
def test_nltk_text_splitting_args() -> None:
"""Test invalid arguments."""
with pytest.raises(ValueError):
NLTKTextSplitter(chunk_size=2, chunk_overlap=4)
def test_spacy_text_splitting_args() -> None:
"""Test invalid arguments."""
with pytest.raises(ValueError):
SpacyTextSplitter(chunk_size=2, chunk_overlap=4)
def test_nltk_text_splitter() -> None:
"""Test splitting by sentence using NLTK."""
text = "This is sentence one. And this is sentence two."
separator = "|||"
splitter = NLTKTextSplitter(separator=separator)
output = splitter.split_text(text)
expected_output = [f"This is sentence one.{separator}And this is sentence two."]
assert output == expected_output
@pytest.mark.parametrize("pipeline", ["sentencizer", "en_core_web_sm"])
def test_spacy_text_splitter(pipeline: str) -> None:
"""Test splitting by sentence using Spacy."""
text = "This is sentence one. And this is sentence two."
separator = "|||"
splitter = SpacyTextSplitter(separator=separator, pipeline=pipeline)
output = splitter.split_text(text)
expected_output = [f"This is sentence one.{separator}And this is sentence two."]
assert output == expected_output

View File

@@ -0,0 +1,109 @@
"""Test text splitters that require an integration."""
import pytest
from langchain_text_splitters import (
TokenTextSplitter,
)
from langchain_text_splitters.character import CharacterTextSplitter
from langchain_text_splitters.sentence_transformers import (
SentenceTransformersTokenTextSplitter,
)
def test_huggingface_type_check() -> None:
"""Test that type checks are done properly on input."""
with pytest.raises(ValueError):
CharacterTextSplitter.from_huggingface_tokenizer("foo")
def test_huggingface_tokenizer() -> None:
"""Test text splitter that uses a HuggingFace tokenizer."""
from transformers import GPT2TokenizerFast
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(
tokenizer, separator=" ", chunk_size=1, chunk_overlap=0
)
output = text_splitter.split_text("foo bar")
assert output == ["foo", "bar"]
def test_token_text_splitter() -> None:
"""Test no overlap."""
splitter = TokenTextSplitter(chunk_size=5, chunk_overlap=0)
output = splitter.split_text("abcdef" * 5) # 10 token string
expected_output = ["abcdefabcdefabc", "defabcdefabcdef"]
assert output == expected_output
def test_token_text_splitter_overlap() -> None:
"""Test with overlap."""
splitter = TokenTextSplitter(chunk_size=5, chunk_overlap=1)
output = splitter.split_text("abcdef" * 5) # 10 token string
expected_output = ["abcdefabcdefabc", "abcdefabcdefabc", "abcdef"]
assert output == expected_output
def test_token_text_splitter_from_tiktoken() -> None:
splitter = TokenTextSplitter.from_tiktoken_encoder(model_name="gpt-3.5-turbo")
expected_tokenizer = "cl100k_base"
actual_tokenizer = splitter._tokenizer.name
assert expected_tokenizer == actual_tokenizer
def test_sentence_transformers_count_tokens() -> None:
splitter = SentenceTransformersTokenTextSplitter(
model_name="sentence-transformers/paraphrase-albert-small-v2"
)
text = "Lorem ipsum"
token_count = splitter.count_tokens(text=text)
expected_start_stop_token_count = 2
expected_text_token_count = 5
expected_token_count = expected_start_stop_token_count + expected_text_token_count
assert expected_token_count == token_count
def test_sentence_transformers_split_text() -> None:
splitter = SentenceTransformersTokenTextSplitter(
model_name="sentence-transformers/paraphrase-albert-small-v2"
)
text = "lorem ipsum"
text_chunks = splitter.split_text(text=text)
expected_text_chunks = [text]
assert expected_text_chunks == text_chunks
def test_sentence_transformers_multiple_tokens() -> None:
splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=0)
text = "Lorem "
text_token_count_including_start_and_stop_tokens = splitter.count_tokens(text=text)
count_start_and_end_tokens = 2
token_multiplier = (
count_start_and_end_tokens
+ (splitter.maximum_tokens_per_chunk - count_start_and_end_tokens)
// (
text_token_count_including_start_and_stop_tokens
- count_start_and_end_tokens
)
+ 1
)
# `text_to_split` does not fit in a single chunk
text_to_embed = text * token_multiplier
text_chunks = splitter.split_text(text=text_to_embed)
expected_number_of_chunks = 2
assert expected_number_of_chunks == len(text_chunks)
actual = splitter.count_tokens(text=text_chunks[1]) - count_start_and_end_tokens
expected = (
token_multiplier * (text_token_count_including_start_and_stop_tokens - 2)
- splitter.maximum_tokens_per_chunk
)
assert expected == actual

View File

@@ -0,0 +1,87 @@
"""Configuration for unit tests."""
from importlib import util
from typing import Dict, Sequence
import pytest
from pytest import Config, Function, Parser
def pytest_addoption(parser: Parser) -> None:
"""Add custom command line options to pytest."""
parser.addoption(
"--only-extended",
action="store_true",
help="Only run extended tests. Does not allow skipping any extended tests.",
)
parser.addoption(
"--only-core",
action="store_true",
help="Only run core tests. Never runs any extended tests.",
)
def pytest_collection_modifyitems(config: Config, items: Sequence[Function]) -> None:
"""Add implementations for handling custom markers.
At the moment, this adds support for a custom `requires` marker.
The `requires` marker is used to denote tests that require one or more packages
to be installed to run. If the package is not installed, the test is skipped.
The `requires` marker syntax is:
.. code-block:: python
@pytest.mark.requires("package1", "package2")
def test_something():
...
"""
# Mapping from the name of a package to whether it is installed or not.
# Used to avoid repeated calls to `util.find_spec`
required_pkgs_info: Dict[str, bool] = {}
only_extended = config.getoption("--only-extended") or False
only_core = config.getoption("--only-core") or False
if only_extended and only_core:
raise ValueError("Cannot specify both `--only-extended` and `--only-core`.")
for item in items:
requires_marker = item.get_closest_marker("requires")
if requires_marker is not None:
if only_core:
item.add_marker(pytest.mark.skip(reason="Skipping not a core test."))
continue
# Iterate through the list of required packages
required_pkgs = requires_marker.args
for pkg in required_pkgs:
# If we haven't yet checked whether the pkg is installed
# let's check it and store the result.
if pkg not in required_pkgs_info:
try:
installed = util.find_spec(pkg) is not None
except Exception:
installed = False
required_pkgs_info[pkg] = installed
if not required_pkgs_info[pkg]:
if only_extended:
pytest.fail(
f"Package `{pkg}` is not installed but is required for "
f"extended tests. Please install the given package and "
f"try again.",
)
else:
# If the package is not installed, we immediately break
# and mark the test as skipped.
item.add_marker(
pytest.mark.skip(reason=f"Requires pkg: `{pkg}`")
)
break
else:
if only_extended:
item.add_marker(
pytest.mark.skip(reason="Skipping not an extended test.")
)

File diff suppressed because it is too large Load Diff