mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-24 20:09:01 +00:00
text-splitters[minor], langchain[minor], community[patch], templates, docs: langchain-text-splitters 0.0.1 (#18346)
This commit is contained in:
0
libs/text-splitters/tests/__init__.py
Normal file
0
libs/text-splitters/tests/__init__.py
Normal file
@@ -0,0 +1,7 @@
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.compile
|
||||
def test_placeholder() -> None:
|
||||
"""Used for compiling integration tests without running any real tests."""
|
||||
pass
|
@@ -0,0 +1,38 @@
|
||||
"""Test text splitting functionality using NLTK and Spacy based sentence splitters."""
|
||||
import pytest
|
||||
|
||||
from langchain_text_splitters.nltk import NLTKTextSplitter
|
||||
from langchain_text_splitters.spacy import SpacyTextSplitter
|
||||
|
||||
|
||||
def test_nltk_text_splitting_args() -> None:
|
||||
"""Test invalid arguments."""
|
||||
with pytest.raises(ValueError):
|
||||
NLTKTextSplitter(chunk_size=2, chunk_overlap=4)
|
||||
|
||||
|
||||
def test_spacy_text_splitting_args() -> None:
|
||||
"""Test invalid arguments."""
|
||||
with pytest.raises(ValueError):
|
||||
SpacyTextSplitter(chunk_size=2, chunk_overlap=4)
|
||||
|
||||
|
||||
def test_nltk_text_splitter() -> None:
|
||||
"""Test splitting by sentence using NLTK."""
|
||||
text = "This is sentence one. And this is sentence two."
|
||||
separator = "|||"
|
||||
splitter = NLTKTextSplitter(separator=separator)
|
||||
output = splitter.split_text(text)
|
||||
expected_output = [f"This is sentence one.{separator}And this is sentence two."]
|
||||
assert output == expected_output
|
||||
|
||||
|
||||
@pytest.mark.parametrize("pipeline", ["sentencizer", "en_core_web_sm"])
|
||||
def test_spacy_text_splitter(pipeline: str) -> None:
|
||||
"""Test splitting by sentence using Spacy."""
|
||||
text = "This is sentence one. And this is sentence two."
|
||||
separator = "|||"
|
||||
splitter = SpacyTextSplitter(separator=separator, pipeline=pipeline)
|
||||
output = splitter.split_text(text)
|
||||
expected_output = [f"This is sentence one.{separator}And this is sentence two."]
|
||||
assert output == expected_output
|
@@ -0,0 +1,109 @@
|
||||
"""Test text splitters that require an integration."""
|
||||
|
||||
import pytest
|
||||
|
||||
from langchain_text_splitters import (
|
||||
TokenTextSplitter,
|
||||
)
|
||||
from langchain_text_splitters.character import CharacterTextSplitter
|
||||
from langchain_text_splitters.sentence_transformers import (
|
||||
SentenceTransformersTokenTextSplitter,
|
||||
)
|
||||
|
||||
|
||||
def test_huggingface_type_check() -> None:
|
||||
"""Test that type checks are done properly on input."""
|
||||
with pytest.raises(ValueError):
|
||||
CharacterTextSplitter.from_huggingface_tokenizer("foo")
|
||||
|
||||
|
||||
def test_huggingface_tokenizer() -> None:
|
||||
"""Test text splitter that uses a HuggingFace tokenizer."""
|
||||
from transformers import GPT2TokenizerFast
|
||||
|
||||
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
|
||||
text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(
|
||||
tokenizer, separator=" ", chunk_size=1, chunk_overlap=0
|
||||
)
|
||||
output = text_splitter.split_text("foo bar")
|
||||
assert output == ["foo", "bar"]
|
||||
|
||||
|
||||
def test_token_text_splitter() -> None:
|
||||
"""Test no overlap."""
|
||||
splitter = TokenTextSplitter(chunk_size=5, chunk_overlap=0)
|
||||
output = splitter.split_text("abcdef" * 5) # 10 token string
|
||||
expected_output = ["abcdefabcdefabc", "defabcdefabcdef"]
|
||||
assert output == expected_output
|
||||
|
||||
|
||||
def test_token_text_splitter_overlap() -> None:
|
||||
"""Test with overlap."""
|
||||
splitter = TokenTextSplitter(chunk_size=5, chunk_overlap=1)
|
||||
output = splitter.split_text("abcdef" * 5) # 10 token string
|
||||
expected_output = ["abcdefabcdefabc", "abcdefabcdefabc", "abcdef"]
|
||||
assert output == expected_output
|
||||
|
||||
|
||||
def test_token_text_splitter_from_tiktoken() -> None:
|
||||
splitter = TokenTextSplitter.from_tiktoken_encoder(model_name="gpt-3.5-turbo")
|
||||
expected_tokenizer = "cl100k_base"
|
||||
actual_tokenizer = splitter._tokenizer.name
|
||||
assert expected_tokenizer == actual_tokenizer
|
||||
|
||||
|
||||
def test_sentence_transformers_count_tokens() -> None:
|
||||
splitter = SentenceTransformersTokenTextSplitter(
|
||||
model_name="sentence-transformers/paraphrase-albert-small-v2"
|
||||
)
|
||||
text = "Lorem ipsum"
|
||||
|
||||
token_count = splitter.count_tokens(text=text)
|
||||
|
||||
expected_start_stop_token_count = 2
|
||||
expected_text_token_count = 5
|
||||
expected_token_count = expected_start_stop_token_count + expected_text_token_count
|
||||
|
||||
assert expected_token_count == token_count
|
||||
|
||||
|
||||
def test_sentence_transformers_split_text() -> None:
|
||||
splitter = SentenceTransformersTokenTextSplitter(
|
||||
model_name="sentence-transformers/paraphrase-albert-small-v2"
|
||||
)
|
||||
text = "lorem ipsum"
|
||||
text_chunks = splitter.split_text(text=text)
|
||||
expected_text_chunks = [text]
|
||||
assert expected_text_chunks == text_chunks
|
||||
|
||||
|
||||
def test_sentence_transformers_multiple_tokens() -> None:
|
||||
splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=0)
|
||||
text = "Lorem "
|
||||
|
||||
text_token_count_including_start_and_stop_tokens = splitter.count_tokens(text=text)
|
||||
count_start_and_end_tokens = 2
|
||||
token_multiplier = (
|
||||
count_start_and_end_tokens
|
||||
+ (splitter.maximum_tokens_per_chunk - count_start_and_end_tokens)
|
||||
// (
|
||||
text_token_count_including_start_and_stop_tokens
|
||||
- count_start_and_end_tokens
|
||||
)
|
||||
+ 1
|
||||
)
|
||||
|
||||
# `text_to_split` does not fit in a single chunk
|
||||
text_to_embed = text * token_multiplier
|
||||
|
||||
text_chunks = splitter.split_text(text=text_to_embed)
|
||||
|
||||
expected_number_of_chunks = 2
|
||||
|
||||
assert expected_number_of_chunks == len(text_chunks)
|
||||
actual = splitter.count_tokens(text=text_chunks[1]) - count_start_and_end_tokens
|
||||
expected = (
|
||||
token_multiplier * (text_token_count_including_start_and_stop_tokens - 2)
|
||||
- splitter.maximum_tokens_per_chunk
|
||||
)
|
||||
assert expected == actual
|
0
libs/text-splitters/tests/unit_tests/__init__.py
Normal file
0
libs/text-splitters/tests/unit_tests/__init__.py
Normal file
87
libs/text-splitters/tests/unit_tests/conftest.py
Normal file
87
libs/text-splitters/tests/unit_tests/conftest.py
Normal file
@@ -0,0 +1,87 @@
|
||||
"""Configuration for unit tests."""
|
||||
from importlib import util
|
||||
from typing import Dict, Sequence
|
||||
|
||||
import pytest
|
||||
from pytest import Config, Function, Parser
|
||||
|
||||
|
||||
def pytest_addoption(parser: Parser) -> None:
|
||||
"""Add custom command line options to pytest."""
|
||||
parser.addoption(
|
||||
"--only-extended",
|
||||
action="store_true",
|
||||
help="Only run extended tests. Does not allow skipping any extended tests.",
|
||||
)
|
||||
parser.addoption(
|
||||
"--only-core",
|
||||
action="store_true",
|
||||
help="Only run core tests. Never runs any extended tests.",
|
||||
)
|
||||
|
||||
|
||||
def pytest_collection_modifyitems(config: Config, items: Sequence[Function]) -> None:
|
||||
"""Add implementations for handling custom markers.
|
||||
|
||||
At the moment, this adds support for a custom `requires` marker.
|
||||
|
||||
The `requires` marker is used to denote tests that require one or more packages
|
||||
to be installed to run. If the package is not installed, the test is skipped.
|
||||
|
||||
The `requires` marker syntax is:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
@pytest.mark.requires("package1", "package2")
|
||||
def test_something():
|
||||
...
|
||||
"""
|
||||
# Mapping from the name of a package to whether it is installed or not.
|
||||
# Used to avoid repeated calls to `util.find_spec`
|
||||
required_pkgs_info: Dict[str, bool] = {}
|
||||
|
||||
only_extended = config.getoption("--only-extended") or False
|
||||
only_core = config.getoption("--only-core") or False
|
||||
|
||||
if only_extended and only_core:
|
||||
raise ValueError("Cannot specify both `--only-extended` and `--only-core`.")
|
||||
|
||||
for item in items:
|
||||
requires_marker = item.get_closest_marker("requires")
|
||||
if requires_marker is not None:
|
||||
if only_core:
|
||||
item.add_marker(pytest.mark.skip(reason="Skipping not a core test."))
|
||||
continue
|
||||
|
||||
# Iterate through the list of required packages
|
||||
required_pkgs = requires_marker.args
|
||||
for pkg in required_pkgs:
|
||||
# If we haven't yet checked whether the pkg is installed
|
||||
# let's check it and store the result.
|
||||
if pkg not in required_pkgs_info:
|
||||
try:
|
||||
installed = util.find_spec(pkg) is not None
|
||||
except Exception:
|
||||
installed = False
|
||||
required_pkgs_info[pkg] = installed
|
||||
|
||||
if not required_pkgs_info[pkg]:
|
||||
if only_extended:
|
||||
pytest.fail(
|
||||
f"Package `{pkg}` is not installed but is required for "
|
||||
f"extended tests. Please install the given package and "
|
||||
f"try again.",
|
||||
)
|
||||
|
||||
else:
|
||||
# If the package is not installed, we immediately break
|
||||
# and mark the test as skipped.
|
||||
item.add_marker(
|
||||
pytest.mark.skip(reason=f"Requires pkg: `{pkg}`")
|
||||
)
|
||||
break
|
||||
else:
|
||||
if only_extended:
|
||||
item.add_marker(
|
||||
pytest.mark.skip(reason="Skipping not an extended test.")
|
||||
)
|
1352
libs/text-splitters/tests/unit_tests/test_text_splitters.py
Normal file
1352
libs/text-splitters/tests/unit_tests/test_text_splitters.py
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user