mirror of
https://github.com/hwchase17/langchain.git
synced 2025-05-05 07:08:03 +00:00
This PR closes #27781 # Problem The current implementation of `NLTKTextSplitter` is using `sent_tokenize`. However, this `sent_tokenize` doesn't handle chars between 2 tokenized sentences... hence, this behavior throws errors when we are using `add_start_index=True`, as described in issue #27781. In particular: ```python from nltk.tokenize import sent_tokenize output1 = sent_tokenize("Innovation drives our success. Collaboration fosters creative solutions. Efficiency enhances data management.", language="english") print(output1) output2 = sent_tokenize("Innovation drives our success. Collaboration fosters creative solutions. Efficiency enhances data management.", language="english") print(output2) >>> ['Innovation drives our success.', 'Collaboration fosters creative solutions.', 'Efficiency enhances data management.'] >>> ['Innovation drives our success.', 'Collaboration fosters creative solutions.', 'Efficiency enhances data management.'] ``` # Solution With this new `use_span_tokenize` parameter, we can use NLTK to create sentences (with `span_tokenize`), but also add extra chars to be sure that we still can map the chunks to the original text. --------- Co-authored-by: Erick Friis <erick@langchain.dev> Co-authored-by: Erick Friis <erickfriis@gmail.com>
121 lines
3.9 KiB
Python
121 lines
3.9 KiB
Python
"""Test text splitters that require an integration."""
|
|
|
|
from typing import Any
|
|
|
|
import pytest
|
|
|
|
from langchain_text_splitters import (
|
|
TokenTextSplitter,
|
|
)
|
|
from langchain_text_splitters.character import CharacterTextSplitter
|
|
from langchain_text_splitters.sentence_transformers import (
|
|
SentenceTransformersTokenTextSplitter,
|
|
)
|
|
|
|
|
|
@pytest.fixture()
|
|
def sentence_transformers() -> Any:
|
|
try:
|
|
import sentence_transformers
|
|
except ImportError:
|
|
pytest.skip("SentenceTransformers not installed.")
|
|
return sentence_transformers
|
|
|
|
|
|
def test_huggingface_type_check() -> None:
|
|
"""Test that type checks are done properly on input."""
|
|
with pytest.raises(ValueError):
|
|
CharacterTextSplitter.from_huggingface_tokenizer("foo")
|
|
|
|
|
|
def test_huggingface_tokenizer() -> None:
|
|
"""Test text splitter that uses a HuggingFace tokenizer."""
|
|
from transformers import GPT2TokenizerFast
|
|
|
|
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
|
|
text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(
|
|
tokenizer, separator=" ", chunk_size=1, chunk_overlap=0
|
|
)
|
|
output = text_splitter.split_text("foo bar")
|
|
assert output == ["foo", "bar"]
|
|
|
|
|
|
def test_token_text_splitter() -> None:
|
|
"""Test no overlap."""
|
|
splitter = TokenTextSplitter(chunk_size=5, chunk_overlap=0)
|
|
output = splitter.split_text("abcdef" * 5) # 10 token string
|
|
expected_output = ["abcdefabcdefabc", "defabcdefabcdef"]
|
|
assert output == expected_output
|
|
|
|
|
|
def test_token_text_splitter_overlap() -> None:
|
|
"""Test with overlap."""
|
|
splitter = TokenTextSplitter(chunk_size=5, chunk_overlap=1)
|
|
output = splitter.split_text("abcdef" * 5) # 10 token string
|
|
expected_output = ["abcdefabcdefabc", "abcdefabcdefabc", "abcdef"]
|
|
assert output == expected_output
|
|
|
|
|
|
def test_token_text_splitter_from_tiktoken() -> None:
|
|
splitter = TokenTextSplitter.from_tiktoken_encoder(model_name="gpt-3.5-turbo")
|
|
expected_tokenizer = "cl100k_base"
|
|
actual_tokenizer = splitter._tokenizer.name
|
|
assert expected_tokenizer == actual_tokenizer
|
|
|
|
|
|
def test_sentence_transformers_count_tokens(sentence_transformers: Any) -> None:
|
|
splitter = SentenceTransformersTokenTextSplitter(
|
|
model_name="sentence-transformers/paraphrase-albert-small-v2"
|
|
)
|
|
text = "Lorem ipsum"
|
|
|
|
token_count = splitter.count_tokens(text=text)
|
|
|
|
expected_start_stop_token_count = 2
|
|
expected_text_token_count = 5
|
|
expected_token_count = expected_start_stop_token_count + expected_text_token_count
|
|
|
|
assert expected_token_count == token_count
|
|
|
|
|
|
def test_sentence_transformers_split_text(sentence_transformers: Any) -> None:
|
|
splitter = SentenceTransformersTokenTextSplitter(
|
|
model_name="sentence-transformers/paraphrase-albert-small-v2"
|
|
)
|
|
text = "lorem ipsum"
|
|
text_chunks = splitter.split_text(text=text)
|
|
expected_text_chunks = [text]
|
|
assert expected_text_chunks == text_chunks
|
|
|
|
|
|
def test_sentence_transformers_multiple_tokens(sentence_transformers: Any) -> None:
|
|
splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=0)
|
|
text = "Lorem "
|
|
|
|
text_token_count_including_start_and_stop_tokens = splitter.count_tokens(text=text)
|
|
count_start_and_end_tokens = 2
|
|
token_multiplier = (
|
|
count_start_and_end_tokens
|
|
+ (splitter.maximum_tokens_per_chunk - count_start_and_end_tokens)
|
|
// (
|
|
text_token_count_including_start_and_stop_tokens
|
|
- count_start_and_end_tokens
|
|
)
|
|
+ 1
|
|
)
|
|
|
|
# `text_to_split` does not fit in a single chunk
|
|
text_to_embed = text * token_multiplier
|
|
|
|
text_chunks = splitter.split_text(text=text_to_embed)
|
|
|
|
expected_number_of_chunks = 2
|
|
|
|
assert expected_number_of_chunks == len(text_chunks)
|
|
actual = splitter.count_tokens(text=text_chunks[1]) - count_start_and_end_tokens
|
|
expected = (
|
|
token_multiplier * (text_token_count_including_start_and_stop_tokens - 2)
|
|
- splitter.maximum_tokens_per_chunk
|
|
)
|
|
assert expected == actual
|