mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-26 05:48:40 +00:00
Fix TextSplitter.from_tiktoken(#4361)
Thanks to @danb27 for the fix! Minor update Fixes https://github.com/hwchase17/langchain/issues/4357 --------- Co-authored-by: Dan Bianchini <42096328+danb27@users.noreply.github.com>
This commit is contained in:
@@ -23,19 +23,24 @@ def test_huggingface_tokenizer() -> None:
|
||||
assert output == ["foo", "bar"]
|
||||
|
||||
|
||||
class TestTokenTextSplitter:
|
||||
"""Test token text splitter."""
|
||||
def test_token_text_splitter() -> None:
|
||||
"""Test no overlap."""
|
||||
splitter = TokenTextSplitter(chunk_size=5, chunk_overlap=0)
|
||||
output = splitter.split_text("abcdef" * 5) # 10 token string
|
||||
expected_output = ["abcdefabcdefabc", "defabcdefabcdef"]
|
||||
assert output == expected_output
|
||||
|
||||
def test_basic(self) -> None:
|
||||
"""Test no overlap."""
|
||||
splitter = TokenTextSplitter(chunk_size=5, chunk_overlap=0)
|
||||
output = splitter.split_text("abcdef" * 5) # 10 token string
|
||||
expected_output = ["abcdefabcdefabc", "defabcdefabcdef"]
|
||||
assert output == expected_output
|
||||
|
||||
def test_overlap(self) -> None:
|
||||
"""Test with overlap."""
|
||||
splitter = TokenTextSplitter(chunk_size=5, chunk_overlap=1)
|
||||
output = splitter.split_text("abcdef" * 5) # 10 token string
|
||||
expected_output = ["abcdefabcdefabc", "abcdefabcdefabc", "abcdef"]
|
||||
assert output == expected_output
|
||||
def test_token_text_splitter_overlap() -> None:
|
||||
"""Test with overlap."""
|
||||
splitter = TokenTextSplitter(chunk_size=5, chunk_overlap=1)
|
||||
output = splitter.split_text("abcdef" * 5) # 10 token string
|
||||
expected_output = ["abcdefabcdefabc", "abcdefabcdefabc", "abcdef"]
|
||||
assert output == expected_output
|
||||
|
||||
|
||||
def test_token_text_splitter_from_tiktoken() -> None:
|
||||
splitter = TokenTextSplitter.from_tiktoken_encoder(model_name="gpt-3.5-turbo")
|
||||
expected_tokenizer = "cl100k_base"
|
||||
actual_tokenizer = splitter._tokenizer.name
|
||||
assert expected_tokenizer == actual_tokenizer
|
||||
|
Reference in New Issue
Block a user