Fix TextSplitter.from_tiktoken(#4361)

Thanks to @danb27 for the fix! Minor update

Fixes https://github.com/hwchase17/langchain/issues/4357

---------

Co-authored-by: Dan Bianchini <42096328+danb27@users.noreply.github.com>
This commit is contained in:
Davis Chase
2023-05-08 16:36:38 -07:00
committed by GitHub
parent 782df1db10
commit 02ebb15c4a
2 changed files with 35 additions and 18 deletions

View File

@@ -23,19 +23,24 @@ def test_huggingface_tokenizer() -> None:
assert output == ["foo", "bar"]
class TestTokenTextSplitter:
"""Test token text splitter."""
def test_token_text_splitter() -> None:
"""Test no overlap."""
splitter = TokenTextSplitter(chunk_size=5, chunk_overlap=0)
output = splitter.split_text("abcdef" * 5) # 10 token string
expected_output = ["abcdefabcdefabc", "defabcdefabcdef"]
assert output == expected_output
def test_basic(self) -> None:
"""Test no overlap."""
splitter = TokenTextSplitter(chunk_size=5, chunk_overlap=0)
output = splitter.split_text("abcdef" * 5) # 10 token string
expected_output = ["abcdefabcdefabc", "defabcdefabcdef"]
assert output == expected_output
def test_overlap(self) -> None:
"""Test with overlap."""
splitter = TokenTextSplitter(chunk_size=5, chunk_overlap=1)
output = splitter.split_text("abcdef" * 5) # 10 token string
expected_output = ["abcdefabcdefabc", "abcdefabcdefabc", "abcdef"]
assert output == expected_output
def test_token_text_splitter_overlap() -> None:
"""Test with overlap."""
splitter = TokenTextSplitter(chunk_size=5, chunk_overlap=1)
output = splitter.split_text("abcdef" * 5) # 10 token string
expected_output = ["abcdefabcdefabc", "abcdefabcdefabc", "abcdef"]
assert output == expected_output
def test_token_text_splitter_from_tiktoken() -> None:
splitter = TokenTextSplitter.from_tiktoken_encoder(model_name="gpt-3.5-turbo")
expected_tokenizer = "cl100k_base"
actual_tokenizer = splitter._tokenizer.name
assert expected_tokenizer == actual_tokenizer