mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-16 15:04:13 +00:00
huggingface tokenizer (#75)
This commit is contained in:
23
tests/integration_tests/test_text_splitter.py
Normal file
23
tests/integration_tests/test_text_splitter.py
Normal file
@@ -0,0 +1,23 @@
|
||||
"""Test text splitters that require an integration."""
|
||||
|
||||
import pytest
|
||||
|
||||
from langchain.text_splitter import CharacterTextSplitter
|
||||
|
||||
|
||||
def test_huggingface_type_check() -> None:
|
||||
"""Test that type checks are done properly on input."""
|
||||
with pytest.raises(ValueError):
|
||||
CharacterTextSplitter.from_huggingface_tokenizer("foo")
|
||||
|
||||
|
||||
def test_huggingface_tokenizer() -> None:
|
||||
"""Test text splitter that uses a HuggingFace tokenizer."""
|
||||
from transformers import GPT2TokenizerFast
|
||||
|
||||
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
|
||||
text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(
|
||||
tokenizer, separator=" ", chunk_size=1, chunk_overlap=0
|
||||
)
|
||||
output = text_splitter.split_text("foo bar")
|
||||
assert output == ["foo", "bar"]
|
Reference in New Issue
Block a user