mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-04 06:03:31 +00:00
- added unittest for schema.py covering utility functions and token counting. - fixed a nit. based on huggingface doc, the tokenizer model is gpt-2. [link](https://huggingface.co/transformers/v4.8.2/_modules/transformers/models/gpt2/tokenization_gpt2_fast.html) - make lint && make format, passed on local - screenshot of new test running result <img width="1283" alt="Screenshot 2023-04-27 at 9 51 55 PM" src="https://user-images.githubusercontent.com/62768671/235057441-c0ac3406-9541-453f-ba14-3ebb08656114.png">
16 lines
517 B
Python
16 lines
517 B
Python
"""Test formatting functionality."""
|
|
|
|
from langchain.schema import _get_num_tokens_default_method
|
|
|
|
|
|
class TestTokenCountingWithGPT2Tokenizer:
|
|
def test_empty_token(self) -> None:
|
|
assert _get_num_tokens_default_method("") == 0
|
|
|
|
def test_multiple_tokens(self) -> None:
|
|
assert _get_num_tokens_default_method("a b c") == 3
|
|
|
|
def test_special_tokens(self) -> None:
|
|
# test for consistency when the default tokenizer is changed
|
|
assert _get_num_tokens_default_method("a:b_c d") == 6
|