mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-15 23:57:21 +00:00
Related to https://github.com/langchain-ai/langchain/issues/30344 https://github.com/langchain-ai/langchain/pull/30542 introduced an erroneous test for token counts for o-series models. tiktoken==0.8 does not support o-series models in `tiktoken.encoding_for_model(model_name)`, and this is the version of tiktoken we had in the lock file. So we would default to `cl100k_base` for o-series, which is the wrong encoding model. The test tested against this wrong encoding (so it passed with tiktoken 0.8). Here we update tiktoken to 0.9 in the lock file, and fix the expected counts in the test. Verified that we are pulling [o200k_base](https://github.com/openai/tiktoken/blob/main/tiktoken/model.py#L8), as expected.
35 lines
987 B
Python
35 lines
987 B
Python
import pytest
|
|
|
|
from langchain_openai import ChatOpenAI, OpenAI
|
|
|
|
_EXPECTED_NUM_TOKENS = {
|
|
"ada": 17,
|
|
"babbage": 17,
|
|
"curie": 17,
|
|
"davinci": 17,
|
|
"gpt-4": 12,
|
|
"gpt-4-32k": 12,
|
|
"gpt-3.5-turbo": 12,
|
|
"o1": 11,
|
|
"o3": 11,
|
|
"gpt-4o": 11,
|
|
}
|
|
|
|
_MODELS = models = ["ada", "babbage", "curie", "davinci"]
|
|
_CHAT_MODELS = ["gpt-4", "gpt-4-32k", "gpt-3.5-turbo", "o1", "o3", "gpt-4o"]
|
|
|
|
|
|
@pytest.mark.xfail(reason="Old models require different tiktoken cached file")
|
|
@pytest.mark.parametrize("model", _MODELS)
|
|
def test_openai_get_num_tokens(model: str) -> None:
|
|
"""Test get_tokens."""
|
|
llm = OpenAI(model=model)
|
|
assert llm.get_num_tokens("表情符号是\n🦜🔗") == _EXPECTED_NUM_TOKENS[model]
|
|
|
|
|
|
@pytest.mark.parametrize("model", _CHAT_MODELS)
|
|
def test_chat_openai_get_num_tokens(model: str) -> None:
|
|
"""Test get_tokens."""
|
|
llm = ChatOpenAI(model=model)
|
|
assert llm.get_num_tokens("表情符号是\n🦜🔗") == _EXPECTED_NUM_TOKENS[model]
|