mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-18 10:43:36 +00:00
Fix wrong token counts from get_num_tokens
from openai llms (#2952)
The encoding fetch was out of date. Luckily OpenAI has a nice[
`encoding_for_model`](46287bfa49/tiktoken/model.py
)
function in `tiktoken` we can use now.
This commit is contained in:
parent
fea5619ce9
commit
b9db20481f
@ -446,15 +446,9 @@ class BaseOpenAI(BaseLLM):
|
|||||||
"This is needed in order to calculate get_num_tokens. "
|
"This is needed in order to calculate get_num_tokens. "
|
||||||
"Please install it with `pip install tiktoken`."
|
"Please install it with `pip install tiktoken`."
|
||||||
)
|
)
|
||||||
encoder = "gpt2"
|
|
||||||
if self.model_name in ("text-davinci-003", "text-davinci-002"):
|
|
||||||
encoder = "p50k_base"
|
|
||||||
if self.model_name.startswith("code"):
|
|
||||||
encoder = "p50k_base"
|
|
||||||
# create a GPT-3 encoder instance
|
|
||||||
enc = tiktoken.get_encoding(encoder)
|
|
||||||
|
|
||||||
# encode the text using the GPT-3 encoder
|
enc = tiktoken.encoding_for_model(self.model_name)
|
||||||
|
|
||||||
tokenized_text = enc.encode(text)
|
tokenized_text = enc.encode(text)
|
||||||
|
|
||||||
# calculate the number of tokens in the encoded text
|
# calculate the number of tokens in the encoded text
|
||||||
|
Loading…
Reference in New Issue
Block a user