mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-13 22:59:05 +00:00
Tiktoken override (#6697)
This commit is contained in:
parent
f9771700e4
commit
e9877ea8b1
@ -184,6 +184,16 @@ class ChatOpenAI(BaseChatModel):
|
|||||||
"""Number of chat completions to generate for each prompt."""
|
"""Number of chat completions to generate for each prompt."""
|
||||||
max_tokens: Optional[int] = None
|
max_tokens: Optional[int] = None
|
||||||
"""Maximum number of tokens to generate."""
|
"""Maximum number of tokens to generate."""
|
||||||
|
tiktoken_model_name: Optional[str] = None
|
||||||
|
"""The model name to pass to tiktoken when using this class.
|
||||||
|
Tiktoken is used to count the number of tokens in documents to constrain
|
||||||
|
them to be under a certain limit. By default, when set to None, this will
|
||||||
|
be the same as the embedding model name. However, there are some cases
|
||||||
|
where you may want to use this Embedding class with a model name not
|
||||||
|
supported by tiktoken. This can include when using Azure embeddings or
|
||||||
|
when using one of the many model providers that expose an OpenAI-like
|
||||||
|
API but with different models. In those cases, in order to avoid erroring
|
||||||
|
when tiktoken is called, you can specify a model name to use here."""
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
"""Configuration for this pydantic object."""
|
"""Configuration for this pydantic object."""
|
||||||
@ -448,6 +458,9 @@ class ChatOpenAI(BaseChatModel):
|
|||||||
|
|
||||||
def _get_encoding_model(self) -> Tuple[str, tiktoken.Encoding]:
|
def _get_encoding_model(self) -> Tuple[str, tiktoken.Encoding]:
|
||||||
tiktoken_ = _import_tiktoken()
|
tiktoken_ = _import_tiktoken()
|
||||||
|
if self.tiktoken_model_name is not None:
|
||||||
|
model = self.tiktoken_model_name
|
||||||
|
else:
|
||||||
model = self.model_name
|
model = self.model_name
|
||||||
if model == "gpt-3.5-turbo":
|
if model == "gpt-3.5-turbo":
|
||||||
# gpt-3.5-turbo may change over time.
|
# gpt-3.5-turbo may change over time.
|
||||||
|
@ -170,6 +170,16 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
|
|||||||
request_timeout: Optional[Union[float, Tuple[float, float]]] = None
|
request_timeout: Optional[Union[float, Tuple[float, float]]] = None
|
||||||
"""Timeout in seconds for the OpenAPI request."""
|
"""Timeout in seconds for the OpenAPI request."""
|
||||||
headers: Any = None
|
headers: Any = None
|
||||||
|
tiktoken_model_name: Optional[str] = None
|
||||||
|
"""The model name to pass to tiktoken when using this class.
|
||||||
|
Tiktoken is used to count the number of tokens in documents to constrain
|
||||||
|
them to be under a certain limit. By default, when set to None, this will
|
||||||
|
be the same as the embedding model name. However, there are some cases
|
||||||
|
where you may want to use this Embedding class with a model name not
|
||||||
|
supported by tiktoken. This can include when using Azure embeddings or
|
||||||
|
when using one of the many model providers that expose an OpenAI-like
|
||||||
|
API but with different models. In those cases, in order to avoid erroring
|
||||||
|
when tiktoken is called, you can specify a model name to use here."""
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
"""Configuration for this pydantic object."""
|
"""Configuration for this pydantic object."""
|
||||||
@ -265,7 +275,13 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
|
|||||||
|
|
||||||
tokens = []
|
tokens = []
|
||||||
indices = []
|
indices = []
|
||||||
encoding = tiktoken.model.encoding_for_model(self.model)
|
model_name = self.tiktoken_model_name or self.model
|
||||||
|
try:
|
||||||
|
encoding = tiktoken.encoding_for_model(model_name)
|
||||||
|
except KeyError:
|
||||||
|
logger.warning("Warning: model not found. Using cl100k_base encoding.")
|
||||||
|
model = "cl100k_base"
|
||||||
|
encoding = tiktoken.get_encoding(model)
|
||||||
for i, text in enumerate(texts):
|
for i, text in enumerate(texts):
|
||||||
if self.model.endswith("001"):
|
if self.model.endswith("001"):
|
||||||
# See: https://github.com/openai/openai-python/issues/418#issuecomment-1525939500
|
# See: https://github.com/openai/openai-python/issues/418#issuecomment-1525939500
|
||||||
@ -329,7 +345,13 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
|
|||||||
|
|
||||||
tokens = []
|
tokens = []
|
||||||
indices = []
|
indices = []
|
||||||
encoding = tiktoken.model.encoding_for_model(self.model)
|
model_name = self.tiktoken_model_name or self.model
|
||||||
|
try:
|
||||||
|
encoding = tiktoken.encoding_for_model(model_name)
|
||||||
|
except KeyError:
|
||||||
|
logger.warning("Warning: model not found. Using cl100k_base encoding.")
|
||||||
|
model = "cl100k_base"
|
||||||
|
encoding = tiktoken.get_encoding(model)
|
||||||
for i, text in enumerate(texts):
|
for i, text in enumerate(texts):
|
||||||
if self.model.endswith("001"):
|
if self.model.endswith("001"):
|
||||||
# See: https://github.com/openai/openai-python/issues/418#issuecomment-1525939500
|
# See: https://github.com/openai/openai-python/issues/418#issuecomment-1525939500
|
||||||
|
@ -171,6 +171,16 @@ class BaseOpenAI(BaseLLM):
|
|||||||
"""Set of special tokens that are allowed。"""
|
"""Set of special tokens that are allowed。"""
|
||||||
disallowed_special: Union[Literal["all"], Collection[str]] = "all"
|
disallowed_special: Union[Literal["all"], Collection[str]] = "all"
|
||||||
"""Set of special tokens that are not allowed。"""
|
"""Set of special tokens that are not allowed。"""
|
||||||
|
tiktoken_model_name: Optional[str] = None
|
||||||
|
"""The model name to pass to tiktoken when using this class.
|
||||||
|
Tiktoken is used to count the number of tokens in documents to constrain
|
||||||
|
them to be under a certain limit. By default, when set to None, this will
|
||||||
|
be the same as the embedding model name. However, there are some cases
|
||||||
|
where you may want to use this Embedding class with a model name not
|
||||||
|
supported by tiktoken. This can include when using Azure embeddings or
|
||||||
|
when using one of the many model providers that expose an OpenAI-like
|
||||||
|
API but with different models. In those cases, in order to avoid erroring
|
||||||
|
when tiktoken is called, you can specify a model name to use here."""
|
||||||
|
|
||||||
def __new__(cls, **data: Any) -> Union[OpenAIChat, BaseOpenAI]: # type: ignore
|
def __new__(cls, **data: Any) -> Union[OpenAIChat, BaseOpenAI]: # type: ignore
|
||||||
"""Initialize the OpenAI object."""
|
"""Initialize the OpenAI object."""
|
||||||
@ -491,7 +501,13 @@ class BaseOpenAI(BaseLLM):
|
|||||||
"Please install it with `pip install tiktoken`."
|
"Please install it with `pip install tiktoken`."
|
||||||
)
|
)
|
||||||
|
|
||||||
enc = tiktoken.encoding_for_model(self.model_name)
|
model_name = self.tiktoken_model_name or self.model_name
|
||||||
|
try:
|
||||||
|
enc = tiktoken.encoding_for_model(model_name)
|
||||||
|
except KeyError:
|
||||||
|
logger.warning("Warning: model not found. Using cl100k_base encoding.")
|
||||||
|
model = "cl100k_base"
|
||||||
|
enc = tiktoken.get_encoding(model)
|
||||||
|
|
||||||
return enc.encode(
|
return enc.encode(
|
||||||
text,
|
text,
|
||||||
|
Loading…
Reference in New Issue
Block a user