mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-08 14:05:16 +00:00
Extend OpenAIEmbeddings class to support non-tiktoken
based embeddings (#13884)
<!-- Thank you for contributing to LangChain! Replace this entire comment with: - **Description:** This extends `OpenAIEmbeddings` to add support for non-`tiktoken` based embeddings, specifically for use with the new `text-generation-webui` API (`--extensions openai`) which does not support `tiktoken` encodings, but rather strings - **Issue:** Not found, - **Dependencies:** HuggingFace `transformers.AutoTokenizer` is new dependency for running the model without `tiktoken` - **Tag maintainer:** @baskaryan based on last commit for `langchain-core` refactor - **Twitter handle:** @xychelsea Modified the tokenization process to be model-agnostic, allowing for both OpenAI and non-OpenAI model tokenizations, by setting the new default `bool` flag `tiktoken_enabled` to `False`. This requeires HuggingFace’s AutoTokenizer and handling tokenization for models requiring different preprocessing steps to generate a chunked string request rather than a list of integers. Updated the embeddings generation process to accommodate non-OpenAI models. This includes converting tokenized text into embeddings using OpenAI’s and Hugging Face’s model architectures. -->
This commit is contained in:
parent
9b59bde93d
commit
2780d2d4dd
@ -210,6 +210,9 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
|
||||
"""Timeout for requests to OpenAI completion API. Can be float, httpx.Timeout or
|
||||
None."""
|
||||
headers: Any = None
|
||||
tiktoken_enabled: bool = True
|
||||
"""Set this to False for non-OpenAI implementations of the embeddings API, e.g.
|
||||
the `--extensions openai` extension for `text-generation-webui`"""
|
||||
tiktoken_model_name: Optional[str] = None
|
||||
"""The model name to pass to tiktoken when using this class.
|
||||
Tiktoken is used to count the number of tokens in documents to constrain
|
||||
@ -382,42 +385,87 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
|
||||
def _get_len_safe_embeddings(
|
||||
self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None
|
||||
) -> List[List[float]]:
|
||||
embeddings: List[List[float]] = [[] for _ in range(len(texts))]
|
||||
try:
|
||||
import tiktoken
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Could not import tiktoken python package. "
|
||||
"This is needed in order to for OpenAIEmbeddings. "
|
||||
"Please install it with `pip install tiktoken`."
|
||||
)
|
||||
"""
|
||||
Generate length-safe embeddings for a list of texts.
|
||||
|
||||
This method handles tokenization and embedding generation, respecting the
|
||||
set embedding context length and chunk size. It supports both tiktoken
|
||||
and HuggingFace tokenizer based on the tiktoken_enabled flag.
|
||||
|
||||
Args:
|
||||
texts (List[str]): A list of texts to embed.
|
||||
engine (str): The engine or model to use for embeddings.
|
||||
chunk_size (Optional[int]): The size of chunks for processing embeddings.
|
||||
|
||||
Returns:
|
||||
List[List[float]]: A list of embeddings for each input text.
|
||||
"""
|
||||
|
||||
tokens = []
|
||||
indices = []
|
||||
model_name = self.tiktoken_model_name or self.model
|
||||
try:
|
||||
encoding = tiktoken.encoding_for_model(model_name)
|
||||
except KeyError:
|
||||
logger.warning("Warning: model not found. Using cl100k_base encoding.")
|
||||
model = "cl100k_base"
|
||||
encoding = tiktoken.get_encoding(model)
|
||||
for i, text in enumerate(texts):
|
||||
if self.model.endswith("001"):
|
||||
# See: https://github.com/openai/openai-python/issues/418#issuecomment-1525939500
|
||||
# replace newlines, which can negatively affect performance.
|
||||
text = text.replace("\n", " ")
|
||||
token = encoding.encode(
|
||||
text,
|
||||
allowed_special=self.allowed_special,
|
||||
disallowed_special=self.disallowed_special,
|
||||
)
|
||||
for j in range(0, len(token), self.embedding_ctx_length):
|
||||
tokens.append(token[j : j + self.embedding_ctx_length])
|
||||
indices.append(i)
|
||||
|
||||
batched_embeddings: List[List[float]] = []
|
||||
_chunk_size = chunk_size or self.chunk_size
|
||||
|
||||
# If tiktoken flag set to False
|
||||
if not self.tiktoken_enabled:
|
||||
try:
|
||||
from transformers import AutoTokenizer
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
"Could not import transformers python package. "
|
||||
"This is needed in order to for OpenAIEmbeddings without "
|
||||
"`tiktoken`. Please install it with `pip install transformers`. "
|
||||
)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
pretrained_model_name_or_path=model_name
|
||||
)
|
||||
for i, text in enumerate(texts):
|
||||
# Tokenize the text using HuggingFace transformers
|
||||
tokenized = tokenizer.encode(text, add_special_tokens=False)
|
||||
|
||||
# Split tokens into chunks respecting the embedding_ctx_length
|
||||
for j in range(0, len(tokenized), self.embedding_ctx_length):
|
||||
token_chunk = tokenized[j : j + self.embedding_ctx_length]
|
||||
|
||||
# Convert token IDs back to a string
|
||||
chunk_text = tokenizer.decode(token_chunk)
|
||||
tokens.append(chunk_text)
|
||||
indices.append(i)
|
||||
else:
|
||||
try:
|
||||
import tiktoken
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Could not import tiktoken python package. "
|
||||
"This is needed in order to for OpenAIEmbeddings. "
|
||||
"Please install it with `pip install tiktoken`."
|
||||
)
|
||||
|
||||
try:
|
||||
encoding = tiktoken.encoding_for_model(model_name)
|
||||
except KeyError:
|
||||
logger.warning("Warning: model not found. Using cl100k_base encoding.")
|
||||
model = "cl100k_base"
|
||||
encoding = tiktoken.get_encoding(model)
|
||||
for i, text in enumerate(texts):
|
||||
if self.model.endswith("001"):
|
||||
# See: https://github.com/openai/openai-python/
|
||||
# issues/418#issuecomment-1525939500
|
||||
# replace newlines, which can negatively affect performance.
|
||||
text = text.replace("\n", " ")
|
||||
|
||||
token = encoding.encode(
|
||||
text=text,
|
||||
allowed_special=self.allowed_special,
|
||||
disallowed_special=self.disallowed_special,
|
||||
)
|
||||
|
||||
# Split tokens into chunks respecting the embedding_ctx_length
|
||||
for j in range(0, len(token), self.embedding_ctx_length):
|
||||
tokens.append(token[j : j + self.embedding_ctx_length])
|
||||
indices.append(i)
|
||||
|
||||
if self.show_progress_bar:
|
||||
try:
|
||||
from tqdm.auto import tqdm
|
||||
@ -428,6 +476,7 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
|
||||
else:
|
||||
_iter = range(0, len(tokens), _chunk_size)
|
||||
|
||||
batched_embeddings: List[List[float]] = []
|
||||
for i in _iter:
|
||||
response = embed_with_retry(
|
||||
self,
|
||||
@ -446,6 +495,7 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
|
||||
results[indices[i]].append(batched_embeddings[i])
|
||||
num_tokens_in_batch[indices[i]].append(len(tokens[i]))
|
||||
|
||||
embeddings: List[List[float]] = [[] for _ in range(len(texts))]
|
||||
for i in range(len(texts)):
|
||||
_result = results[i]
|
||||
if len(_result) == 0:
|
||||
@ -468,38 +518,86 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
|
||||
async def _aget_len_safe_embeddings(
|
||||
self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None
|
||||
) -> List[List[float]]:
|
||||
embeddings: List[List[float]] = [[] for _ in range(len(texts))]
|
||||
try:
|
||||
import tiktoken
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Could not import tiktoken python package. "
|
||||
"This is needed in order to for OpenAIEmbeddings. "
|
||||
"Please install it with `pip install tiktoken`."
|
||||
)
|
||||
"""
|
||||
Asynchronously generate length-safe embeddings for a list of texts.
|
||||
|
||||
This method handles tokenization and asynchronous embedding generation,
|
||||
respecting the set embedding context length and chunk size. It supports both
|
||||
`tiktoken` and HuggingFace `tokenizer` based on the tiktoken_enabled flag.
|
||||
|
||||
Args:
|
||||
texts (List[str]): A list of texts to embed.
|
||||
engine (str): The engine or model to use for embeddings.
|
||||
chunk_size (Optional[int]): The size of chunks for processing embeddings.
|
||||
|
||||
Returns:
|
||||
List[List[float]]: A list of embeddings for each input text.
|
||||
"""
|
||||
|
||||
tokens = []
|
||||
indices = []
|
||||
model_name = self.tiktoken_model_name or self.model
|
||||
try:
|
||||
encoding = tiktoken.encoding_for_model(model_name)
|
||||
except KeyError:
|
||||
logger.warning("Warning: model not found. Using cl100k_base encoding.")
|
||||
model = "cl100k_base"
|
||||
encoding = tiktoken.get_encoding(model)
|
||||
for i, text in enumerate(texts):
|
||||
if self.model.endswith("001"):
|
||||
# See: https://github.com/openai/openai-python/issues/418#issuecomment-1525939500
|
||||
# replace newlines, which can negatively affect performance.
|
||||
text = text.replace("\n", " ")
|
||||
token = encoding.encode(
|
||||
text,
|
||||
allowed_special=self.allowed_special,
|
||||
disallowed_special=self.disallowed_special,
|
||||
_chunk_size = chunk_size or self.chunk_size
|
||||
|
||||
# If tiktoken flag set to False
|
||||
if not self.tiktoken_enabled:
|
||||
try:
|
||||
from transformers import AutoTokenizer
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
"Could not import transformers python package. "
|
||||
"This is needed in order to for OpenAIEmbeddings without "
|
||||
" `tiktoken`. Please install it with `pip install transformers`."
|
||||
)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
pretrained_model_name_or_path=model_name
|
||||
)
|
||||
for j in range(0, len(token), self.embedding_ctx_length):
|
||||
tokens.append(token[j : j + self.embedding_ctx_length])
|
||||
indices.append(i)
|
||||
for i, text in enumerate(texts):
|
||||
# Tokenize the text using HuggingFace transformers
|
||||
tokenized = tokenizer.encode(text, add_special_tokens=False)
|
||||
|
||||
# Split tokens into chunks respecting the embedding_ctx_length
|
||||
for j in range(0, len(tokenized), self.embedding_ctx_length):
|
||||
token_chunk = tokenized[j : j + self.embedding_ctx_length]
|
||||
|
||||
# Convert token IDs back to a string
|
||||
chunk_text = tokenizer.decode(token_chunk)
|
||||
tokens.append(chunk_text)
|
||||
indices.append(i)
|
||||
else:
|
||||
try:
|
||||
import tiktoken
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Could not import tiktoken python package. "
|
||||
"This is needed in order to for OpenAIEmbeddings. "
|
||||
"Please install it with `pip install tiktoken`."
|
||||
)
|
||||
|
||||
try:
|
||||
encoding = tiktoken.encoding_for_model(model_name)
|
||||
except KeyError:
|
||||
logger.warning("Warning: model not found. Using cl100k_base encoding.")
|
||||
model = "cl100k_base"
|
||||
encoding = tiktoken.get_encoding(model)
|
||||
for i, text in enumerate(texts):
|
||||
if self.model.endswith("001"):
|
||||
# See: https://github.com/openai/openai-python/
|
||||
# issues/418#issuecomment-1525939500
|
||||
# replace newlines, which can negatively affect performance.
|
||||
text = text.replace("\n", " ")
|
||||
|
||||
token = encoding.encode(
|
||||
text=text,
|
||||
allowed_special=self.allowed_special,
|
||||
disallowed_special=self.disallowed_special,
|
||||
)
|
||||
|
||||
# Split tokens into chunks respecting the embedding_ctx_length
|
||||
for j in range(0, len(token), self.embedding_ctx_length):
|
||||
tokens.append(token[j : j + self.embedding_ctx_length])
|
||||
indices.append(i)
|
||||
|
||||
batched_embeddings: List[List[float]] = []
|
||||
_chunk_size = chunk_size or self.chunk_size
|
||||
@ -520,6 +618,7 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
|
||||
results[indices[i]].append(batched_embeddings[i])
|
||||
num_tokens_in_batch[indices[i]].append(len(tokens[i]))
|
||||
|
||||
embeddings: List[List[float]] = [[] for _ in range(len(texts))]
|
||||
for i in range(len(texts)):
|
||||
_result = results[i]
|
||||
if len(_result) == 0:
|
||||
|
Loading…
Reference in New Issue
Block a user