mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-08 14:05:16 +00:00
Extend OpenAIEmbeddings class to support non-tiktoken
based embeddings (#13884)
<!-- Thank you for contributing to LangChain! Replace this entire comment with: - **Description:** This extends `OpenAIEmbeddings` to add support for non-`tiktoken` based embeddings, specifically for use with the new `text-generation-webui` API (`--extensions openai`) which does not support `tiktoken` encodings, but rather strings - **Issue:** Not found, - **Dependencies:** HuggingFace `transformers.AutoTokenizer` is new dependency for running the model without `tiktoken` - **Tag maintainer:** @baskaryan based on last commit for `langchain-core` refactor - **Twitter handle:** @xychelsea Modified the tokenization process to be model-agnostic, allowing for both OpenAI and non-OpenAI model tokenizations, by setting the new default `bool` flag `tiktoken_enabled` to `False`. This requeires HuggingFace’s AutoTokenizer and handling tokenization for models requiring different preprocessing steps to generate a chunked string request rather than a list of integers. Updated the embeddings generation process to accommodate non-OpenAI models. This includes converting tokenized text into embeddings using OpenAI’s and Hugging Face’s model architectures. -->
This commit is contained in:
parent
9b59bde93d
commit
2780d2d4dd
@ -210,6 +210,9 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
|
|||||||
"""Timeout for requests to OpenAI completion API. Can be float, httpx.Timeout or
|
"""Timeout for requests to OpenAI completion API. Can be float, httpx.Timeout or
|
||||||
None."""
|
None."""
|
||||||
headers: Any = None
|
headers: Any = None
|
||||||
|
tiktoken_enabled: bool = True
|
||||||
|
"""Set this to False for non-OpenAI implementations of the embeddings API, e.g.
|
||||||
|
the `--extensions openai` extension for `text-generation-webui`"""
|
||||||
tiktoken_model_name: Optional[str] = None
|
tiktoken_model_name: Optional[str] = None
|
||||||
"""The model name to pass to tiktoken when using this class.
|
"""The model name to pass to tiktoken when using this class.
|
||||||
Tiktoken is used to count the number of tokens in documents to constrain
|
Tiktoken is used to count the number of tokens in documents to constrain
|
||||||
@ -382,42 +385,87 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
|
|||||||
def _get_len_safe_embeddings(
|
def _get_len_safe_embeddings(
|
||||||
self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None
|
self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None
|
||||||
) -> List[List[float]]:
|
) -> List[List[float]]:
|
||||||
embeddings: List[List[float]] = [[] for _ in range(len(texts))]
|
"""
|
||||||
try:
|
Generate length-safe embeddings for a list of texts.
|
||||||
import tiktoken
|
|
||||||
except ImportError:
|
This method handles tokenization and embedding generation, respecting the
|
||||||
raise ImportError(
|
set embedding context length and chunk size. It supports both tiktoken
|
||||||
"Could not import tiktoken python package. "
|
and HuggingFace tokenizer based on the tiktoken_enabled flag.
|
||||||
"This is needed in order to for OpenAIEmbeddings. "
|
|
||||||
"Please install it with `pip install tiktoken`."
|
Args:
|
||||||
)
|
texts (List[str]): A list of texts to embed.
|
||||||
|
engine (str): The engine or model to use for embeddings.
|
||||||
|
chunk_size (Optional[int]): The size of chunks for processing embeddings.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[List[float]]: A list of embeddings for each input text.
|
||||||
|
"""
|
||||||
|
|
||||||
tokens = []
|
tokens = []
|
||||||
indices = []
|
indices = []
|
||||||
model_name = self.tiktoken_model_name or self.model
|
model_name = self.tiktoken_model_name or self.model
|
||||||
try:
|
|
||||||
encoding = tiktoken.encoding_for_model(model_name)
|
|
||||||
except KeyError:
|
|
||||||
logger.warning("Warning: model not found. Using cl100k_base encoding.")
|
|
||||||
model = "cl100k_base"
|
|
||||||
encoding = tiktoken.get_encoding(model)
|
|
||||||
for i, text in enumerate(texts):
|
|
||||||
if self.model.endswith("001"):
|
|
||||||
# See: https://github.com/openai/openai-python/issues/418#issuecomment-1525939500
|
|
||||||
# replace newlines, which can negatively affect performance.
|
|
||||||
text = text.replace("\n", " ")
|
|
||||||
token = encoding.encode(
|
|
||||||
text,
|
|
||||||
allowed_special=self.allowed_special,
|
|
||||||
disallowed_special=self.disallowed_special,
|
|
||||||
)
|
|
||||||
for j in range(0, len(token), self.embedding_ctx_length):
|
|
||||||
tokens.append(token[j : j + self.embedding_ctx_length])
|
|
||||||
indices.append(i)
|
|
||||||
|
|
||||||
batched_embeddings: List[List[float]] = []
|
|
||||||
_chunk_size = chunk_size or self.chunk_size
|
_chunk_size = chunk_size or self.chunk_size
|
||||||
|
|
||||||
|
# If tiktoken flag set to False
|
||||||
|
if not self.tiktoken_enabled:
|
||||||
|
try:
|
||||||
|
from transformers import AutoTokenizer
|
||||||
|
except ImportError:
|
||||||
|
raise ValueError(
|
||||||
|
"Could not import transformers python package. "
|
||||||
|
"This is needed in order to for OpenAIEmbeddings without "
|
||||||
|
"`tiktoken`. Please install it with `pip install transformers`. "
|
||||||
|
)
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(
|
||||||
|
pretrained_model_name_or_path=model_name
|
||||||
|
)
|
||||||
|
for i, text in enumerate(texts):
|
||||||
|
# Tokenize the text using HuggingFace transformers
|
||||||
|
tokenized = tokenizer.encode(text, add_special_tokens=False)
|
||||||
|
|
||||||
|
# Split tokens into chunks respecting the embedding_ctx_length
|
||||||
|
for j in range(0, len(tokenized), self.embedding_ctx_length):
|
||||||
|
token_chunk = tokenized[j : j + self.embedding_ctx_length]
|
||||||
|
|
||||||
|
# Convert token IDs back to a string
|
||||||
|
chunk_text = tokenizer.decode(token_chunk)
|
||||||
|
tokens.append(chunk_text)
|
||||||
|
indices.append(i)
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
import tiktoken
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"Could not import tiktoken python package. "
|
||||||
|
"This is needed in order to for OpenAIEmbeddings. "
|
||||||
|
"Please install it with `pip install tiktoken`."
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
encoding = tiktoken.encoding_for_model(model_name)
|
||||||
|
except KeyError:
|
||||||
|
logger.warning("Warning: model not found. Using cl100k_base encoding.")
|
||||||
|
model = "cl100k_base"
|
||||||
|
encoding = tiktoken.get_encoding(model)
|
||||||
|
for i, text in enumerate(texts):
|
||||||
|
if self.model.endswith("001"):
|
||||||
|
# See: https://github.com/openai/openai-python/
|
||||||
|
# issues/418#issuecomment-1525939500
|
||||||
|
# replace newlines, which can negatively affect performance.
|
||||||
|
text = text.replace("\n", " ")
|
||||||
|
|
||||||
|
token = encoding.encode(
|
||||||
|
text=text,
|
||||||
|
allowed_special=self.allowed_special,
|
||||||
|
disallowed_special=self.disallowed_special,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Split tokens into chunks respecting the embedding_ctx_length
|
||||||
|
for j in range(0, len(token), self.embedding_ctx_length):
|
||||||
|
tokens.append(token[j : j + self.embedding_ctx_length])
|
||||||
|
indices.append(i)
|
||||||
|
|
||||||
if self.show_progress_bar:
|
if self.show_progress_bar:
|
||||||
try:
|
try:
|
||||||
from tqdm.auto import tqdm
|
from tqdm.auto import tqdm
|
||||||
@ -428,6 +476,7 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
|
|||||||
else:
|
else:
|
||||||
_iter = range(0, len(tokens), _chunk_size)
|
_iter = range(0, len(tokens), _chunk_size)
|
||||||
|
|
||||||
|
batched_embeddings: List[List[float]] = []
|
||||||
for i in _iter:
|
for i in _iter:
|
||||||
response = embed_with_retry(
|
response = embed_with_retry(
|
||||||
self,
|
self,
|
||||||
@ -446,6 +495,7 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
|
|||||||
results[indices[i]].append(batched_embeddings[i])
|
results[indices[i]].append(batched_embeddings[i])
|
||||||
num_tokens_in_batch[indices[i]].append(len(tokens[i]))
|
num_tokens_in_batch[indices[i]].append(len(tokens[i]))
|
||||||
|
|
||||||
|
embeddings: List[List[float]] = [[] for _ in range(len(texts))]
|
||||||
for i in range(len(texts)):
|
for i in range(len(texts)):
|
||||||
_result = results[i]
|
_result = results[i]
|
||||||
if len(_result) == 0:
|
if len(_result) == 0:
|
||||||
@ -468,38 +518,86 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
|
|||||||
async def _aget_len_safe_embeddings(
|
async def _aget_len_safe_embeddings(
|
||||||
self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None
|
self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None
|
||||||
) -> List[List[float]]:
|
) -> List[List[float]]:
|
||||||
embeddings: List[List[float]] = [[] for _ in range(len(texts))]
|
"""
|
||||||
try:
|
Asynchronously generate length-safe embeddings for a list of texts.
|
||||||
import tiktoken
|
|
||||||
except ImportError:
|
This method handles tokenization and asynchronous embedding generation,
|
||||||
raise ImportError(
|
respecting the set embedding context length and chunk size. It supports both
|
||||||
"Could not import tiktoken python package. "
|
`tiktoken` and HuggingFace `tokenizer` based on the tiktoken_enabled flag.
|
||||||
"This is needed in order to for OpenAIEmbeddings. "
|
|
||||||
"Please install it with `pip install tiktoken`."
|
Args:
|
||||||
)
|
texts (List[str]): A list of texts to embed.
|
||||||
|
engine (str): The engine or model to use for embeddings.
|
||||||
|
chunk_size (Optional[int]): The size of chunks for processing embeddings.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[List[float]]: A list of embeddings for each input text.
|
||||||
|
"""
|
||||||
|
|
||||||
tokens = []
|
tokens = []
|
||||||
indices = []
|
indices = []
|
||||||
model_name = self.tiktoken_model_name or self.model
|
model_name = self.tiktoken_model_name or self.model
|
||||||
try:
|
_chunk_size = chunk_size or self.chunk_size
|
||||||
encoding = tiktoken.encoding_for_model(model_name)
|
|
||||||
except KeyError:
|
# If tiktoken flag set to False
|
||||||
logger.warning("Warning: model not found. Using cl100k_base encoding.")
|
if not self.tiktoken_enabled:
|
||||||
model = "cl100k_base"
|
try:
|
||||||
encoding = tiktoken.get_encoding(model)
|
from transformers import AutoTokenizer
|
||||||
for i, text in enumerate(texts):
|
except ImportError:
|
||||||
if self.model.endswith("001"):
|
raise ValueError(
|
||||||
# See: https://github.com/openai/openai-python/issues/418#issuecomment-1525939500
|
"Could not import transformers python package. "
|
||||||
# replace newlines, which can negatively affect performance.
|
"This is needed in order to for OpenAIEmbeddings without "
|
||||||
text = text.replace("\n", " ")
|
" `tiktoken`. Please install it with `pip install transformers`."
|
||||||
token = encoding.encode(
|
)
|
||||||
text,
|
|
||||||
allowed_special=self.allowed_special,
|
tokenizer = AutoTokenizer.from_pretrained(
|
||||||
disallowed_special=self.disallowed_special,
|
pretrained_model_name_or_path=model_name
|
||||||
)
|
)
|
||||||
for j in range(0, len(token), self.embedding_ctx_length):
|
for i, text in enumerate(texts):
|
||||||
tokens.append(token[j : j + self.embedding_ctx_length])
|
# Tokenize the text using HuggingFace transformers
|
||||||
indices.append(i)
|
tokenized = tokenizer.encode(text, add_special_tokens=False)
|
||||||
|
|
||||||
|
# Split tokens into chunks respecting the embedding_ctx_length
|
||||||
|
for j in range(0, len(tokenized), self.embedding_ctx_length):
|
||||||
|
token_chunk = tokenized[j : j + self.embedding_ctx_length]
|
||||||
|
|
||||||
|
# Convert token IDs back to a string
|
||||||
|
chunk_text = tokenizer.decode(token_chunk)
|
||||||
|
tokens.append(chunk_text)
|
||||||
|
indices.append(i)
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
import tiktoken
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"Could not import tiktoken python package. "
|
||||||
|
"This is needed in order to for OpenAIEmbeddings. "
|
||||||
|
"Please install it with `pip install tiktoken`."
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
encoding = tiktoken.encoding_for_model(model_name)
|
||||||
|
except KeyError:
|
||||||
|
logger.warning("Warning: model not found. Using cl100k_base encoding.")
|
||||||
|
model = "cl100k_base"
|
||||||
|
encoding = tiktoken.get_encoding(model)
|
||||||
|
for i, text in enumerate(texts):
|
||||||
|
if self.model.endswith("001"):
|
||||||
|
# See: https://github.com/openai/openai-python/
|
||||||
|
# issues/418#issuecomment-1525939500
|
||||||
|
# replace newlines, which can negatively affect performance.
|
||||||
|
text = text.replace("\n", " ")
|
||||||
|
|
||||||
|
token = encoding.encode(
|
||||||
|
text=text,
|
||||||
|
allowed_special=self.allowed_special,
|
||||||
|
disallowed_special=self.disallowed_special,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Split tokens into chunks respecting the embedding_ctx_length
|
||||||
|
for j in range(0, len(token), self.embedding_ctx_length):
|
||||||
|
tokens.append(token[j : j + self.embedding_ctx_length])
|
||||||
|
indices.append(i)
|
||||||
|
|
||||||
batched_embeddings: List[List[float]] = []
|
batched_embeddings: List[List[float]] = []
|
||||||
_chunk_size = chunk_size or self.chunk_size
|
_chunk_size = chunk_size or self.chunk_size
|
||||||
@ -520,6 +618,7 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
|
|||||||
results[indices[i]].append(batched_embeddings[i])
|
results[indices[i]].append(batched_embeddings[i])
|
||||||
num_tokens_in_batch[indices[i]].append(len(tokens[i]))
|
num_tokens_in_batch[indices[i]].append(len(tokens[i]))
|
||||||
|
|
||||||
|
embeddings: List[List[float]] = [[] for _ in range(len(texts))]
|
||||||
for i in range(len(texts)):
|
for i in range(len(texts)):
|
||||||
_result = results[i]
|
_result = results[i]
|
||||||
if len(_result) == 0:
|
if len(_result) == 0:
|
||||||
|
Loading…
Reference in New Issue
Block a user