Extend OpenAIEmbeddings class to support non-tiktoken based embeddings (#13884)

<!-- Thank you for contributing to LangChain!

Replace this entire comment with:
- **Description:** This extends `OpenAIEmbeddings` to add support for
non-`tiktoken` based embeddings, specifically for use with the new
`text-generation-webui` API (`--extensions openai`) which does not
support `tiktoken` encodings, but rather strings
  - **Issue:** Not found,
- **Dependencies:** HuggingFace `transformers.AutoTokenizer` is new
dependency for running the model without `tiktoken`
- **Tag maintainer:** @baskaryan based on last commit for
`langchain-core` refactor
  - **Twitter handle:** @xychelsea

Modified the tokenization process to be model-agnostic, allowing for
both OpenAI and non-OpenAI model tokenizations, by setting the new
default `bool` flag `tiktoken_enabled` to `False`. This requeires
HuggingFace’s AutoTokenizer and handling tokenization for models
requiring different preprocessing steps to generate a chunked string
request rather than a list of integers.

Updated the embeddings generation process to accommodate non-OpenAI
models. This includes converting tokenized text into embeddings using
OpenAI’s and Hugging Face’s model architectures.
 -->
This commit is contained in:
Chelsea E. Manning 2023-12-03 15:04:17 -05:00 committed by GitHub
parent 9b59bde93d
commit 2780d2d4dd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -210,6 +210,9 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
"""Timeout for requests to OpenAI completion API. Can be float, httpx.Timeout or """Timeout for requests to OpenAI completion API. Can be float, httpx.Timeout or
None.""" None."""
headers: Any = None headers: Any = None
tiktoken_enabled: bool = True
"""Set this to False for non-OpenAI implementations of the embeddings API, e.g.
the `--extensions openai` extension for `text-generation-webui`"""
tiktoken_model_name: Optional[str] = None tiktoken_model_name: Optional[str] = None
"""The model name to pass to tiktoken when using this class. """The model name to pass to tiktoken when using this class.
Tiktoken is used to count the number of tokens in documents to constrain Tiktoken is used to count the number of tokens in documents to constrain
@ -382,7 +385,54 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
def _get_len_safe_embeddings( def _get_len_safe_embeddings(
self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None
) -> List[List[float]]: ) -> List[List[float]]:
embeddings: List[List[float]] = [[] for _ in range(len(texts))] """
Generate length-safe embeddings for a list of texts.
This method handles tokenization and embedding generation, respecting the
set embedding context length and chunk size. It supports both tiktoken
and HuggingFace tokenizer based on the tiktoken_enabled flag.
Args:
texts (List[str]): A list of texts to embed.
engine (str): The engine or model to use for embeddings.
chunk_size (Optional[int]): The size of chunks for processing embeddings.
Returns:
List[List[float]]: A list of embeddings for each input text.
"""
tokens = []
indices = []
model_name = self.tiktoken_model_name or self.model
_chunk_size = chunk_size or self.chunk_size
# If tiktoken flag set to False
if not self.tiktoken_enabled:
try:
from transformers import AutoTokenizer
except ImportError:
raise ValueError(
"Could not import transformers python package. "
"This is needed in order to for OpenAIEmbeddings without "
"`tiktoken`. Please install it with `pip install transformers`. "
)
tokenizer = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path=model_name
)
for i, text in enumerate(texts):
# Tokenize the text using HuggingFace transformers
tokenized = tokenizer.encode(text, add_special_tokens=False)
# Split tokens into chunks respecting the embedding_ctx_length
for j in range(0, len(tokenized), self.embedding_ctx_length):
token_chunk = tokenized[j : j + self.embedding_ctx_length]
# Convert token IDs back to a string
chunk_text = tokenizer.decode(token_chunk)
tokens.append(chunk_text)
indices.append(i)
else:
try: try:
import tiktoken import tiktoken
except ImportError: except ImportError:
@ -392,9 +442,6 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
"Please install it with `pip install tiktoken`." "Please install it with `pip install tiktoken`."
) )
tokens = []
indices = []
model_name = self.tiktoken_model_name or self.model
try: try:
encoding = tiktoken.encoding_for_model(model_name) encoding = tiktoken.encoding_for_model(model_name)
except KeyError: except KeyError:
@ -403,21 +450,22 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
encoding = tiktoken.get_encoding(model) encoding = tiktoken.get_encoding(model)
for i, text in enumerate(texts): for i, text in enumerate(texts):
if self.model.endswith("001"): if self.model.endswith("001"):
# See: https://github.com/openai/openai-python/issues/418#issuecomment-1525939500 # See: https://github.com/openai/openai-python/
# issues/418#issuecomment-1525939500
# replace newlines, which can negatively affect performance. # replace newlines, which can negatively affect performance.
text = text.replace("\n", " ") text = text.replace("\n", " ")
token = encoding.encode( token = encoding.encode(
text, text=text,
allowed_special=self.allowed_special, allowed_special=self.allowed_special,
disallowed_special=self.disallowed_special, disallowed_special=self.disallowed_special,
) )
# Split tokens into chunks respecting the embedding_ctx_length
for j in range(0, len(token), self.embedding_ctx_length): for j in range(0, len(token), self.embedding_ctx_length):
tokens.append(token[j : j + self.embedding_ctx_length]) tokens.append(token[j : j + self.embedding_ctx_length])
indices.append(i) indices.append(i)
batched_embeddings: List[List[float]] = []
_chunk_size = chunk_size or self.chunk_size
if self.show_progress_bar: if self.show_progress_bar:
try: try:
from tqdm.auto import tqdm from tqdm.auto import tqdm
@ -428,6 +476,7 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
else: else:
_iter = range(0, len(tokens), _chunk_size) _iter = range(0, len(tokens), _chunk_size)
batched_embeddings: List[List[float]] = []
for i in _iter: for i in _iter:
response = embed_with_retry( response = embed_with_retry(
self, self,
@ -446,6 +495,7 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
results[indices[i]].append(batched_embeddings[i]) results[indices[i]].append(batched_embeddings[i])
num_tokens_in_batch[indices[i]].append(len(tokens[i])) num_tokens_in_batch[indices[i]].append(len(tokens[i]))
embeddings: List[List[float]] = [[] for _ in range(len(texts))]
for i in range(len(texts)): for i in range(len(texts)):
_result = results[i] _result = results[i]
if len(_result) == 0: if len(_result) == 0:
@ -468,7 +518,54 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
async def _aget_len_safe_embeddings( async def _aget_len_safe_embeddings(
self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None
) -> List[List[float]]: ) -> List[List[float]]:
embeddings: List[List[float]] = [[] for _ in range(len(texts))] """
Asynchronously generate length-safe embeddings for a list of texts.
This method handles tokenization and asynchronous embedding generation,
respecting the set embedding context length and chunk size. It supports both
`tiktoken` and HuggingFace `tokenizer` based on the tiktoken_enabled flag.
Args:
texts (List[str]): A list of texts to embed.
engine (str): The engine or model to use for embeddings.
chunk_size (Optional[int]): The size of chunks for processing embeddings.
Returns:
List[List[float]]: A list of embeddings for each input text.
"""
tokens = []
indices = []
model_name = self.tiktoken_model_name or self.model
_chunk_size = chunk_size or self.chunk_size
# If tiktoken flag set to False
if not self.tiktoken_enabled:
try:
from transformers import AutoTokenizer
except ImportError:
raise ValueError(
"Could not import transformers python package. "
"This is needed in order to for OpenAIEmbeddings without "
" `tiktoken`. Please install it with `pip install transformers`."
)
tokenizer = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path=model_name
)
for i, text in enumerate(texts):
# Tokenize the text using HuggingFace transformers
tokenized = tokenizer.encode(text, add_special_tokens=False)
# Split tokens into chunks respecting the embedding_ctx_length
for j in range(0, len(tokenized), self.embedding_ctx_length):
token_chunk = tokenized[j : j + self.embedding_ctx_length]
# Convert token IDs back to a string
chunk_text = tokenizer.decode(token_chunk)
tokens.append(chunk_text)
indices.append(i)
else:
try: try:
import tiktoken import tiktoken
except ImportError: except ImportError:
@ -478,9 +575,6 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
"Please install it with `pip install tiktoken`." "Please install it with `pip install tiktoken`."
) )
tokens = []
indices = []
model_name = self.tiktoken_model_name or self.model
try: try:
encoding = tiktoken.encoding_for_model(model_name) encoding = tiktoken.encoding_for_model(model_name)
except KeyError: except KeyError:
@ -489,14 +583,18 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
encoding = tiktoken.get_encoding(model) encoding = tiktoken.get_encoding(model)
for i, text in enumerate(texts): for i, text in enumerate(texts):
if self.model.endswith("001"): if self.model.endswith("001"):
# See: https://github.com/openai/openai-python/issues/418#issuecomment-1525939500 # See: https://github.com/openai/openai-python/
# issues/418#issuecomment-1525939500
# replace newlines, which can negatively affect performance. # replace newlines, which can negatively affect performance.
text = text.replace("\n", " ") text = text.replace("\n", " ")
token = encoding.encode( token = encoding.encode(
text, text=text,
allowed_special=self.allowed_special, allowed_special=self.allowed_special,
disallowed_special=self.disallowed_special, disallowed_special=self.disallowed_special,
) )
# Split tokens into chunks respecting the embedding_ctx_length
for j in range(0, len(token), self.embedding_ctx_length): for j in range(0, len(token), self.embedding_ctx_length):
tokens.append(token[j : j + self.embedding_ctx_length]) tokens.append(token[j : j + self.embedding_ctx_length])
indices.append(i) indices.append(i)
@ -520,6 +618,7 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
results[indices[i]].append(batched_embeddings[i]) results[indices[i]].append(batched_embeddings[i])
num_tokens_in_batch[indices[i]].append(len(tokens[i])) num_tokens_in_batch[indices[i]].append(len(tokens[i]))
embeddings: List[List[float]] = [[] for _ in range(len(texts))]
for i in range(len(texts)): for i in range(len(texts)):
_result = results[i] _result = results[i]
if len(_result) == 0: if len(_result) == 0: