Extend OpenAIEmbeddings class to support non-tiktoken based embeddings (#13884)

<!-- Thank you for contributing to LangChain!

Replace this entire comment with:
- **Description:** This extends `OpenAIEmbeddings` to add support for
non-`tiktoken` based embeddings, specifically for use with the new
`text-generation-webui` API (`--extensions openai`) which does not
support `tiktoken` encodings, but rather strings
  - **Issue:** Not found,
- **Dependencies:** HuggingFace `transformers.AutoTokenizer` is new
dependency for running the model without `tiktoken`
- **Tag maintainer:** @baskaryan based on last commit for
`langchain-core` refactor
  - **Twitter handle:** @xychelsea

Modified the tokenization process to be model-agnostic, allowing for
both OpenAI and non-OpenAI model tokenizations, by setting the new
default `bool` flag `tiktoken_enabled` to `False`. This requeires
HuggingFace’s AutoTokenizer and handling tokenization for models
requiring different preprocessing steps to generate a chunked string
request rather than a list of integers.

Updated the embeddings generation process to accommodate non-OpenAI
models. This includes converting tokenized text into embeddings using
OpenAI’s and Hugging Face’s model architectures.
 -->
This commit is contained in:
Chelsea E. Manning 2023-12-03 15:04:17 -05:00 committed by GitHub
parent 9b59bde93d
commit 2780d2d4dd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -210,6 +210,9 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
"""Timeout for requests to OpenAI completion API. Can be float, httpx.Timeout or """Timeout for requests to OpenAI completion API. Can be float, httpx.Timeout or
None.""" None."""
headers: Any = None headers: Any = None
tiktoken_enabled: bool = True
"""Set this to False for non-OpenAI implementations of the embeddings API, e.g.
the `--extensions openai` extension for `text-generation-webui`"""
tiktoken_model_name: Optional[str] = None tiktoken_model_name: Optional[str] = None
"""The model name to pass to tiktoken when using this class. """The model name to pass to tiktoken when using this class.
Tiktoken is used to count the number of tokens in documents to constrain Tiktoken is used to count the number of tokens in documents to constrain
@ -382,42 +385,87 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
def _get_len_safe_embeddings( def _get_len_safe_embeddings(
self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None
) -> List[List[float]]: ) -> List[List[float]]:
embeddings: List[List[float]] = [[] for _ in range(len(texts))] """
try: Generate length-safe embeddings for a list of texts.
import tiktoken
except ImportError: This method handles tokenization and embedding generation, respecting the
raise ImportError( set embedding context length and chunk size. It supports both tiktoken
"Could not import tiktoken python package. " and HuggingFace tokenizer based on the tiktoken_enabled flag.
"This is needed in order to for OpenAIEmbeddings. "
"Please install it with `pip install tiktoken`." Args:
) texts (List[str]): A list of texts to embed.
engine (str): The engine or model to use for embeddings.
chunk_size (Optional[int]): The size of chunks for processing embeddings.
Returns:
List[List[float]]: A list of embeddings for each input text.
"""
tokens = [] tokens = []
indices = [] indices = []
model_name = self.tiktoken_model_name or self.model model_name = self.tiktoken_model_name or self.model
try:
encoding = tiktoken.encoding_for_model(model_name)
except KeyError:
logger.warning("Warning: model not found. Using cl100k_base encoding.")
model = "cl100k_base"
encoding = tiktoken.get_encoding(model)
for i, text in enumerate(texts):
if self.model.endswith("001"):
# See: https://github.com/openai/openai-python/issues/418#issuecomment-1525939500
# replace newlines, which can negatively affect performance.
text = text.replace("\n", " ")
token = encoding.encode(
text,
allowed_special=self.allowed_special,
disallowed_special=self.disallowed_special,
)
for j in range(0, len(token), self.embedding_ctx_length):
tokens.append(token[j : j + self.embedding_ctx_length])
indices.append(i)
batched_embeddings: List[List[float]] = []
_chunk_size = chunk_size or self.chunk_size _chunk_size = chunk_size or self.chunk_size
# If tiktoken flag set to False
if not self.tiktoken_enabled:
try:
from transformers import AutoTokenizer
except ImportError:
raise ValueError(
"Could not import transformers python package. "
"This is needed in order to for OpenAIEmbeddings without "
"`tiktoken`. Please install it with `pip install transformers`. "
)
tokenizer = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path=model_name
)
for i, text in enumerate(texts):
# Tokenize the text using HuggingFace transformers
tokenized = tokenizer.encode(text, add_special_tokens=False)
# Split tokens into chunks respecting the embedding_ctx_length
for j in range(0, len(tokenized), self.embedding_ctx_length):
token_chunk = tokenized[j : j + self.embedding_ctx_length]
# Convert token IDs back to a string
chunk_text = tokenizer.decode(token_chunk)
tokens.append(chunk_text)
indices.append(i)
else:
try:
import tiktoken
except ImportError:
raise ImportError(
"Could not import tiktoken python package. "
"This is needed in order to for OpenAIEmbeddings. "
"Please install it with `pip install tiktoken`."
)
try:
encoding = tiktoken.encoding_for_model(model_name)
except KeyError:
logger.warning("Warning: model not found. Using cl100k_base encoding.")
model = "cl100k_base"
encoding = tiktoken.get_encoding(model)
for i, text in enumerate(texts):
if self.model.endswith("001"):
# See: https://github.com/openai/openai-python/
# issues/418#issuecomment-1525939500
# replace newlines, which can negatively affect performance.
text = text.replace("\n", " ")
token = encoding.encode(
text=text,
allowed_special=self.allowed_special,
disallowed_special=self.disallowed_special,
)
# Split tokens into chunks respecting the embedding_ctx_length
for j in range(0, len(token), self.embedding_ctx_length):
tokens.append(token[j : j + self.embedding_ctx_length])
indices.append(i)
if self.show_progress_bar: if self.show_progress_bar:
try: try:
from tqdm.auto import tqdm from tqdm.auto import tqdm
@ -428,6 +476,7 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
else: else:
_iter = range(0, len(tokens), _chunk_size) _iter = range(0, len(tokens), _chunk_size)
batched_embeddings: List[List[float]] = []
for i in _iter: for i in _iter:
response = embed_with_retry( response = embed_with_retry(
self, self,
@ -446,6 +495,7 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
results[indices[i]].append(batched_embeddings[i]) results[indices[i]].append(batched_embeddings[i])
num_tokens_in_batch[indices[i]].append(len(tokens[i])) num_tokens_in_batch[indices[i]].append(len(tokens[i]))
embeddings: List[List[float]] = [[] for _ in range(len(texts))]
for i in range(len(texts)): for i in range(len(texts)):
_result = results[i] _result = results[i]
if len(_result) == 0: if len(_result) == 0:
@ -468,38 +518,86 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
async def _aget_len_safe_embeddings( async def _aget_len_safe_embeddings(
self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None
) -> List[List[float]]: ) -> List[List[float]]:
embeddings: List[List[float]] = [[] for _ in range(len(texts))] """
try: Asynchronously generate length-safe embeddings for a list of texts.
import tiktoken
except ImportError: This method handles tokenization and asynchronous embedding generation,
raise ImportError( respecting the set embedding context length and chunk size. It supports both
"Could not import tiktoken python package. " `tiktoken` and HuggingFace `tokenizer` based on the tiktoken_enabled flag.
"This is needed in order to for OpenAIEmbeddings. "
"Please install it with `pip install tiktoken`." Args:
) texts (List[str]): A list of texts to embed.
engine (str): The engine or model to use for embeddings.
chunk_size (Optional[int]): The size of chunks for processing embeddings.
Returns:
List[List[float]]: A list of embeddings for each input text.
"""
tokens = [] tokens = []
indices = [] indices = []
model_name = self.tiktoken_model_name or self.model model_name = self.tiktoken_model_name or self.model
try: _chunk_size = chunk_size or self.chunk_size
encoding = tiktoken.encoding_for_model(model_name)
except KeyError: # If tiktoken flag set to False
logger.warning("Warning: model not found. Using cl100k_base encoding.") if not self.tiktoken_enabled:
model = "cl100k_base" try:
encoding = tiktoken.get_encoding(model) from transformers import AutoTokenizer
for i, text in enumerate(texts): except ImportError:
if self.model.endswith("001"): raise ValueError(
# See: https://github.com/openai/openai-python/issues/418#issuecomment-1525939500 "Could not import transformers python package. "
# replace newlines, which can negatively affect performance. "This is needed in order to for OpenAIEmbeddings without "
text = text.replace("\n", " ") " `tiktoken`. Please install it with `pip install transformers`."
token = encoding.encode( )
text,
allowed_special=self.allowed_special, tokenizer = AutoTokenizer.from_pretrained(
disallowed_special=self.disallowed_special, pretrained_model_name_or_path=model_name
) )
for j in range(0, len(token), self.embedding_ctx_length): for i, text in enumerate(texts):
tokens.append(token[j : j + self.embedding_ctx_length]) # Tokenize the text using HuggingFace transformers
indices.append(i) tokenized = tokenizer.encode(text, add_special_tokens=False)
# Split tokens into chunks respecting the embedding_ctx_length
for j in range(0, len(tokenized), self.embedding_ctx_length):
token_chunk = tokenized[j : j + self.embedding_ctx_length]
# Convert token IDs back to a string
chunk_text = tokenizer.decode(token_chunk)
tokens.append(chunk_text)
indices.append(i)
else:
try:
import tiktoken
except ImportError:
raise ImportError(
"Could not import tiktoken python package. "
"This is needed in order to for OpenAIEmbeddings. "
"Please install it with `pip install tiktoken`."
)
try:
encoding = tiktoken.encoding_for_model(model_name)
except KeyError:
logger.warning("Warning: model not found. Using cl100k_base encoding.")
model = "cl100k_base"
encoding = tiktoken.get_encoding(model)
for i, text in enumerate(texts):
if self.model.endswith("001"):
# See: https://github.com/openai/openai-python/
# issues/418#issuecomment-1525939500
# replace newlines, which can negatively affect performance.
text = text.replace("\n", " ")
token = encoding.encode(
text=text,
allowed_special=self.allowed_special,
disallowed_special=self.disallowed_special,
)
# Split tokens into chunks respecting the embedding_ctx_length
for j in range(0, len(token), self.embedding_ctx_length):
tokens.append(token[j : j + self.embedding_ctx_length])
indices.append(i)
batched_embeddings: List[List[float]] = [] batched_embeddings: List[List[float]] = []
_chunk_size = chunk_size or self.chunk_size _chunk_size = chunk_size or self.chunk_size
@ -520,6 +618,7 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
results[indices[i]].append(batched_embeddings[i]) results[indices[i]].append(batched_embeddings[i])
num_tokens_in_batch[indices[i]].append(len(tokens[i])) num_tokens_in_batch[indices[i]].append(len(tokens[i]))
embeddings: List[List[float]] = [[] for _ in range(len(texts))]
for i in range(len(texts)): for i in range(len(texts)):
_result = results[i] _result = results[i]
if len(_result) == 0: if len(_result) == 0: