mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-08 04:25:46 +00:00
openai[patch]: fix embedding float precision issue (#21736)
also clean up + comment some of the embedding batching code
This commit is contained in:
parent
38c297a025
commit
e41d801369
@ -37,6 +37,67 @@ from langchain_core.utils import (
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def _process_batched_chunked_embeddings(
|
||||||
|
num_texts: int,
|
||||||
|
tokens: List[Union[List[int], str]],
|
||||||
|
batched_embeddings: List[List[float]],
|
||||||
|
indices: List[int],
|
||||||
|
skip_empty: bool,
|
||||||
|
) -> List[Optional[List[float]]]:
|
||||||
|
# for each text, this is the list of embeddings (list of list of floats)
|
||||||
|
# corresponding to the chunks of the text
|
||||||
|
results: List[List[List[float]]] = [[] for _ in range(num_texts)]
|
||||||
|
|
||||||
|
# for each text, this is the token length of each chunk
|
||||||
|
# for transformers tokenization, this is the string length
|
||||||
|
# for tiktoken, this is the number of tokens
|
||||||
|
num_tokens_in_batch: List[List[int]] = [[] for _ in range(num_texts)]
|
||||||
|
|
||||||
|
for i in range(len(indices)):
|
||||||
|
if skip_empty and len(batched_embeddings[i]) == 1:
|
||||||
|
continue
|
||||||
|
results[indices[i]].append(batched_embeddings[i])
|
||||||
|
num_tokens_in_batch[indices[i]].append(len(tokens[i]))
|
||||||
|
|
||||||
|
# for each text, this is the final embedding
|
||||||
|
embeddings: List[Optional[List[float]]] = []
|
||||||
|
for i in range(num_texts):
|
||||||
|
# an embedding for each chunk
|
||||||
|
_result: List[List[float]] = results[i]
|
||||||
|
|
||||||
|
if len(_result) == 0:
|
||||||
|
# this will be populated with the embedding of an empty string
|
||||||
|
# in the sync or async code calling this
|
||||||
|
embeddings.append(None)
|
||||||
|
continue
|
||||||
|
|
||||||
|
elif len(_result) == 1:
|
||||||
|
# if only one embedding was produced, use it
|
||||||
|
embeddings.append(_result[0])
|
||||||
|
continue
|
||||||
|
|
||||||
|
else:
|
||||||
|
# else we need to weighted average
|
||||||
|
# should be same as
|
||||||
|
# average = np.average(_result, axis=0, weights=num_tokens_in_batch[i])
|
||||||
|
total_weight = sum(num_tokens_in_batch[i])
|
||||||
|
average = [
|
||||||
|
sum(
|
||||||
|
val * weight
|
||||||
|
for val, weight in zip(embedding, num_tokens_in_batch[i])
|
||||||
|
)
|
||||||
|
/ total_weight
|
||||||
|
for embedding in zip(*_result)
|
||||||
|
]
|
||||||
|
|
||||||
|
# should be same as
|
||||||
|
# embeddings.append((average / np.linalg.norm(average)).tolist())
|
||||||
|
magnitude = sum(val**2 for val in average) ** 0.5
|
||||||
|
embeddings.append([val / magnitude for val in average])
|
||||||
|
|
||||||
|
return embeddings
|
||||||
|
|
||||||
|
|
||||||
class OpenAIEmbeddings(BaseModel, Embeddings):
|
class OpenAIEmbeddings(BaseModel, Embeddings):
|
||||||
"""OpenAI embedding models.
|
"""OpenAI embedding models.
|
||||||
|
|
||||||
@ -248,9 +309,29 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
|
|||||||
|
|
||||||
def _tokenize(
|
def _tokenize(
|
||||||
self, texts: List[str], chunk_size: int
|
self, texts: List[str], chunk_size: int
|
||||||
) -> Tuple[Iterable[int], List[List[float]], List[int]]:
|
) -> Tuple[Iterable[int], List[Union[List[int], str]], List[int]]:
|
||||||
tokens = []
|
"""
|
||||||
indices = []
|
Take the input `texts` and `chunk_size` and return 3 iterables as a tuple:
|
||||||
|
|
||||||
|
We have `batches`, where batches are sets of individual texts
|
||||||
|
we want responses from the openai api. The length of a single batch is
|
||||||
|
`chunk_size` texts.
|
||||||
|
|
||||||
|
Each individual text is also split into multiple texts based on the
|
||||||
|
`embedding_ctx_length` parameter (based on number of tokens).
|
||||||
|
|
||||||
|
This function returns a 3-tuple of the following:
|
||||||
|
|
||||||
|
_iter: An iterable of the starting index in `tokens` for each *batch*
|
||||||
|
tokens: A list of tokenized texts, where each text has already been split
|
||||||
|
into sub-texts based on the `embedding_ctx_length` parameter. In the
|
||||||
|
case of tiktoken, this is a list of token arrays. In the case of
|
||||||
|
HuggingFace transformers, this is a list of strings.
|
||||||
|
indices: An iterable of the same length as `tokens` that maps each token-array
|
||||||
|
to the index of the original text in `texts`.
|
||||||
|
"""
|
||||||
|
tokens: List[Union[List[int], str]] = []
|
||||||
|
indices: List[int] = []
|
||||||
model_name = self.tiktoken_model_name or self.model
|
model_name = self.tiktoken_model_name or self.model
|
||||||
|
|
||||||
# If tiktoken flag set to False
|
# If tiktoken flag set to False
|
||||||
@ -269,14 +350,16 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
|
|||||||
)
|
)
|
||||||
for i, text in enumerate(texts):
|
for i, text in enumerate(texts):
|
||||||
# Tokenize the text using HuggingFace transformers
|
# Tokenize the text using HuggingFace transformers
|
||||||
tokenized = tokenizer.encode(text, add_special_tokens=False)
|
tokenized: List[int] = tokenizer.encode(text, add_special_tokens=False)
|
||||||
|
|
||||||
# Split tokens into chunks respecting the embedding_ctx_length
|
# Split tokens into chunks respecting the embedding_ctx_length
|
||||||
for j in range(0, len(tokenized), self.embedding_ctx_length):
|
for j in range(0, len(tokenized), self.embedding_ctx_length):
|
||||||
token_chunk = tokenized[j : j + self.embedding_ctx_length]
|
token_chunk: List[int] = tokenized[
|
||||||
|
j : j + self.embedding_ctx_length
|
||||||
|
]
|
||||||
|
|
||||||
# Convert token IDs back to a string
|
# Convert token IDs back to a string
|
||||||
chunk_text = tokenizer.decode(token_chunk)
|
chunk_text: str = tokenizer.decode(token_chunk)
|
||||||
tokens.append(chunk_text)
|
tokens.append(chunk_text)
|
||||||
indices.append(i)
|
indices.append(i)
|
||||||
else:
|
else:
|
||||||
@ -351,43 +434,23 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
|
|||||||
response = response.model_dump()
|
response = response.model_dump()
|
||||||
batched_embeddings.extend(r["embedding"] for r in response["data"])
|
batched_embeddings.extend(r["embedding"] for r in response["data"])
|
||||||
|
|
||||||
results: List[List[List[float]]] = [[] for _ in range(len(texts))]
|
embeddings = _process_batched_chunked_embeddings(
|
||||||
num_tokens_in_batch: List[List[int]] = [[] for _ in range(len(texts))]
|
len(texts), tokens, batched_embeddings, indices, self.skip_empty
|
||||||
for i in range(len(indices)):
|
)
|
||||||
if self.skip_empty and len(batched_embeddings[i]) == 1:
|
_cached_empty_embedding: Optional[List[float]] = None
|
||||||
continue
|
|
||||||
results[indices[i]].append(batched_embeddings[i])
|
|
||||||
num_tokens_in_batch[indices[i]].append(len(tokens[i]))
|
|
||||||
|
|
||||||
embeddings: List[List[float]] = [[] for _ in range(len(texts))]
|
def empty_embedding() -> List[float]:
|
||||||
for i in range(len(texts)):
|
nonlocal _cached_empty_embedding
|
||||||
_result = results[i]
|
if _cached_empty_embedding is None:
|
||||||
if len(_result) == 0:
|
|
||||||
average_embedded = self.client.create(
|
average_embedded = self.client.create(
|
||||||
input="", **self._invocation_params
|
input="", **self._invocation_params
|
||||||
)
|
)
|
||||||
if not isinstance(average_embedded, dict):
|
if not isinstance(average_embedded, dict):
|
||||||
average_embedded = average_embedded.model_dump()
|
average_embedded = average_embedded.model_dump()
|
||||||
average = average_embedded["data"][0]["embedding"]
|
_cached_empty_embedding = average_embedded["data"][0]["embedding"]
|
||||||
else:
|
return _cached_empty_embedding
|
||||||
# should be same as
|
|
||||||
# average = np.average(_result, axis=0, weights=num_tokens_in_batch[i])
|
|
||||||
total_weight = sum(num_tokens_in_batch[i])
|
|
||||||
average = [
|
|
||||||
sum(
|
|
||||||
val * weight
|
|
||||||
for val, weight in zip(embedding, num_tokens_in_batch[i])
|
|
||||||
)
|
|
||||||
/ total_weight
|
|
||||||
for embedding in zip(*_result)
|
|
||||||
]
|
|
||||||
|
|
||||||
# should be same as
|
return [e if e is not None else empty_embedding() for e in embeddings]
|
||||||
# embeddings[i] = (average / np.linalg.norm(average)).tolist()
|
|
||||||
magnitude = sum(val**2 for val in average) ** 0.5
|
|
||||||
embeddings[i] = [val / magnitude for val in average]
|
|
||||||
|
|
||||||
return embeddings
|
|
||||||
|
|
||||||
# please refer to
|
# please refer to
|
||||||
# https://github.com/openai/openai-cookbook/blob/main/examples/Embedding_long_inputs.ipynb
|
# https://github.com/openai/openai-cookbook/blob/main/examples/Embedding_long_inputs.ipynb
|
||||||
@ -423,40 +486,23 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
|
|||||||
response = response.model_dump()
|
response = response.model_dump()
|
||||||
batched_embeddings.extend(r["embedding"] for r in response["data"])
|
batched_embeddings.extend(r["embedding"] for r in response["data"])
|
||||||
|
|
||||||
results: List[List[List[float]]] = [[] for _ in range(len(texts))]
|
embeddings = _process_batched_chunked_embeddings(
|
||||||
num_tokens_in_batch: List[List[int]] = [[] for _ in range(len(texts))]
|
len(texts), tokens, batched_embeddings, indices, self.skip_empty
|
||||||
for i in range(len(indices)):
|
)
|
||||||
results[indices[i]].append(batched_embeddings[i])
|
_cached_empty_embedding: Optional[List[float]] = None
|
||||||
num_tokens_in_batch[indices[i]].append(len(tokens[i]))
|
|
||||||
|
|
||||||
embeddings: List[List[float]] = [[] for _ in range(len(texts))]
|
async def empty_embedding() -> List[float]:
|
||||||
for i in range(len(texts)):
|
nonlocal _cached_empty_embedding
|
||||||
_result = results[i]
|
if _cached_empty_embedding is None:
|
||||||
if len(_result) == 0:
|
|
||||||
average_embedded = await self.async_client.create(
|
average_embedded = await self.async_client.create(
|
||||||
input="", **self._invocation_params
|
input="", **self._invocation_params
|
||||||
)
|
)
|
||||||
if not isinstance(average_embedded, dict):
|
if not isinstance(average_embedded, dict):
|
||||||
average_embedded = average_embedded.model_dump()
|
average_embedded = average_embedded.model_dump()
|
||||||
average = average_embedded["data"][0]["embedding"]
|
_cached_empty_embedding = average_embedded["data"][0]["embedding"]
|
||||||
else:
|
return _cached_empty_embedding
|
||||||
# should be same as
|
|
||||||
# average = np.average(_result, axis=0, weights=num_tokens_in_batch[i])
|
|
||||||
total_weight = sum(num_tokens_in_batch[i])
|
|
||||||
average = [
|
|
||||||
sum(
|
|
||||||
val * weight
|
|
||||||
for val, weight in zip(embedding, num_tokens_in_batch[i])
|
|
||||||
)
|
|
||||||
/ total_weight
|
|
||||||
for embedding in zip(*_result)
|
|
||||||
]
|
|
||||||
# should be same as
|
|
||||||
# embeddings[i] = (average / np.linalg.norm(average)).tolist()
|
|
||||||
magnitude = sum(val**2 for val in average) ** 0.5
|
|
||||||
embeddings[i] = [val / magnitude for val in average]
|
|
||||||
|
|
||||||
return embeddings
|
return [e if e is not None else await empty_embedding() for e in embeddings]
|
||||||
|
|
||||||
def embed_documents(
|
def embed_documents(
|
||||||
self, texts: List[str], chunk_size: Optional[int] = 0
|
self, texts: List[str], chunk_size: Optional[int] = 0
|
||||||
|
Loading…
Reference in New Issue
Block a user