fmt

2026-01-23 05:09:12 +00:00 · 2024-09-02 18:14:40 -07:00 · 2024-09-02 18:01:02 -07:00 · 2024-06-23 19:19:54 -07:00 · 2024-05-30 18:00:20 -07:00 · 2024-05-30 14:05:42 -07:00
3 changed files with 349 additions and 234 deletions
--- a/libs/partners/openai/langchain_openai/embeddings/base.py
+++ b/libs/partners/openai/langchain_openai/embeddings/base.py
@@ -1,5 +1,6 @@
 from __future__ import annotations

+import asyncio
 import logging
 import os
 import warnings
@@ -15,13 +16,13 @@ from typing import (
    Set,
    Tuple,
    Union,
-    cast,
 )

 import openai
 import tiktoken
 from langchain_core.embeddings import Embeddings
 from langchain_core.pydantic_v1 import BaseModel, Field, SecretStr, root_validator
+from langchain_core.runnables.utils import gather_with_concurrency
 from langchain_core.utils import (
    convert_to_secret_str,
    get_from_dict_or_env,
@@ -31,65 +32,43 @@ from langchain_core.utils import (
 logger = logging.getLogger(__name__)


-def _process_batched_chunked_embeddings(
+def _process_split_embeddings(
    num_texts: int,
-    tokens: List[Union[List[int], str]],
-    batched_embeddings: List[List[float]],
-    indices: List[int],
+    split_tokens: List[Union[List[int], str]],
+    split_embeddings: List[List[float]],
+    split_embeddings_text_indices: List[int],
    skip_empty: bool,
 ) -> List[Optional[List[float]]]:
-    # for each text, this is the list of embeddings (list of list of floats)
-    # corresponding to the chunks of the text
-    results: List[List[List[float]]] = [[] for _ in range(num_texts)]
+    split_embeddings_by_text: List[List[List[float]]] = [[] for _ in range(num_texts)]
+    split_num_tokens_by_text: List[List[int]] = [[] for _ in range(num_texts)]

-    # for each text, this is the token length of each chunk
-    # for transformers tokenization, this is the string length
-    # for tiktoken, this is the number of tokens
-    num_tokens_in_batch: List[List[int]] = [[] for _ in range(num_texts)]
-
-    for i in range(len(indices)):
-        if skip_empty and len(batched_embeddings[i]) == 1:
+    for text_idx, embeddings, tokens in zip(
+        split_embeddings_text_indices, split_embeddings, split_tokens
+    ):
+        if skip_empty and len(embeddings) == 1:
            continue
-        results[indices[i]].append(batched_embeddings[i])
-        num_tokens_in_batch[indices[i]].append(len(tokens[i]))
+        split_embeddings_by_text[text_idx].append(embeddings)
+        split_num_tokens_by_text[text_idx].append(len(tokens))

    # for each text, this is the final embedding
-    embeddings: List[Optional[List[float]]] = []
-    for i in range(num_texts):
-        # an embedding for each chunk
-        _result: List[List[float]] = results[i]
-
-        if len(_result) == 0:
+    averaged_embeddings: List[Optional[List[float]]] = []
+    for text_idx, (curr_split_embeddings, curr_split_num_tokens) in enumerate(
+        zip(split_embeddings_by_text, split_num_tokens_by_text)
+    ):
+        if len(curr_split_embeddings) == 0:
            # this will be populated with the embedding of an empty string
            # in the sync or async code calling this
-            embeddings.append(None)
-            continue
-
-        elif len(_result) == 1:
+            averaged_embeddings.append(None)
+        elif len(curr_split_embeddings) == 1:
            # if only one embedding was produced, use it
-            embeddings.append(_result[0])
-            continue
-
+            averaged_embeddings.append(curr_split_embeddings[0])
        else:
-            # else we need to weighted average
-            # should be same as
-            # average = np.average(_result, axis=0, weights=num_tokens_in_batch[i])
-            total_weight = sum(num_tokens_in_batch[i])
-            average = [
-                sum(
-                    val * weight
-                    for val, weight in zip(embedding, num_tokens_in_batch[i])
-                )
-                / total_weight
-                for embedding in zip(*_result)
-            ]
+            # else we need to take weighted average
+            averaged_embeddings.append(
+                _normed_vector_avg(curr_split_embeddings, curr_split_num_tokens)
+            )

-            # should be same as
-            # embeddings.append((average / np.linalg.norm(average)).tolist())
-            magnitude = sum(val**2 for val in average) ** 0.5
-            embeddings.append([val / magnitude for val in average])
-
-    return embeddings
+    return averaged_embeddings


 class OpenAIEmbeddings(BaseModel, Embeddings):
@@ -119,7 +98,10 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
        max_retries: int = 2
            Maximum number of retries to make when generating.
        request_timeout: Optional[Union[float, Tuple[float, float], Any]] = None
-            Timeout for requests to OpenAI completion API
+            Timeout for requests to OpenAI completion API.
+        max_concurrency: Optional[int] = None
+            Maximum number of coroutines to run concurrently. Only used for
+            ``aembed_documents()``.

    See full list of supported init args and their descriptions in the params section.

@@ -251,6 +233,11 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
    check_embedding_ctx_length: bool = True
    """Whether to check the token length of inputs and automatically split inputs 
        longer than embedding_ctx_length."""
+    max_concurrency: Optional[int] = None
+    """Maximum number of coroutines to run concurrently.
+    
+    Only used for ``aembed_documents()``.
+    """

    class Config:
        """Configuration for this pydantic object."""
@@ -380,106 +367,106 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
            params["dimensions"] = self.dimensions
        return params

-    def _tokenize(
-        self, texts: List[str], chunk_size: int
-    ) -> Tuple[Iterable[int], List[Union[List[int], str]], List[int]]:
+    def _tokenize_and_split(
+        self, texts: List[str]
+    ) -> Tuple[List[Union[List[int], str]], List[int]]:
        """
-        Take the input `texts` and `chunk_size` and return 3 iterables as a tuple:
-
-        We have `batches`, where batches are sets of individual texts
-        we want responses from the openai api. The length of a single batch is
-        `chunk_size` texts.
+        Tokenize and split the input texts to be shorter than max ctx length.

        Each individual text is also split into multiple texts based on the
        `embedding_ctx_length` parameter (based on number of tokens).

-        This function returns a 3-tuple of the following:
-
-        _iter: An iterable of the starting index in `tokens` for each *batch*
-        tokens: A list of tokenized texts, where each text has already been split
-            into sub-texts based on the `embedding_ctx_length` parameter. In the
-            case of tiktoken, this is a list of token arrays. In the case of
-            HuggingFace transformers, this is a list of strings.
-        indices: An iterable of the same length as `tokens` that maps each token-array
-            to the index of the original text in `texts`.
+        Returns:
+            This function returns a 2-tuple of the following:
+                split_tokens: A list of tokenized texts, where each text has already
+                    been split into sub-texts based on the `embedding_ctx_length`
+                    parameter. In the case of tiktoken, this is a list of token arrays.
+                    In the case of HuggingFace transformers, this is a list of strings.
+                indices: An iterable of the same length as `split_tokens` that maps
+                    each token array to the index of the original text in `texts`.
        """
+        # If tiktoken flag set to False
+        if not self.tiktoken_enabled:
+            return self._transformers_tokenize_and_split(texts)
+        else:
+            return self._tiktoken_tokenize_and_split(texts)
+
+    def _transformers_tokenize_and_split(
+        self, texts: List[str]
+    ) -> Tuple[List[Union[List[int], str]], List[int]]:
        tokens: List[Union[List[int], str]] = []
        indices: List[int] = []
        model_name = self.tiktoken_model_name or self.model

-        # If tiktoken flag set to False
-        if not self.tiktoken_enabled:
-            try:
-                from transformers import AutoTokenizer
-            except ImportError:
-                raise ValueError(
-                    "Could not import transformers python package. "
-                    "This is needed for OpenAIEmbeddings to work without "
-                    "`tiktoken`. Please install it with `pip install transformers`. "
-                )
-
-            tokenizer = AutoTokenizer.from_pretrained(
-                pretrained_model_name_or_path=model_name
+        try:
+            from transformers import AutoTokenizer
+        except ImportError:
+            raise ValueError(
+                "Could not import transformers python package. "
+                "This is needed for OpenAIEmbeddings to work without "
+                "`tiktoken`. Please install it with `pip install transformers`. "
            )
-            for i, text in enumerate(texts):
-                # Tokenize the text using HuggingFace transformers
-                tokenized: List[int] = tokenizer.encode(text, add_special_tokens=False)

-                # Split tokens into chunks respecting the embedding_ctx_length
-                for j in range(0, len(tokenized), self.embedding_ctx_length):
-                    token_chunk: List[int] = tokenized[
-                        j : j + self.embedding_ctx_length
-                    ]
+        tokenizer = AutoTokenizer.from_pretrained(
+            pretrained_model_name_or_path=model_name
+        )
+        for i, text in enumerate(texts):
+            # Tokenize the text using HuggingFace transformers
+            tokenized: List[int] = tokenizer.encode(text, add_special_tokens=False)

-                    # Convert token IDs back to a string
-                    chunk_text: str = tokenizer.decode(token_chunk)
-                    tokens.append(chunk_text)
-                    indices.append(i)
-        else:
-            try:
-                encoding = tiktoken.encoding_for_model(model_name)
-            except KeyError:
-                encoding = tiktoken.get_encoding("cl100k_base")
-            encoder_kwargs: Dict[str, Any] = {
-                k: v
-                for k, v in {
-                    "allowed_special": self.allowed_special,
-                    "disallowed_special": self.disallowed_special,
-                }.items()
-                if v is not None
-            }
-            for i, text in enumerate(texts):
-                if self.model.endswith("001"):
-                    # See: https://github.com/openai/openai-python/
-                    #      issues/418#issuecomment-1525939500
-                    # replace newlines, which can negatively affect performance.
-                    text = text.replace("\n", " ")
+            # Split tokens into chunks respecting the embedding_ctx_length
+            for j in range(0, len(tokenized), self.embedding_ctx_length):
+                token_chunk: List[int] = tokenized[j : j + self.embedding_ctx_length]

-                if encoder_kwargs:
-                    token = encoding.encode(text, **encoder_kwargs)
-                else:
-                    token = encoding.encode_ordinary(text)
+                # Convert token IDs back to a string
+                chunk_text: str = tokenizer.decode(token_chunk)
+                tokens.append(chunk_text)
+                indices.append(i)
+        return tokens, indices

-                # Split tokens into chunks respecting the embedding_ctx_length
-                for j in range(0, len(token), self.embedding_ctx_length):
-                    tokens.append(token[j : j + self.embedding_ctx_length])
-                    indices.append(i)
+    def _tiktoken_tokenize_and_split(
+        self, texts: List[str]
+    ) -> Tuple[List[Union[List[int], str]], List[int]]:
+        tokens: List[Union[List[int], str]] = []
+        indices: List[int] = []
+        model_name = self.tiktoken_model_name or self.model
+        try:
+            encoding = tiktoken.encoding_for_model(model_name)
+        except KeyError:
+            encoding = tiktoken.get_encoding("cl100k_base")
+        encoder_kwargs: Dict = {
+            "allowed_special": self.allowed_special,
+            "disallowed_special": self.disallowed_special,
+        }
+        encoder_kwargs = {k: v for k, v in encoder_kwargs.items() if v is not None}
+        for i, text in enumerate(texts):
+            if self.model.endswith("001"):
+                # See: https://github.com/openai/openai-python/
+                #      issues/418#issuecomment-1525939500
+                # replace newlines, which can negatively affect performance.
+                text = text.replace("\n", " ")

-        if self.show_progress_bar:
-            try:
-                from tqdm.auto import tqdm
+            if encoder_kwargs:
+                token = encoding.encode(text, **encoder_kwargs)
+            else:
+                token = encoding.encode_ordinary(text)

-                _iter: Iterable = tqdm(range(0, len(tokens), chunk_size))
-            except ImportError:
-                _iter = range(0, len(tokens), chunk_size)
-        else:
-            _iter = range(0, len(tokens), chunk_size)
-        return _iter, tokens, indices
+            # Split tokens into chunks respecting the embedding_ctx_length
+            for j in range(0, len(token), self.embedding_ctx_length):
+                tokens.append(token[j : j + self.embedding_ctx_length])
+                indices.append(i)

-    # please refer to
+        return tokens, indices
+
+    # Inspired by
    # https://github.com/openai/openai-cookbook/blob/main/examples/Embedding_long_inputs.ipynb
    def _get_len_safe_embeddings(
-        self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None
+        self,
+        num_texts: int,
+        split_tokens: List[Union[List[int], str]],
+        split_to_text_indices: List[int],
+        *,
+        chunk_size: Optional[int] = None,
    ) -> List[List[float]]:
        """
        Generate length-safe embeddings for a list of texts.
@@ -489,96 +476,86 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
        and HuggingFace tokenizer based on the tiktoken_enabled flag.

        Args:
-            texts (List[str]): A list of texts to embed.
-            engine (str): The engine or model to use for embeddings.
-            chunk_size (Optional[int]): The size of chunks for processing embeddings.
+            num_texts : Number of original texts.
+            split_tokens: Tokenized splits of the texts.
+            split_to_text_indices: Index of the original text that each token split
+                corresponds to.
+            chunk_size: Maximum number of texts to embed in each batch.

        Returns:
            List[List[float]]: A list of embeddings for each input text.
        """
-        _chunk_size = chunk_size or self.chunk_size
-        _iter, tokens, indices = self._tokenize(texts, _chunk_size)
-        batched_embeddings: List[List[float]] = []
-        for i in _iter:
-            response = self.client.create(
-                input=tokens[i : i + _chunk_size], **self._invocation_params
-            )
-            if not isinstance(response, dict):
-                response = response.model_dump()
-            batched_embeddings.extend(r["embedding"] for r in response["data"])
-
-        embeddings = _process_batched_chunked_embeddings(
-            len(texts), tokens, batched_embeddings, indices, self.skip_empty
+        split_embeddings = self._get_embeddings(split_tokens, chunk_size=chunk_size)
+        averaged_embeddings = _process_split_embeddings(
+            num_texts,
+            split_tokens,
+            split_embeddings,
+            split_to_text_indices,
+            self.skip_empty,
        )
        _cached_empty_embedding: Optional[List[float]] = None

        def empty_embedding() -> List[float]:
            nonlocal _cached_empty_embedding
            if _cached_empty_embedding is None:
-                average_embedded = self.client.create(
-                    input="", **self._invocation_params
-                )
-                if not isinstance(average_embedded, dict):
-                    average_embedded = average_embedded.model_dump()
-                _cached_empty_embedding = average_embedded["data"][0]["embedding"]
+                _cached_empty_embedding = self._get_embeddings([""])[0]
            return _cached_empty_embedding

-        return [e if e is not None else empty_embedding() for e in embeddings]
+        return [e if e is not None else empty_embedding() for e in averaged_embeddings]

-    # please refer to
+    # Inspired by
    # https://github.com/openai/openai-cookbook/blob/main/examples/Embedding_long_inputs.ipynb
    async def _aget_len_safe_embeddings(
-        self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None
+        self,
+        num_texts: int,
+        split_tokens: List[Union[List[int], str]],
+        split_to_text_indices: List[int],
+        *,
+        chunk_size: Optional[int] = None,
+        max_concurrency: Optional[int] = None,
    ) -> List[List[float]]:
        """
-        Asynchronously generate length-safe embeddings for a list of texts.
+        Generate length-safe embeddings for a list of texts.

-        This method handles tokenization and asynchronous embedding generation,
-        respecting the set embedding context length and chunk size. It supports both
-        `tiktoken` and HuggingFace `tokenizer` based on the tiktoken_enabled flag.
+        This method handles tokenization and embedding generation, respecting the
+        set embedding context length and chunk size. It supports both tiktoken
+        and HuggingFace tokenizer based on the tiktoken_enabled flag.

        Args:
-            texts (List[str]): A list of texts to embed.
-            engine (str): The engine or model to use for embeddings.
-            chunk_size (Optional[int]): The size of chunks for processing embeddings.
+            num_texts : Number of original texts.
+            split_tokens: Tokenized splits of the texts.
+            split_to_text_indices: Index of the original text that each token split
+                corresponds to.
+            chunk_size: Maximum number of texts to embed in each batch.

        Returns:
            List[List[float]]: A list of embeddings for each input text.
        """
-
-        _chunk_size = chunk_size or self.chunk_size
-        _iter, tokens, indices = self._tokenize(texts, _chunk_size)
-        batched_embeddings: List[List[float]] = []
-        _chunk_size = chunk_size or self.chunk_size
-        for i in range(0, len(tokens), _chunk_size):
-            response = await self.async_client.create(
-                input=tokens[i : i + _chunk_size], **self._invocation_params
-            )
-
-            if not isinstance(response, dict):
-                response = response.model_dump()
-            batched_embeddings.extend(r["embedding"] for r in response["data"])
-
-        embeddings = _process_batched_chunked_embeddings(
-            len(texts), tokens, batched_embeddings, indices, self.skip_empty
+        split_embeddings = await self._aget_embeddings(
+            split_tokens, chunk_size=chunk_size, max_concurrency=max_concurrency
+        )
+        averaged_embeddings = _process_split_embeddings(
+            num_texts,
+            split_tokens,
+            split_embeddings,
+            split_to_text_indices,
+            self.skip_empty,
        )
        _cached_empty_embedding: Optional[List[float]] = None

        async def empty_embedding() -> List[float]:
            nonlocal _cached_empty_embedding
            if _cached_empty_embedding is None:
-                average_embedded = await self.async_client.create(
-                    input="", **self._invocation_params
-                )
-                if not isinstance(average_embedded, dict):
-                    average_embedded = average_embedded.model_dump()
-                _cached_empty_embedding = average_embedded["data"][0]["embedding"]
+                _cached_empty_embedding = (await self._aget_embeddings([""]))[0]
            return _cached_empty_embedding

-        return [e if e is not None else await empty_embedding() for e in embeddings]
+        return [
+            e if e is not None else (await empty_embedding())
+            for e in averaged_embeddings
+        ]

    def embed_documents(
-        self, texts: List[str], chunk_size: Optional[int] = 0
+        self, texts: List[str], chunk_size: Optional[int] = None
    ) -> List[List[float]]:
        """Call out to OpenAI's embedding endpoint for embedding search docs.

@@ -591,21 +568,23 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
            List of embeddings, one for each text.
        """
        if not self.check_embedding_ctx_length:
-            embeddings: List[List[float]] = []
-            for text in texts:
-                response = self.client.create(input=text, **self._invocation_params)
-                if not isinstance(response, dict):
-                    response = response.dict()
-                embeddings.extend(r["embedding"] for r in response["data"])
-            return embeddings
+            return self._get_embeddings(texts, chunk_size=chunk_size)

-        # NOTE: to keep things simple, we assume the list may contain texts longer
-        #       than the maximum context and use length-safe embedding function.
-        engine = cast(str, self.deployment)
-        return self._get_len_safe_embeddings(texts, engine=engine)
+        num_texts = len(texts)
+        split_tokens, embedding_to_text_indices = self._tokenize_and_split(texts)
+        if len(split_tokens) == num_texts:
+            return self._get_embeddings(texts, chunk_size=chunk_size)
+
+        return self._get_len_safe_embeddings(
+            num_texts, split_tokens, embedding_to_text_indices, chunk_size=chunk_size
+        )

    async def aembed_documents(
-        self, texts: List[str], chunk_size: Optional[int] = 0
+        self,
+        texts: List[str],
+        chunk_size: Optional[int] = None,
+        *,
+        max_concurrency: Optional[int] = None,
    ) -> List[List[float]]:
        """Call out to OpenAI's embedding endpoint async for embedding search docs.

@@ -617,21 +596,28 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
        Returns:
            List of embeddings, one for each text.
        """
+        max_concurrency = (
+            max_concurrency if max_concurrency is not None else self.max_concurrency
+        )
        if not self.check_embedding_ctx_length:
-            embeddings: List[List[float]] = []
-            for text in texts:
-                response = await self.async_client.create(
-                    input=text, **self._invocation_params
-                )
-                if not isinstance(response, dict):
-                    response = response.dict()
-                embeddings.extend(r["embedding"] for r in response["data"])
-            return embeddings
+            return await self._aget_embeddings(
+                texts, chunk_size=chunk_size, max_concurrency=max_concurrency
+            )

-        # NOTE: to keep things simple, we assume the list may contain texts longer
-        #       than the maximum context and use length-safe embedding function.
-        engine = cast(str, self.deployment)
-        return await self._aget_len_safe_embeddings(texts, engine=engine)
+        num_texts = len(texts)
+        split_tokens, embedding_to_text_indices = self._tokenize_and_split(texts)
+        if len(split_tokens) == num_texts:
+            return await self._aget_embeddings(
+                texts, chunk_size=chunk_size, max_concurrency=max_concurrency
+            )
+
+        return await self._aget_len_safe_embeddings(
+            num_texts,
+            split_tokens,
+            embedding_to_text_indices,
+            chunk_size=chunk_size,
+            max_concurrency=max_concurrency,
+        )

    def embed_query(self, text: str) -> List[float]:
        """Call out to OpenAI's embedding endpoint for embedding query text.
@@ -655,3 +641,75 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
        """
        embeddings = await self.aembed_documents([text])
        return embeddings[0]
+
+    def _get_embeddings(
+        self,
+        input: Sequence[Union[List[int], str]],
+        *,
+        chunk_size: Optional[int] = None,
+    ) -> List[List[float]]:
+        embeddings: List[List[float]] = []
+        chunk_size = chunk_size or self.chunk_size
+        _iter: Iterable = range(0, len(input), chunk_size)
+        if self.show_progress_bar:
+            try:
+                from tqdm.auto import tqdm
+            except ImportError:
+                pass
+            else:
+                _iter = tqdm(_iter)
+        for chunk_start in _iter:
+            chunk_end = chunk_start + chunk_size
+            response = self.client.create(
+                input=input[chunk_start:chunk_end], **self._invocation_params
+            )
+            if not isinstance(response, dict):
+                if hasattr(response, "model_dump"):
+                    response = response.model_dump()
+                else:
+                    response = response.dict()
+            embeddings.extend(r["embedding"] for r in response["data"])
+        return embeddings
+
+    async def _aget_embeddings(
+        self,
+        input: Sequence[Union[List[int], str]],
+        *,
+        chunk_size: Optional[int] = None,
+        max_concurrency: Optional[int] = None,
+    ) -> List[List[float]]:
+        chunk_size = chunk_size or self.chunk_size
+        responses = await gather_with_concurrency(
+            max_concurrency,
+            *(
+                self.async_client.create(
+                    input=input[start : start + chunk_size], **self._invocation_params
+                )
+                for start in range(0, len(input), chunk_size)
+            ),
+        )
+        embeddings: List = []
+        for res in responses:
+            if not isinstance(res, dict):
+                res = res.model_dump() if hasattr(res, "model_dump") else res.dict()
+            embeddings.extend(r["embedding"] for r in res["data"])
+        return embeddings
+
+
+def _normed_vector_avg(vectors: List[List[float]], weights: List[int]) -> List[float]:
+    # should be same as
+    # np.average(vectors, axis=0, weights=weights)
+    total_weight = sum(weights)
+    averaged = []
+    for transposed_vec in zip(*vectors):
+        avg_ = sum(v * w for v, w in zip(transposed_vec, weights)) / total_weight
+        averaged.append(avg_)
+
+    return _vector_norm(averaged)
+
+
+def _vector_norm(vector: List[float]) -> List[float]:
+    # should be same as
+    # (np.array(vector) / np.linalg.norm(vector)).tolist()
+    magnitude = sum(x**2 for x in vector) ** 0.5
+    return [x / magnitude for x in vector]
--- a/libs/partners/openai/tests/integration_tests/embeddings/test_base.py
+++ b/libs/partners/openai/tests/integration_tests/embeddings/test_base.py
@@ -4,39 +4,60 @@ import numpy as np
 import openai
 import pytest

-from langchain_openai.embeddings.base import OpenAIEmbeddings
+from langchain_openai import OpenAIEmbeddings


-def test_langchain_openai_embedding_documents() -> None:
+@pytest.fixture
+def embeddings() -> OpenAIEmbeddings:
+    return OpenAIEmbeddings(model="text-embedding-3-small")
+
+
+def test_embedding_documents(embeddings: OpenAIEmbeddings) -> None:
    """Test openai embeddings."""
-    documents = ["foo bar"]
-    embedding = OpenAIEmbeddings()
-    output = embedding.embed_documents(documents)
+    output = embeddings.embed_documents(["foo bar", "baz buz"])
+    assert len(output) == 2
+    assert len(output[0]) > 0
+
+
+def test_embedding_query(embeddings: OpenAIEmbeddings) -> None:
+    """Test openai embeddings."""
+    output = embeddings.embed_query("foo bar")
+    assert len(output) > 0
+
+
+def test_embeddings_dimensions() -> None:
+    """Test openai embeddings with dimensions param."""
+    embeddings = OpenAIEmbeddings(model="text-embedding-3-small", dimensions=128)
+    output = embeddings.embed_documents(["foo bar", "baz buz"])
+    assert len(output) == 2
+    assert len(output[0]) == 128
+
+
+def test_embed_documents_long(embeddings: OpenAIEmbeddings) -> None:
+    """Test openai embeddings."""
+    long_text = " ".join(["foo bar"] * embeddings.embedding_ctx_length)
+    token_splits, _ = embeddings._tokenize_and_split([long_text])
+    assert len(token_splits) > 1
+    output = embeddings.embed_documents([long_text])
    assert len(output) == 1
    assert len(output[0]) > 0


-def test_langchain_openai_embedding_query() -> None:
+async def test_embed_documents_long_async(embeddings: OpenAIEmbeddings) -> None:
    """Test openai embeddings."""
-    document = "foo bar"
-    embedding = OpenAIEmbeddings()
-    output = embedding.embed_query(document)
-    assert len(output) > 0
-
-
-def test_langchain_openai_embeddings_dimensions() -> None:
-    """Test openai embeddings."""
-    documents = ["foo bar"]
-    embedding = OpenAIEmbeddings(model="text-embedding-3-small", dimensions=128)
-    output = embedding.embed_documents(documents)
+    long_text = " ".join(["foo bar"] * embeddings.embedding_ctx_length)
+    token_splits, _ = embeddings._tokenize_and_split([long_text])
+    assert len(token_splits) > 1
+    output = await embeddings.aembed_documents([long_text])
    assert len(output) == 1
-    assert len(output[0]) == 128
+    assert len(output[0]) > 0


@pytest.mark.skip(reason="flaky")
-def test_langchain_openai_embeddings_equivalent_to_raw() -> None:
+def test_langchain_openai_embeddings_equivalent_to_raw(
+    embedding: OpenAIEmbeddings,
+) -> None:
    documents = ["disallowed special token '<|endoftext|>'"]
-    embedding = OpenAIEmbeddings()

    lc_output = embedding.embed_documents(documents)[0]
    direct_output = (
@@ -49,9 +70,10 @@ def test_langchain_openai_embeddings_equivalent_to_raw() -> None:


@pytest.mark.skip(reason="flaky")
-async def test_langchain_openai_embeddings_equivalent_to_raw_async() -> None:
+async def test_langchain_openai_embeddings_equivalent_to_raw_async(
+    embedding: OpenAIEmbeddings,
+) -> None:
    documents = ["disallowed special token '<|endoftext|>'"]
-    embedding = OpenAIEmbeddings()

    lc_output = (await embedding.aembed_documents(documents))[0]
    client = openai.AsyncOpenAI()
--- a/libs/partners/openai/tests/unit_tests/embeddings/test_base.py
+++ b/libs/partners/openai/tests/unit_tests/embeddings/test_base.py
@@ -1,18 +1,53 @@
 import os
+from typing import List

+import numpy as np
 import pytest

 from langchain_openai import OpenAIEmbeddings
+from langchain_openai.embeddings.base import _normed_vector_avg, _vector_norm

 os.environ["OPENAI_API_KEY"] = "foo"


-def test_openai_invalid_model_kwargs() -> None:
+def test_invalid_model_kwargs() -> None:
    with pytest.raises(ValueError):
        OpenAIEmbeddings(model_kwargs={"model": "foo"})


-def test_openai_incorrect_field() -> None:
+def test_incorrect_field() -> None:
    with pytest.warns(match="not default parameter"):
        llm = OpenAIEmbeddings(foo="bar")  # type: ignore[call-arg]
    assert llm.model_kwargs == {"foo": "bar"}
+
+
+@pytest.mark.parametrize(
+    ("vectors", "weights", "expected"),
+    [
+        ([[1]], [1], [1]),
+        ([[1, 0, 0], [0, 1, 0]], [1, 1], [(2**0.5) * 0.5, (2**0.5) * 0.5, 0]),
+        (
+            [[0.27, 0.95, 0.13], [0.52, 0.1, 0.84], [0.18, 0.91, 0.36]],
+            [10, 51, 3],
+            [0.5235709525340383, 0.30488859567926363, 0.7955604325802826],
+        ),
+    ],
+)
+def test__normed_vector_avg(
+    vectors: List[List[float]], weights: List[int], expected: List[float]
+) -> None:
+    actual = _normed_vector_avg(vectors, weights)
+    assert np.isclose(actual, expected).all()
+
+
+@pytest.mark.parametrize(
+    ("vector", "expected"),
+    [
+        ([0.1], [1]),
+        ([1, 1, 1], [3**0.5 / 3] * 3),
+        ([27, 95, 13], [0.2710455418938115, 0.9536787585152627, 0.13050340905998334]),
+    ],
+)
+def test__vector_norm(vector: List[float], expected: List[float]) -> None:
+    actual = _vector_norm(vector)
+    assert np.isclose(actual, expected).all()
Author	SHA1	Message	Date
Bagatur	c6625b4877	fmt	2024-09-02 18:14:40 -07:00
Bagatur	5aabcff871	fmt	2024-09-02 18:01:02 -07:00
Bagatur	a93f3ca59a	fmt	2024-06-23 19:19:54 -07:00
Bagatur	15b4bf511a	openai[patch]: refactor embeddings, add tests	2024-05-30 18:00:20 -07:00
Bagatur	acb9a82d3a	wip	2024-05-30 14:05:42 -07:00