Extend OpenAIEmbeddings class to support non-tiktoken based embeddings (#13884)

2025-07-08 06:00:41 +00:00 · 2023-12-03 15:04:17 -05:00 · 2023-12-03 15:04:17 -05:00 · 2780d2d4dd
commit 2780d2d4dd
parent 9b59bde93d
1 changed files with 156 additions and 57 deletions
--- a/libs/langchain/langchain/embeddings/openai.py
+++ b/libs/langchain/langchain/embeddings/openai.py
@ -210,6 +210,9 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
    """Timeout for requests to OpenAI completion API. Can be float, httpx.Timeout or 
        None."""
    headers: Any = None
    tiktoken_enabled: bool = True
    """Set this to False for non-OpenAI implementations of the embeddings API, e.g.
    the `--extensions openai` extension for `text-generation-webui`"""
    tiktoken_model_name: Optional[str] = None
    """The model name to pass to tiktoken when using this class. 
    Tiktoken is used to count the number of tokens in documents to constrain 
@ -382,42 +385,87 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
    def _get_len_safe_embeddings(
        self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None
    ) -> List[List[float]]:
-        embeddings: List[List[float]] = [[] for _ in range(len(texts))]
+        """
-        try:
+        Generate length-safe embeddings for a list of texts.
-            import tiktoken
+
-        except ImportError:
+        This method handles tokenization and embedding generation, respecting the
-            raise ImportError(
+        set embedding context length and chunk size. It supports both tiktoken
-                "Could not import tiktoken python package. "
+        and HuggingFace tokenizer based on the tiktoken_enabled flag.
-                "This is needed in order to for OpenAIEmbeddings. "
+
-                "Please install it with `pip install tiktoken`."
+        Args:
-            )
+            texts (List[str]): A list of texts to embed.
            engine (str): The engine or model to use for embeddings.
            chunk_size (Optional[int]): The size of chunks for processing embeddings.
        Returns:
            List[List[float]]: A list of embeddings for each input text.
        """
        tokens = []
        indices = []
        model_name = self.tiktoken_model_name or self.model
        try:
            encoding = tiktoken.encoding_for_model(model_name)
        except KeyError:
            logger.warning("Warning: model not found. Using cl100k_base encoding.")
            model = "cl100k_base"
            encoding = tiktoken.get_encoding(model)
        for i, text in enumerate(texts):
            if self.model.endswith("001"):
                # See: https://github.com/openai/openai-python/issues/418#issuecomment-1525939500
                # replace newlines, which can negatively affect performance.
                text = text.replace("\n", " ")
            token = encoding.encode(
                text,
                allowed_special=self.allowed_special,
                disallowed_special=self.disallowed_special,
            )
            for j in range(0, len(token), self.embedding_ctx_length):
                tokens.append(token[j : j + self.embedding_ctx_length])
                indices.append(i)
        batched_embeddings: List[List[float]] = []
        _chunk_size = chunk_size or self.chunk_size
        # If tiktoken flag set to False
        if not self.tiktoken_enabled:
            try:
                from transformers import AutoTokenizer
            except ImportError:
                raise ValueError(
                    "Could not import transformers python package. "
                    "This is needed in order to for OpenAIEmbeddings without "
                    "`tiktoken`. Please install it with `pip install transformers`. "
                )
            tokenizer = AutoTokenizer.from_pretrained(
                pretrained_model_name_or_path=model_name
            )
            for i, text in enumerate(texts):
                # Tokenize the text using HuggingFace transformers
                tokenized = tokenizer.encode(text, add_special_tokens=False)
                # Split tokens into chunks respecting the embedding_ctx_length
                for j in range(0, len(tokenized), self.embedding_ctx_length):
                    token_chunk = tokenized[j : j + self.embedding_ctx_length]
                    # Convert token IDs back to a string
                    chunk_text = tokenizer.decode(token_chunk)
                    tokens.append(chunk_text)
                    indices.append(i)
        else:
            try:
                import tiktoken
            except ImportError:
                raise ImportError(
                    "Could not import tiktoken python package. "
                    "This is needed in order to for OpenAIEmbeddings. "
                    "Please install it with `pip install tiktoken`."
                )
            try:
                encoding = tiktoken.encoding_for_model(model_name)
            except KeyError:
                logger.warning("Warning: model not found. Using cl100k_base encoding.")
                model = "cl100k_base"
                encoding = tiktoken.get_encoding(model)
            for i, text in enumerate(texts):
                if self.model.endswith("001"):
                    # See: https://github.com/openai/openai-python/
                    #      issues/418#issuecomment-1525939500
                    # replace newlines, which can negatively affect performance.
                    text = text.replace("\n", " ")
                token = encoding.encode(
                    text=text,
                    allowed_special=self.allowed_special,
                    disallowed_special=self.disallowed_special,
                )
                # Split tokens into chunks respecting the embedding_ctx_length
                for j in range(0, len(token), self.embedding_ctx_length):
                    tokens.append(token[j : j + self.embedding_ctx_length])
                    indices.append(i)
        if self.show_progress_bar:
            try:
                from tqdm.auto import tqdm
@ -428,6 +476,7 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
        else:
            _iter = range(0, len(tokens), _chunk_size)
        batched_embeddings: List[List[float]] = []
        for i in _iter:
            response = embed_with_retry(
                self,
@ -446,6 +495,7 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
            results[indices[i]].append(batched_embeddings[i])
            num_tokens_in_batch[indices[i]].append(len(tokens[i]))
        embeddings: List[List[float]] = [[] for _ in range(len(texts))]
        for i in range(len(texts)):
            _result = results[i]
            if len(_result) == 0:
@ -468,38 +518,86 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
    async def _aget_len_safe_embeddings(
        self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None
    ) -> List[List[float]]:
-        embeddings: List[List[float]] = [[] for _ in range(len(texts))]
+        """
-        try:
+        Asynchronously generate length-safe embeddings for a list of texts.
-            import tiktoken
+
-        except ImportError:
+        This method handles tokenization and asynchronous embedding generation,
-            raise ImportError(
+        respecting the set embedding context length and chunk size. It supports both
-                "Could not import tiktoken python package. "
+        `tiktoken` and HuggingFace `tokenizer` based on the tiktoken_enabled flag.
-                "This is needed in order to for OpenAIEmbeddings. "
+
-                "Please install it with `pip install tiktoken`."
+        Args:
-            )
+            texts (List[str]): A list of texts to embed.
            engine (str): The engine or model to use for embeddings.
            chunk_size (Optional[int]): The size of chunks for processing embeddings.
        Returns:
            List[List[float]]: A list of embeddings for each input text.
        """
        tokens = []
        indices = []
        model_name = self.tiktoken_model_name or self.model
-        try:
+        _chunk_size = chunk_size or self.chunk_size
-            encoding = tiktoken.encoding_for_model(model_name)
+
-        except KeyError:
+        # If tiktoken flag set to False
-            logger.warning("Warning: model not found. Using cl100k_base encoding.")
+        if not self.tiktoken_enabled:
-            model = "cl100k_base"
+            try:
-            encoding = tiktoken.get_encoding(model)
+                from transformers import AutoTokenizer
-        for i, text in enumerate(texts):
+            except ImportError:
-            if self.model.endswith("001"):
+                raise ValueError(
-                # See: https://github.com/openai/openai-python/issues/418#issuecomment-1525939500
+                    "Could not import transformers python package. "
-                # replace newlines, which can negatively affect performance.
+                    "This is needed in order to for OpenAIEmbeddings without "
-                text = text.replace("\n", " ")
+                    " `tiktoken`. Please install it with `pip install transformers`."
-            token = encoding.encode(
+                )
-                text,
+
-                allowed_special=self.allowed_special,
+            tokenizer = AutoTokenizer.from_pretrained(
-                disallowed_special=self.disallowed_special,
+                pretrained_model_name_or_path=model_name
            )
-            for j in range(0, len(token), self.embedding_ctx_length):
+            for i, text in enumerate(texts):
-                tokens.append(token[j : j + self.embedding_ctx_length])
+                # Tokenize the text using HuggingFace transformers
-                indices.append(i)
+                tokenized = tokenizer.encode(text, add_special_tokens=False)
                # Split tokens into chunks respecting the embedding_ctx_length
                for j in range(0, len(tokenized), self.embedding_ctx_length):
                    token_chunk = tokenized[j : j + self.embedding_ctx_length]
                    # Convert token IDs back to a string
                    chunk_text = tokenizer.decode(token_chunk)
                    tokens.append(chunk_text)
                    indices.append(i)
        else:
            try:
                import tiktoken
            except ImportError:
                raise ImportError(
                    "Could not import tiktoken python package. "
                    "This is needed in order to for OpenAIEmbeddings. "
                    "Please install it with `pip install tiktoken`."
                )
            try:
                encoding = tiktoken.encoding_for_model(model_name)
            except KeyError:
                logger.warning("Warning: model not found. Using cl100k_base encoding.")
                model = "cl100k_base"
                encoding = tiktoken.get_encoding(model)
            for i, text in enumerate(texts):
                if self.model.endswith("001"):
                    # See: https://github.com/openai/openai-python/
                    #      issues/418#issuecomment-1525939500
                    # replace newlines, which can negatively affect performance.
                    text = text.replace("\n", " ")
                token = encoding.encode(
                    text=text,
                    allowed_special=self.allowed_special,
                    disallowed_special=self.disallowed_special,
                )
                # Split tokens into chunks respecting the embedding_ctx_length
                for j in range(0, len(token), self.embedding_ctx_length):
                    tokens.append(token[j : j + self.embedding_ctx_length])
                    indices.append(i)
        batched_embeddings: List[List[float]] = []
        _chunk_size = chunk_size or self.chunk_size
@ -520,6 +618,7 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
            results[indices[i]].append(batched_embeddings[i])
            num_tokens_in_batch[indices[i]].append(len(tokens[i]))
        embeddings: List[List[float]] = [[] for _ in range(len(texts))]
        for i in range(len(texts)):
            _result = results[i]
            if len(_result) == 0: