openai: Add skip_tokenization option to OpenAIEmbeddings (Fixes #30574)

2025-08-20 01:49:51 +00:00 · 2025-04-04 06:43:42 +05:30 · 2025-04-04 06:43:42 +05:30 · 56c0460d6e
commit 56c0460d6e
parent af66ab098e
1 changed files with 10 additions and 2 deletions
--- a/libs/partners/openai/langchain_openai/embeddings/base.py
+++ b/libs/partners/openai/langchain_openai/embeddings/base.py
@ -238,6 +238,9 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
    when using one of the many model providers that expose an OpenAI-like
    API but with different models. In those cases, in order to avoid erroring
    when tiktoken is called, you can specify a model name to use here."""
+    skip_tokenization: bool = False
+    """Set this to True to skip tokenization entirely and pass texts directly to the API.
+    Use this for OpenAI-compatible APIs that don't support tiktoken or huggingface tokenizers."""
    show_progress_bar: bool = False
    """Whether to show a progress bar when embedding."""
    model_kwargs: Dict[str, Any] = Field(default_factory=dict)
@ -387,8 +390,12 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
        indices: List[int] = []
        model_name = self.tiktoken_model_name or self.model

-        # If tiktoken flag set to False
-        if not self.tiktoken_enabled:
+        # Skip tokenization entirely and use raw text
+        if self.skip_tokenization:
+            for i, text in enumerate(texts):
+                tokens.append(text)
+                indices.append(i)
+        elif not self.tiktoken_enabled:
            try:
                from transformers import AutoTokenizer
            except ImportError:
@ -396,6 +403,7 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
                    "Could not import transformers python package. "
                    "This is needed for OpenAIEmbeddings to work without "
                    "`tiktoken`. Please install it with `pip install transformers`. "
+                    "Alternatively, set `skip_tokenization=True` to bypass tokenization entirely."
                )

            tokenizer = AutoTokenizer.from_pretrained(