mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-26 00:23:25 +00:00
openai[patch]: fix special token default behavior (#21131)
By default handle special sequences as regular text
This commit is contained in:
parent
0f7f448603
commit
bef50ded63
@ -82,8 +82,8 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
|
||||
"""Automatically inferred from env var `OPENAI_API_KEY` if not provided."""
|
||||
openai_organization: Optional[str] = Field(default=None, alias="organization")
|
||||
"""Automatically inferred from env var `OPENAI_ORG_ID` if not provided."""
|
||||
allowed_special: Union[Literal["all"], Set[str]] = set()
|
||||
disallowed_special: Union[Literal["all"], Set[str], Sequence[str]] = "all"
|
||||
allowed_special: Union[Literal["all"], Set[str], None] = None
|
||||
disallowed_special: Union[Literal["all"], Set[str], Sequence[str], None] = None
|
||||
chunk_size: int = 1000
|
||||
"""Maximum number of texts to embed in each batch"""
|
||||
max_retries: int = 2
|
||||
@ -246,31 +246,12 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
|
||||
params["dimensions"] = self.dimensions
|
||||
return params
|
||||
|
||||
# please refer to
|
||||
# https://github.com/openai/openai-cookbook/blob/main/examples/Embedding_long_inputs.ipynb
|
||||
def _get_len_safe_embeddings(
|
||||
self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None
|
||||
) -> List[List[float]]:
|
||||
"""
|
||||
Generate length-safe embeddings for a list of texts.
|
||||
|
||||
This method handles tokenization and embedding generation, respecting the
|
||||
set embedding context length and chunk size. It supports both tiktoken
|
||||
and HuggingFace tokenizer based on the tiktoken_enabled flag.
|
||||
|
||||
Args:
|
||||
texts (List[str]): A list of texts to embed.
|
||||
engine (str): The engine or model to use for embeddings.
|
||||
chunk_size (Optional[int]): The size of chunks for processing embeddings.
|
||||
|
||||
Returns:
|
||||
List[List[float]]: A list of embeddings for each input text.
|
||||
"""
|
||||
|
||||
def _tokenize(
|
||||
self, texts: List[str], chunk_size: int
|
||||
) -> Tuple[Iterable[int], List[List[float]], List[int]]:
|
||||
tokens = []
|
||||
indices = []
|
||||
model_name = self.tiktoken_model_name or self.model
|
||||
_chunk_size = chunk_size or self.chunk_size
|
||||
|
||||
# If tiktoken flag set to False
|
||||
if not self.tiktoken_enabled:
|
||||
@ -303,6 +284,14 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
|
||||
encoding = tiktoken.encoding_for_model(model_name)
|
||||
except KeyError:
|
||||
encoding = tiktoken.get_encoding("cl100k_base")
|
||||
encoder_kwargs: Dict[str, Any] = {
|
||||
k: v
|
||||
for k, v in {
|
||||
"allowed_special": self.allowed_special,
|
||||
"disallowed_special": self.disallowed_special,
|
||||
}.items()
|
||||
if v is not None
|
||||
}
|
||||
for i, text in enumerate(texts):
|
||||
if self.model.endswith("001"):
|
||||
# See: https://github.com/openai/openai-python/
|
||||
@ -310,11 +299,10 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
|
||||
# replace newlines, which can negatively affect performance.
|
||||
text = text.replace("\n", " ")
|
||||
|
||||
token = encoding.encode(
|
||||
text=text,
|
||||
allowed_special=self.allowed_special,
|
||||
disallowed_special=self.disallowed_special,
|
||||
)
|
||||
if encoder_kwargs:
|
||||
token = encoding.encode(text, **encoder_kwargs)
|
||||
else:
|
||||
token = encoding.encode_ordinary(text)
|
||||
|
||||
# Split tokens into chunks respecting the embedding_ctx_length
|
||||
for j in range(0, len(token), self.embedding_ctx_length):
|
||||
@ -325,12 +313,35 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
|
||||
try:
|
||||
from tqdm.auto import tqdm
|
||||
|
||||
_iter: Iterable = tqdm(range(0, len(tokens), _chunk_size))
|
||||
_iter: Iterable = tqdm(range(0, len(tokens), chunk_size))
|
||||
except ImportError:
|
||||
_iter = range(0, len(tokens), _chunk_size)
|
||||
_iter = range(0, len(tokens), chunk_size)
|
||||
else:
|
||||
_iter = range(0, len(tokens), _chunk_size)
|
||||
_iter = range(0, len(tokens), chunk_size)
|
||||
return _iter, tokens, indices
|
||||
|
||||
# please refer to
|
||||
# https://github.com/openai/openai-cookbook/blob/main/examples/Embedding_long_inputs.ipynb
|
||||
def _get_len_safe_embeddings(
|
||||
self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None
|
||||
) -> List[List[float]]:
|
||||
"""
|
||||
Generate length-safe embeddings for a list of texts.
|
||||
|
||||
This method handles tokenization and embedding generation, respecting the
|
||||
set embedding context length and chunk size. It supports both tiktoken
|
||||
and HuggingFace tokenizer based on the tiktoken_enabled flag.
|
||||
|
||||
Args:
|
||||
texts (List[str]): A list of texts to embed.
|
||||
engine (str): The engine or model to use for embeddings.
|
||||
chunk_size (Optional[int]): The size of chunks for processing embeddings.
|
||||
|
||||
Returns:
|
||||
List[List[float]]: A list of embeddings for each input text.
|
||||
"""
|
||||
_chunk_size = chunk_size or self.chunk_size
|
||||
_iter, tokens, indices = self._tokenize(texts, _chunk_size)
|
||||
batched_embeddings: List[List[float]] = []
|
||||
for i in _iter:
|
||||
response = self.client.create(
|
||||
@ -399,62 +410,8 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
|
||||
List[List[float]]: A list of embeddings for each input text.
|
||||
"""
|
||||
|
||||
tokens = []
|
||||
indices = []
|
||||
model_name = self.tiktoken_model_name or self.model
|
||||
_chunk_size = chunk_size or self.chunk_size
|
||||
|
||||
# If tiktoken flag set to False
|
||||
if not self.tiktoken_enabled:
|
||||
try:
|
||||
from transformers import AutoTokenizer
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
"Could not import transformers python package. "
|
||||
"This is needed in order to for OpenAIEmbeddings without "
|
||||
" `tiktoken`. Please install it with `pip install transformers`."
|
||||
)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
pretrained_model_name_or_path=model_name
|
||||
)
|
||||
for i, text in enumerate(texts):
|
||||
# Tokenize the text using HuggingFace transformers
|
||||
tokenized = tokenizer.encode(text, add_special_tokens=False)
|
||||
|
||||
# Split tokens into chunks respecting the embedding_ctx_length
|
||||
for j in range(0, len(tokenized), self.embedding_ctx_length):
|
||||
token_chunk = tokenized[j : j + self.embedding_ctx_length]
|
||||
|
||||
# Convert token IDs back to a string
|
||||
chunk_text = tokenizer.decode(token_chunk)
|
||||
tokens.append(chunk_text)
|
||||
indices.append(i)
|
||||
else:
|
||||
try:
|
||||
encoding = tiktoken.encoding_for_model(model_name)
|
||||
except KeyError:
|
||||
logger.warning("Warning: model not found. Using cl100k_base encoding.")
|
||||
model = "cl100k_base"
|
||||
encoding = tiktoken.get_encoding(model)
|
||||
for i, text in enumerate(texts):
|
||||
if self.model.endswith("001"):
|
||||
# See: https://github.com/openai/openai-python/
|
||||
# issues/418#issuecomment-1525939500
|
||||
# replace newlines, which can negatively affect performance.
|
||||
text = text.replace("\n", " ")
|
||||
|
||||
token = encoding.encode(
|
||||
text=text,
|
||||
allowed_special=self.allowed_special,
|
||||
disallowed_special=self.disallowed_special,
|
||||
)
|
||||
|
||||
# Split tokens into chunks respecting the embedding_ctx_length
|
||||
for j in range(0, len(token), self.embedding_ctx_length):
|
||||
tokens.append(token[j : j + self.embedding_ctx_length])
|
||||
indices.append(i)
|
||||
|
||||
_iter, tokens, indices = self._tokenize(texts, _chunk_size)
|
||||
batched_embeddings: List[List[float]] = []
|
||||
_chunk_size = chunk_size or self.chunk_size
|
||||
for i in range(0, len(tokens), _chunk_size):
|
||||
|
2
libs/partners/openai/poetry.lock
generated
2
libs/partners/openai/poetry.lock
generated
@ -1286,4 +1286,4 @@ watchmedo = ["PyYAML (>=3.10)"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = ">=3.8.1,<4.0"
|
||||
content-hash = "f8a406a4ebd93e5c2ef3fcf4a3cebdd588ce09e288dc31b7b9b6b1560285575a"
|
||||
content-hash = "1d9cefc90178d94dee2a09afc14af160a7e35e4972ad4701d3bbbfdde14a81fa"
|
||||
|
@ -29,6 +29,7 @@ pytest-asyncio = "^0.21.1"
|
||||
langchain-core = { path = "../../core", develop = true }
|
||||
pytest-cov = "^4.1.0"
|
||||
langchain-standard-tests = { path = "../../standard-tests", develop = true }
|
||||
numpy = "^1.24"
|
||||
|
||||
[tool.poetry.group.codespell]
|
||||
optional = true
|
||||
|
@ -1,4 +1,7 @@
|
||||
"""Test OpenAI embeddings."""
|
||||
import numpy as np
|
||||
import openai
|
||||
|
||||
from langchain_openai.embeddings.base import OpenAIEmbeddings
|
||||
|
||||
|
||||
@ -26,3 +29,31 @@ def test_langchain_openai_embeddings_dimensions() -> None:
|
||||
output = embedding.embed_documents(documents)
|
||||
assert len(output) == 1
|
||||
assert len(output[0]) == 128
|
||||
|
||||
|
||||
def test_langchain_openai_embeddings_equivalent_to_raw() -> None:
|
||||
documents = ["disallowed special token '<|endoftext|>'"]
|
||||
embedding = OpenAIEmbeddings()
|
||||
|
||||
lc_output = embedding.embed_documents(documents)[0]
|
||||
direct_output = (
|
||||
openai.OpenAI()
|
||||
.embeddings.create(input=documents, model=embedding.model)
|
||||
.data[0]
|
||||
.embedding
|
||||
)
|
||||
assert np.isclose(lc_output, direct_output).all()
|
||||
|
||||
|
||||
async def test_langchain_openai_embeddings_equivalent_to_raw_async() -> None:
|
||||
documents = ["disallowed special token '<|endoftext|>'"]
|
||||
embedding = OpenAIEmbeddings()
|
||||
|
||||
lc_output = (await embedding.aembed_documents(documents))[0]
|
||||
client = openai.AsyncOpenAI()
|
||||
direct_output = (
|
||||
(await client.embeddings.create(input=documents, model=embedding.model))
|
||||
.data[0]
|
||||
.embedding
|
||||
)
|
||||
assert np.isclose(lc_output, direct_output).all()
|
||||
|
Loading…
Reference in New Issue
Block a user