mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-26 16:43:35 +00:00
openai[patch]: fix special token default behavior (#21131)
By default handle special sequences as regular text
This commit is contained in:
parent
0f7f448603
commit
bef50ded63
@ -82,8 +82,8 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
|
|||||||
"""Automatically inferred from env var `OPENAI_API_KEY` if not provided."""
|
"""Automatically inferred from env var `OPENAI_API_KEY` if not provided."""
|
||||||
openai_organization: Optional[str] = Field(default=None, alias="organization")
|
openai_organization: Optional[str] = Field(default=None, alias="organization")
|
||||||
"""Automatically inferred from env var `OPENAI_ORG_ID` if not provided."""
|
"""Automatically inferred from env var `OPENAI_ORG_ID` if not provided."""
|
||||||
allowed_special: Union[Literal["all"], Set[str]] = set()
|
allowed_special: Union[Literal["all"], Set[str], None] = None
|
||||||
disallowed_special: Union[Literal["all"], Set[str], Sequence[str]] = "all"
|
disallowed_special: Union[Literal["all"], Set[str], Sequence[str], None] = None
|
||||||
chunk_size: int = 1000
|
chunk_size: int = 1000
|
||||||
"""Maximum number of texts to embed in each batch"""
|
"""Maximum number of texts to embed in each batch"""
|
||||||
max_retries: int = 2
|
max_retries: int = 2
|
||||||
@ -246,31 +246,12 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
|
|||||||
params["dimensions"] = self.dimensions
|
params["dimensions"] = self.dimensions
|
||||||
return params
|
return params
|
||||||
|
|
||||||
# please refer to
|
def _tokenize(
|
||||||
# https://github.com/openai/openai-cookbook/blob/main/examples/Embedding_long_inputs.ipynb
|
self, texts: List[str], chunk_size: int
|
||||||
def _get_len_safe_embeddings(
|
) -> Tuple[Iterable[int], List[List[float]], List[int]]:
|
||||||
self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None
|
|
||||||
) -> List[List[float]]:
|
|
||||||
"""
|
|
||||||
Generate length-safe embeddings for a list of texts.
|
|
||||||
|
|
||||||
This method handles tokenization and embedding generation, respecting the
|
|
||||||
set embedding context length and chunk size. It supports both tiktoken
|
|
||||||
and HuggingFace tokenizer based on the tiktoken_enabled flag.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
texts (List[str]): A list of texts to embed.
|
|
||||||
engine (str): The engine or model to use for embeddings.
|
|
||||||
chunk_size (Optional[int]): The size of chunks for processing embeddings.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List[List[float]]: A list of embeddings for each input text.
|
|
||||||
"""
|
|
||||||
|
|
||||||
tokens = []
|
tokens = []
|
||||||
indices = []
|
indices = []
|
||||||
model_name = self.tiktoken_model_name or self.model
|
model_name = self.tiktoken_model_name or self.model
|
||||||
_chunk_size = chunk_size or self.chunk_size
|
|
||||||
|
|
||||||
# If tiktoken flag set to False
|
# If tiktoken flag set to False
|
||||||
if not self.tiktoken_enabled:
|
if not self.tiktoken_enabled:
|
||||||
@ -303,6 +284,14 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
|
|||||||
encoding = tiktoken.encoding_for_model(model_name)
|
encoding = tiktoken.encoding_for_model(model_name)
|
||||||
except KeyError:
|
except KeyError:
|
||||||
encoding = tiktoken.get_encoding("cl100k_base")
|
encoding = tiktoken.get_encoding("cl100k_base")
|
||||||
|
encoder_kwargs: Dict[str, Any] = {
|
||||||
|
k: v
|
||||||
|
for k, v in {
|
||||||
|
"allowed_special": self.allowed_special,
|
||||||
|
"disallowed_special": self.disallowed_special,
|
||||||
|
}.items()
|
||||||
|
if v is not None
|
||||||
|
}
|
||||||
for i, text in enumerate(texts):
|
for i, text in enumerate(texts):
|
||||||
if self.model.endswith("001"):
|
if self.model.endswith("001"):
|
||||||
# See: https://github.com/openai/openai-python/
|
# See: https://github.com/openai/openai-python/
|
||||||
@ -310,11 +299,10 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
|
|||||||
# replace newlines, which can negatively affect performance.
|
# replace newlines, which can negatively affect performance.
|
||||||
text = text.replace("\n", " ")
|
text = text.replace("\n", " ")
|
||||||
|
|
||||||
token = encoding.encode(
|
if encoder_kwargs:
|
||||||
text=text,
|
token = encoding.encode(text, **encoder_kwargs)
|
||||||
allowed_special=self.allowed_special,
|
else:
|
||||||
disallowed_special=self.disallowed_special,
|
token = encoding.encode_ordinary(text)
|
||||||
)
|
|
||||||
|
|
||||||
# Split tokens into chunks respecting the embedding_ctx_length
|
# Split tokens into chunks respecting the embedding_ctx_length
|
||||||
for j in range(0, len(token), self.embedding_ctx_length):
|
for j in range(0, len(token), self.embedding_ctx_length):
|
||||||
@ -325,12 +313,35 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
|
|||||||
try:
|
try:
|
||||||
from tqdm.auto import tqdm
|
from tqdm.auto import tqdm
|
||||||
|
|
||||||
_iter: Iterable = tqdm(range(0, len(tokens), _chunk_size))
|
_iter: Iterable = tqdm(range(0, len(tokens), chunk_size))
|
||||||
except ImportError:
|
except ImportError:
|
||||||
_iter = range(0, len(tokens), _chunk_size)
|
_iter = range(0, len(tokens), chunk_size)
|
||||||
else:
|
else:
|
||||||
_iter = range(0, len(tokens), _chunk_size)
|
_iter = range(0, len(tokens), chunk_size)
|
||||||
|
return _iter, tokens, indices
|
||||||
|
|
||||||
|
# please refer to
|
||||||
|
# https://github.com/openai/openai-cookbook/blob/main/examples/Embedding_long_inputs.ipynb
|
||||||
|
def _get_len_safe_embeddings(
|
||||||
|
self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None
|
||||||
|
) -> List[List[float]]:
|
||||||
|
"""
|
||||||
|
Generate length-safe embeddings for a list of texts.
|
||||||
|
|
||||||
|
This method handles tokenization and embedding generation, respecting the
|
||||||
|
set embedding context length and chunk size. It supports both tiktoken
|
||||||
|
and HuggingFace tokenizer based on the tiktoken_enabled flag.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
texts (List[str]): A list of texts to embed.
|
||||||
|
engine (str): The engine or model to use for embeddings.
|
||||||
|
chunk_size (Optional[int]): The size of chunks for processing embeddings.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[List[float]]: A list of embeddings for each input text.
|
||||||
|
"""
|
||||||
|
_chunk_size = chunk_size or self.chunk_size
|
||||||
|
_iter, tokens, indices = self._tokenize(texts, _chunk_size)
|
||||||
batched_embeddings: List[List[float]] = []
|
batched_embeddings: List[List[float]] = []
|
||||||
for i in _iter:
|
for i in _iter:
|
||||||
response = self.client.create(
|
response = self.client.create(
|
||||||
@ -399,62 +410,8 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
|
|||||||
List[List[float]]: A list of embeddings for each input text.
|
List[List[float]]: A list of embeddings for each input text.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
tokens = []
|
|
||||||
indices = []
|
|
||||||
model_name = self.tiktoken_model_name or self.model
|
|
||||||
_chunk_size = chunk_size or self.chunk_size
|
_chunk_size = chunk_size or self.chunk_size
|
||||||
|
_iter, tokens, indices = self._tokenize(texts, _chunk_size)
|
||||||
# If tiktoken flag set to False
|
|
||||||
if not self.tiktoken_enabled:
|
|
||||||
try:
|
|
||||||
from transformers import AutoTokenizer
|
|
||||||
except ImportError:
|
|
||||||
raise ValueError(
|
|
||||||
"Could not import transformers python package. "
|
|
||||||
"This is needed in order to for OpenAIEmbeddings without "
|
|
||||||
" `tiktoken`. Please install it with `pip install transformers`."
|
|
||||||
)
|
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(
|
|
||||||
pretrained_model_name_or_path=model_name
|
|
||||||
)
|
|
||||||
for i, text in enumerate(texts):
|
|
||||||
# Tokenize the text using HuggingFace transformers
|
|
||||||
tokenized = tokenizer.encode(text, add_special_tokens=False)
|
|
||||||
|
|
||||||
# Split tokens into chunks respecting the embedding_ctx_length
|
|
||||||
for j in range(0, len(tokenized), self.embedding_ctx_length):
|
|
||||||
token_chunk = tokenized[j : j + self.embedding_ctx_length]
|
|
||||||
|
|
||||||
# Convert token IDs back to a string
|
|
||||||
chunk_text = tokenizer.decode(token_chunk)
|
|
||||||
tokens.append(chunk_text)
|
|
||||||
indices.append(i)
|
|
||||||
else:
|
|
||||||
try:
|
|
||||||
encoding = tiktoken.encoding_for_model(model_name)
|
|
||||||
except KeyError:
|
|
||||||
logger.warning("Warning: model not found. Using cl100k_base encoding.")
|
|
||||||
model = "cl100k_base"
|
|
||||||
encoding = tiktoken.get_encoding(model)
|
|
||||||
for i, text in enumerate(texts):
|
|
||||||
if self.model.endswith("001"):
|
|
||||||
# See: https://github.com/openai/openai-python/
|
|
||||||
# issues/418#issuecomment-1525939500
|
|
||||||
# replace newlines, which can negatively affect performance.
|
|
||||||
text = text.replace("\n", " ")
|
|
||||||
|
|
||||||
token = encoding.encode(
|
|
||||||
text=text,
|
|
||||||
allowed_special=self.allowed_special,
|
|
||||||
disallowed_special=self.disallowed_special,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Split tokens into chunks respecting the embedding_ctx_length
|
|
||||||
for j in range(0, len(token), self.embedding_ctx_length):
|
|
||||||
tokens.append(token[j : j + self.embedding_ctx_length])
|
|
||||||
indices.append(i)
|
|
||||||
|
|
||||||
batched_embeddings: List[List[float]] = []
|
batched_embeddings: List[List[float]] = []
|
||||||
_chunk_size = chunk_size or self.chunk_size
|
_chunk_size = chunk_size or self.chunk_size
|
||||||
for i in range(0, len(tokens), _chunk_size):
|
for i in range(0, len(tokens), _chunk_size):
|
||||||
|
2
libs/partners/openai/poetry.lock
generated
2
libs/partners/openai/poetry.lock
generated
@ -1286,4 +1286,4 @@ watchmedo = ["PyYAML (>=3.10)"]
|
|||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = ">=3.8.1,<4.0"
|
python-versions = ">=3.8.1,<4.0"
|
||||||
content-hash = "f8a406a4ebd93e5c2ef3fcf4a3cebdd588ce09e288dc31b7b9b6b1560285575a"
|
content-hash = "1d9cefc90178d94dee2a09afc14af160a7e35e4972ad4701d3bbbfdde14a81fa"
|
||||||
|
@ -29,6 +29,7 @@ pytest-asyncio = "^0.21.1"
|
|||||||
langchain-core = { path = "../../core", develop = true }
|
langchain-core = { path = "../../core", develop = true }
|
||||||
pytest-cov = "^4.1.0"
|
pytest-cov = "^4.1.0"
|
||||||
langchain-standard-tests = { path = "../../standard-tests", develop = true }
|
langchain-standard-tests = { path = "../../standard-tests", develop = true }
|
||||||
|
numpy = "^1.24"
|
||||||
|
|
||||||
[tool.poetry.group.codespell]
|
[tool.poetry.group.codespell]
|
||||||
optional = true
|
optional = true
|
||||||
|
@ -1,4 +1,7 @@
|
|||||||
"""Test OpenAI embeddings."""
|
"""Test OpenAI embeddings."""
|
||||||
|
import numpy as np
|
||||||
|
import openai
|
||||||
|
|
||||||
from langchain_openai.embeddings.base import OpenAIEmbeddings
|
from langchain_openai.embeddings.base import OpenAIEmbeddings
|
||||||
|
|
||||||
|
|
||||||
@ -26,3 +29,31 @@ def test_langchain_openai_embeddings_dimensions() -> None:
|
|||||||
output = embedding.embed_documents(documents)
|
output = embedding.embed_documents(documents)
|
||||||
assert len(output) == 1
|
assert len(output) == 1
|
||||||
assert len(output[0]) == 128
|
assert len(output[0]) == 128
|
||||||
|
|
||||||
|
|
||||||
|
def test_langchain_openai_embeddings_equivalent_to_raw() -> None:
|
||||||
|
documents = ["disallowed special token '<|endoftext|>'"]
|
||||||
|
embedding = OpenAIEmbeddings()
|
||||||
|
|
||||||
|
lc_output = embedding.embed_documents(documents)[0]
|
||||||
|
direct_output = (
|
||||||
|
openai.OpenAI()
|
||||||
|
.embeddings.create(input=documents, model=embedding.model)
|
||||||
|
.data[0]
|
||||||
|
.embedding
|
||||||
|
)
|
||||||
|
assert np.isclose(lc_output, direct_output).all()
|
||||||
|
|
||||||
|
|
||||||
|
async def test_langchain_openai_embeddings_equivalent_to_raw_async() -> None:
|
||||||
|
documents = ["disallowed special token '<|endoftext|>'"]
|
||||||
|
embedding = OpenAIEmbeddings()
|
||||||
|
|
||||||
|
lc_output = (await embedding.aembed_documents(documents))[0]
|
||||||
|
client = openai.AsyncOpenAI()
|
||||||
|
direct_output = (
|
||||||
|
(await client.embeddings.create(input=documents, model=embedding.model))
|
||||||
|
.data[0]
|
||||||
|
.embedding
|
||||||
|
)
|
||||||
|
assert np.isclose(lc_output, direct_output).all()
|
||||||
|
Loading…
Reference in New Issue
Block a user