openai[patch]: fix special token default behavior (#21131)

By default handle special sequences as regular text
This commit is contained in:
Bagatur
2024-04-30 20:08:24 -04:00
committed by GitHub
parent 0f7f448603
commit bef50ded63
4 changed files with 77 additions and 88 deletions

View File

@@ -82,8 +82,8 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
"""Automatically inferred from env var `OPENAI_API_KEY` if not provided."""
openai_organization: Optional[str] = Field(default=None, alias="organization")
"""Automatically inferred from env var `OPENAI_ORG_ID` if not provided."""
allowed_special: Union[Literal["all"], Set[str]] = set()
disallowed_special: Union[Literal["all"], Set[str], Sequence[str]] = "all"
allowed_special: Union[Literal["all"], Set[str], None] = None
disallowed_special: Union[Literal["all"], Set[str], Sequence[str], None] = None
chunk_size: int = 1000
"""Maximum number of texts to embed in each batch"""
max_retries: int = 2
@@ -246,31 +246,12 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
params["dimensions"] = self.dimensions
return params
# please refer to
# https://github.com/openai/openai-cookbook/blob/main/examples/Embedding_long_inputs.ipynb
def _get_len_safe_embeddings(
self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None
) -> List[List[float]]:
"""
Generate length-safe embeddings for a list of texts.
This method handles tokenization and embedding generation, respecting the
set embedding context length and chunk size. It supports both tiktoken
and HuggingFace tokenizer based on the tiktoken_enabled flag.
Args:
texts (List[str]): A list of texts to embed.
engine (str): The engine or model to use for embeddings.
chunk_size (Optional[int]): The size of chunks for processing embeddings.
Returns:
List[List[float]]: A list of embeddings for each input text.
"""
def _tokenize(
self, texts: List[str], chunk_size: int
) -> Tuple[Iterable[int], List[List[float]], List[int]]:
tokens = []
indices = []
model_name = self.tiktoken_model_name or self.model
_chunk_size = chunk_size or self.chunk_size
# If tiktoken flag set to False
if not self.tiktoken_enabled:
@@ -303,6 +284,14 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
encoding = tiktoken.encoding_for_model(model_name)
except KeyError:
encoding = tiktoken.get_encoding("cl100k_base")
encoder_kwargs: Dict[str, Any] = {
k: v
for k, v in {
"allowed_special": self.allowed_special,
"disallowed_special": self.disallowed_special,
}.items()
if v is not None
}
for i, text in enumerate(texts):
if self.model.endswith("001"):
# See: https://github.com/openai/openai-python/
@@ -310,11 +299,10 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
# replace newlines, which can negatively affect performance.
text = text.replace("\n", " ")
token = encoding.encode(
text=text,
allowed_special=self.allowed_special,
disallowed_special=self.disallowed_special,
)
if encoder_kwargs:
token = encoding.encode(text, **encoder_kwargs)
else:
token = encoding.encode_ordinary(text)
# Split tokens into chunks respecting the embedding_ctx_length
for j in range(0, len(token), self.embedding_ctx_length):
@@ -325,12 +313,35 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
try:
from tqdm.auto import tqdm
_iter: Iterable = tqdm(range(0, len(tokens), _chunk_size))
_iter: Iterable = tqdm(range(0, len(tokens), chunk_size))
except ImportError:
_iter = range(0, len(tokens), _chunk_size)
_iter = range(0, len(tokens), chunk_size)
else:
_iter = range(0, len(tokens), _chunk_size)
_iter = range(0, len(tokens), chunk_size)
return _iter, tokens, indices
# please refer to
# https://github.com/openai/openai-cookbook/blob/main/examples/Embedding_long_inputs.ipynb
def _get_len_safe_embeddings(
self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None
) -> List[List[float]]:
"""
Generate length-safe embeddings for a list of texts.
This method handles tokenization and embedding generation, respecting the
set embedding context length and chunk size. It supports both tiktoken
and HuggingFace tokenizer based on the tiktoken_enabled flag.
Args:
texts (List[str]): A list of texts to embed.
engine (str): The engine or model to use for embeddings.
chunk_size (Optional[int]): The size of chunks for processing embeddings.
Returns:
List[List[float]]: A list of embeddings for each input text.
"""
_chunk_size = chunk_size or self.chunk_size
_iter, tokens, indices = self._tokenize(texts, _chunk_size)
batched_embeddings: List[List[float]] = []
for i in _iter:
response = self.client.create(
@@ -399,62 +410,8 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
List[List[float]]: A list of embeddings for each input text.
"""
tokens = []
indices = []
model_name = self.tiktoken_model_name or self.model
_chunk_size = chunk_size or self.chunk_size
# If tiktoken flag set to False
if not self.tiktoken_enabled:
try:
from transformers import AutoTokenizer
except ImportError:
raise ValueError(
"Could not import transformers python package. "
"This is needed in order to for OpenAIEmbeddings without "
" `tiktoken`. Please install it with `pip install transformers`."
)
tokenizer = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path=model_name
)
for i, text in enumerate(texts):
# Tokenize the text using HuggingFace transformers
tokenized = tokenizer.encode(text, add_special_tokens=False)
# Split tokens into chunks respecting the embedding_ctx_length
for j in range(0, len(tokenized), self.embedding_ctx_length):
token_chunk = tokenized[j : j + self.embedding_ctx_length]
# Convert token IDs back to a string
chunk_text = tokenizer.decode(token_chunk)
tokens.append(chunk_text)
indices.append(i)
else:
try:
encoding = tiktoken.encoding_for_model(model_name)
except KeyError:
logger.warning("Warning: model not found. Using cl100k_base encoding.")
model = "cl100k_base"
encoding = tiktoken.get_encoding(model)
for i, text in enumerate(texts):
if self.model.endswith("001"):
# See: https://github.com/openai/openai-python/
# issues/418#issuecomment-1525939500
# replace newlines, which can negatively affect performance.
text = text.replace("\n", " ")
token = encoding.encode(
text=text,
allowed_special=self.allowed_special,
disallowed_special=self.disallowed_special,
)
# Split tokens into chunks respecting the embedding_ctx_length
for j in range(0, len(token), self.embedding_ctx_length):
tokens.append(token[j : j + self.embedding_ctx_length])
indices.append(i)
_iter, tokens, indices = self._tokenize(texts, _chunk_size)
batched_embeddings: List[List[float]] = []
_chunk_size = chunk_size or self.chunk_size
for i in range(0, len(tokens), _chunk_size):