From bef50ded63100992ad04a354107e5fc39522eb6d Mon Sep 17 00:00:00 2001 From: Bagatur <22008038+baskaryan@users.noreply.github.com> Date: Tue, 30 Apr 2024 20:08:24 -0400 Subject: [PATCH] openai[patch]: fix special token default behavior (#21131) By default handle special sequences as regular text --- .../langchain_openai/embeddings/base.py | 131 ++++++------------ libs/partners/openai/poetry.lock | 2 +- libs/partners/openai/pyproject.toml | 1 + .../integration_tests/embeddings/test_base.py | 31 +++++ 4 files changed, 77 insertions(+), 88 deletions(-) diff --git a/libs/partners/openai/langchain_openai/embeddings/base.py b/libs/partners/openai/langchain_openai/embeddings/base.py index cb3850e834d..6cc017ad480 100644 --- a/libs/partners/openai/langchain_openai/embeddings/base.py +++ b/libs/partners/openai/langchain_openai/embeddings/base.py @@ -82,8 +82,8 @@ class OpenAIEmbeddings(BaseModel, Embeddings): """Automatically inferred from env var `OPENAI_API_KEY` if not provided.""" openai_organization: Optional[str] = Field(default=None, alias="organization") """Automatically inferred from env var `OPENAI_ORG_ID` if not provided.""" - allowed_special: Union[Literal["all"], Set[str]] = set() - disallowed_special: Union[Literal["all"], Set[str], Sequence[str]] = "all" + allowed_special: Union[Literal["all"], Set[str], None] = None + disallowed_special: Union[Literal["all"], Set[str], Sequence[str], None] = None chunk_size: int = 1000 """Maximum number of texts to embed in each batch""" max_retries: int = 2 @@ -246,31 +246,12 @@ class OpenAIEmbeddings(BaseModel, Embeddings): params["dimensions"] = self.dimensions return params - # please refer to - # https://github.com/openai/openai-cookbook/blob/main/examples/Embedding_long_inputs.ipynb - def _get_len_safe_embeddings( - self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None - ) -> List[List[float]]: - """ - Generate length-safe embeddings for a list of texts. - - This method handles tokenization and embedding generation, respecting the - set embedding context length and chunk size. It supports both tiktoken - and HuggingFace tokenizer based on the tiktoken_enabled flag. - - Args: - texts (List[str]): A list of texts to embed. - engine (str): The engine or model to use for embeddings. - chunk_size (Optional[int]): The size of chunks for processing embeddings. - - Returns: - List[List[float]]: A list of embeddings for each input text. - """ - + def _tokenize( + self, texts: List[str], chunk_size: int + ) -> Tuple[Iterable[int], List[List[float]], List[int]]: tokens = [] indices = [] model_name = self.tiktoken_model_name or self.model - _chunk_size = chunk_size or self.chunk_size # If tiktoken flag set to False if not self.tiktoken_enabled: @@ -303,6 +284,14 @@ class OpenAIEmbeddings(BaseModel, Embeddings): encoding = tiktoken.encoding_for_model(model_name) except KeyError: encoding = tiktoken.get_encoding("cl100k_base") + encoder_kwargs: Dict[str, Any] = { + k: v + for k, v in { + "allowed_special": self.allowed_special, + "disallowed_special": self.disallowed_special, + }.items() + if v is not None + } for i, text in enumerate(texts): if self.model.endswith("001"): # See: https://github.com/openai/openai-python/ @@ -310,11 +299,10 @@ class OpenAIEmbeddings(BaseModel, Embeddings): # replace newlines, which can negatively affect performance. text = text.replace("\n", " ") - token = encoding.encode( - text=text, - allowed_special=self.allowed_special, - disallowed_special=self.disallowed_special, - ) + if encoder_kwargs: + token = encoding.encode(text, **encoder_kwargs) + else: + token = encoding.encode_ordinary(text) # Split tokens into chunks respecting the embedding_ctx_length for j in range(0, len(token), self.embedding_ctx_length): @@ -325,12 +313,35 @@ class OpenAIEmbeddings(BaseModel, Embeddings): try: from tqdm.auto import tqdm - _iter: Iterable = tqdm(range(0, len(tokens), _chunk_size)) + _iter: Iterable = tqdm(range(0, len(tokens), chunk_size)) except ImportError: - _iter = range(0, len(tokens), _chunk_size) + _iter = range(0, len(tokens), chunk_size) else: - _iter = range(0, len(tokens), _chunk_size) + _iter = range(0, len(tokens), chunk_size) + return _iter, tokens, indices + # please refer to + # https://github.com/openai/openai-cookbook/blob/main/examples/Embedding_long_inputs.ipynb + def _get_len_safe_embeddings( + self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None + ) -> List[List[float]]: + """ + Generate length-safe embeddings for a list of texts. + + This method handles tokenization and embedding generation, respecting the + set embedding context length and chunk size. It supports both tiktoken + and HuggingFace tokenizer based on the tiktoken_enabled flag. + + Args: + texts (List[str]): A list of texts to embed. + engine (str): The engine or model to use for embeddings. + chunk_size (Optional[int]): The size of chunks for processing embeddings. + + Returns: + List[List[float]]: A list of embeddings for each input text. + """ + _chunk_size = chunk_size or self.chunk_size + _iter, tokens, indices = self._tokenize(texts, _chunk_size) batched_embeddings: List[List[float]] = [] for i in _iter: response = self.client.create( @@ -399,62 +410,8 @@ class OpenAIEmbeddings(BaseModel, Embeddings): List[List[float]]: A list of embeddings for each input text. """ - tokens = [] - indices = [] - model_name = self.tiktoken_model_name or self.model _chunk_size = chunk_size or self.chunk_size - - # If tiktoken flag set to False - if not self.tiktoken_enabled: - try: - from transformers import AutoTokenizer - except ImportError: - raise ValueError( - "Could not import transformers python package. " - "This is needed in order to for OpenAIEmbeddings without " - " `tiktoken`. Please install it with `pip install transformers`." - ) - - tokenizer = AutoTokenizer.from_pretrained( - pretrained_model_name_or_path=model_name - ) - for i, text in enumerate(texts): - # Tokenize the text using HuggingFace transformers - tokenized = tokenizer.encode(text, add_special_tokens=False) - - # Split tokens into chunks respecting the embedding_ctx_length - for j in range(0, len(tokenized), self.embedding_ctx_length): - token_chunk = tokenized[j : j + self.embedding_ctx_length] - - # Convert token IDs back to a string - chunk_text = tokenizer.decode(token_chunk) - tokens.append(chunk_text) - indices.append(i) - else: - try: - encoding = tiktoken.encoding_for_model(model_name) - except KeyError: - logger.warning("Warning: model not found. Using cl100k_base encoding.") - model = "cl100k_base" - encoding = tiktoken.get_encoding(model) - for i, text in enumerate(texts): - if self.model.endswith("001"): - # See: https://github.com/openai/openai-python/ - # issues/418#issuecomment-1525939500 - # replace newlines, which can negatively affect performance. - text = text.replace("\n", " ") - - token = encoding.encode( - text=text, - allowed_special=self.allowed_special, - disallowed_special=self.disallowed_special, - ) - - # Split tokens into chunks respecting the embedding_ctx_length - for j in range(0, len(token), self.embedding_ctx_length): - tokens.append(token[j : j + self.embedding_ctx_length]) - indices.append(i) - + _iter, tokens, indices = self._tokenize(texts, _chunk_size) batched_embeddings: List[List[float]] = [] _chunk_size = chunk_size or self.chunk_size for i in range(0, len(tokens), _chunk_size): diff --git a/libs/partners/openai/poetry.lock b/libs/partners/openai/poetry.lock index 312eaafad49..19d79b0c7e2 100644 --- a/libs/partners/openai/poetry.lock +++ b/libs/partners/openai/poetry.lock @@ -1286,4 +1286,4 @@ watchmedo = ["PyYAML (>=3.10)"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "f8a406a4ebd93e5c2ef3fcf4a3cebdd588ce09e288dc31b7b9b6b1560285575a" +content-hash = "1d9cefc90178d94dee2a09afc14af160a7e35e4972ad4701d3bbbfdde14a81fa" diff --git a/libs/partners/openai/pyproject.toml b/libs/partners/openai/pyproject.toml index 24bf7cf513e..04d657857bd 100644 --- a/libs/partners/openai/pyproject.toml +++ b/libs/partners/openai/pyproject.toml @@ -29,6 +29,7 @@ pytest-asyncio = "^0.21.1" langchain-core = { path = "../../core", develop = true } pytest-cov = "^4.1.0" langchain-standard-tests = { path = "../../standard-tests", develop = true } +numpy = "^1.24" [tool.poetry.group.codespell] optional = true diff --git a/libs/partners/openai/tests/integration_tests/embeddings/test_base.py b/libs/partners/openai/tests/integration_tests/embeddings/test_base.py index e63e77b5a9e..0964d9886f4 100644 --- a/libs/partners/openai/tests/integration_tests/embeddings/test_base.py +++ b/libs/partners/openai/tests/integration_tests/embeddings/test_base.py @@ -1,4 +1,7 @@ """Test OpenAI embeddings.""" +import numpy as np +import openai + from langchain_openai.embeddings.base import OpenAIEmbeddings @@ -26,3 +29,31 @@ def test_langchain_openai_embeddings_dimensions() -> None: output = embedding.embed_documents(documents) assert len(output) == 1 assert len(output[0]) == 128 + + +def test_langchain_openai_embeddings_equivalent_to_raw() -> None: + documents = ["disallowed special token '<|endoftext|>'"] + embedding = OpenAIEmbeddings() + + lc_output = embedding.embed_documents(documents)[0] + direct_output = ( + openai.OpenAI() + .embeddings.create(input=documents, model=embedding.model) + .data[0] + .embedding + ) + assert np.isclose(lc_output, direct_output).all() + + +async def test_langchain_openai_embeddings_equivalent_to_raw_async() -> None: + documents = ["disallowed special token '<|endoftext|>'"] + embedding = OpenAIEmbeddings() + + lc_output = (await embedding.aembed_documents(documents))[0] + client = openai.AsyncOpenAI() + direct_output = ( + (await client.embeddings.create(input=documents, model=embedding.model)) + .data[0] + .embedding + ) + assert np.isclose(lc_output, direct_output).all()