langchain/libs/partners/openai/tests/integration_tests/embeddings/test_base.py
ccurme 56499cf58b
openai[patch]: unskip test and relax tolerance in embeddings comparison (#28262)
From what I can tell response using SDK is not deterministic:
```python
import numpy as np
import openai

documents = ["disallowed special token '<|endoftext|>'"]
model = "text-embedding-ada-002"

direct_output_1 = (
    openai.OpenAI()
    .embeddings.create(input=documents, model=model)
    .data[0]
    .embedding
)

for i in range(10):
    direct_output_2 = (
        openai.OpenAI()
        .embeddings.create(input=documents, model=model)
        .data[0]
        .embedding
    )
    print(f"{i}: {np.isclose(direct_output_1, direct_output_2).all()}")
```
```
0: True
1: True
2: True
3: True
4: False
5: True
6: True
7: True
8: True
9: True
```

See related discussion here:
https://community.openai.com/t/can-text-embedding-ada-002-be-made-deterministic/318054

Found the same result using `"text-embedding-3-small"`.
2024-11-21 10:23:10 -08:00

70 lines
2.1 KiB
Python

"""Test OpenAI embeddings."""
import numpy as np
import openai
from langchain_openai.embeddings.base import OpenAIEmbeddings
def test_langchain_openai_embedding_documents() -> None:
"""Test openai embeddings."""
documents = ["foo bar"]
embedding = OpenAIEmbeddings()
output = embedding.embed_documents(documents)
assert len(output) == 1
assert len(output[0]) > 0
def test_langchain_openai_embedding_query() -> None:
"""Test openai embeddings."""
document = "foo bar"
embedding = OpenAIEmbeddings()
output = embedding.embed_query(document)
assert len(output) > 0
def test_langchain_openai_embeddings_dimensions() -> None:
"""Test openai embeddings."""
documents = ["foo bar"]
embedding = OpenAIEmbeddings(model="text-embedding-3-small", dimensions=128)
output = embedding.embed_documents(documents)
assert len(output) == 1
assert len(output[0]) == 128
def test_langchain_openai_embeddings_equivalent_to_raw() -> None:
documents = ["disallowed special token '<|endoftext|>'"]
embedding = OpenAIEmbeddings()
lc_output = embedding.embed_documents(documents)[0]
direct_output = (
openai.OpenAI()
.embeddings.create(input=documents, model=embedding.model)
.data[0]
.embedding
)
assert np.allclose(lc_output, direct_output, atol=0.001)
async def test_langchain_openai_embeddings_equivalent_to_raw_async() -> None:
documents = ["disallowed special token '<|endoftext|>'"]
embedding = OpenAIEmbeddings()
lc_output = (await embedding.aembed_documents(documents))[0]
client = openai.AsyncOpenAI()
direct_output = (
(await client.embeddings.create(input=documents, model=embedding.model))
.data[0]
.embedding
)
assert np.allclose(lc_output, direct_output, atol=0.001)
def test_langchain_openai_embeddings_dimensions_large_num() -> None:
"""Test openai embeddings."""
documents = [f"foo bar {i}" for i in range(2000)]
embedding = OpenAIEmbeddings(model="text-embedding-3-small", dimensions=128)
output = embedding.embed_documents(documents)
assert len(output) == 2000
assert len(output[0]) == 128