Fix: Qdrant ids (#5515)

# Fix Qdrant ids creation

There has been a bug in how the ids were created in the Qdrant vector
store. They were previously calculated based on the texts. However,
there are some scenarios in which two documents may have the same piece
of text but different metadata, and that's a valid case. Deduplication
should be done outside of insertion.

It has been fixed and covered with the integration tests.
---------

Co-authored-by: Dev 2049 <dev.dev2049@gmail.com>
This commit is contained in:
Kacper Łukawski
2023-06-02 17:57:34 +02:00
committed by GitHub
parent d1f65d8dc1
commit 71a7c16ee0
2 changed files with 161 additions and 56 deletions

View File

@@ -1,4 +1,5 @@
"""Test Qdrant functionality."""
import tempfile
from typing import Callable, Optional
import pytest
@@ -247,3 +248,91 @@ def test_qdrant_embedding_interface_raises(
embeddings=embeddings,
embedding_function=embedding_function,
)
def test_qdrant_stores_duplicated_texts() -> None:
from qdrant_client import QdrantClient
from qdrant_client.http import models as rest
client = QdrantClient(":memory:")
collection_name = "test"
client.recreate_collection(
collection_name,
vectors_config=rest.VectorParams(size=10, distance=rest.Distance.COSINE),
)
vec_store = Qdrant(
client,
collection_name,
embeddings=ConsistentFakeEmbeddings(),
)
ids = vec_store.add_texts(["abc", "abc"], [{"a": 1}, {"a": 2}])
assert 2 == len(set(ids))
assert 2 == client.count(collection_name).count
def test_qdrant_from_texts_stores_duplicated_texts() -> None:
from qdrant_client import QdrantClient
with tempfile.TemporaryDirectory() as tmpdir:
vec_store = Qdrant.from_texts(
["abc", "abc"],
ConsistentFakeEmbeddings(),
collection_name="test",
path=str(tmpdir),
)
del vec_store
client = QdrantClient(path=str(tmpdir))
assert 2 == client.count("test").count
@pytest.mark.parametrize("batch_size", [1, 64])
def test_qdrant_from_texts_stores_ids(batch_size: int) -> None:
from qdrant_client import QdrantClient
with tempfile.TemporaryDirectory() as tmpdir:
ids = [
"fa38d572-4c31-4579-aedc-1960d79df6df",
"cdc1aa36-d6ab-4fb2-8a94-56674fd27484",
]
vec_store = Qdrant.from_texts(
["abc", "def"],
ConsistentFakeEmbeddings(),
ids=ids,
collection_name="test",
path=str(tmpdir),
batch_size=batch_size,
)
del vec_store
client = QdrantClient(path=str(tmpdir))
assert 2 == client.count("test").count
stored_ids = [point.id for point in client.scroll("test")[0]]
assert set(ids) == set(stored_ids)
@pytest.mark.parametrize("batch_size", [1, 64])
def test_qdrant_add_texts_stores_ids(batch_size: int) -> None:
from qdrant_client import QdrantClient
ids = [
"fa38d572-4c31-4579-aedc-1960d79df6df",
"cdc1aa36-d6ab-4fb2-8a94-56674fd27484",
]
client = QdrantClient(":memory:")
collection_name = "test"
client.recreate_collection(
collection_name,
vectors_config=rest.VectorParams(size=10, distance=rest.Distance.COSINE),
)
vec_store = Qdrant(client, "test", ConsistentFakeEmbeddings())
returned_ids = vec_store.add_texts(["abc", "def"], ids=ids, batch_size=batch_size)
assert all(first == second for first, second in zip(ids, returned_ids))
assert 2 == client.count("test").count
stored_ids = [point.id for point in client.scroll("test")[0]]
assert set(ids) == set(stored_ids)