Reuse the existing collection if configured properly in Qdrant.from_texts (#7530)

This PR changes the behavior of `Qdrant.from_texts` so the collection is
reused if not requested to recreate it. Previously, calling
`Qdrant.from_texts` or `Qdrant.from_documents` resulted in removing the
old data which was confusing for many.
This commit is contained in:
Kacper Łukawski
2023-07-11 22:24:35 +02:00
committed by GitHub
parent 6674b33cf5
commit 1f83b5f47e
4 changed files with 989 additions and 804 deletions

View File

@@ -27,8 +27,9 @@ class ConsistentFakeEmbeddings(FakeEmbeddings):
"""Fake embeddings which remember all the texts seen so far to return consistent
vectors for the same texts."""
def __init__(self) -> None:
def __init__(self, dimensionality: int = 10) -> None:
self.known_texts: List[str] = []
self.dimensionality = dimensionality
def embed_documents(self, texts: List[str]) -> List[List[float]]:
"""Return consistent embeddings for each text seen so far."""
@@ -36,7 +37,9 @@ class ConsistentFakeEmbeddings(FakeEmbeddings):
for text in texts:
if text not in self.known_texts:
self.known_texts.append(text)
vector = [float(1.0)] * 9 + [float(self.known_texts.index(text))]
vector = [float(1.0)] * (self.dimensionality - 1) + [
float(self.known_texts.index(text))
]
out_vectors.append(vector)
return out_vectors
@@ -44,8 +47,10 @@ class ConsistentFakeEmbeddings(FakeEmbeddings):
"""Return consistent embeddings for the text, if seen before, or a constant
one if the text is unknown."""
if text not in self.known_texts:
return [float(1.0)] * 9 + [float(0.0)]
return [float(1.0)] * 9 + [float(self.known_texts.index(text))]
return [float(1.0)] * (self.dimensionality - 1) + [float(0.0)]
return [float(1.0)] * (self.dimensionality - 1) + [
float(self.known_texts.index(text))
]
class AngularTwoDimensionalEmbeddings(Embeddings):

View File

@@ -9,6 +9,7 @@ from qdrant_client.http import models as rest
from langchain.docstore.document import Document
from langchain.embeddings.base import Embeddings
from langchain.vectorstores import Qdrant
from langchain.vectorstores.qdrant import QdrantException
from tests.integration_tests.vectorstores.fake_embeddings import (
ConsistentFakeEmbeddings,
)
@@ -537,3 +538,148 @@ def test_qdrant_similarity_search_with_relevance_scores(
assert all(
(1 >= score or np.isclose(score, 1)) and score >= 0 for _, score in output
)
@pytest.mark.parametrize("vector_name", [None, "custom-vector"])
def test_qdrant_from_texts_reuses_same_collection(vector_name: Optional[str]) -> None:
"""Test if Qdrant.from_texts reuses the same collection"""
from qdrant_client import QdrantClient
collection_name = "test"
embeddings = ConsistentFakeEmbeddings()
with tempfile.TemporaryDirectory() as tmpdir:
vec_store = Qdrant.from_texts(
["lorem", "ipsum", "dolor", "sit", "amet"],
embeddings,
collection_name=collection_name,
path=str(tmpdir),
vector_name=vector_name,
)
del vec_store
vec_store = Qdrant.from_texts(
["foo", "bar"],
embeddings,
collection_name=collection_name,
path=str(tmpdir),
vector_name=vector_name,
)
del vec_store
client = QdrantClient(path=str(tmpdir))
assert 7 == client.count(collection_name).count
@pytest.mark.parametrize("vector_name", [None, "custom-vector"])
def test_qdrant_from_texts_raises_error_on_different_dimensionality(
vector_name: Optional[str],
) -> None:
"""Test if Qdrant.from_texts raises an exception if dimensionality does not match"""
collection_name = "test"
with tempfile.TemporaryDirectory() as tmpdir:
vec_store = Qdrant.from_texts(
["lorem", "ipsum", "dolor", "sit", "amet"],
ConsistentFakeEmbeddings(dimensionality=10),
collection_name=collection_name,
path=str(tmpdir),
vector_name=vector_name,
)
del vec_store
with pytest.raises(QdrantException):
Qdrant.from_texts(
["foo", "bar"],
ConsistentFakeEmbeddings(dimensionality=5),
collection_name=collection_name,
path=str(tmpdir),
vector_name=vector_name,
)
@pytest.mark.parametrize(
["first_vector_name", "second_vector_name"],
[
(None, "custom-vector"),
("custom-vector", None),
("my-first-vector", "my-second_vector"),
],
)
def test_qdrant_from_texts_raises_error_on_different_vector_name(
first_vector_name: Optional[str],
second_vector_name: Optional[str],
) -> None:
"""Test if Qdrant.from_texts raises an exception if vector name does not match"""
collection_name = "test"
with tempfile.TemporaryDirectory() as tmpdir:
vec_store = Qdrant.from_texts(
["lorem", "ipsum", "dolor", "sit", "amet"],
ConsistentFakeEmbeddings(dimensionality=10),
collection_name=collection_name,
path=str(tmpdir),
vector_name=first_vector_name,
)
del vec_store
with pytest.raises(QdrantException):
Qdrant.from_texts(
["foo", "bar"],
ConsistentFakeEmbeddings(dimensionality=5),
collection_name=collection_name,
path=str(tmpdir),
vector_name=second_vector_name,
)
def test_qdrant_from_texts_raises_error_on_different_distance() -> None:
"""Test if Qdrant.from_texts raises an exception if distance does not match"""
collection_name = "test"
with tempfile.TemporaryDirectory() as tmpdir:
vec_store = Qdrant.from_texts(
["lorem", "ipsum", "dolor", "sit", "amet"],
ConsistentFakeEmbeddings(dimensionality=10),
collection_name=collection_name,
path=str(tmpdir),
distance_func="Cosine",
)
del vec_store
with pytest.raises(QdrantException):
Qdrant.from_texts(
["foo", "bar"],
ConsistentFakeEmbeddings(dimensionality=5),
collection_name=collection_name,
path=str(tmpdir),
distance_func="Euclid",
)
@pytest.mark.parametrize("vector_name", [None, "custom-vector"])
def test_qdrant_from_texts_recreates_collection_on_force_recreate(
vector_name: Optional[str],
) -> None:
"""Test if Qdrant.from_texts recreates the collection even if config mismatches"""
from qdrant_client import QdrantClient
collection_name = "test"
with tempfile.TemporaryDirectory() as tmpdir:
vec_store = Qdrant.from_texts(
["lorem", "ipsum", "dolor", "sit", "amet"],
ConsistentFakeEmbeddings(dimensionality=10),
collection_name=collection_name,
path=str(tmpdir),
vector_name=vector_name,
)
del vec_store
vec_store = Qdrant.from_texts(
["foo", "bar"],
ConsistentFakeEmbeddings(dimensionality=5),
collection_name=collection_name,
path=str(tmpdir),
vector_name=vector_name,
force_recreate=True,
)
del vec_store
client = QdrantClient(path=str(tmpdir))
assert 2 == client.count(collection_name).count