diff --git a/libs/partners/qdrant/tests/integration_tests/common.py b/libs/partners/qdrant/tests/integration_tests/common.py index f1dcc53a264..3c06cc9019e 100644 --- a/libs/partners/qdrant/tests/integration_tests/common.py +++ b/libs/partners/qdrant/tests/integration_tests/common.py @@ -4,6 +4,8 @@ import requests # type: ignore from langchain_core.documents import Document from langchain_core.embeddings import Embeddings +from langchain_qdrant import SparseEmbeddings, SparseVector + def qdrant_running_locally() -> bool: """Check if Qdrant is running at http://localhost:6333.""" @@ -55,3 +57,29 @@ class ConsistentFakeEmbeddings(Embeddings): """Return consistent embeddings for the text, if seen before, or a constant one if the text is unknown.""" return self.embed_documents([text])[0] + + +class ConsistentFakeSparseEmbeddings(SparseEmbeddings): + """Fake sparse embeddings which remembers all the texts seen so far " + "to return consistent vectors for the same texts.""" + + def __init__(self, dimensionality: int = 25) -> None: + self.known_texts: List[str] = [] + self.dimensionality = 25 + + def embed_documents(self, texts: List[str]) -> List[SparseVector]: + """Return consistent embeddings for each text seen so far.""" + out_vectors = [] + for text in texts: + if text not in self.known_texts: + self.known_texts.append(text) + index = self.known_texts.index(text) + indices = [i + index for i in range(self.dimensionality)] + values = [1.0] * (self.dimensionality - 1) + [float(index)] + out_vectors.append(SparseVector(indices=indices, values=values)) + return out_vectors + + def embed_query(self, text: str) -> SparseVector: + """Return consistent embeddings for the text, " + "if seen before, or a constant one if the text is unknown.""" + return self.embed_documents([text])[0] diff --git a/libs/partners/qdrant/tests/integration_tests/conftest.py b/libs/partners/qdrant/tests/integration_tests/conftest.py index b7830c1cd81..4788cf0843d 100644 --- a/libs/partners/qdrant/tests/integration_tests/conftest.py +++ b/libs/partners/qdrant/tests/integration_tests/conftest.py @@ -5,8 +5,8 @@ from qdrant_client import QdrantClient from tests.integration_tests.fixtures import qdrant_locations -def pytest_sessionfinish() -> None: - """Clean up all collections after the test session.""" +def pytest_runtest_teardown() -> None: + """Clean up all collections after the each test.""" for location in qdrant_locations(): client = QdrantClient(location=location, api_key=os.getenv("QDRANT_API_KEY")) collections = client.get_collections().collections diff --git a/libs/partners/qdrant/tests/integration_tests/fixtures.py b/libs/partners/qdrant/tests/integration_tests/fixtures.py index 50819050dee..7a92abe475d 100644 --- a/libs/partners/qdrant/tests/integration_tests/fixtures.py +++ b/libs/partners/qdrant/tests/integration_tests/fixtures.py @@ -2,6 +2,7 @@ import logging import os from typing import List +from langchain_qdrant.qdrant import RetrievalMode from tests.integration_tests.common import qdrant_running_locally logger = logging.getLogger(__name__) @@ -23,3 +24,20 @@ def qdrant_locations(use_in_memory: bool = True) -> List[str]: locations.append(qdrant_url) return locations + + +def retrieval_modes( + *, dense: bool = True, sparse: bool = True, hybrid: bool = True +) -> List[RetrievalMode]: + modes = [] + + if dense: + modes.append(RetrievalMode.DENSE) + + if sparse: + modes.append(RetrievalMode.SPARSE) + + if hybrid: + modes.append(RetrievalMode.HYBRID) + + return modes diff --git a/libs/partners/qdrant/tests/integration_tests/qdrant_vector_store/test_add_texts.py b/libs/partners/qdrant/tests/integration_tests/qdrant_vector_store/test_add_texts.py new file mode 100644 index 00000000000..08c006a596f --- /dev/null +++ b/libs/partners/qdrant/tests/integration_tests/qdrant_vector_store/test_add_texts.py @@ -0,0 +1,143 @@ +import uuid +from typing import List, Union + +import pytest +from langchain_core.documents import Document +from qdrant_client import QdrantClient, models + +from langchain_qdrant import QdrantVectorStore, RetrievalMode +from tests.integration_tests.common import ( + ConsistentFakeEmbeddings, + ConsistentFakeSparseEmbeddings, + assert_documents_equals, +) +from tests.integration_tests.fixtures import qdrant_locations, retrieval_modes + + +@pytest.mark.parametrize("location", qdrant_locations()) +@pytest.mark.parametrize("vector_name", ["", "my-vector"]) +@pytest.mark.parametrize("retrieval_mode", retrieval_modes()) +@pytest.mark.parametrize( + "sparse_vector_name", ["my-sparse-vector", "another-sparse-vector"] +) +def test_qdrant_add_documents_extends_existing_collection( + location: str, + vector_name: str, + retrieval_mode: RetrievalMode, + sparse_vector_name: str, +) -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + docsearch = QdrantVectorStore.from_texts( + texts, + ConsistentFakeEmbeddings(), + location=location, + vector_name=vector_name, + retrieval_mode=retrieval_mode, + sparse_vector_name=sparse_vector_name, + sparse_embedding=ConsistentFakeSparseEmbeddings(), + ) + + new_texts = ["foobar", "foobaz"] + docsearch.add_documents([Document(page_content=content) for content in new_texts]) + output = docsearch.similarity_search("foobar", k=1) + assert_documents_equals(output, [Document(page_content="foobar")]) + + +@pytest.mark.parametrize("location", qdrant_locations()) +@pytest.mark.parametrize("vector_name", ["", "my-vector"]) +@pytest.mark.parametrize("retrieval_mode", retrieval_modes()) +@pytest.mark.parametrize( + "sparse_vector_name", ["my-sparse-vector", "another-sparse-vector"] +) +@pytest.mark.parametrize("batch_size", [1, 64]) +def test_qdrant_add_texts_returns_all_ids( + location: str, + vector_name: str, + retrieval_mode: RetrievalMode, + sparse_vector_name: str, + batch_size: int, +) -> None: + """Test end to end Qdrant.add_texts returns unique ids.""" + docsearch = QdrantVectorStore.from_texts( + ["foobar"], + ConsistentFakeEmbeddings(), + location=location, + vector_name=vector_name, + retrieval_mode=retrieval_mode, + sparse_vector_name=sparse_vector_name, + sparse_embedding=ConsistentFakeSparseEmbeddings(), + batch_size=batch_size, + ) + + ids = docsearch.add_texts(["foo", "bar", "baz"]) + assert 3 == len(ids) + assert 3 == len(set(ids)) + assert 3 == len(docsearch.get_by_ids(ids)) + + +@pytest.mark.parametrize("location", qdrant_locations()) +@pytest.mark.parametrize("vector_name", ["", "my-vector"]) +def test_qdrant_add_texts_stores_duplicated_texts( + location: str, + vector_name: str, +) -> None: + """Test end to end Qdrant.add_texts stores duplicated texts separately.""" + + client = QdrantClient(location) + collection_name = uuid.uuid4().hex + vectors_config = { + vector_name: models.VectorParams(size=10, distance=models.Distance.COSINE) + } + client.recreate_collection(collection_name, vectors_config=vectors_config) + + vec_store = QdrantVectorStore( + client, + collection_name, + embedding=ConsistentFakeEmbeddings(), + vector_name=vector_name, + ) + ids = vec_store.add_texts(["abc", "abc"], [{"a": 1}, {"a": 2}]) + + assert 2 == len(set(ids)) + assert 2 == client.count(collection_name).count + + +@pytest.mark.parametrize("location", qdrant_locations()) +@pytest.mark.parametrize("vector_name", ["", "my-vector"]) +@pytest.mark.parametrize("retrieval_mode", retrieval_modes()) +@pytest.mark.parametrize( + "sparse_vector_name", ["my-sparse-vector", "another-sparse-vector"] +) +@pytest.mark.parametrize("batch_size", [1, 64]) +def test_qdrant_add_texts_stores_ids( + location: str, + vector_name: str, + retrieval_mode: RetrievalMode, + sparse_vector_name: str, + batch_size: int, +) -> None: + """Test end to end Qdrant.add_texts stores provided ids.""" + ids: List[Union[str, int]] = [ + "fa38d572-4c31-4579-aedc-1960d79df6df", + 432, + 432145435, + ] + collection_name = uuid.uuid4().hex + vec_store = QdrantVectorStore.from_texts( + ["abc", "def", "ghi"], + ConsistentFakeEmbeddings(), + ids=ids, + collection_name=collection_name, + location=location, + vector_name=vector_name, + retrieval_mode=retrieval_mode, + sparse_vector_name=sparse_vector_name, + sparse_embedding=ConsistentFakeSparseEmbeddings(), + batch_size=batch_size, + ) + + assert 3 == vec_store.client.count(collection_name).count + stored_ids = [point.id for point in vec_store.client.scroll(collection_name)[0]] + assert set(ids) == set(stored_ids) + assert 3 == len(vec_store.get_by_ids(ids)) diff --git a/libs/partners/qdrant/tests/integration_tests/qdrant_vector_store/test_from_existing.py b/libs/partners/qdrant/tests/integration_tests/qdrant_vector_store/test_from_existing.py new file mode 100644 index 00000000000..4e4a78d517c --- /dev/null +++ b/libs/partners/qdrant/tests/integration_tests/qdrant_vector_store/test_from_existing.py @@ -0,0 +1,51 @@ +import uuid + +import pytest + +from langchain_qdrant.qdrant import QdrantVectorStore, RetrievalMode +from tests.integration_tests.common import ( + ConsistentFakeEmbeddings, + ConsistentFakeSparseEmbeddings, +) +from tests.integration_tests.fixtures import qdrant_locations, retrieval_modes + + +@pytest.mark.parametrize("location", qdrant_locations(use_in_memory=False)) +@pytest.mark.parametrize("vector_name", ["", "my-vector"]) +@pytest.mark.parametrize("retrieval_mode", retrieval_modes()) +@pytest.mark.parametrize( + "sparse_vector_name", ["my-sparse-vector", "another-sparse-vector"] +) +def test_qdrant_from_existing_collection_uses_same_collection( + location: str, + vector_name: str, + retrieval_mode: RetrievalMode, + sparse_vector_name: str, +) -> None: + """Test if the QdrantVectorStore.from_existing_collection reuses the collection.""" + + collection_name = uuid.uuid4().hex + docs = ["foo"] + QdrantVectorStore.from_texts( + docs, + embedding=ConsistentFakeEmbeddings(), + collection_name=collection_name, + location=location, + vector_name=vector_name, + retrieval_mode=retrieval_mode, + sparse_vector_name=sparse_vector_name, + sparse_embedding=ConsistentFakeSparseEmbeddings(), + ) + + qdrant = QdrantVectorStore.from_existing_collection( + collection_name, + embedding=ConsistentFakeEmbeddings(), + location=location, + vector_name=vector_name, + retrieval_mode=retrieval_mode, + sparse_vector_name=sparse_vector_name, + sparse_embedding=ConsistentFakeSparseEmbeddings(), + ) + qdrant.add_texts(["baz", "bar"]) + + assert 3 == qdrant.client.count(collection_name).count diff --git a/libs/partners/qdrant/tests/integration_tests/qdrant_vector_store/test_from_texts.py b/libs/partners/qdrant/tests/integration_tests/qdrant_vector_store/test_from_texts.py new file mode 100644 index 00000000000..65f76dc9ebe --- /dev/null +++ b/libs/partners/qdrant/tests/integration_tests/qdrant_vector_store/test_from_texts.py @@ -0,0 +1,385 @@ +import uuid +from typing import List, Union + +import pytest +from langchain_core.documents import Document +from qdrant_client import models + +from langchain_qdrant import QdrantVectorStore, RetrievalMode +from langchain_qdrant.qdrant import QdrantVectorStoreError +from tests.integration_tests.common import ( + ConsistentFakeEmbeddings, + ConsistentFakeSparseEmbeddings, + assert_documents_equals, +) +from tests.integration_tests.fixtures import qdrant_locations, retrieval_modes + + +@pytest.mark.parametrize("location", qdrant_locations()) +@pytest.mark.parametrize("retrieval_mode", retrieval_modes()) +def test_vectorstore_from_texts(location: str, retrieval_mode: RetrievalMode) -> None: + """Test end to end Qdrant.from_texts stores texts.""" + collection_name = uuid.uuid4().hex + + vec_store = QdrantVectorStore.from_texts( + ["Lorem ipsum dolor sit amet", "Ipsum dolor sit amet"], + ConsistentFakeEmbeddings(), + collection_name=collection_name, + location=location, + retrieval_mode=retrieval_mode, + sparse_embedding=ConsistentFakeSparseEmbeddings(), + ) + + assert 2 == vec_store.client.count(collection_name).count + + +@pytest.mark.parametrize("batch_size", [1, 64]) +@pytest.mark.parametrize("vector_name", ["", "my-vector"]) +@pytest.mark.parametrize( + "sparse_vector_name", ["my-sparse-vector", "another-sparse-vector"] +) +@pytest.mark.parametrize("location", qdrant_locations()) +@pytest.mark.parametrize("retrieval_mode", retrieval_modes()) +def test_qdrant_from_texts_stores_ids( + batch_size: int, + vector_name: str, + sparse_vector_name: str, + location: str, + retrieval_mode: RetrievalMode, +) -> None: + """Test end to end Qdrant.from_texts stores provided ids.""" + collection_name = uuid.uuid4().hex + ids: List[Union[str, int]] = [ + "fa38d572-4c31-4579-aedc-1960d79df6df", + 786, + ] + vec_store = QdrantVectorStore.from_texts( + ["abc", "def"], + ConsistentFakeEmbeddings(), + ids=ids, + collection_name=collection_name, + location=location, + retrieval_mode=retrieval_mode, + sparse_embedding=ConsistentFakeSparseEmbeddings(), + batch_size=batch_size, + vector_name=vector_name, + sparse_vector_name=sparse_vector_name, + ) + + assert 2 == vec_store.client.count(collection_name).count + stored_ids = [point.id for point in vec_store.client.retrieve(collection_name, ids)] + assert set(ids) == set(stored_ids) + + +@pytest.mark.parametrize("location", qdrant_locations()) +@pytest.mark.parametrize("retrieval_mode", retrieval_modes()) +@pytest.mark.parametrize("vector_name", ["", "my-vector"]) +@pytest.mark.parametrize( + "sparse_vector_name", ["my-sparse-vector", "another-sparse-vector"] +) +def test_qdrant_from_texts_stores_embeddings_as_named_vectors( + location: str, + retrieval_mode: RetrievalMode, + vector_name: str, + sparse_vector_name: str, +) -> None: + """Test end to end Qdrant.from_texts stores named vectors if name is provided.""" + + collection_name = uuid.uuid4().hex + vec_store = QdrantVectorStore.from_texts( + ["lorem", "ipsum", "dolor", "sit", "amet"], + ConsistentFakeEmbeddings(), + collection_name=collection_name, + location=location, + vector_name=vector_name, + retrieval_mode=retrieval_mode, + sparse_vector_name=sparse_vector_name, + sparse_embedding=ConsistentFakeSparseEmbeddings(), + ) + + assert 5 == vec_store.client.count(collection_name).count + if retrieval_mode in retrieval_modes(sparse=False): + assert all( + (vector_name in point.vector or isinstance(point.vector, list)) # type: ignore + for point in vec_store.client.scroll(collection_name, with_vectors=True)[0] + ) + if retrieval_mode in retrieval_modes(dense=False): + assert all( + sparse_vector_name in point.vector # type: ignore + for point in vec_store.client.scroll(collection_name, with_vectors=True)[0] + ) + + +@pytest.mark.parametrize("location", qdrant_locations(use_in_memory=False)) +@pytest.mark.parametrize("retrieval_mode", retrieval_modes()) +@pytest.mark.parametrize("vector_name", ["", "my-vector"]) +@pytest.mark.parametrize( + "sparse_vector_name", ["my-sparse-vector", "another-sparse-vector"] +) +def test_qdrant_from_texts_reuses_same_collection( + location: str, + retrieval_mode: RetrievalMode, + vector_name: str, + sparse_vector_name: str, +) -> None: + """Test if Qdrant.from_texts reuses the same collection""" + collection_name = uuid.uuid4().hex + embeddings = ConsistentFakeEmbeddings() + sparse_embeddings = ConsistentFakeSparseEmbeddings() + vec_store = QdrantVectorStore.from_texts( + ["lorem", "ipsum", "dolor", "sit", "amet"], + embeddings, + collection_name=collection_name, + location=location, + vector_name=vector_name, + retrieval_mode=retrieval_mode, + sparse_vector_name=sparse_vector_name, + sparse_embedding=sparse_embeddings, + ) + del vec_store + + vec_store = QdrantVectorStore.from_texts( + ["foo", "bar"], + embeddings, + collection_name=collection_name, + location=location, + vector_name=vector_name, + retrieval_mode=retrieval_mode, + sparse_vector_name=sparse_vector_name, + sparse_embedding=sparse_embeddings, + ) + + assert 7 == vec_store.client.count(collection_name).count + + +@pytest.mark.parametrize("location", qdrant_locations(use_in_memory=False)) +@pytest.mark.parametrize("vector_name", ["", "my-vector"]) +@pytest.mark.parametrize("retrieval_mode", retrieval_modes(sparse=False)) +def test_qdrant_from_texts_raises_error_on_different_dimensionality( + location: str, + vector_name: str, + retrieval_mode: RetrievalMode, +) -> None: + """Test if Qdrant.from_texts raises an exception if dimensionality does not match""" + collection_name = uuid.uuid4().hex + QdrantVectorStore.from_texts( + ["lorem", "ipsum", "dolor", "sit", "amet"], + ConsistentFakeEmbeddings(dimensionality=10), + collection_name=collection_name, + location=location, + vector_name=vector_name, + retrieval_mode=retrieval_mode, + sparse_embedding=ConsistentFakeSparseEmbeddings(), + ) + + with pytest.raises(QdrantVectorStoreError) as excinfo: + QdrantVectorStore.from_texts( + ["foo", "bar"], + ConsistentFakeEmbeddings(dimensionality=5), + collection_name=collection_name, + location=location, + vector_name=vector_name, + retrieval_mode=retrieval_mode, + sparse_embedding=ConsistentFakeSparseEmbeddings(), + ) + + expected_message = "collection is configured for dense vectors " + "with 10 dimensions. Selected embeddings are 5-dimensional" + assert expected_message in str(excinfo.value) + + +@pytest.mark.parametrize("location", qdrant_locations(use_in_memory=False)) +@pytest.mark.parametrize( + ["first_vector_name", "second_vector_name"], + [ + ("", "custom-vector"), + ("custom-vector", ""), + ("my-first-vector", "my-second_vector"), + ], +) +@pytest.mark.parametrize("retrieval_mode", retrieval_modes(sparse=False)) +def test_qdrant_from_texts_raises_error_on_different_vector_name( + location: str, + first_vector_name: str, + second_vector_name: str, + retrieval_mode: RetrievalMode, +) -> None: + """Test if Qdrant.from_texts raises an exception if vector name does not match""" + collection_name = uuid.uuid4().hex + QdrantVectorStore.from_texts( + ["lorem", "ipsum", "dolor", "sit", "amet"], + ConsistentFakeEmbeddings(dimensionality=10), + collection_name=collection_name, + location=location, + vector_name=first_vector_name, + retrieval_mode=retrieval_mode, + sparse_embedding=ConsistentFakeSparseEmbeddings(), + ) + + with pytest.raises(QdrantVectorStoreError) as excinfo: + QdrantVectorStore.from_texts( + ["foo", "bar"], + ConsistentFakeEmbeddings(dimensionality=10), + collection_name=collection_name, + location=location, + vector_name=second_vector_name, + retrieval_mode=retrieval_mode, + sparse_embedding=ConsistentFakeSparseEmbeddings(), + ) + + expected_message = "does not contain dense vector named" + assert expected_message in str(excinfo.value) + + +@pytest.mark.parametrize("location", qdrant_locations(use_in_memory=False)) +@pytest.mark.parametrize("vector_name", ["", "my-vector"]) +@pytest.mark.parametrize("retrieval_mode", retrieval_modes(sparse=False)) +def test_qdrant_from_texts_raises_error_on_different_distance( + location: str, vector_name: str, retrieval_mode: RetrievalMode +) -> None: + """Test if Qdrant.from_texts raises an exception if distance does not match""" + collection_name = uuid.uuid4().hex + QdrantVectorStore.from_texts( + ["lorem", "ipsum", "dolor", "sit", "amet"], + ConsistentFakeEmbeddings(), + collection_name=collection_name, + location=location, + vector_name=vector_name, + distance=models.Distance.COSINE, + retrieval_mode=retrieval_mode, + sparse_embedding=ConsistentFakeSparseEmbeddings(), + ) + + with pytest.raises(QdrantVectorStoreError) as excinfo: + QdrantVectorStore.from_texts( + ["foo", "bar"], + ConsistentFakeEmbeddings(), + collection_name=collection_name, + location=location, + vector_name=vector_name, + distance=models.Distance.EUCLID, + retrieval_mode=retrieval_mode, + sparse_embedding=ConsistentFakeSparseEmbeddings(), + ) + + expected_message = "configured for COSINE similarity, but requested EUCLID" + assert expected_message in str(excinfo.value) + + +@pytest.mark.parametrize("location", qdrant_locations(use_in_memory=False)) +@pytest.mark.parametrize("vector_name", ["", "my-vector"]) +@pytest.mark.parametrize("retrieval_mode", retrieval_modes()) +@pytest.mark.parametrize( + "sparse_vector_name", ["my-sparse-vector", "another-sparse-vector"] +) +def test_qdrant_from_texts_recreates_collection_on_force_recreate( + location: str, + vector_name: str, + retrieval_mode: RetrievalMode, + sparse_vector_name: str, +) -> None: + collection_name = uuid.uuid4().hex + vec_store = QdrantVectorStore.from_texts( + ["lorem", "ipsum", "dolor", "sit", "amet"], + ConsistentFakeEmbeddings(dimensionality=10), + collection_name=collection_name, + location=location, + vector_name=vector_name, + retrieval_mode=retrieval_mode, + sparse_vector_name=sparse_vector_name, + sparse_embedding=ConsistentFakeSparseEmbeddings(), + ) + + vec_store = QdrantVectorStore.from_texts( + ["foo", "bar"], + ConsistentFakeEmbeddings(dimensionality=5), + collection_name=collection_name, + location=location, + vector_name=vector_name, + retrieval_mode=retrieval_mode, + sparse_vector_name=sparse_vector_name, + sparse_embedding=ConsistentFakeSparseEmbeddings(), + force_recreate=True, + ) + + assert 2 == vec_store.client.count(collection_name).count + + +@pytest.mark.parametrize("location", qdrant_locations()) +@pytest.mark.parametrize("content_payload_key", [QdrantVectorStore.CONTENT_KEY, "foo"]) +@pytest.mark.parametrize( + "metadata_payload_key", [QdrantVectorStore.METADATA_KEY, "bar"] +) +@pytest.mark.parametrize("vector_name", ["", "my-vector"]) +@pytest.mark.parametrize("retrieval_mode", retrieval_modes()) +@pytest.mark.parametrize( + "sparse_vector_name", ["my-sparse-vector", "another-sparse-vector"] +) +def test_qdrant_from_texts_stores_metadatas( + location: str, + content_payload_key: str, + metadata_payload_key: str, + vector_name: str, + retrieval_mode: RetrievalMode, + sparse_vector_name: str, +) -> None: + """Test end to end construction and search.""" + texts = ["fabrin", "barizda"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = QdrantVectorStore.from_texts( + texts, + ConsistentFakeEmbeddings(), + metadatas=metadatas, + location=location, + content_payload_key=content_payload_key, + metadata_payload_key=metadata_payload_key, + vector_name=vector_name, + retrieval_mode=retrieval_mode, + sparse_vector_name=sparse_vector_name, + sparse_embedding=ConsistentFakeSparseEmbeddings(), + ) + output = docsearch.similarity_search("fabrin", k=1) + assert_documents_equals( + output, [Document(page_content="fabrin", metadata={"page": 0})] + ) + + +@pytest.mark.parametrize("location", qdrant_locations(use_in_memory=False)) +@pytest.mark.parametrize("vector_name", ["", "my-vector"]) +@pytest.mark.parametrize("retrieval_mode", retrieval_modes(sparse=False)) +@pytest.mark.parametrize( + "sparse_vector_name", ["my-sparse-vector", "another-sparse-vector"] +) +def test_from_texts_passed_optimizers_config_and_on_disk_payload( + location: str, + vector_name: str, + retrieval_mode: RetrievalMode, + sparse_vector_name: str, +) -> None: + collection_name = uuid.uuid4().hex + texts = ["foo", "bar", "baz"] + metadatas = [{"page": i} for i in range(len(texts))] + optimizers_config = models.OptimizersConfigDiff(memmap_threshold=1000) + vec_store = QdrantVectorStore.from_texts( + texts, + ConsistentFakeEmbeddings(), + metadatas=metadatas, + collection_create_options={ + "on_disk_payload": True, + "optimizers_config": optimizers_config, + }, + vector_params={ + "on_disk": True, + }, + collection_name=collection_name, + location=location, + vector_name=vector_name, + retrieval_mode=retrieval_mode, + sparse_vector_name=sparse_vector_name, + sparse_embedding=ConsistentFakeSparseEmbeddings(), + ) + + collection_info = vec_store.client.get_collection(collection_name) + assert collection_info.config.params.vectors[vector_name].on_disk is True # type: ignore + assert collection_info.config.optimizer_config.memmap_threshold == 1000 + assert collection_info.config.params.on_disk_payload is True diff --git a/libs/partners/qdrant/tests/integration_tests/qdrant_vector_store/test_mmr.py b/libs/partners/qdrant/tests/integration_tests/qdrant_vector_store/test_mmr.py new file mode 100644 index 00000000000..78f784a2c47 --- /dev/null +++ b/libs/partners/qdrant/tests/integration_tests/qdrant_vector_store/test_mmr.py @@ -0,0 +1,116 @@ +import pytest # type: ignore[import-not-found] +from langchain_core.documents import Document +from qdrant_client import models + +from langchain_qdrant import QdrantVectorStore, RetrievalMode +from langchain_qdrant.qdrant import QdrantVectorStoreError +from tests.integration_tests.common import ( + ConsistentFakeEmbeddings, + ConsistentFakeSparseEmbeddings, + assert_documents_equals, +) +from tests.integration_tests.fixtures import qdrant_locations, retrieval_modes + + +# MMR is supported when dense embeddings are available +# i.e. In Dense and Hybrid retrieval modes +@pytest.mark.parametrize("location", qdrant_locations()) +@pytest.mark.parametrize( + "content_payload_key", [QdrantVectorStore.CONTENT_KEY, "test_content"] +) +@pytest.mark.parametrize( + "metadata_payload_key", [QdrantVectorStore.METADATA_KEY, "test_metadata"] +) +@pytest.mark.parametrize("retrieval_mode", retrieval_modes(sparse=False)) +@pytest.mark.parametrize("vector_name", ["", "my-vector"]) +def test_qdrant_mmr_search( + location: str, + content_payload_key: str, + metadata_payload_key: str, + retrieval_mode: RetrievalMode, + vector_name: str, +) -> None: + """Test end to end construction and MRR search.""" + filter = models.Filter( + must=[ + models.FieldCondition( + key=f"{metadata_payload_key}.page", + match=models.MatchValue( + value=2, + ), + ), + ], + ) + + texts = ["foo", "bar", "baz"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = QdrantVectorStore.from_texts( + texts, + ConsistentFakeEmbeddings(), + metadatas=metadatas, + content_payload_key=content_payload_key, + metadata_payload_key=metadata_payload_key, + location=location, + retrieval_mode=retrieval_mode, + vector_name=vector_name, + distance=models.Distance.EUCLID, + sparse_embedding=ConsistentFakeSparseEmbeddings(), + ) + output = docsearch.max_marginal_relevance_search( + "foo", k=2, fetch_k=3, lambda_mult=0.0 + ) + assert_documents_equals( + output, + [ + Document(page_content="foo", metadata={"page": 0}), + Document(page_content="baz", metadata={"page": 2}), + ], + ) + + output = docsearch.max_marginal_relevance_search( + "foo", k=2, fetch_k=3, lambda_mult=0.0, filter=filter + ) + assert_documents_equals( + output, + [Document(page_content="baz", metadata={"page": 2})], + ) + + +# MMR shouldn't work with only sparse retrieval mode +@pytest.mark.parametrize("location", qdrant_locations()) +@pytest.mark.parametrize( + "content_payload_key", [QdrantVectorStore.CONTENT_KEY, "test_content"] +) +@pytest.mark.parametrize( + "metadata_payload_key", [QdrantVectorStore.METADATA_KEY, "test_metadata"] +) +@pytest.mark.parametrize("retrieval_mode", retrieval_modes(dense=False, hybrid=False)) +@pytest.mark.parametrize("vector_name", ["", "my-vector"]) +def test_invalid_qdrant_mmr_with_sparse( + location: str, + content_payload_key: str, + metadata_payload_key: str, + retrieval_mode: RetrievalMode, + vector_name: str, +) -> None: + """Test end to end construction and MRR search.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = QdrantVectorStore.from_texts( + texts, + ConsistentFakeEmbeddings(), + metadatas=metadatas, + content_payload_key=content_payload_key, + metadata_payload_key=metadata_payload_key, + location=location, + retrieval_mode=retrieval_mode, + vector_name=vector_name, + distance=models.Distance.EUCLID, + sparse_embedding=ConsistentFakeSparseEmbeddings(), + ) + + with pytest.raises(QdrantVectorStoreError) as excinfo: + docsearch.max_marginal_relevance_search("foo", k=2, fetch_k=3, lambda_mult=0.0) + + expected_message = "does not contain dense vector named" + assert expected_message in str(excinfo.value) diff --git a/libs/partners/qdrant/tests/integration_tests/qdrant_vector_store/test_search.py b/libs/partners/qdrant/tests/integration_tests/qdrant_vector_store/test_search.py new file mode 100644 index 00000000000..9525c5c17d1 --- /dev/null +++ b/libs/partners/qdrant/tests/integration_tests/qdrant_vector_store/test_search.py @@ -0,0 +1,278 @@ +import pytest +from langchain_core.documents import Document +from qdrant_client import models + +from langchain_qdrant import QdrantVectorStore, RetrievalMode +from tests.integration_tests.common import ( + ConsistentFakeEmbeddings, + ConsistentFakeSparseEmbeddings, + assert_documents_equals, +) +from tests.integration_tests.fixtures import qdrant_locations, retrieval_modes + + +@pytest.mark.parametrize("location", qdrant_locations()) +@pytest.mark.parametrize("vector_name", ["", "my-vector"]) +@pytest.mark.parametrize("retrieval_mode", retrieval_modes()) +@pytest.mark.parametrize("batch_size", [1, 64]) +def test_similarity_search( + location: str, + vector_name: str, + retrieval_mode: RetrievalMode, + batch_size: int, +) -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + docsearch = QdrantVectorStore.from_texts( + texts, + ConsistentFakeEmbeddings(), + location=location, + batch_size=batch_size, + vector_name=vector_name, + retrieval_mode=retrieval_mode, + sparse_embedding=ConsistentFakeSparseEmbeddings(), + ) + output = docsearch.similarity_search("foo", k=1) + assert_documents_equals(actual=output, expected=[Document(page_content="foo")]) + + +@pytest.mark.parametrize("location", qdrant_locations()) +@pytest.mark.parametrize("content_payload_key", [QdrantVectorStore.CONTENT_KEY, "foo"]) +@pytest.mark.parametrize( + "metadata_payload_key", [QdrantVectorStore.METADATA_KEY, "bar"] +) +@pytest.mark.parametrize("vector_name", ["", "my-vector"]) +@pytest.mark.parametrize("batch_size", [1, 64]) +def test_similarity_search_by_vector( + location: str, + content_payload_key: str, + metadata_payload_key: str, + vector_name: str, + batch_size: int, +) -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + docsearch = QdrantVectorStore.from_texts( + texts, + ConsistentFakeEmbeddings(), + location=location, + content_payload_key=content_payload_key, + metadata_payload_key=metadata_payload_key, + batch_size=batch_size, + vector_name=vector_name, + ) + embeddings = ConsistentFakeEmbeddings().embed_query("foo") + output = docsearch.similarity_search_by_vector(embeddings, k=1) + assert_documents_equals(output, [Document(page_content="foo")]) + + +@pytest.mark.parametrize("location", qdrant_locations()) +@pytest.mark.parametrize( + "metadata_payload_key", [QdrantVectorStore.METADATA_KEY, "bar"] +) +@pytest.mark.parametrize("retrieval_mode", retrieval_modes()) +def test_similarity_search_filters( + location: str, + metadata_payload_key: str, + retrieval_mode: RetrievalMode, +) -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + metadatas = [ + {"page": i, "metadata": {"page": i + 1, "pages": [i + 2, -1]}} + for i in range(len(texts)) + ] + docsearch = QdrantVectorStore.from_texts( + texts, + ConsistentFakeEmbeddings(), + metadatas=metadatas, + location=location, + metadata_payload_key=metadata_payload_key, + retrieval_mode=retrieval_mode, + sparse_embedding=ConsistentFakeSparseEmbeddings(), + ) + + qdrant_filter = models.Filter( + must=[ + models.FieldCondition( + key=f"{metadata_payload_key}.page", match=models.MatchValue(value=1) + ) + ] + ) + output = docsearch.similarity_search("foo", k=1, filter=qdrant_filter) + + assert_documents_equals( + actual=output, + expected=[ + Document( + page_content="bar", + metadata={"page": 1, "metadata": {"page": 2, "pages": [3, -1]}}, + ) + ], + ) + + +@pytest.mark.parametrize("location", qdrant_locations()) +@pytest.mark.parametrize("vector_name", ["", "my-vector"]) +def test_similarity_relevance_search_no_threshold( + location: str, + vector_name: str, +) -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + metadatas = [ + {"page": i, "metadata": {"page": i + 1, "pages": [i + 2, -1]}} + for i in range(len(texts)) + ] + docsearch = QdrantVectorStore.from_texts( + texts, + ConsistentFakeEmbeddings(), + metadatas=metadatas, + location=location, + vector_name=vector_name, + ) + output = docsearch.similarity_search_with_relevance_scores( + "foo", k=3, score_threshold=None + ) + assert len(output) == 3 + for i in range(len(output)): + assert round(output[i][1], 2) >= 0 + assert round(output[i][1], 2) <= 1 + + +@pytest.mark.parametrize("location", qdrant_locations()) +@pytest.mark.parametrize("vector_name", ["", "my-vector"]) +def test_relevance_search_with_threshold( + location: str, + vector_name: str, +) -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + metadatas = [ + {"page": i, "metadata": {"page": i + 1, "pages": [i + 2, -1]}} + for i in range(len(texts)) + ] + docsearch = QdrantVectorStore.from_texts( + texts, + ConsistentFakeEmbeddings(), + metadatas=metadatas, + location=location, + vector_name=vector_name, + ) + + score_threshold = 0.99 + kwargs = {"score_threshold": score_threshold} + output = docsearch.similarity_search_with_relevance_scores("foo", k=3, **kwargs) + assert len(output) == 1 + assert all([score >= score_threshold for _, score in output]) + + +@pytest.mark.parametrize("location", qdrant_locations()) +@pytest.mark.parametrize("content_payload_key", [QdrantVectorStore.CONTENT_KEY, "foo"]) +@pytest.mark.parametrize( + "metadata_payload_key", [QdrantVectorStore.METADATA_KEY, "bar"] +) +@pytest.mark.parametrize("vector_name", ["", "my-vector"]) +def test_relevance_search_with_threshold_and_filter( + location: str, + content_payload_key: str, + metadata_payload_key: str, + vector_name: str, +) -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + metadatas = [ + {"page": i, "metadata": {"page": i + 1, "pages": [i + 2, -1]}} + for i in range(len(texts)) + ] + docsearch = QdrantVectorStore.from_texts( + texts, + ConsistentFakeEmbeddings(), + metadatas=metadatas, + location=location, + content_payload_key=content_payload_key, + metadata_payload_key=metadata_payload_key, + vector_name=vector_name, + ) + score_threshold = 0.99 # for almost exact match + negative_filter = models.Filter( + must=[ + models.FieldCondition( + key=f"{metadata_payload_key}.page", match=models.MatchValue(value=1) + ) + ] + ) + kwargs = {"filter": negative_filter, "score_threshold": score_threshold} + output = docsearch.similarity_search_with_relevance_scores("foo", k=3, **kwargs) + assert len(output) == 0 + positive_filter = models.Filter( + must=[ + models.FieldCondition( + key=f"{metadata_payload_key}.page", match=models.MatchValue(value=0) + ) + ] + ) + kwargs = {"filter": positive_filter, "score_threshold": score_threshold} + output = docsearch.similarity_search_with_relevance_scores("foo", k=3, **kwargs) + assert len(output) == 1 + assert all([score >= score_threshold for _, score in output]) + + +@pytest.mark.parametrize("location", qdrant_locations()) +@pytest.mark.parametrize("content_payload_key", [QdrantVectorStore.CONTENT_KEY, "foo"]) +@pytest.mark.parametrize( + "metadata_payload_key", [QdrantVectorStore.METADATA_KEY, "bar"] +) +@pytest.mark.parametrize("retrieval_mode", retrieval_modes()) +def test_similarity_search_filters_with_qdrant_filters( + location: str, + content_payload_key: str, + metadata_payload_key: str, + retrieval_mode: RetrievalMode, +) -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + metadatas = [ + {"page": i, "details": {"page": i + 1, "pages": [i + 2, -1]}} + for i in range(len(texts)) + ] + docsearch = QdrantVectorStore.from_texts( + texts, + ConsistentFakeEmbeddings(), + location=location, + metadatas=metadatas, + content_payload_key=content_payload_key, + metadata_payload_key=metadata_payload_key, + retrieval_mode=retrieval_mode, + sparse_embedding=ConsistentFakeSparseEmbeddings(), + ) + + qdrant_filter = models.Filter( + must=[ + models.FieldCondition( + key=content_payload_key, match=models.MatchValue(value="bar") + ), + models.FieldCondition( + key=f"{metadata_payload_key}.page", + match=models.MatchValue(value=1), + ), + models.FieldCondition( + key=f"{metadata_payload_key}.details.page", + match=models.MatchValue(value=2), + ), + models.FieldCondition( + key=f"{metadata_payload_key}.details.pages", + match=models.MatchAny(any=[3]), + ), + ] + ) + output = docsearch.similarity_search("foo", k=1, filter=qdrant_filter) + assert_documents_equals( + actual=output, + expected=[ + Document( + page_content="bar", + metadata={"page": 1, "details": {"page": 2, "pages": [3, -1]}}, + ) + ], + )