From 1ff5b6702524f732129551bf527ec40108ad2188 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kacper=20=C5=81ukawski?= Date: Sat, 15 Jul 2023 15:33:26 +0200 Subject: [PATCH] Implement async API for Qdrant vector store (#7704) Inspired by #5550, I implemented full async API support in Qdrant. The docs were extended to mention the existence of asynchronous operations in Langchain. I also used that chance to restructure the tests of Qdrant and provided a suite of tests for the async version. Async API requires the GRPC protocol to be enabled. Thus, it doesn't work on local mode yet, but we're considering including the support to be consistent. --- .../data_connection/vectorstores/index.mdx | 8 + .../data_connection/vectorstores/async.mdx | 89 ++ langchain/vectorstores/qdrant.py | 850 ++++++++++++++++-- poetry.lock | 33 +- pyproject.toml | 2 +- .../vectorstores/qdrant/__init__.py | 0 .../vectorstores/qdrant/async/__init__.py | 0 .../vectorstores/qdrant/async/common.py | 10 + .../qdrant/async/test_add_texts.py | 121 +++ .../qdrant/async/test_from_texts.py | 247 +++++ .../async/test_max_marginal_relevance.py | 46 + .../qdrant/async/test_similarity_search.py | 286 ++++++ .../vectorstores/qdrant/fixtures.py | 0 .../vectorstores/qdrant/test_add_texts.py | 132 +++ .../vectorstores/qdrant/test_delete.py | 1 + .../qdrant/test_embedding_interface.py | 59 ++ .../vectorstores/qdrant/test_from_texts.py | 252 ++++++ .../qdrant/test_max_marginal_relevance.py | 40 + .../qdrant/test_similarity_search.py | 275 ++++++ .../vectorstores/test_qdrant.py | 685 -------------- 20 files changed, 2361 insertions(+), 775 deletions(-) create mode 100644 docs/snippets/modules/data_connection/vectorstores/async.mdx create mode 100644 tests/integration_tests/vectorstores/qdrant/__init__.py create mode 100644 tests/integration_tests/vectorstores/qdrant/async/__init__.py create mode 100644 tests/integration_tests/vectorstores/qdrant/async/common.py create mode 100644 tests/integration_tests/vectorstores/qdrant/async/test_add_texts.py create mode 100644 tests/integration_tests/vectorstores/qdrant/async/test_from_texts.py create mode 100644 tests/integration_tests/vectorstores/qdrant/async/test_max_marginal_relevance.py create mode 100644 tests/integration_tests/vectorstores/qdrant/async/test_similarity_search.py create mode 100644 tests/integration_tests/vectorstores/qdrant/fixtures.py create mode 100644 tests/integration_tests/vectorstores/qdrant/test_add_texts.py create mode 100644 tests/integration_tests/vectorstores/qdrant/test_delete.py create mode 100644 tests/integration_tests/vectorstores/qdrant/test_embedding_interface.py create mode 100644 tests/integration_tests/vectorstores/qdrant/test_from_texts.py create mode 100644 tests/integration_tests/vectorstores/qdrant/test_max_marginal_relevance.py create mode 100644 tests/integration_tests/vectorstores/qdrant/test_similarity_search.py delete mode 100644 tests/integration_tests/vectorstores/test_qdrant.py diff --git a/docs/docs_skeleton/docs/modules/data_connection/vectorstores/index.mdx b/docs/docs_skeleton/docs/modules/data_connection/vectorstores/index.mdx index 3618605db67..442c1ca77a6 100644 --- a/docs/docs_skeleton/docs/modules/data_connection/vectorstores/index.mdx +++ b/docs/docs_skeleton/docs/modules/data_connection/vectorstores/index.mdx @@ -15,3 +15,11 @@ This walkthrough showcases basic functionality related to VectorStores. A key pa import GetStarted from "@snippets/modules/data_connection/vectorstores/get_started.mdx" + +## Asynchronous operations + +Vector stores are usually run as a separate service that requires some IO operations, and therefore they might be called asynchronously. That gives performance benefits as you don't waste time waiting for responses from external services. That might also be important if you work with an asynchronous framework, such as [FastAPI](https://fastapi.tiangolo.com/). + +import AsyncVectorStore from "@snippets/modules/data_connection/vectorstores/async.mdx" + + \ No newline at end of file diff --git a/docs/snippets/modules/data_connection/vectorstores/async.mdx b/docs/snippets/modules/data_connection/vectorstores/async.mdx new file mode 100644 index 00000000000..0463bbe931f --- /dev/null +++ b/docs/snippets/modules/data_connection/vectorstores/async.mdx @@ -0,0 +1,89 @@ +Langchain supports async operation on vector stores. All the methods might be called using their async counterparts, with the prefix `a`, meaning `async`. + +`Qdrant` is a vector store, which supports all the async operations, thus it will be used in this walkthrough. + +```bash +pip install qdrant-client +``` + +```python +from langchain.vectorstores import Qdrant +``` + +### Create a vector store asynchronously + +```python +db = await Qdrant.afrom_documents(documents, embeddings, "http://localhost:6333") +``` + +### Similarity search + +```python +query = "What did the president say about Ketanji Brown Jackson" +docs = await db.asimilarity_search(query) +print(docs[0].page_content) +``` + + + +``` + Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. + + Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. + + One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. + + And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence. +``` + + + +### Similarity search by vector + +```python +embedding_vector = embeddings.embed_query(query) +docs = await db.asimilarity_search_by_vector(embedding_vector) +``` + +## Maximum marginal relevance search (MMR) + +Maximal marginal relevance optimizes for similarity to query AND diversity among selected documents. It is also supported in async API. + +```python +query = "What did the president say about Ketanji Brown Jackson" +found_docs = await qdrant.amax_marginal_relevance_search(query, k=2, fetch_k=10) +for i, doc in enumerate(found_docs): + print(f"{i + 1}.", doc.page_content, "\n") +``` + + + +``` +1. Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. + +Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. + +One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. + +And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence. + +2. We can’t change how divided we’ve been. But we can change how we move forward—on COVID-19 and other issues we must face together. + +I recently visited the New York City Police Department days after the funerals of Officer Wilbert Mora and his partner, Officer Jason Rivera. + +They were responding to a 9-1-1 call when a man shot and killed them with a stolen gun. + +Officer Mora was 27 years old. + +Officer Rivera was 22. + +Both Dominican Americans who’d grown up on the same streets they later chose to patrol as police officers. + +I spoke with their families and told them that we are forever in debt for their sacrifice, and we will carry on their mission to restore the trust and safety every community deserves. + +I’ve worked on these issues a long time. + +I know what works: Investing in crime preventionand community police officers who’ll walk the beat, who’ll know the neighborhood, and who can restore trust and safety. +``` + + diff --git a/langchain/vectorstores/qdrant.py b/langchain/vectorstores/qdrant.py index 0271396b187..7832c8adf3f 100644 --- a/langchain/vectorstores/qdrant.py +++ b/langchain/vectorstores/qdrant.py @@ -10,6 +10,7 @@ from typing import ( Any, Callable, Dict, + Generator, Iterable, List, Optional, @@ -27,6 +28,7 @@ from langchain.vectorstores import VectorStore from langchain.vectorstores.utils import maximal_marginal_relevance if TYPE_CHECKING: + from qdrant_client import grpc # noqa from qdrant_client.conversions import common_types from qdrant_client.http import models as rest @@ -142,37 +144,54 @@ class Qdrant(VectorStore): Returns: List of ids from adding the texts into the vectorstore. """ - from qdrant_client.http import models as rest + added_ids = [] + for batch_ids, points in self._generate_rest_batches( + texts, metadatas, ids, batch_size + ): + self.client.upsert( + collection_name=self.collection_name, + points=points, + ) + added_ids.extend(batch_ids) + + return added_ids + + async def aadd_texts( + self, + texts: Iterable[str], + metadatas: Optional[List[dict]] = None, + ids: Optional[Sequence[str]] = None, + batch_size: int = 64, + **kwargs: Any, + ) -> List[str]: + """Run more texts through the embeddings and add to the vectorstore. + + Args: + texts: Iterable of strings to add to the vectorstore. + metadatas: Optional list of metadatas associated with the texts. + ids: + Optional list of ids to associate with the texts. Ids have to be + uuid-like strings. + batch_size: + How many vectors upload per-request. + Default: 64 + + Returns: + List of ids from adding the texts into the vectorstore. + """ + from qdrant_client import grpc # noqa + from qdrant_client.conversions.conversion import RestToGrpc added_ids = [] - texts_iterator = iter(texts) - metadatas_iterator = iter(metadatas or []) - ids_iterator = iter(ids or [uuid.uuid4().hex for _ in iter(texts)]) - while batch_texts := list(islice(texts_iterator, batch_size)): - # Take the corresponding metadata and id for each text in a batch - batch_metadatas = list(islice(metadatas_iterator, batch_size)) or None - batch_ids = list(islice(ids_iterator, batch_size)) - - # Generate the embeddings for all the texts in a batch - batch_embeddings = self._embed_texts(batch_texts) - if self.vector_name is not None: - batch_embeddings = { # type: ignore[assignment] - self.vector_name: batch_embeddings - } - - points = rest.Batch.construct( - ids=batch_ids, - vectors=batch_embeddings, - payloads=self._build_payloads( - batch_texts, - batch_metadatas, - self.content_payload_key, - self.metadata_payload_key, - ), + for batch_ids, points in self._generate_rest_batches( + texts, metadatas, ids, batch_size + ): + await self.client.async_grpc_points.Upsert( + grpc.UpsertPoints( + collection_name=self.collection_name, + points=[RestToGrpc.convert_point_struct(point) for point in points], + ) ) - - self.client.upsert(collection_name=self.collection_name, points=points) - added_ids.extend(batch_ids) return added_ids @@ -232,6 +251,24 @@ class Qdrant(VectorStore): ) return list(map(itemgetter(0), results)) + async def asimilarity_search( + self, + query: str, + k: int = 4, + filter: Optional[MetadataFilter] = None, + **kwargs: Any, + ) -> List[Document]: + """Return docs most similar to query. + Args: + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + filter: Filter by metadata. Defaults to None. + Returns: + List of Documents most similar to the query. + """ + results = await self.asimilarity_search_with_score(query, k, filter) + return list(map(itemgetter(0), results)) + def similarity_search_with_score( self, query: str, @@ -273,9 +310,7 @@ class Qdrant(VectorStore): - 'all' - query all replicas, and return values present in all replicas Returns: - List of documents most similar to the query text and cosine - distance in float for each. - Lower score represents more similarity. + List of documents most similar to the query text and distance for each. """ return self.similarity_search_with_score_by_vector( self._embed_query(query), @@ -288,6 +323,60 @@ class Qdrant(VectorStore): **kwargs, ) + async def asimilarity_search_with_score( + self, + query: str, + k: int = 4, + filter: Optional[MetadataFilter] = None, + search_params: Optional[common_types.SearchParams] = None, + offset: int = 0, + score_threshold: Optional[float] = None, + consistency: Optional[common_types.ReadConsistency] = None, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + """Return docs most similar to query. + + Args: + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + filter: Filter by metadata. Defaults to None. + search_params: Additional search params + offset: + Offset of the first result to return. + May be used to paginate results. + Note: large offset values may cause performance issues. + score_threshold: + Define a minimal score threshold for the result. + If defined, less similar results will not be returned. + Score of the returned result might be higher or smaller than the + threshold depending on the Distance function used. + E.g. for cosine similarity only higher scores will be returned. + consistency: + Read consistency of the search. Defines how many replicas should be + queried before returning the result. + Values: + - int - number of replicas to query, values should present in all + queried replicas + - 'majority' - query all replicas, but return values present in the + majority of replicas + - 'quorum' - query the majority of replicas, return values present in + all of them + - 'all' - query all replicas, and return values present in all replicas + + Returns: + List of documents most similar to the query text and distance for each. + """ + return await self.asimilarity_search_with_score_by_vector( + self._embed_query(query), + k, + filter=filter, + search_params=search_params, + offset=offset, + score_threshold=score_threshold, + consistency=consistency, + **kwargs, + ) + def similarity_search_by_vector( self, embedding: List[float], @@ -343,6 +432,61 @@ class Qdrant(VectorStore): ) return list(map(itemgetter(0), results)) + async def asimilarity_search_by_vector( + self, + embedding: List[float], + k: int = 4, + filter: Optional[MetadataFilter] = None, + search_params: Optional[common_types.SearchParams] = None, + offset: int = 0, + score_threshold: Optional[float] = None, + consistency: Optional[common_types.ReadConsistency] = None, + **kwargs: Any, + ) -> List[Document]: + """Return docs most similar to embedding vector. + + Args: + embedding: Embedding vector to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + filter: Filter by metadata. Defaults to None. + search_params: Additional search params + offset: + Offset of the first result to return. + May be used to paginate results. + Note: large offset values may cause performance issues. + score_threshold: + Define a minimal score threshold for the result. + If defined, less similar results will not be returned. + Score of the returned result might be higher or smaller than the + threshold depending on the Distance function used. + E.g. for cosine similarity only higher scores will be returned. + consistency: + Read consistency of the search. Defines how many replicas should be + queried before returning the result. + Values: + - int - number of replicas to query, values should present in all + queried replicas + - 'majority' - query all replicas, but return values present in the + majority of replicas + - 'quorum' - query the majority of replicas, return values present in + all of them + - 'all' - query all replicas, and return values present in all replicas + + Returns: + List of Documents most similar to the query. + """ + results = await self.asimilarity_search_with_score_by_vector( + embedding, + k, + filter=filter, + search_params=search_params, + offset=offset, + score_threshold=score_threshold, + consistency=consistency, + **kwargs, + ) + return list(map(itemgetter(0), results)) + def similarity_search_with_score_by_vector( self, embedding: List[float], @@ -384,9 +528,7 @@ class Qdrant(VectorStore): - 'all' - query all replicas, and return values present in all replicas Returns: - List of documents most similar to the query text and cosine - distance in float for each. - Lower score represents more similarity. + List of documents most similar to the query text and distance for each. """ if filter is not None and isinstance(filter, dict): warnings.warn( @@ -426,49 +568,93 @@ class Qdrant(VectorStore): for result in results ] - def _select_relevance_score_fn(self) -> Callable[[float], float]: - """ - The 'correct' relevance function - may differ depending on a few things, including: - - the distance / similarity metric used by the VectorStore - - the scale of your embeddings (OpenAI's are unit normed. Many others are not!) - - embedding dimensionality - - etc. - """ - - if self.distance_strategy == "COSINE": - return self._cosine_relevance_score_fn - elif self.distance_strategy == "DOT": - return self._max_inner_product_relevance_score_fn - elif self.distance_strategy == "EUCLID": - return self._euclidean_relevance_score_fn - else: - raise ValueError( - "Unknown distance strategy, must be cosine, " - "max_inner_product, or euclidean" - ) - - def _similarity_search_with_relevance_scores( + async def asimilarity_search_with_score_by_vector( self, - query: str, + embedding: List[float], k: int = 4, + filter: Optional[MetadataFilter] = None, + search_params: Optional[common_types.SearchParams] = None, + offset: int = 0, + score_threshold: Optional[float] = None, + consistency: Optional[common_types.ReadConsistency] = None, **kwargs: Any, ) -> List[Tuple[Document, float]]: - """Return docs and relevance scores in the range [0, 1]. - - 0 is dissimilar, 1 is most similar. + """Return docs most similar to embedding vector. Args: - query: input text + embedding: Embedding vector to look up documents similar to. k: Number of Documents to return. Defaults to 4. - **kwargs: kwargs to be passed to similarity search. Should include: - score_threshold: Optional, a floating point value between 0 to 1 to - filter the resulting set of retrieved docs + filter: Filter by metadata. Defaults to None. + search_params: Additional search params + offset: + Offset of the first result to return. + May be used to paginate results. + Note: large offset values may cause performance issues. + score_threshold: + Define a minimal score threshold for the result. + If defined, less similar results will not be returned. + Score of the returned result might be higher or smaller than the + threshold depending on the Distance function used. + E.g. for cosine similarity only higher scores will be returned. + consistency: + Read consistency of the search. Defines how many replicas should be + queried before returning the result. + Values: + - int - number of replicas to query, values should present in all + queried replicas + - 'majority' - query all replicas, but return values present in the + majority of replicas + - 'quorum' - query the majority of replicas, return values present in + all of them + - 'all' - query all replicas, and return values present in all replicas Returns: - List of Tuples of (doc, similarity_score) + List of documents most similar to the query text and distance for each. """ - return self.similarity_search_with_score(query, k, **kwargs) + from qdrant_client import grpc # noqa + from qdrant_client.conversions.conversion import RestToGrpc + from qdrant_client.http import models as rest + + if filter is not None and isinstance(filter, dict): + warnings.warn( + "Using dict as a `filter` is deprecated. Please use qdrant-client " + "filters directly: " + "https://qdrant.tech/documentation/concepts/filtering/", + DeprecationWarning, + ) + qdrant_filter = self._qdrant_filter_from_dict(filter) + else: + qdrant_filter = filter + + if qdrant_filter is not None and isinstance(qdrant_filter, rest.Filter): + qdrant_filter = RestToGrpc.convert_filter(qdrant_filter) + + response = await self.client.async_grpc_points.Search( + grpc.SearchPoints( + collection_name=self.collection_name, + vector_name=self.vector_name, + vector=embedding, + filter=qdrant_filter, + params=search_params, + limit=k, + offset=offset, + with_payload=grpc.WithPayloadSelector(enable=True), + with_vectors=grpc.WithVectorsSelector(enable=False), + score_threshold=score_threshold, + read_consistency=consistency, + **kwargs, + ) + ) + + return [ + ( + self._document_from_scored_point_grpc( + result, self.content_payload_key, self.metadata_payload_key + ), + result.score, + ) + for result in response.result + ] def max_marginal_relevance_search( self, @@ -496,7 +682,65 @@ class Qdrant(VectorStore): List of Documents selected by maximal marginal relevance. """ query_embedding = self._embed_query(query) - query_vector = query_embedding + return self.max_marginal_relevance_search_by_vector( + query_embedding, k, fetch_k, lambda_mult, **kwargs + ) + + async def amax_marginal_relevance_search( + self, + query: str, + k: int = 4, + fetch_k: int = 20, + lambda_mult: float = 0.5, + **kwargs: Any, + ) -> List[Document]: + """Return docs selected using the maximal marginal relevance. + + Maximal marginal relevance optimizes for similarity to query AND diversity + among selected documents. + + Args: + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + fetch_k: Number of Documents to fetch to pass to MMR algorithm. + Defaults to 20. + lambda_mult: Number between 0 and 1 that determines the degree + of diversity among the results with 0 corresponding + to maximum diversity and 1 to minimum diversity. + Defaults to 0.5. + Returns: + List of Documents selected by maximal marginal relevance. + """ + query_embedding = self._embed_query(query) + return await self.amax_marginal_relevance_search_by_vector( + query_embedding, k, fetch_k, lambda_mult, **kwargs + ) + + def max_marginal_relevance_search_by_vector( + self, + embedding: List[float], + k: int = 4, + fetch_k: int = 20, + lambda_mult: float = 0.5, + **kwargs: Any, + ) -> List[Document]: + """Return docs selected using the maximal marginal relevance. + + Maximal marginal relevance optimizes for similarity to query AND diversity + among selected documents. + + Args: + embedding: Embedding to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + fetch_k: Number of Documents to fetch to pass to MMR algorithm. + lambda_mult: Number between 0 and 1 that determines the degree + of diversity among the results with 0 corresponding + to maximum diversity and 1 to minimum diversity. + Defaults to 0.5. + Returns: + List of Documents selected by maximal marginal relevance. + """ + query_vector = embedding if self.vector_name is not None: query_vector = (self.vector_name, query_vector) # type: ignore[assignment] @@ -514,7 +758,7 @@ class Qdrant(VectorStore): for result in results ] mmr_selected = maximal_marginal_relevance( - np.array(query_embedding), embeddings, k=k, lambda_mult=lambda_mult + np.array(embedding), embeddings, k=k, lambda_mult=lambda_mult ) return [ self._document_from_scored_point( @@ -523,6 +767,118 @@ class Qdrant(VectorStore): for i in mmr_selected ] + async def amax_marginal_relevance_search_by_vector( + self, + embedding: List[float], + k: int = 4, + fetch_k: int = 20, + lambda_mult: float = 0.5, + **kwargs: Any, + ) -> List[Document]: + """Return docs selected using the maximal marginal relevance. + Maximal marginal relevance optimizes for similarity to query AND diversity + among selected documents. + Args: + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + fetch_k: Number of Documents to fetch to pass to MMR algorithm. + Defaults to 20. + lambda_mult: Number between 0 and 1 that determines the degree + of diversity among the results with 0 corresponding + to maximum diversity and 1 to minimum diversity. + Defaults to 0.5. + Returns: + List of Documents selected by maximal marginal relevance and distance for + each. + """ + results = await self.amax_marginal_relevance_search_with_score_by_vector( + embedding, k, fetch_k, lambda_mult, **kwargs + ) + return list(map(itemgetter(0), results)) + + async def amax_marginal_relevance_search_with_score_by_vector( + self, + embedding: List[float], + k: int = 4, + fetch_k: int = 20, + lambda_mult: float = 0.5, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + """Return docs selected using the maximal marginal relevance. + Maximal marginal relevance optimizes for similarity to query AND diversity + among selected documents. + Args: + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + fetch_k: Number of Documents to fetch to pass to MMR algorithm. + Defaults to 20. + lambda_mult: Number between 0 and 1 that determines the degree + of diversity among the results with 0 corresponding + to maximum diversity and 1 to minimum diversity. + Defaults to 0.5. + Returns: + List of Documents selected by maximal marginal relevance and distance for + each. + """ + from qdrant_client import grpc # noqa + from qdrant_client.conversions.conversion import GrpcToRest + + response = await self.client.async_grpc_points.Search( + grpc.SearchPoints( + collection_name=self.collection_name, + vector_name=self.vector_name, + vector=embedding, + with_payload=grpc.WithPayloadSelector(enable=True), + with_vectors=grpc.WithVectorsSelector(enable=True), + limit=fetch_k, + ) + ) + results = [ + GrpcToRest.convert_vectors(result.vectors) for result in response.result + ] + embeddings: List[List[float]] = [ + result.get(self.vector_name) # type: ignore + if isinstance(result, dict) + else result + for result in results + ] + mmr_selected: List[int] = maximal_marginal_relevance( + np.array(embedding), + embeddings, + k=k, + lambda_mult=lambda_mult, + ) + return [ + ( + self._document_from_scored_point_grpc( + response.result[i], + self.content_payload_key, + self.metadata_payload_key, + ), + response.result[i].score, + ) + for i in mmr_selected + ] + + def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> Optional[bool]: + """Delete by vector ID or other criteria. + + Args: + ids: List of ids to delete. + **kwargs: Other keyword arguments that subclasses might use. + + Returns: + Optional[bool]: True if deletion is successful, + False otherwise, None if not implemented. + """ + from qdrant_client.http import models as rest + + result = self.client.delete( + collection_name=self.collection_name, + points_selector=ids, + ) + return result.status == rest.UpdateStatus.COMPLETED + @classmethod def from_texts( cls: Type[Qdrant], @@ -662,6 +1018,255 @@ class Qdrant(VectorStore): embeddings = OpenAIEmbeddings() qdrant = Qdrant.from_texts(texts, embeddings, "localhost") """ + qdrant = cls._construct_instance( + texts, + embedding, + metadatas, + ids, + location, + url, + port, + grpc_port, + prefer_grpc, + https, + api_key, + prefix, + timeout, + host, + path, + collection_name, + distance_func, + content_payload_key, + metadata_payload_key, + vector_name, + batch_size, + shard_number, + replication_factor, + write_consistency_factor, + on_disk_payload, + hnsw_config, + optimizers_config, + wal_config, + quantization_config, + init_from, + force_recreate, + **kwargs, + ) + qdrant.add_texts(texts, metadatas, ids, batch_size) + return qdrant + + @classmethod + async def afrom_texts( + cls: Type[Qdrant], + texts: List[str], + embedding: Embeddings, + metadatas: Optional[List[dict]] = None, + ids: Optional[Sequence[str]] = None, + location: Optional[str] = None, + url: Optional[str] = None, + port: Optional[int] = 6333, + grpc_port: int = 6334, + prefer_grpc: bool = False, + https: Optional[bool] = None, + api_key: Optional[str] = None, + prefix: Optional[str] = None, + timeout: Optional[float] = None, + host: Optional[str] = None, + path: Optional[str] = None, + collection_name: Optional[str] = None, + distance_func: str = "Cosine", + content_payload_key: str = CONTENT_KEY, + metadata_payload_key: str = METADATA_KEY, + vector_name: Optional[str] = VECTOR_NAME, + batch_size: int = 64, + shard_number: Optional[int] = None, + replication_factor: Optional[int] = None, + write_consistency_factor: Optional[int] = None, + on_disk_payload: Optional[bool] = None, + hnsw_config: Optional[common_types.HnswConfigDiff] = None, + optimizers_config: Optional[common_types.OptimizersConfigDiff] = None, + wal_config: Optional[common_types.WalConfigDiff] = None, + quantization_config: Optional[common_types.QuantizationConfig] = None, + init_from: Optional[common_types.InitFrom] = None, + force_recreate: bool = False, + **kwargs: Any, + ) -> Qdrant: + """Construct Qdrant wrapper from a list of texts. + + Args: + texts: A list of texts to be indexed in Qdrant. + embedding: A subclass of `Embeddings`, responsible for text vectorization. + metadatas: + An optional list of metadata. If provided it has to be of the same + length as a list of texts. + ids: + Optional list of ids to associate with the texts. Ids have to be + uuid-like strings. + location: + If `:memory:` - use in-memory Qdrant instance. + If `str` - use it as a `url` parameter. + If `None` - fallback to relying on `host` and `port` parameters. + url: either host or str of "Optional[scheme], host, Optional[port], + Optional[prefix]". Default: `None` + port: Port of the REST API interface. Default: 6333 + grpc_port: Port of the gRPC interface. Default: 6334 + prefer_grpc: + If true - use gPRC interface whenever possible in custom methods. + Default: False + https: If true - use HTTPS(SSL) protocol. Default: None + api_key: API key for authentication in Qdrant Cloud. Default: None + prefix: + If not None - add prefix to the REST URL path. + Example: service/v1 will result in + http://localhost:6333/service/v1/{qdrant-endpoint} for REST API. + Default: None + timeout: + Timeout for REST and gRPC API requests. + Default: 5.0 seconds for REST and unlimited for gRPC + host: + Host name of Qdrant service. If url and host are None, set to + 'localhost'. Default: None + path: + Path in which the vectors will be stored while using local mode. + Default: None + collection_name: + Name of the Qdrant collection to be used. If not provided, + it will be created randomly. Default: None + distance_func: + Distance function. One of: "Cosine" / "Euclid" / "Dot". + Default: "Cosine" + content_payload_key: + A payload key used to store the content of the document. + Default: "page_content" + metadata_payload_key: + A payload key used to store the metadata of the document. + Default: "metadata" + vector_name: + Name of the vector to be used internally in Qdrant. + Default: None + batch_size: + How many vectors upload per-request. + Default: 64 + shard_number: Number of shards in collection. Default is 1, minimum is 1. + replication_factor: + Replication factor for collection. Default is 1, minimum is 1. + Defines how many copies of each shard will be created. + Have effect only in distributed mode. + write_consistency_factor: + Write consistency factor for collection. Default is 1, minimum is 1. + Defines how many replicas should apply the operation for us to consider + it successful. Increasing this number will make the collection more + resilient to inconsistencies, but will also make it fail if not enough + replicas are available. + Does not have any performance impact. + Have effect only in distributed mode. + on_disk_payload: + If true - point`s payload will not be stored in memory. + It will be read from the disk every time it is requested. + This setting saves RAM by (slightly) increasing the response time. + Note: those payload values that are involved in filtering and are + indexed - remain in RAM. + hnsw_config: Params for HNSW index + optimizers_config: Params for optimizer + wal_config: Params for Write-Ahead-Log + quantization_config: + Params for quantization, if None - quantization will be disabled + init_from: + Use data stored in another collection to initialize this collection + force_recreate: + Force recreating the collection + **kwargs: + Additional arguments passed directly into REST client initialization + + This is a user-friendly interface that: + 1. Creates embeddings, one for each text + 2. Initializes the Qdrant database as an in-memory docstore by default + (and overridable to a remote docstore) + 3. Adds the text embeddings to the Qdrant database + + This is intended to be a quick way to get started. + + Example: + .. code-block:: python + + from langchain import Qdrant + from langchain.embeddings import OpenAIEmbeddings + embeddings = OpenAIEmbeddings() + qdrant = await Qdrant.afrom_texts(texts, embeddings, "localhost") + """ + qdrant = cls._construct_instance( + texts, + embedding, + metadatas, + ids, + location, + url, + port, + grpc_port, + prefer_grpc, + https, + api_key, + prefix, + timeout, + host, + path, + collection_name, + distance_func, + content_payload_key, + metadata_payload_key, + vector_name, + batch_size, + shard_number, + replication_factor, + write_consistency_factor, + on_disk_payload, + hnsw_config, + optimizers_config, + wal_config, + quantization_config, + init_from, + force_recreate, + **kwargs, + ) + await qdrant.aadd_texts(texts, metadatas, ids, batch_size) + return qdrant + + @classmethod + def _construct_instance( + cls: Type[Qdrant], + texts: List[str], + embedding: Embeddings, + metadatas: Optional[List[dict]] = None, + ids: Optional[Sequence[str]] = None, + location: Optional[str] = None, + url: Optional[str] = None, + port: Optional[int] = 6333, + grpc_port: int = 6334, + prefer_grpc: bool = False, + https: Optional[bool] = None, + api_key: Optional[str] = None, + prefix: Optional[str] = None, + timeout: Optional[float] = None, + host: Optional[str] = None, + path: Optional[str] = None, + collection_name: Optional[str] = None, + distance_func: str = "Cosine", + content_payload_key: str = CONTENT_KEY, + metadata_payload_key: str = METADATA_KEY, + vector_name: Optional[str] = VECTOR_NAME, + batch_size: int = 64, + shard_number: Optional[int] = None, + replication_factor: Optional[int] = None, + write_consistency_factor: Optional[int] = None, + on_disk_payload: Optional[bool] = None, + hnsw_config: Optional[common_types.HnswConfigDiff] = None, + optimizers_config: Optional[common_types.OptimizersConfigDiff] = None, + wal_config: Optional[common_types.WalConfigDiff] = None, + quantization_config: Optional[common_types.QuantizationConfig] = None, + init_from: Optional[common_types.InitFrom] = None, + force_recreate: bool = False, + **kwargs: Any, + ) -> Qdrant: try: import qdrant_client except ImportError: @@ -669,7 +1274,6 @@ class Qdrant(VectorStore): "Could not import qdrant-client python package. " "Please install it with `pip install qdrant-client`." ) - from grpc import RpcError from qdrant_client.http import models as rest from qdrant_client.http.exceptions import UnexpectedResponse @@ -677,10 +1281,8 @@ class Qdrant(VectorStore): # Just do a single quick embedding to get vector size partial_embeddings = embedding.embed_documents(texts[:1]) vector_size = len(partial_embeddings[0]) - collection_name = collection_name or uuid.uuid4().hex distance_func = distance_func.upper() - client = qdrant_client.QdrantClient( location=location, url=url, @@ -695,7 +1297,6 @@ class Qdrant(VectorStore): path=path, **kwargs, ) - try: # Skip any validation in case of forced collection recreate. if force_recreate: @@ -786,7 +1387,6 @@ class Qdrant(VectorStore): init_from=init_from, timeout=timeout, # type: ignore[arg-type] ) - qdrant = cls( client=client, collection_name=collection_name, @@ -796,9 +1396,52 @@ class Qdrant(VectorStore): distance_strategy=distance_func, vector_name=vector_name, ) - qdrant.add_texts(texts, metadatas, ids, batch_size) return qdrant + def _select_relevance_score_fn(self) -> Callable[[float], float]: + """ + The 'correct' relevance function + may differ depending on a few things, including: + - the distance / similarity metric used by the VectorStore + - the scale of your embeddings (OpenAI's are unit normed. Many others are not!) + - embedding dimensionality + - etc. + """ + + if self.distance_strategy == "COSINE": + return self._cosine_relevance_score_fn + elif self.distance_strategy == "DOT": + return self._max_inner_product_relevance_score_fn + elif self.distance_strategy == "EUCLID": + return self._euclidean_relevance_score_fn + else: + raise ValueError( + "Unknown distance strategy, must be cosine, " + "max_inner_product, or euclidean" + ) + + def _similarity_search_with_relevance_scores( + self, + query: str, + k: int = 4, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + """Return docs and relevance scores in the range [0, 1]. + + 0 is dissimilar, 1 is most similar. + + Args: + query: input text + k: Number of Documents to return. Defaults to 4. + **kwargs: kwargs to be passed to similarity search. Should include: + score_threshold: Optional, a floating point value between 0 to 1 to + filter the resulting set of retrieved docs + + Returns: + List of Tuples of (doc, similarity_score) + """ + return self.similarity_search_with_score(query, k, **kwargs) + @classmethod def _build_payloads( cls, @@ -836,6 +1479,21 @@ class Qdrant(VectorStore): metadata=scored_point.payload.get(metadata_payload_key) or {}, ) + @classmethod + def _document_from_scored_point_grpc( + cls, + scored_point: Any, + content_payload_key: str, + metadata_payload_key: str, + ) -> Document: + from qdrant_client.conversions.conversion import grpc_to_payload + + payload = grpc_to_payload(scored_point.payload) + return Document( + page_content=payload[content_payload_key], + metadata=payload.get(metadata_payload_key) or {}, + ) + def _build_condition(self, key: str, value: Any) -> List[rest.FieldCondition]: from qdrant_client.http import models as rest @@ -922,3 +1580,45 @@ class Qdrant(VectorStore): raise ValueError("Neither of embeddings or embedding_function is set") return embeddings + + def _generate_rest_batches( + self, + texts: Iterable[str], + metadatas: Optional[List[dict]] = None, + ids: Optional[Sequence[str]] = None, + batch_size: int = 64, + ) -> Generator[Tuple[List[str], List[rest.PointStruct]], None, None]: + from qdrant_client.http import models as rest + + texts_iterator = iter(texts) + metadatas_iterator = iter(metadatas or []) + ids_iterator = iter(ids or [uuid.uuid4().hex for _ in iter(texts)]) + while batch_texts := list(islice(texts_iterator, batch_size)): + # Take the corresponding metadata and id for each text in a batch + batch_metadatas = list(islice(metadatas_iterator, batch_size)) or None + batch_ids = list(islice(ids_iterator, batch_size)) + + # Generate the embeddings for all the texts in a batch + batch_embeddings = self._embed_texts(batch_texts) + + points = [ + rest.PointStruct( + id=point_id, + vector=vector + if self.vector_name is None + else {self.vector_name: vector}, + payload=payload, + ) + for point_id, vector, payload in zip( + batch_ids, + batch_embeddings, + self._build_payloads( + batch_texts, + batch_metadatas, + self.content_payload_key, + self.metadata_payload_key, + ), + ) + ] + + yield batch_ids, points diff --git a/poetry.lock b/poetry.lock index de94226cb45..7ba1681d6da 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand. +# This file is automatically @generated by Poetry and should not be changed by hand. [[package]] name = "absl-py" @@ -641,12 +641,16 @@ category = "main" optional = true python-versions = ">=3.7" files = [ + {file = "awadb-0.3.6-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:d90318d2d388aa1bb740b0b7e641cb7da00e6ab5700ce97564163c88a1927ed4"}, {file = "awadb-0.3.6-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:6154f73aab9996aefe8c8f8bf754f7182d109d6b60302c9f31666c7f50cc7aca"}, {file = "awadb-0.3.6-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:9d7e9dff353517595ecc8c9395a2367acdcfc83c68a64dd4785c8d366eed3f40"}, + {file = "awadb-0.3.6-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:6f6d10d1e885fa1d64eeb8ffda2de470c3a7508d57a9489213b8649bcddcd31e"}, {file = "awadb-0.3.6-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:475af75d2ffbbe970999d93fbabdf7281797390c66fe852f6a6989e706b90c94"}, {file = "awadb-0.3.6-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:304be1de63daec1555f0fe9de9a18cdf16a467687a35a6ccf3405cd400fefb48"}, {file = "awadb-0.3.6-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:176cc27d1afc4aad758515d5f8fb435f555c9ba827a9e84d6f28b1c6ac568965"}, + {file = "awadb-0.3.6-cp38-cp38-macosx_13_0_arm64.whl", hash = "sha256:36138b754c990143d0314fd7a9293c96f7ba549860244bda728e3f51b73e0f6e"}, {file = "awadb-0.3.6-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:211d7f6b0f7c3c3d7518d424f0f3dfac5f45f9e5d7bbf397fdae861ff0dc46fd"}, + {file = "awadb-0.3.6-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:b1f9e9a7ba2fa58bce55fcca784d5b3e159712962aaee2156f6317c5993f4277"}, {file = "awadb-0.3.6-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:b935ab4ffaa3bcbcc9a381fce91ace5940143b527ffdb467dd4bc630cd94afab"}, ] @@ -4378,6 +4382,7 @@ optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, !=3.6.*" files = [ {file = "jsonpointer-2.4-py2.py3-none-any.whl", hash = "sha256:15d51bba20eea3165644553647711d150376234112651b4f1811022aecad7d7a"}, + {file = "jsonpointer-2.4.tar.gz", hash = "sha256:585cee82b70211fa9e6043b7bb89db6e1aa49524340dde8ad6b63206ea689d88"}, ] [[package]] @@ -9031,14 +9036,14 @@ cffi = {version = "*", markers = "implementation_name == \"pypy\""} [[package]] name = "qdrant-client" -version = "1.1.7" +version = "1.3.1" description = "Client library for the Qdrant vector search engine" category = "main" optional = true python-versions = ">=3.7,<3.12" files = [ - {file = "qdrant_client-1.1.7-py3-none-any.whl", hash = "sha256:4f5d883660b8193840d8982919ab813a0470ace9a7ff46ee730f909841be5319"}, - {file = "qdrant_client-1.1.7.tar.gz", hash = "sha256:686d86934bec2ebb70676fc0650c9a44a9e552e0149124ca5a22ee8533879deb"}, + {file = "qdrant_client-1.3.1-py3-none-any.whl", hash = "sha256:9640855585d1f532094e342f07e0f2ef00652a60fc5d903c92ca3989a1e86318"}, + {file = "qdrant_client-1.3.1.tar.gz", hash = "sha256:a999358b10e611d71b4b04c6ded36a6cfc963e56b4c3f99d9c1a603ca524a82e"}, ] [package.dependencies] @@ -9048,7 +9053,7 @@ httpx = {version = ">=0.14.0", extras = ["http2"]} numpy = {version = ">=1.21", markers = "python_version >= \"3.8\""} portalocker = ">=2.7.0,<3.0.0" pydantic = ">=1.8,<2.0" -typing-extensions = ">=4.0.0,<5.0.0" +typing-extensions = ">=4.0.0,<4.6.0" urllib3 = ">=1.26.14,<2.0.0" [[package]] @@ -11440,7 +11445,7 @@ files = [ ] [package.dependencies] -accelerate = {version = ">=0.20.2", optional = true, markers = "extra == \"accelerate\" or extra == \"torch\""} +accelerate = {version = ">=0.20.2", optional = true, markers = "extra == \"accelerate\""} filelock = "*" huggingface-hub = ">=0.14.1,<1.0" numpy = ">=1.17" @@ -11642,14 +11647,14 @@ files = [ [[package]] name = "typing-extensions" -version = "4.6.3" +version = "4.5.0" description = "Backported and Experimental Type Hints for Python 3.7+" category = "main" optional = false python-versions = ">=3.7" files = [ - {file = "typing_extensions-4.6.3-py3-none-any.whl", hash = "sha256:88a4153d8505aabbb4e13aacb7c486c2b4a33ca3b3f807914a9b4c844c471c26"}, - {file = "typing_extensions-4.6.3.tar.gz", hash = "sha256:d91d5919357fe7f681a9f2b5b4cb2a5f1ef0a1e9f59c4d8ff0d3491e05c0ffd5"}, + {file = "typing_extensions-4.5.0-py3-none-any.whl", hash = "sha256:fb33085c39dd998ac16d1431ebc293a8b3eedd00fd4a32de0ff79002c19511b4"}, + {file = "typing_extensions-4.5.0.tar.gz", hash = "sha256:5cb5f4a79139d699607b3ef622a1dedafa84e115ab0024e0d9c044a9479ca7cb"}, ] [[package]] @@ -12697,15 +12702,15 @@ cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\ cffi = ["cffi (>=1.11)"] [extras] -all = ["O365", "aleph-alpha-client", "anthropic", "arxiv", "atlassian-python-api", "awadb", "azure-ai-formrecognizer", "azure-ai-vision", "azure-cognitiveservices-speech", "azure-cosmos", "azure-identity", "beautifulsoup4", "clarifai", "clickhouse-connect", "cohere", "deeplake", "docarray", "duckduckgo-search", "elasticsearch", "esprima", "faiss-cpu", "google-api-python-client", "google-auth", "google-search-results", "gptcache", "html2text", "huggingface_hub", "jina", "jinja2", "jq", "lancedb", "langkit", "lark", "libdeeplake", "lxml", "manifest-ml", "marqo", "momento", "nebula3-python", "neo4j", "networkx", "nlpcloud", "nltk", "nomic", "octoai-sdk", "openai", "openlm", "opensearch-py", "pdfminer-six", "pexpect", "pgvector", "pinecone-client", "pinecone-text", "psycopg2-binary", "pymongo", "pyowm", "pypdf", "pytesseract", "pyvespa", "qdrant-client", "rdflib", "redis", "requests-toolbelt", "sentence-transformers", "singlestoredb", "spacy", "steamship", "tensorflow-text", "tigrisdb", "tiktoken", "torch", "transformers", "weaviate-client", "wikipedia", "wolframalpha"] -azure = ["azure-ai-formrecognizer", "azure-ai-vision", "azure-cognitiveservices-speech", "azure-core", "azure-cosmos", "azure-identity", "azure-search-documents", "openai"] +all = ["anthropic", "clarifai", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "marqo", "pymongo", "weaviate-client", "redis", "google-api-python-client", "google-auth", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "libdeeplake", "pgvector", "psycopg2-binary", "pyowm", "pytesseract", "html2text", "atlassian-python-api", "gptcache", "duckduckgo-search", "arxiv", "azure-identity", "clickhouse-connect", "azure-cosmos", "lancedb", "langkit", "lark", "pexpect", "pyvespa", "O365", "jq", "docarray", "steamship", "pdfminer-six", "lxml", "requests-toolbelt", "neo4j", "openlm", "azure-ai-formrecognizer", "azure-ai-vision", "azure-cognitiveservices-speech", "momento", "singlestoredb", "tigrisdb", "nebula3-python", "awadb", "esprima", "octoai-sdk", "rdflib"] +azure = ["azure-identity", "azure-cosmos", "openai", "azure-core", "azure-ai-formrecognizer", "azure-ai-vision", "azure-cognitiveservices-speech", "azure-search-documents"] clarifai = ["clarifai"] cohere = ["cohere"] docarray = ["docarray"] embeddings = ["sentence-transformers"] -extended-testing = ["atlassian-python-api", "beautifulsoup4", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "esprima", "gql", "html2text", "jq", "lxml", "openai", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rapidfuzz", "requests-toolbelt", "scikit-learn", "streamlit", "sympy", "telethon", "tqdm", "zep-python"] +extended-testing = ["beautifulsoup4", "bibtexparser", "cassio", "chardet", "esprima", "jq", "pdfminer-six", "pgvector", "pypdf", "pymupdf", "pypdfium2", "tqdm", "lxml", "atlassian-python-api", "beautifulsoup4", "pandas", "telethon", "psychicapi", "zep-python", "gql", "requests-toolbelt", "html2text", "py-trello", "scikit-learn", "streamlit", "pyspark", "openai", "sympy", "rapidfuzz"] javascript = ["esprima"] -llms = ["anthropic", "clarifai", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "openllm", "openlm", "torch", "transformers"] +llms = ["anthropic", "clarifai", "cohere", "openai", "openllm", "openlm", "nlpcloud", "huggingface_hub", "manifest-ml", "torch", "transformers"] openai = ["openai", "tiktoken"] qdrant = ["qdrant-client"] text-helpers = ["chardet"] @@ -12713,4 +12718,4 @@ text-helpers = ["chardet"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "f8f94ad19dd8f96637f6ffe64401b780ea9e7985543a7c9da31c41c55e94ab0f" +content-hash = "7c3eeaa43dead997a66d01a3ba3799656d216d20011329f6a14fcf653cc658b7" diff --git a/pyproject.toml b/pyproject.toml index 56628cb22c1..1e750b88e44 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,7 +43,7 @@ google-api-python-client = {version = "2.70.0", optional = true} google-auth = {version = "^2.18.1", optional = true} wolframalpha = {version = "5.0.0", optional = true} anthropic = {version = "^0.3", optional = true} -qdrant-client = {version = "^1.1.2", optional = true, python = ">=3.8.1,<3.12"} +qdrant-client = {version = "^1.3.1", optional = true, python = ">=3.8.1,<3.12"} dataclasses-json = "^0.5.7" tensorflow-text = {version = "^2.11.0", optional = true, python = "^3.10, <3.12"} tenacity = "^8.1.0" diff --git a/tests/integration_tests/vectorstores/qdrant/__init__.py b/tests/integration_tests/vectorstores/qdrant/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration_tests/vectorstores/qdrant/async/__init__.py b/tests/integration_tests/vectorstores/qdrant/async/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration_tests/vectorstores/qdrant/async/common.py b/tests/integration_tests/vectorstores/qdrant/async/common.py new file mode 100644 index 00000000000..065dddd0a6b --- /dev/null +++ b/tests/integration_tests/vectorstores/qdrant/async/common.py @@ -0,0 +1,10 @@ +def qdrant_is_not_running() -> bool: + """Check if Qdrant is not running.""" + import requests + + try: + response = requests.get("http://localhost:6333", timeout=10.0) + response_json = response.json() + return response_json.get("title") != "qdrant - vector search engine" + except (requests.exceptions.ConnectionError, requests.exceptions.Timeout): + return True diff --git a/tests/integration_tests/vectorstores/qdrant/async/test_add_texts.py b/tests/integration_tests/vectorstores/qdrant/async/test_add_texts.py new file mode 100644 index 00000000000..ddc7ccef857 --- /dev/null +++ b/tests/integration_tests/vectorstores/qdrant/async/test_add_texts.py @@ -0,0 +1,121 @@ +from typing import Optional + +import pytest +from qdrant_client.http import models as rest + +from langchain.vectorstores import Qdrant +from tests.integration_tests.vectorstores.fake_embeddings import ( + ConsistentFakeEmbeddings, +) + +from .common import qdrant_is_not_running + +# Skipping all the tests in the module if Qdrant is not running on localhost. +pytestmark = pytest.mark.skipif( + qdrant_is_not_running(), reason="Qdrant server is not running" +) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("batch_size", [1, 64]) +async def test_qdrant_aadd_texts_returns_all_ids(batch_size: int) -> None: + """Test end to end Qdrant.aadd_texts returns unique ids.""" + docsearch: Qdrant = Qdrant.from_texts( + ["foobar"], + ConsistentFakeEmbeddings(), + batch_size=batch_size, + ) + + ids = await docsearch.aadd_texts(["foo", "bar", "baz"]) + assert 3 == len(ids) + assert 3 == len(set(ids)) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("vector_name", [None, "my-vector"]) +async def test_qdrant_aadd_texts_stores_duplicated_texts( + vector_name: Optional[str], +) -> None: + """Test end to end Qdrant.aadd_texts stores duplicated texts separately.""" + from qdrant_client import QdrantClient + from qdrant_client.http import models as rest + + client = QdrantClient() + collection_name = "test" + vectors_config = rest.VectorParams(size=10, distance=rest.Distance.COSINE) + if vector_name is not None: + vectors_config = {vector_name: vectors_config} # type: ignore[assignment] + client.recreate_collection(collection_name, vectors_config=vectors_config) + + vec_store = Qdrant( + client, + collection_name, + embeddings=ConsistentFakeEmbeddings(), + vector_name=vector_name, + ) + ids = await vec_store.aadd_texts(["abc", "abc"], [{"a": 1}, {"a": 2}]) + + assert 2 == len(set(ids)) + assert 2 == client.count(collection_name).count + + +@pytest.mark.asyncio +@pytest.mark.parametrize("batch_size", [1, 64]) +async def test_qdrant_aadd_texts_stores_ids(batch_size: int) -> None: + """Test end to end Qdrant.aadd_texts stores provided ids.""" + from qdrant_client import QdrantClient + + ids = [ + "fa38d572-4c31-4579-aedc-1960d79df6df", + "cdc1aa36-d6ab-4fb2-8a94-56674fd27484", + ] + + client = QdrantClient() + collection_name = "test" + client.recreate_collection( + collection_name, + vectors_config=rest.VectorParams(size=10, distance=rest.Distance.COSINE), + ) + + vec_store = Qdrant(client, collection_name, ConsistentFakeEmbeddings()) + returned_ids = await vec_store.aadd_texts( + ["abc", "def"], ids=ids, batch_size=batch_size + ) + + assert all(first == second for first, second in zip(ids, returned_ids)) + assert 2 == client.count(collection_name).count + stored_ids = [point.id for point in client.scroll(collection_name)[0]] + assert set(ids) == set(stored_ids) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("vector_name", ["custom-vector"]) +async def test_qdrant_aadd_texts_stores_embeddings_as_named_vectors( + vector_name: str, +) -> None: + """Test end to end Qdrant.aadd_texts stores named vectors if name is provided.""" + from qdrant_client import QdrantClient + + collection_name = "test" + + client = QdrantClient() + client.recreate_collection( + collection_name, + vectors_config={ + vector_name: rest.VectorParams(size=10, distance=rest.Distance.COSINE) + }, + ) + + vec_store = Qdrant( + client, + collection_name, + ConsistentFakeEmbeddings(), + vector_name=vector_name, + ) + await vec_store.aadd_texts(["lorem", "ipsum", "dolor", "sit", "amet"]) + + assert 5 == client.count(collection_name).count + assert all( + vector_name in point.vector # type: ignore[operator] + for point in client.scroll(collection_name, with_vectors=True)[0] + ) diff --git a/tests/integration_tests/vectorstores/qdrant/async/test_from_texts.py b/tests/integration_tests/vectorstores/qdrant/async/test_from_texts.py new file mode 100644 index 00000000000..4aac43e69fa --- /dev/null +++ b/tests/integration_tests/vectorstores/qdrant/async/test_from_texts.py @@ -0,0 +1,247 @@ +import uuid +from typing import Optional + +import pytest + +from langchain.schema import Document +from langchain.vectorstores import Qdrant +from langchain.vectorstores.qdrant import QdrantException +from tests.integration_tests.vectorstores.fake_embeddings import ( + ConsistentFakeEmbeddings, +) + +from .common import qdrant_is_not_running + +# Skipping all the tests in the module if Qdrant is not running on localhost. +pytestmark = pytest.mark.skipif( + qdrant_is_not_running(), reason="Qdrant server is not running" +) + + +@pytest.mark.asyncio +async def test_qdrant_from_texts_stores_duplicated_texts() -> None: + """Test end to end Qdrant.afrom_texts stores duplicated texts separately.""" + from qdrant_client import QdrantClient + + collection_name = uuid.uuid4().hex + + await Qdrant.afrom_texts( + ["abc", "abc"], + ConsistentFakeEmbeddings(), + collection_name=collection_name, + ) + + client = QdrantClient() + assert 2 == client.count(collection_name).count + + +@pytest.mark.asyncio +@pytest.mark.parametrize("batch_size", [1, 64]) +@pytest.mark.parametrize("vector_name", [None, "my-vector"]) +async def test_qdrant_from_texts_stores_ids( + batch_size: int, vector_name: Optional[str] +) -> None: + """Test end to end Qdrant.afrom_texts stores provided ids.""" + from qdrant_client import QdrantClient + + collection_name = uuid.uuid4().hex + ids = [ + "fa38d572-4c31-4579-aedc-1960d79df6df", + "cdc1aa36-d6ab-4fb2-8a94-56674fd27484", + ] + await Qdrant.afrom_texts( + ["abc", "def"], + ConsistentFakeEmbeddings(), + ids=ids, + collection_name=collection_name, + batch_size=batch_size, + vector_name=vector_name, + ) + + client = QdrantClient() + assert 2 == client.count(collection_name).count + stored_ids = [point.id for point in client.scroll(collection_name)[0]] + assert set(ids) == set(stored_ids) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("vector_name", ["custom-vector"]) +async def test_qdrant_from_texts_stores_embeddings_as_named_vectors( + vector_name: str, +) -> None: + """Test end to end Qdrant.afrom_texts stores named vectors if name is provided.""" + from qdrant_client import QdrantClient + + collection_name = uuid.uuid4().hex + + await Qdrant.afrom_texts( + ["lorem", "ipsum", "dolor", "sit", "amet"], + ConsistentFakeEmbeddings(), + collection_name=collection_name, + vector_name=vector_name, + ) + + client = QdrantClient() + assert 5 == client.count(collection_name).count + assert all( + vector_name in point.vector # type: ignore[operator] + for point in client.scroll(collection_name, with_vectors=True)[0] + ) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("vector_name", [None, "custom-vector"]) +async def test_qdrant_from_texts_reuses_same_collection( + vector_name: Optional[str], +) -> None: + """Test if Qdrant.afrom_texts reuses the same collection""" + from qdrant_client import QdrantClient + + collection_name = uuid.uuid4().hex + embeddings = ConsistentFakeEmbeddings() + + await Qdrant.afrom_texts( + ["lorem", "ipsum", "dolor", "sit", "amet"], + embeddings, + collection_name=collection_name, + vector_name=vector_name, + ) + + await Qdrant.afrom_texts( + ["foo", "bar"], + embeddings, + collection_name=collection_name, + vector_name=vector_name, + ) + + client = QdrantClient() + assert 7 == client.count(collection_name).count + + +@pytest.mark.asyncio +@pytest.mark.parametrize("vector_name", [None, "custom-vector"]) +async def test_qdrant_from_texts_raises_error_on_different_dimensionality( + vector_name: Optional[str], +) -> None: + """Test if Qdrant.afrom_texts raises an exception if dimensionality does not + match""" + collection_name = uuid.uuid4().hex + + await Qdrant.afrom_texts( + ["lorem", "ipsum", "dolor", "sit", "amet"], + ConsistentFakeEmbeddings(dimensionality=10), + collection_name=collection_name, + vector_name=vector_name, + ) + + with pytest.raises(QdrantException): + await Qdrant.afrom_texts( + ["foo", "bar"], + ConsistentFakeEmbeddings(dimensionality=5), + collection_name=collection_name, + vector_name=vector_name, + ) + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + ["first_vector_name", "second_vector_name"], + [ + (None, "custom-vector"), + ("custom-vector", None), + ("my-first-vector", "my-second_vector"), + ], +) +async def test_qdrant_from_texts_raises_error_on_different_vector_name( + first_vector_name: Optional[str], + second_vector_name: Optional[str], +) -> None: + """Test if Qdrant.afrom_texts raises an exception if vector name does not match""" + collection_name = uuid.uuid4().hex + + await Qdrant.afrom_texts( + ["lorem", "ipsum", "dolor", "sit", "amet"], + ConsistentFakeEmbeddings(dimensionality=10), + collection_name=collection_name, + vector_name=first_vector_name, + ) + + with pytest.raises(QdrantException): + await Qdrant.afrom_texts( + ["foo", "bar"], + ConsistentFakeEmbeddings(dimensionality=5), + collection_name=collection_name, + vector_name=second_vector_name, + ) + + +@pytest.mark.asyncio +async def test_qdrant_from_texts_raises_error_on_different_distance() -> None: + """Test if Qdrant.afrom_texts raises an exception if distance does not match""" + collection_name = uuid.uuid4().hex + + await Qdrant.afrom_texts( + ["lorem", "ipsum", "dolor", "sit", "amet"], + ConsistentFakeEmbeddings(dimensionality=10), + collection_name=collection_name, + distance_func="Cosine", + ) + + with pytest.raises(QdrantException): + await Qdrant.afrom_texts( + ["foo", "bar"], + ConsistentFakeEmbeddings(dimensionality=5), + collection_name=collection_name, + distance_func="Euclid", + ) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("vector_name", [None, "custom-vector"]) +async def test_qdrant_from_texts_recreates_collection_on_force_recreate( + vector_name: Optional[str], +) -> None: + """Test if Qdrant.afrom_texts recreates the collection even if config mismatches""" + from qdrant_client import QdrantClient + + collection_name = uuid.uuid4().hex + + await Qdrant.afrom_texts( + ["lorem", "ipsum", "dolor", "sit", "amet"], + ConsistentFakeEmbeddings(dimensionality=10), + collection_name=collection_name, + vector_name=vector_name, + ) + + await Qdrant.afrom_texts( + ["foo", "bar"], + ConsistentFakeEmbeddings(dimensionality=5), + collection_name=collection_name, + vector_name=vector_name, + force_recreate=True, + ) + + client = QdrantClient() + assert 2 == client.count(collection_name).count + + +@pytest.mark.asyncio +@pytest.mark.parametrize("batch_size", [1, 64]) +@pytest.mark.parametrize("content_payload_key", [Qdrant.CONTENT_KEY, "foo"]) +@pytest.mark.parametrize("metadata_payload_key", [Qdrant.METADATA_KEY, "bar"]) +async def test_qdrant_from_texts_stores_metadatas( + batch_size: int, content_payload_key: str, metadata_payload_key: str +) -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = await Qdrant.afrom_texts( + texts, + ConsistentFakeEmbeddings(), + metadatas=metadatas, + content_payload_key=content_payload_key, + metadata_payload_key=metadata_payload_key, + batch_size=batch_size, + ) + output = await docsearch.asimilarity_search("foo", k=1) + assert output == [Document(page_content="foo", metadata={"page": 0})] diff --git a/tests/integration_tests/vectorstores/qdrant/async/test_max_marginal_relevance.py b/tests/integration_tests/vectorstores/qdrant/async/test_max_marginal_relevance.py new file mode 100644 index 00000000000..b40f65e126a --- /dev/null +++ b/tests/integration_tests/vectorstores/qdrant/async/test_max_marginal_relevance.py @@ -0,0 +1,46 @@ +from typing import Optional + +import pytest + +from langchain.schema import Document +from langchain.vectorstores import Qdrant +from tests.integration_tests.vectorstores.fake_embeddings import ( + ConsistentFakeEmbeddings, +) + +from .common import qdrant_is_not_running + +# Skipping all the tests in the module if Qdrant is not running on localhost. +pytestmark = pytest.mark.skipif( + qdrant_is_not_running(), reason="Qdrant server is not running" +) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("batch_size", [1, 64]) +@pytest.mark.parametrize("content_payload_key", [Qdrant.CONTENT_KEY, "test_content"]) +@pytest.mark.parametrize("metadata_payload_key", [Qdrant.METADATA_KEY, "test_metadata"]) +@pytest.mark.parametrize("vector_name", [None, "my-vector"]) +async def test_qdrant_max_marginal_relevance_search( + batch_size: int, + content_payload_key: str, + metadata_payload_key: str, + vector_name: Optional[str], +) -> None: + """Test end to end construction and MRR search.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = Qdrant.from_texts( + texts, + ConsistentFakeEmbeddings(), + metadatas=metadatas, + content_payload_key=content_payload_key, + metadata_payload_key=metadata_payload_key, + batch_size=batch_size, + vector_name=vector_name, + ) + output = await docsearch.amax_marginal_relevance_search("foo", k=2, fetch_k=3) + assert output == [ + Document(page_content="foo", metadata={"page": 0}), + Document(page_content="baz", metadata={"page": 2}), + ] diff --git a/tests/integration_tests/vectorstores/qdrant/async/test_similarity_search.py b/tests/integration_tests/vectorstores/qdrant/async/test_similarity_search.py new file mode 100644 index 00000000000..7833d5ead9b --- /dev/null +++ b/tests/integration_tests/vectorstores/qdrant/async/test_similarity_search.py @@ -0,0 +1,286 @@ +from typing import Optional + +import numpy as np +import pytest +from qdrant_client.http import models as rest + +from langchain.schema import Document +from langchain.vectorstores import Qdrant +from tests.integration_tests.vectorstores.fake_embeddings import ( + ConsistentFakeEmbeddings, +) + +from .common import qdrant_is_not_running + +# Skipping all the tests in the module if Qdrant is not running on localhost. +pytestmark = pytest.mark.skipif( + qdrant_is_not_running(), reason="Qdrant server is not running" +) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("batch_size", [1, 64]) +@pytest.mark.parametrize("content_payload_key", [Qdrant.CONTENT_KEY, "foo"]) +@pytest.mark.parametrize("metadata_payload_key", [Qdrant.METADATA_KEY, "bar"]) +@pytest.mark.parametrize("vector_name", [None, "my-vector"]) +async def test_qdrant_similarity_search( + batch_size: int, + content_payload_key: str, + metadata_payload_key: str, + vector_name: Optional[str], +) -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + docsearch = Qdrant.from_texts( + texts, + ConsistentFakeEmbeddings(), + content_payload_key=content_payload_key, + metadata_payload_key=metadata_payload_key, + batch_size=batch_size, + vector_name=vector_name, + ) + output = await docsearch.asimilarity_search("foo", k=1) + assert output == [Document(page_content="foo")] + + +@pytest.mark.asyncio +@pytest.mark.parametrize("batch_size", [1, 64]) +@pytest.mark.parametrize("content_payload_key", [Qdrant.CONTENT_KEY, "foo"]) +@pytest.mark.parametrize("metadata_payload_key", [Qdrant.METADATA_KEY, "bar"]) +@pytest.mark.parametrize("vector_name", [None, "my-vector"]) +async def test_qdrant_similarity_search_by_vector( + batch_size: int, + content_payload_key: str, + metadata_payload_key: str, + vector_name: Optional[str], +) -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + docsearch = Qdrant.from_texts( + texts, + ConsistentFakeEmbeddings(), + content_payload_key=content_payload_key, + metadata_payload_key=metadata_payload_key, + batch_size=batch_size, + vector_name=vector_name, + ) + embeddings = ConsistentFakeEmbeddings().embed_query("foo") + output = await docsearch.asimilarity_search_by_vector(embeddings, k=1) + assert output == [Document(page_content="foo")] + + +@pytest.mark.asyncio +@pytest.mark.parametrize("batch_size", [1, 64]) +@pytest.mark.parametrize("content_payload_key", [Qdrant.CONTENT_KEY, "foo"]) +@pytest.mark.parametrize("metadata_payload_key", [Qdrant.METADATA_KEY, "bar"]) +@pytest.mark.parametrize("vector_name", [None, "my-vector"]) +async def test_qdrant_similarity_search_with_score_by_vector( + batch_size: int, + content_payload_key: str, + metadata_payload_key: str, + vector_name: Optional[str], +) -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + docsearch = Qdrant.from_texts( + texts, + ConsistentFakeEmbeddings(), + content_payload_key=content_payload_key, + metadata_payload_key=metadata_payload_key, + batch_size=batch_size, + vector_name=vector_name, + ) + embeddings = ConsistentFakeEmbeddings().embed_query("foo") + output = await docsearch.asimilarity_search_with_score_by_vector(embeddings, k=1) + assert len(output) == 1 + document, score = output[0] + assert document == Document(page_content="foo") + assert score >= 0 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("batch_size", [1, 64]) +@pytest.mark.parametrize("vector_name", [None, "my-vector"]) +async def test_qdrant_similarity_search_filters( + batch_size: int, vector_name: Optional[str] +) -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + metadatas = [ + {"page": i, "metadata": {"page": i + 1, "pages": [i + 2, -1]}} + for i in range(len(texts)) + ] + docsearch = Qdrant.from_texts( + texts, + ConsistentFakeEmbeddings(), + metadatas=metadatas, + batch_size=batch_size, + vector_name=vector_name, + ) + + output = await docsearch.asimilarity_search( + "foo", k=1, filter={"page": 1, "metadata": {"page": 2, "pages": [3]}} + ) + assert output == [ + Document( + page_content="bar", + metadata={"page": 1, "metadata": {"page": 2, "pages": [3, -1]}}, + ) + ] + + +@pytest.mark.asyncio +@pytest.mark.parametrize("vector_name", [None, "my-vector"]) +async def test_qdrant_similarity_search_with_relevance_score_no_threshold( + vector_name: Optional[str], +) -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + metadatas = [ + {"page": i, "metadata": {"page": i + 1, "pages": [i + 2, -1]}} + for i in range(len(texts)) + ] + docsearch = Qdrant.from_texts( + texts, + ConsistentFakeEmbeddings(), + metadatas=metadatas, + vector_name=vector_name, + ) + output = await docsearch.asimilarity_search_with_relevance_scores( + "foo", k=3, score_threshold=None + ) + assert len(output) == 3 + for i in range(len(output)): + assert round(output[i][1], 2) >= 0 + assert round(output[i][1], 2) <= 1 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("vector_name", [None, "my-vector"]) +async def test_qdrant_similarity_search_with_relevance_score_with_threshold( + vector_name: Optional[str], +) -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + metadatas = [ + {"page": i, "metadata": {"page": i + 1, "pages": [i + 2, -1]}} + for i in range(len(texts)) + ] + docsearch = Qdrant.from_texts( + texts, + ConsistentFakeEmbeddings(), + metadatas=metadatas, + vector_name=vector_name, + ) + + score_threshold = 0.98 + kwargs = {"score_threshold": score_threshold} + output = await docsearch.asimilarity_search_with_relevance_scores( + "foo", k=3, **kwargs + ) + assert len(output) == 1 + assert all([score >= score_threshold for _, score in output]) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("vector_name", [None, "my-vector"]) +async def test_similarity_search_with_relevance_score_with_threshold_and_filter( + vector_name: Optional[str], +) -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + metadatas = [ + {"page": i, "metadata": {"page": i + 1, "pages": [i + 2, -1]}} + for i in range(len(texts)) + ] + docsearch = Qdrant.from_texts( + texts, + ConsistentFakeEmbeddings(), + metadatas=metadatas, + vector_name=vector_name, + ) + score_threshold = 0.99 # for almost exact match + # test negative filter condition + negative_filter = {"page": 1, "metadata": {"page": 2, "pages": [3]}} + kwargs = {"filter": negative_filter, "score_threshold": score_threshold} + output = docsearch.similarity_search_with_relevance_scores("foo", k=3, **kwargs) + assert len(output) == 0 + # test positive filter condition + positive_filter = {"page": 0, "metadata": {"page": 1, "pages": [2]}} + kwargs = {"filter": positive_filter, "score_threshold": score_threshold} + output = await docsearch.asimilarity_search_with_relevance_scores( + "foo", k=3, **kwargs + ) + assert len(output) == 1 + assert all([score >= score_threshold for _, score in output]) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("vector_name", [None, "my-vector"]) +async def test_qdrant_similarity_search_filters_with_qdrant_filters( + vector_name: Optional[str], +) -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + metadatas = [ + {"page": i, "details": {"page": i + 1, "pages": [i + 2, -1]}} + for i in range(len(texts)) + ] + docsearch = Qdrant.from_texts( + texts, + ConsistentFakeEmbeddings(), + metadatas=metadatas, + vector_name=vector_name, + ) + + qdrant_filter = rest.Filter( + must=[ + rest.FieldCondition( + key="metadata.page", + match=rest.MatchValue(value=1), + ), + rest.FieldCondition( + key="metadata.details.page", + match=rest.MatchValue(value=2), + ), + rest.FieldCondition( + key="metadata.details.pages", + match=rest.MatchAny(any=[3]), + ), + ] + ) + output = await docsearch.asimilarity_search("foo", k=1, filter=qdrant_filter) + assert output == [ + Document( + page_content="bar", + metadata={"page": 1, "details": {"page": 2, "pages": [3, -1]}}, + ) + ] + + +@pytest.mark.asyncio +@pytest.mark.parametrize("batch_size", [1, 64]) +@pytest.mark.parametrize("content_payload_key", [Qdrant.CONTENT_KEY, "foo"]) +@pytest.mark.parametrize("metadata_payload_key", [Qdrant.METADATA_KEY, "bar"]) +@pytest.mark.parametrize("vector_name", [None, "my-vector"]) +async def test_qdrant_similarity_search_with_relevance_scores( + batch_size: int, + content_payload_key: str, + metadata_payload_key: str, + vector_name: str, +) -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + docsearch = Qdrant.from_texts( + texts, + ConsistentFakeEmbeddings(), + content_payload_key=content_payload_key, + metadata_payload_key=metadata_payload_key, + batch_size=batch_size, + vector_name=vector_name, + ) + output = await docsearch.asimilarity_search_with_relevance_scores("foo", k=3) + + assert all( + (1 >= score or np.isclose(score, 1)) and score >= 0 for _, score in output + ) diff --git a/tests/integration_tests/vectorstores/qdrant/fixtures.py b/tests/integration_tests/vectorstores/qdrant/fixtures.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration_tests/vectorstores/qdrant/test_add_texts.py b/tests/integration_tests/vectorstores/qdrant/test_add_texts.py new file mode 100644 index 00000000000..4afd823161f --- /dev/null +++ b/tests/integration_tests/vectorstores/qdrant/test_add_texts.py @@ -0,0 +1,132 @@ +from typing import Optional + +import pytest +from qdrant_client.http import models as rest + +from langchain.schema import Document +from langchain.vectorstores import Qdrant +from tests.integration_tests.vectorstores.fake_embeddings import ( + ConsistentFakeEmbeddings, +) + + +@pytest.mark.parametrize("batch_size", [1, 64]) +@pytest.mark.parametrize("vector_name", [None, "my-vector"]) +def test_qdrant_add_documents_extends_existing_collection( + batch_size: int, vector_name: Optional[str] +) -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + docsearch: Qdrant = Qdrant.from_texts( + texts, + ConsistentFakeEmbeddings(), + location=":memory:", + batch_size=batch_size, + vector_name=vector_name, + ) + + new_texts = ["foobar", "foobaz"] + docsearch.add_documents( + [Document(page_content=content) for content in new_texts], batch_size=batch_size + ) + output = docsearch.similarity_search("foobar", k=1) + # ConsistentFakeEmbeddings return the same query embedding as the first document + # embedding computed in `embedding.embed_documents`. Thus, "foo" embedding is the + # same as "foobar" embedding + assert output == [Document(page_content="foobar")] + + +@pytest.mark.parametrize("batch_size", [1, 64]) +def test_qdrant_add_texts_returns_all_ids(batch_size: int) -> None: + """Test end to end Qdrant.add_texts returns unique ids.""" + docsearch: Qdrant = Qdrant.from_texts( + ["foobar"], + ConsistentFakeEmbeddings(), + location=":memory:", + batch_size=batch_size, + ) + + ids = docsearch.add_texts(["foo", "bar", "baz"]) + assert 3 == len(ids) + assert 3 == len(set(ids)) + + +@pytest.mark.parametrize("vector_name", [None, "my-vector"]) +def test_qdrant_add_texts_stores_duplicated_texts(vector_name: Optional[str]) -> None: + """Test end to end Qdrant.add_texts stores duplicated texts separately.""" + from qdrant_client import QdrantClient + from qdrant_client.http import models as rest + + client = QdrantClient(":memory:") + collection_name = "test" + vectors_config = rest.VectorParams(size=10, distance=rest.Distance.COSINE) + if vector_name is not None: + vectors_config = {vector_name: vectors_config} # type: ignore[assignment] + client.recreate_collection(collection_name, vectors_config=vectors_config) + + vec_store = Qdrant( + client, + collection_name, + embeddings=ConsistentFakeEmbeddings(), + vector_name=vector_name, + ) + ids = vec_store.add_texts(["abc", "abc"], [{"a": 1}, {"a": 2}]) + + assert 2 == len(set(ids)) + assert 2 == client.count(collection_name).count + + +@pytest.mark.parametrize("batch_size", [1, 64]) +def test_qdrant_add_texts_stores_ids(batch_size: int) -> None: + """Test end to end Qdrant.add_texts stores provided ids.""" + from qdrant_client import QdrantClient + + ids = [ + "fa38d572-4c31-4579-aedc-1960d79df6df", + "cdc1aa36-d6ab-4fb2-8a94-56674fd27484", + ] + + client = QdrantClient(":memory:") + collection_name = "test" + client.recreate_collection( + collection_name, + vectors_config=rest.VectorParams(size=10, distance=rest.Distance.COSINE), + ) + + vec_store = Qdrant(client, collection_name, ConsistentFakeEmbeddings()) + returned_ids = vec_store.add_texts(["abc", "def"], ids=ids, batch_size=batch_size) + + assert all(first == second for first, second in zip(ids, returned_ids)) + assert 2 == client.count(collection_name).count + stored_ids = [point.id for point in client.scroll(collection_name)[0]] + assert set(ids) == set(stored_ids) + + +@pytest.mark.parametrize("vector_name", ["custom-vector"]) +def test_qdrant_add_texts_stores_embeddings_as_named_vectors(vector_name: str) -> None: + """Test end to end Qdrant.add_texts stores named vectors if name is provided.""" + from qdrant_client import QdrantClient + + collection_name = "test" + + client = QdrantClient(":memory:") + client.recreate_collection( + collection_name, + vectors_config={ + vector_name: rest.VectorParams(size=10, distance=rest.Distance.COSINE) + }, + ) + + vec_store = Qdrant( + client, + collection_name, + ConsistentFakeEmbeddings(), + vector_name=vector_name, + ) + vec_store.add_texts(["lorem", "ipsum", "dolor", "sit", "amet"]) + + assert 5 == client.count(collection_name).count + assert all( + vector_name in point.vector # type: ignore[operator] + for point in client.scroll(collection_name, with_vectors=True)[0] + ) diff --git a/tests/integration_tests/vectorstores/qdrant/test_delete.py b/tests/integration_tests/vectorstores/qdrant/test_delete.py new file mode 100644 index 00000000000..6804fcf8866 --- /dev/null +++ b/tests/integration_tests/vectorstores/qdrant/test_delete.py @@ -0,0 +1 @@ +# TODO: implement tests for delete diff --git a/tests/integration_tests/vectorstores/qdrant/test_embedding_interface.py b/tests/integration_tests/vectorstores/qdrant/test_embedding_interface.py new file mode 100644 index 00000000000..5b3d64bae88 --- /dev/null +++ b/tests/integration_tests/vectorstores/qdrant/test_embedding_interface.py @@ -0,0 +1,59 @@ +from typing import Callable, Optional + +import pytest + +from langchain.embeddings.base import Embeddings +from langchain.vectorstores import Qdrant +from tests.integration_tests.vectorstores.fake_embeddings import ( + ConsistentFakeEmbeddings, +) + + +@pytest.mark.parametrize( + ["embeddings", "embedding_function"], + [ + (ConsistentFakeEmbeddings(), None), + (ConsistentFakeEmbeddings().embed_query, None), + (None, ConsistentFakeEmbeddings().embed_query), + ], +) +def test_qdrant_embedding_interface( + embeddings: Optional[Embeddings], embedding_function: Optional[Callable] +) -> None: + """Test Qdrant may accept different types for embeddings.""" + from qdrant_client import QdrantClient + + client = QdrantClient(":memory:") + collection_name = "test" + + Qdrant( + client, + collection_name, + embeddings=embeddings, + embedding_function=embedding_function, + ) + + +@pytest.mark.parametrize( + ["embeddings", "embedding_function"], + [ + (ConsistentFakeEmbeddings(), ConsistentFakeEmbeddings().embed_query), + (None, None), + ], +) +def test_qdrant_embedding_interface_raises_value_error( + embeddings: Optional[Embeddings], embedding_function: Optional[Callable] +) -> None: + """Test Qdrant requires only one method for embeddings.""" + from qdrant_client import QdrantClient + + client = QdrantClient(":memory:") + collection_name = "test" + + with pytest.raises(ValueError): + Qdrant( + client, + collection_name, + embeddings=embeddings, + embedding_function=embedding_function, + ) diff --git a/tests/integration_tests/vectorstores/qdrant/test_from_texts.py b/tests/integration_tests/vectorstores/qdrant/test_from_texts.py new file mode 100644 index 00000000000..03aeed59e39 --- /dev/null +++ b/tests/integration_tests/vectorstores/qdrant/test_from_texts.py @@ -0,0 +1,252 @@ +import tempfile +from typing import Optional + +import pytest + +from langchain.schema import Document +from langchain.vectorstores import Qdrant +from langchain.vectorstores.qdrant import QdrantException +from tests.integration_tests.vectorstores.fake_embeddings import ( + ConsistentFakeEmbeddings, +) + + +def test_qdrant_from_texts_stores_duplicated_texts() -> None: + """Test end to end Qdrant.from_texts stores duplicated texts separately.""" + from qdrant_client import QdrantClient + + collection_name = "test" + + with tempfile.TemporaryDirectory() as tmpdir: + vec_store = Qdrant.from_texts( + ["abc", "abc"], + ConsistentFakeEmbeddings(), + collection_name=collection_name, + path=str(tmpdir), + ) + del vec_store + + client = QdrantClient(path=str(tmpdir)) + assert 2 == client.count(collection_name).count + + +@pytest.mark.parametrize("batch_size", [1, 64]) +@pytest.mark.parametrize("vector_name", [None, "my-vector"]) +def test_qdrant_from_texts_stores_ids( + batch_size: int, vector_name: Optional[str] +) -> None: + """Test end to end Qdrant.from_texts stores provided ids.""" + from qdrant_client import QdrantClient + + collection_name = "test" + with tempfile.TemporaryDirectory() as tmpdir: + ids = [ + "fa38d572-4c31-4579-aedc-1960d79df6df", + "cdc1aa36-d6ab-4fb2-8a94-56674fd27484", + ] + vec_store = Qdrant.from_texts( + ["abc", "def"], + ConsistentFakeEmbeddings(), + ids=ids, + collection_name=collection_name, + path=str(tmpdir), + batch_size=batch_size, + vector_name=vector_name, + ) + del vec_store + + client = QdrantClient(path=str(tmpdir)) + assert 2 == client.count(collection_name).count + stored_ids = [point.id for point in client.scroll(collection_name)[0]] + assert set(ids) == set(stored_ids) + + +@pytest.mark.parametrize("vector_name", ["custom-vector"]) +def test_qdrant_from_texts_stores_embeddings_as_named_vectors(vector_name: str) -> None: + """Test end to end Qdrant.from_texts stores named vectors if name is provided.""" + from qdrant_client import QdrantClient + + collection_name = "test" + with tempfile.TemporaryDirectory() as tmpdir: + vec_store = Qdrant.from_texts( + ["lorem", "ipsum", "dolor", "sit", "amet"], + ConsistentFakeEmbeddings(), + collection_name=collection_name, + path=str(tmpdir), + vector_name=vector_name, + ) + del vec_store + + client = QdrantClient(path=str(tmpdir)) + assert 5 == client.count(collection_name).count + assert all( + vector_name in point.vector # type: ignore[operator] + for point in client.scroll(collection_name, with_vectors=True)[0] + ) + + +@pytest.mark.parametrize("vector_name", [None, "custom-vector"]) +def test_qdrant_from_texts_reuses_same_collection(vector_name: Optional[str]) -> None: + """Test if Qdrant.from_texts reuses the same collection""" + from qdrant_client import QdrantClient + + collection_name = "test" + embeddings = ConsistentFakeEmbeddings() + with tempfile.TemporaryDirectory() as tmpdir: + vec_store = Qdrant.from_texts( + ["lorem", "ipsum", "dolor", "sit", "amet"], + embeddings, + collection_name=collection_name, + path=str(tmpdir), + vector_name=vector_name, + ) + del vec_store + + vec_store = Qdrant.from_texts( + ["foo", "bar"], + embeddings, + collection_name=collection_name, + path=str(tmpdir), + vector_name=vector_name, + ) + del vec_store + + client = QdrantClient(path=str(tmpdir)) + assert 7 == client.count(collection_name).count + + +@pytest.mark.parametrize("vector_name", [None, "custom-vector"]) +def test_qdrant_from_texts_raises_error_on_different_dimensionality( + vector_name: Optional[str], +) -> None: + """Test if Qdrant.from_texts raises an exception if dimensionality does not match""" + collection_name = "test" + with tempfile.TemporaryDirectory() as tmpdir: + vec_store = Qdrant.from_texts( + ["lorem", "ipsum", "dolor", "sit", "amet"], + ConsistentFakeEmbeddings(dimensionality=10), + collection_name=collection_name, + path=str(tmpdir), + vector_name=vector_name, + ) + del vec_store + + with pytest.raises(QdrantException): + Qdrant.from_texts( + ["foo", "bar"], + ConsistentFakeEmbeddings(dimensionality=5), + collection_name=collection_name, + path=str(tmpdir), + vector_name=vector_name, + ) + + +@pytest.mark.parametrize( + ["first_vector_name", "second_vector_name"], + [ + (None, "custom-vector"), + ("custom-vector", None), + ("my-first-vector", "my-second_vector"), + ], +) +def test_qdrant_from_texts_raises_error_on_different_vector_name( + first_vector_name: Optional[str], + second_vector_name: Optional[str], +) -> None: + """Test if Qdrant.from_texts raises an exception if vector name does not match""" + collection_name = "test" + with tempfile.TemporaryDirectory() as tmpdir: + vec_store = Qdrant.from_texts( + ["lorem", "ipsum", "dolor", "sit", "amet"], + ConsistentFakeEmbeddings(dimensionality=10), + collection_name=collection_name, + path=str(tmpdir), + vector_name=first_vector_name, + ) + del vec_store + + with pytest.raises(QdrantException): + Qdrant.from_texts( + ["foo", "bar"], + ConsistentFakeEmbeddings(dimensionality=5), + collection_name=collection_name, + path=str(tmpdir), + vector_name=second_vector_name, + ) + + +def test_qdrant_from_texts_raises_error_on_different_distance() -> None: + """Test if Qdrant.from_texts raises an exception if distance does not match""" + collection_name = "test" + with tempfile.TemporaryDirectory() as tmpdir: + vec_store = Qdrant.from_texts( + ["lorem", "ipsum", "dolor", "sit", "amet"], + ConsistentFakeEmbeddings(dimensionality=10), + collection_name=collection_name, + path=str(tmpdir), + distance_func="Cosine", + ) + del vec_store + + with pytest.raises(QdrantException): + Qdrant.from_texts( + ["foo", "bar"], + ConsistentFakeEmbeddings(dimensionality=5), + collection_name=collection_name, + path=str(tmpdir), + distance_func="Euclid", + ) + + +@pytest.mark.parametrize("vector_name", [None, "custom-vector"]) +def test_qdrant_from_texts_recreates_collection_on_force_recreate( + vector_name: Optional[str], +) -> None: + """Test if Qdrant.from_texts recreates the collection even if config mismatches""" + from qdrant_client import QdrantClient + + collection_name = "test" + with tempfile.TemporaryDirectory() as tmpdir: + vec_store = Qdrant.from_texts( + ["lorem", "ipsum", "dolor", "sit", "amet"], + ConsistentFakeEmbeddings(dimensionality=10), + collection_name=collection_name, + path=str(tmpdir), + vector_name=vector_name, + ) + del vec_store + + vec_store = Qdrant.from_texts( + ["foo", "bar"], + ConsistentFakeEmbeddings(dimensionality=5), + collection_name=collection_name, + path=str(tmpdir), + vector_name=vector_name, + force_recreate=True, + ) + del vec_store + + client = QdrantClient(path=str(tmpdir)) + assert 2 == client.count(collection_name).count + + +@pytest.mark.parametrize("batch_size", [1, 64]) +@pytest.mark.parametrize("content_payload_key", [Qdrant.CONTENT_KEY, "foo"]) +@pytest.mark.parametrize("metadata_payload_key", [Qdrant.METADATA_KEY, "bar"]) +def test_qdrant_from_texts_stores_metadatas( + batch_size: int, content_payload_key: str, metadata_payload_key: str +) -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = Qdrant.from_texts( + texts, + ConsistentFakeEmbeddings(), + metadatas=metadatas, + location=":memory:", + content_payload_key=content_payload_key, + metadata_payload_key=metadata_payload_key, + batch_size=batch_size, + ) + output = docsearch.similarity_search("foo", k=1) + assert output == [Document(page_content="foo", metadata={"page": 0})] diff --git a/tests/integration_tests/vectorstores/qdrant/test_max_marginal_relevance.py b/tests/integration_tests/vectorstores/qdrant/test_max_marginal_relevance.py new file mode 100644 index 00000000000..5008dc1c7fc --- /dev/null +++ b/tests/integration_tests/vectorstores/qdrant/test_max_marginal_relevance.py @@ -0,0 +1,40 @@ +from typing import Optional + +import pytest + +from langchain.schema import Document +from langchain.vectorstores import Qdrant +from tests.integration_tests.vectorstores.fake_embeddings import ( + ConsistentFakeEmbeddings, +) + + +@pytest.mark.parametrize("batch_size", [1, 64]) +@pytest.mark.parametrize("content_payload_key", [Qdrant.CONTENT_KEY, "test_content"]) +@pytest.mark.parametrize("metadata_payload_key", [Qdrant.METADATA_KEY, "test_metadata"]) +@pytest.mark.parametrize("vector_name", [None, "my-vector"]) +@pytest.mark.skip(reason="Qdrant local behaves differently from Qdrant server") +def test_qdrant_max_marginal_relevance_search( + batch_size: int, + content_payload_key: str, + metadata_payload_key: str, + vector_name: Optional[str], +) -> None: + """Test end to end construction and MRR search.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = Qdrant.from_texts( + texts, + ConsistentFakeEmbeddings(), + metadatas=metadatas, + location=":memory:", + content_payload_key=content_payload_key, + metadata_payload_key=metadata_payload_key, + batch_size=batch_size, + vector_name=vector_name, + ) + output = docsearch.max_marginal_relevance_search("foo", k=2, fetch_k=3) + assert output == [ + Document(page_content="foo", metadata={"page": 0}), + Document(page_content="baz", metadata={"page": 2}), + ] diff --git a/tests/integration_tests/vectorstores/qdrant/test_similarity_search.py b/tests/integration_tests/vectorstores/qdrant/test_similarity_search.py new file mode 100644 index 00000000000..cfd413fbef2 --- /dev/null +++ b/tests/integration_tests/vectorstores/qdrant/test_similarity_search.py @@ -0,0 +1,275 @@ +from typing import Optional + +import numpy as np +import pytest +from qdrant_client.http import models as rest + +from langchain.schema import Document +from langchain.vectorstores import Qdrant +from tests.integration_tests.vectorstores.fake_embeddings import ( + ConsistentFakeEmbeddings, +) + + +@pytest.mark.parametrize("batch_size", [1, 64]) +@pytest.mark.parametrize("content_payload_key", [Qdrant.CONTENT_KEY, "foo"]) +@pytest.mark.parametrize("metadata_payload_key", [Qdrant.METADATA_KEY, "bar"]) +@pytest.mark.parametrize("vector_name", [None, "my-vector"]) +def test_qdrant_similarity_search( + batch_size: int, + content_payload_key: str, + metadata_payload_key: str, + vector_name: Optional[str], +) -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + docsearch = Qdrant.from_texts( + texts, + ConsistentFakeEmbeddings(), + location=":memory:", + content_payload_key=content_payload_key, + metadata_payload_key=metadata_payload_key, + batch_size=batch_size, + vector_name=vector_name, + ) + output = docsearch.similarity_search("foo", k=1) + assert output == [Document(page_content="foo")] + + +@pytest.mark.parametrize("batch_size", [1, 64]) +@pytest.mark.parametrize("content_payload_key", [Qdrant.CONTENT_KEY, "foo"]) +@pytest.mark.parametrize("metadata_payload_key", [Qdrant.METADATA_KEY, "bar"]) +@pytest.mark.parametrize("vector_name", [None, "my-vector"]) +def test_qdrant_similarity_search_by_vector( + batch_size: int, + content_payload_key: str, + metadata_payload_key: str, + vector_name: Optional[str], +) -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + docsearch = Qdrant.from_texts( + texts, + ConsistentFakeEmbeddings(), + location=":memory:", + content_payload_key=content_payload_key, + metadata_payload_key=metadata_payload_key, + batch_size=batch_size, + vector_name=vector_name, + ) + embeddings = ConsistentFakeEmbeddings().embed_query("foo") + output = docsearch.similarity_search_by_vector(embeddings, k=1) + assert output == [Document(page_content="foo")] + + +@pytest.mark.parametrize("batch_size", [1, 64]) +@pytest.mark.parametrize("content_payload_key", [Qdrant.CONTENT_KEY, "foo"]) +@pytest.mark.parametrize("metadata_payload_key", [Qdrant.METADATA_KEY, "bar"]) +@pytest.mark.parametrize("vector_name", [None, "my-vector"]) +def test_qdrant_similarity_search_with_score_by_vector( + batch_size: int, + content_payload_key: str, + metadata_payload_key: str, + vector_name: Optional[str], +) -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + docsearch = Qdrant.from_texts( + texts, + ConsistentFakeEmbeddings(), + location=":memory:", + content_payload_key=content_payload_key, + metadata_payload_key=metadata_payload_key, + batch_size=batch_size, + vector_name=vector_name, + ) + embeddings = ConsistentFakeEmbeddings().embed_query("foo") + output = docsearch.similarity_search_with_score_by_vector(embeddings, k=1) + assert len(output) == 1 + document, score = output[0] + assert document == Document(page_content="foo") + assert score >= 0 + + +@pytest.mark.parametrize("batch_size", [1, 64]) +@pytest.mark.parametrize("vector_name", [None, "my-vector"]) +def test_qdrant_similarity_search_filters( + batch_size: int, vector_name: Optional[str] +) -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + metadatas = [ + {"page": i, "metadata": {"page": i + 1, "pages": [i + 2, -1]}} + for i in range(len(texts)) + ] + docsearch = Qdrant.from_texts( + texts, + ConsistentFakeEmbeddings(), + metadatas=metadatas, + location=":memory:", + batch_size=batch_size, + vector_name=vector_name, + ) + + output = docsearch.similarity_search( + "foo", k=1, filter={"page": 1, "metadata": {"page": 2, "pages": [3]}} + ) + assert output == [ + Document( + page_content="bar", + metadata={"page": 1, "metadata": {"page": 2, "pages": [3, -1]}}, + ) + ] + + +@pytest.mark.parametrize("vector_name", [None, "my-vector"]) +def test_qdrant_similarity_search_with_relevance_score_no_threshold( + vector_name: Optional[str], +) -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + metadatas = [ + {"page": i, "metadata": {"page": i + 1, "pages": [i + 2, -1]}} + for i in range(len(texts)) + ] + docsearch = Qdrant.from_texts( + texts, + ConsistentFakeEmbeddings(), + metadatas=metadatas, + location=":memory:", + vector_name=vector_name, + ) + output = docsearch.similarity_search_with_relevance_scores( + "foo", k=3, score_threshold=None + ) + assert len(output) == 3 + for i in range(len(output)): + assert round(output[i][1], 2) >= 0 + assert round(output[i][1], 2) <= 1 + + +@pytest.mark.parametrize("vector_name", [None, "my-vector"]) +def test_qdrant_similarity_search_with_relevance_score_with_threshold( + vector_name: Optional[str], +) -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + metadatas = [ + {"page": i, "metadata": {"page": i + 1, "pages": [i + 2, -1]}} + for i in range(len(texts)) + ] + docsearch = Qdrant.from_texts( + texts, + ConsistentFakeEmbeddings(), + metadatas=metadatas, + location=":memory:", + vector_name=vector_name, + ) + + score_threshold = 0.98 + kwargs = {"score_threshold": score_threshold} + output = docsearch.similarity_search_with_relevance_scores("foo", k=3, **kwargs) + assert len(output) == 1 + assert all([score >= score_threshold for _, score in output]) + + +@pytest.mark.parametrize("vector_name", [None, "my-vector"]) +def test_qdrant_similarity_search_with_relevance_score_with_threshold_and_filter( + vector_name: Optional[str], +) -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + metadatas = [ + {"page": i, "metadata": {"page": i + 1, "pages": [i + 2, -1]}} + for i in range(len(texts)) + ] + docsearch = Qdrant.from_texts( + texts, + ConsistentFakeEmbeddings(), + metadatas=metadatas, + location=":memory:", + vector_name=vector_name, + ) + score_threshold = 0.99 # for almost exact match + # test negative filter condition + negative_filter = {"page": 1, "metadata": {"page": 2, "pages": [3]}} + kwargs = {"filter": negative_filter, "score_threshold": score_threshold} + output = docsearch.similarity_search_with_relevance_scores("foo", k=3, **kwargs) + assert len(output) == 0 + # test positive filter condition + positive_filter = {"page": 0, "metadata": {"page": 1, "pages": [2]}} + kwargs = {"filter": positive_filter, "score_threshold": score_threshold} + output = docsearch.similarity_search_with_relevance_scores("foo", k=3, **kwargs) + assert len(output) == 1 + assert all([score >= score_threshold for _, score in output]) + + +@pytest.mark.parametrize("vector_name", [None, "my-vector"]) +def test_qdrant_similarity_search_filters_with_qdrant_filters( + vector_name: Optional[str], +) -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + metadatas = [ + {"page": i, "details": {"page": i + 1, "pages": [i + 2, -1]}} + for i in range(len(texts)) + ] + docsearch = Qdrant.from_texts( + texts, + ConsistentFakeEmbeddings(), + metadatas=metadatas, + location=":memory:", + vector_name=vector_name, + ) + + qdrant_filter = rest.Filter( + must=[ + rest.FieldCondition( + key="metadata.page", + match=rest.MatchValue(value=1), + ), + rest.FieldCondition( + key="metadata.details.page", + match=rest.MatchValue(value=2), + ), + rest.FieldCondition( + key="metadata.details.pages", + match=rest.MatchAny(any=[3]), + ), + ] + ) + output = docsearch.similarity_search("foo", k=1, filter=qdrant_filter) + assert output == [ + Document( + page_content="bar", + metadata={"page": 1, "details": {"page": 2, "pages": [3, -1]}}, + ) + ] + + +@pytest.mark.parametrize("batch_size", [1, 64]) +@pytest.mark.parametrize("content_payload_key", [Qdrant.CONTENT_KEY, "foo"]) +@pytest.mark.parametrize("metadata_payload_key", [Qdrant.METADATA_KEY, "bar"]) +@pytest.mark.parametrize("vector_name", [None, "my-vector"]) +def test_qdrant_similarity_search_with_relevance_scores( + batch_size: int, + content_payload_key: str, + metadata_payload_key: str, + vector_name: Optional[str], +) -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + docsearch = Qdrant.from_texts( + texts, + ConsistentFakeEmbeddings(), + location=":memory:", + content_payload_key=content_payload_key, + metadata_payload_key=metadata_payload_key, + batch_size=batch_size, + vector_name=vector_name, + ) + output = docsearch.similarity_search_with_relevance_scores("foo", k=3) + + assert all( + (1 >= score or np.isclose(score, 1)) and score >= 0 for _, score in output + ) diff --git a/tests/integration_tests/vectorstores/test_qdrant.py b/tests/integration_tests/vectorstores/test_qdrant.py deleted file mode 100644 index 5a9686ab631..00000000000 --- a/tests/integration_tests/vectorstores/test_qdrant.py +++ /dev/null @@ -1,685 +0,0 @@ -"""Test Qdrant functionality.""" -import tempfile -from typing import Callable, Optional - -import numpy as np -import pytest -from qdrant_client.http import models as rest - -from langchain.docstore.document import Document -from langchain.embeddings.base import Embeddings -from langchain.vectorstores import Qdrant -from langchain.vectorstores.qdrant import QdrantException -from tests.integration_tests.vectorstores.fake_embeddings import ( - ConsistentFakeEmbeddings, -) - - -@pytest.mark.parametrize("batch_size", [1, 64]) -@pytest.mark.parametrize("content_payload_key", [Qdrant.CONTENT_KEY, "foo"]) -@pytest.mark.parametrize("metadata_payload_key", [Qdrant.METADATA_KEY, "bar"]) -def test_qdrant_similarity_search( - batch_size: int, content_payload_key: str, metadata_payload_key: str -) -> None: - """Test end to end construction and search.""" - texts = ["foo", "bar", "baz"] - docsearch = Qdrant.from_texts( - texts, - ConsistentFakeEmbeddings(), - location=":memory:", - content_payload_key=content_payload_key, - metadata_payload_key=metadata_payload_key, - batch_size=batch_size, - ) - output = docsearch.similarity_search("foo", k=1) - assert output == [Document(page_content="foo")] - - -@pytest.mark.parametrize("batch_size", [1, 64]) -@pytest.mark.parametrize("content_payload_key", [Qdrant.CONTENT_KEY, "foo"]) -@pytest.mark.parametrize("metadata_payload_key", [Qdrant.METADATA_KEY, "bar"]) -def test_qdrant_similarity_search_by_vector( - batch_size: int, content_payload_key: str, metadata_payload_key: str -) -> None: - """Test end to end construction and search.""" - texts = ["foo", "bar", "baz"] - docsearch = Qdrant.from_texts( - texts, - ConsistentFakeEmbeddings(), - location=":memory:", - content_payload_key=content_payload_key, - metadata_payload_key=metadata_payload_key, - batch_size=batch_size, - ) - embeddings = ConsistentFakeEmbeddings().embed_query("foo") - output = docsearch.similarity_search_by_vector(embeddings, k=1) - assert output == [Document(page_content="foo")] - - -@pytest.mark.parametrize("batch_size", [1, 64]) -@pytest.mark.parametrize("content_payload_key", [Qdrant.CONTENT_KEY, "foo"]) -@pytest.mark.parametrize("metadata_payload_key", [Qdrant.METADATA_KEY, "bar"]) -def test_qdrant_similarity_search_with_score_by_vector( - batch_size: int, content_payload_key: str, metadata_payload_key: str -) -> None: - """Test end to end construction and search.""" - texts = ["foo", "bar", "baz"] - docsearch = Qdrant.from_texts( - texts, - ConsistentFakeEmbeddings(), - location=":memory:", - content_payload_key=content_payload_key, - metadata_payload_key=metadata_payload_key, - batch_size=batch_size, - ) - embeddings = ConsistentFakeEmbeddings().embed_query("foo") - output = docsearch.similarity_search_with_score_by_vector(embeddings, k=1) - assert len(output) == 1 - document, score = output[0] - assert document == Document(page_content="foo") - assert score >= 0 - - -@pytest.mark.parametrize("batch_size", [1, 64]) -@pytest.mark.parametrize("vector_name", [None, "my-vector"]) -def test_qdrant_add_documents(batch_size: int, vector_name: Optional[str]) -> None: - """Test end to end construction and search.""" - texts = ["foo", "bar", "baz"] - docsearch: Qdrant = Qdrant.from_texts( - texts, - ConsistentFakeEmbeddings(), - location=":memory:", - batch_size=batch_size, - vector_name=vector_name, - ) - - new_texts = ["foobar", "foobaz"] - docsearch.add_documents( - [Document(page_content=content) for content in new_texts], batch_size=batch_size - ) - output = docsearch.similarity_search("foobar", k=1) - # StatefulFakeEmbeddings return the same query embedding as the first document - # embedding computed in `embedding.embed_documents`. Thus, "foo" embedding is the - # same as "foobar" embedding - assert output == [Document(page_content="foobar")] or output == [ - Document(page_content="foo") - ] - - -@pytest.mark.parametrize("batch_size", [1, 64]) -def test_qdrant_add_texts_returns_all_ids(batch_size: int) -> None: - """Test end to end Qdrant.add_texts returns unique ids.""" - docsearch: Qdrant = Qdrant.from_texts( - ["foobar"], - ConsistentFakeEmbeddings(), - location=":memory:", - batch_size=batch_size, - ) - - ids = docsearch.add_texts(["foo", "bar", "baz"]) - assert 3 == len(ids) - assert 3 == len(set(ids)) - - -@pytest.mark.parametrize("batch_size", [1, 64]) -@pytest.mark.parametrize("content_payload_key", [Qdrant.CONTENT_KEY, "foo"]) -@pytest.mark.parametrize("metadata_payload_key", [Qdrant.METADATA_KEY, "bar"]) -def test_qdrant_with_metadatas( - batch_size: int, content_payload_key: str, metadata_payload_key: str -) -> None: - """Test end to end construction and search.""" - texts = ["foo", "bar", "baz"] - metadatas = [{"page": i} for i in range(len(texts))] - docsearch = Qdrant.from_texts( - texts, - ConsistentFakeEmbeddings(), - metadatas=metadatas, - location=":memory:", - content_payload_key=content_payload_key, - metadata_payload_key=metadata_payload_key, - batch_size=batch_size, - ) - output = docsearch.similarity_search("foo", k=1) - assert output == [Document(page_content="foo", metadata={"page": 0})] - - -@pytest.mark.parametrize("batch_size", [1, 64]) -def test_qdrant_similarity_search_filters(batch_size: int) -> None: - """Test end to end construction and search.""" - texts = ["foo", "bar", "baz"] - metadatas = [ - {"page": i, "metadata": {"page": i + 1, "pages": [i + 2, -1]}} - for i in range(len(texts)) - ] - docsearch = Qdrant.from_texts( - texts, - ConsistentFakeEmbeddings(), - metadatas=metadatas, - location=":memory:", - batch_size=batch_size, - ) - - output = docsearch.similarity_search( - "foo", k=1, filter={"page": 1, "metadata": {"page": 2, "pages": [3]}} - ) - assert output == [ - Document( - page_content="bar", - metadata={"page": 1, "metadata": {"page": 2, "pages": [3, -1]}}, - ) - ] - - -@pytest.mark.parametrize("vector_name", [None, "my-vector"]) -def test_qdrant_similarity_search_with_relevance_score_no_threshold( - vector_name: Optional[str], -) -> None: - """Test end to end construction and search.""" - texts = ["foo", "bar", "baz"] - metadatas = [ - {"page": i, "metadata": {"page": i + 1, "pages": [i + 2, -1]}} - for i in range(len(texts)) - ] - docsearch = Qdrant.from_texts( - texts, - ConsistentFakeEmbeddings(), - metadatas=metadatas, - location=":memory:", - vector_name=vector_name, - ) - output = docsearch.similarity_search_with_relevance_scores( - "foo", k=3, score_threshold=None - ) - assert len(output) == 3 - for i in range(len(output)): - assert round(output[i][1], 2) >= 0 - assert round(output[i][1], 2) <= 1 - - -def test_qdrant_similarity_search_with_relevance_score_with_threshold() -> None: - """Test end to end construction and search.""" - texts = ["foo", "bar", "baz"] - metadatas = [ - {"page": i, "metadata": {"page": i + 1, "pages": [i + 2, -1]}} - for i in range(len(texts)) - ] - docsearch = Qdrant.from_texts( - texts, - ConsistentFakeEmbeddings(), - metadatas=metadatas, - location=":memory:", - ) - - score_threshold = 0.98 - kwargs = {"score_threshold": score_threshold} - output = docsearch.similarity_search_with_relevance_scores("foo", k=3, **kwargs) - assert len(output) == 1 - assert all([score >= score_threshold for _, score in output]) - - -def test_qdrant_similarity_search_with_relevance_score_with_threshold_and_filter() -> ( - None -): - """Test end to end construction and search.""" - texts = ["foo", "bar", "baz"] - metadatas = [ - {"page": i, "metadata": {"page": i + 1, "pages": [i + 2, -1]}} - for i in range(len(texts)) - ] - docsearch = Qdrant.from_texts( - texts, - ConsistentFakeEmbeddings(), - metadatas=metadatas, - location=":memory:", - ) - score_threshold = 0.99 # for almost exact match - # test negative filter condition - negative_filter = {"page": 1, "metadata": {"page": 2, "pages": [3]}} - kwargs = {"filter": negative_filter, "score_threshold": score_threshold} - output = docsearch.similarity_search_with_relevance_scores("foo", k=3, **kwargs) - assert len(output) == 0 - # test positive filter condition - positive_filter = {"page": 0, "metadata": {"page": 1, "pages": [2]}} - kwargs = {"filter": positive_filter, "score_threshold": score_threshold} - output = docsearch.similarity_search_with_relevance_scores("foo", k=3, **kwargs) - assert len(output) == 1 - assert all([score >= score_threshold for _, score in output]) - - -def test_qdrant_similarity_search_filters_with_qdrant_filters() -> None: - """Test end to end construction and search.""" - texts = ["foo", "bar", "baz"] - metadatas = [ - {"page": i, "details": {"page": i + 1, "pages": [i + 2, -1]}} - for i in range(len(texts)) - ] - docsearch = Qdrant.from_texts( - texts, - ConsistentFakeEmbeddings(), - metadatas=metadatas, - location=":memory:", - ) - - qdrant_filter = rest.Filter( - must=[ - rest.FieldCondition( - key="metadata.page", - match=rest.MatchValue(value=1), - ), - rest.FieldCondition( - key="metadata.details.page", - match=rest.MatchValue(value=2), - ), - rest.FieldCondition( - key="metadata.details.pages", - match=rest.MatchAny(any=[3]), - ), - ] - ) - output = docsearch.similarity_search("foo", k=1, filter=qdrant_filter) - assert output == [ - Document( - page_content="bar", - metadata={"page": 1, "details": {"page": 2, "pages": [3, -1]}}, - ) - ] - - -@pytest.mark.parametrize("batch_size", [1, 64]) -@pytest.mark.parametrize("content_payload_key", [Qdrant.CONTENT_KEY, "test_content"]) -@pytest.mark.parametrize("metadata_payload_key", [Qdrant.METADATA_KEY, "test_metadata"]) -@pytest.mark.parametrize("vector_name", [None, "my-vector"]) -def test_qdrant_max_marginal_relevance_search( - batch_size: int, - content_payload_key: str, - metadata_payload_key: str, - vector_name: Optional[str], -) -> None: - """Test end to end construction and MRR search.""" - texts = ["foo", "bar", "baz"] - metadatas = [{"page": i} for i in range(len(texts))] - docsearch = Qdrant.from_texts( - texts, - ConsistentFakeEmbeddings(), - metadatas=metadatas, - location=":memory:", - content_payload_key=content_payload_key, - metadata_payload_key=metadata_payload_key, - batch_size=batch_size, - vector_name=vector_name, - ) - output = docsearch.max_marginal_relevance_search("foo", k=2, fetch_k=3) - assert output == [ - Document(page_content="foo", metadata={"page": 0}), - Document(page_content="bar", metadata={"page": 1}), - ] - - -@pytest.mark.parametrize( - ["embeddings", "embedding_function"], - [ - (ConsistentFakeEmbeddings(), None), - (ConsistentFakeEmbeddings().embed_query, None), - (None, ConsistentFakeEmbeddings().embed_query), - ], -) -def test_qdrant_embedding_interface( - embeddings: Optional[Embeddings], embedding_function: Optional[Callable] -) -> None: - """Test Qdrant may accept different types for embeddings.""" - from qdrant_client import QdrantClient - - client = QdrantClient(":memory:") - collection_name = "test" - - Qdrant( - client, - collection_name, - embeddings=embeddings, - embedding_function=embedding_function, - ) - - -@pytest.mark.parametrize( - ["embeddings", "embedding_function"], - [ - (ConsistentFakeEmbeddings(), ConsistentFakeEmbeddings().embed_query), - (None, None), - ], -) -def test_qdrant_embedding_interface_raises_value_error( - embeddings: Optional[Embeddings], embedding_function: Optional[Callable] -) -> None: - """Test Qdrant requires only one method for embeddings.""" - from qdrant_client import QdrantClient - - client = QdrantClient(":memory:") - collection_name = "test" - - with pytest.raises(ValueError): - Qdrant( - client, - collection_name, - embeddings=embeddings, - embedding_function=embedding_function, - ) - - -@pytest.mark.parametrize("vector_name", [None, "my-vector"]) -def test_qdrant_add_texts_stores_duplicated_texts(vector_name: Optional[str]) -> None: - """Test end to end Qdrant.add_texts stores duplicated texts separately.""" - from qdrant_client import QdrantClient - from qdrant_client.http import models as rest - - client = QdrantClient(":memory:") - collection_name = "test" - vectors_config = rest.VectorParams(size=10, distance=rest.Distance.COSINE) - if vector_name is not None: - vectors_config = {vector_name: vectors_config} # type: ignore[assignment] - client.recreate_collection(collection_name, vectors_config=vectors_config) - - vec_store = Qdrant( - client, - collection_name, - embeddings=ConsistentFakeEmbeddings(), - vector_name=vector_name, - ) - ids = vec_store.add_texts(["abc", "abc"], [{"a": 1}, {"a": 2}]) - - assert 2 == len(set(ids)) - assert 2 == client.count(collection_name).count - - -def test_qdrant_from_texts_stores_duplicated_texts() -> None: - """Test end to end Qdrant.from_texts stores duplicated texts separately.""" - from qdrant_client import QdrantClient - - with tempfile.TemporaryDirectory() as tmpdir: - vec_store = Qdrant.from_texts( - ["abc", "abc"], - ConsistentFakeEmbeddings(), - collection_name="test", - path=str(tmpdir), - ) - del vec_store - - client = QdrantClient(path=str(tmpdir)) - assert 2 == client.count("test").count - - -@pytest.mark.parametrize("batch_size", [1, 64]) -@pytest.mark.parametrize("vector_name", [None, "my-vector"]) -def test_qdrant_from_texts_stores_ids( - batch_size: int, vector_name: Optional[str] -) -> None: - """Test end to end Qdrant.from_texts stores provided ids.""" - from qdrant_client import QdrantClient - - with tempfile.TemporaryDirectory() as tmpdir: - ids = [ - "fa38d572-4c31-4579-aedc-1960d79df6df", - "cdc1aa36-d6ab-4fb2-8a94-56674fd27484", - ] - vec_store = Qdrant.from_texts( - ["abc", "def"], - ConsistentFakeEmbeddings(), - ids=ids, - collection_name="test", - path=str(tmpdir), - batch_size=batch_size, - vector_name=vector_name, - ) - del vec_store - - client = QdrantClient(path=str(tmpdir)) - assert 2 == client.count("test").count - stored_ids = [point.id for point in client.scroll("test")[0]] - assert set(ids) == set(stored_ids) - - -@pytest.mark.parametrize("batch_size", [1, 64]) -def test_qdrant_add_texts_stores_ids(batch_size: int) -> None: - """Test end to end Qdrant.add_texts stores provided ids.""" - from qdrant_client import QdrantClient - - ids = [ - "fa38d572-4c31-4579-aedc-1960d79df6df", - "cdc1aa36-d6ab-4fb2-8a94-56674fd27484", - ] - - client = QdrantClient(":memory:") - collection_name = "test" - client.recreate_collection( - collection_name, - vectors_config=rest.VectorParams(size=10, distance=rest.Distance.COSINE), - ) - - vec_store = Qdrant(client, "test", ConsistentFakeEmbeddings()) - returned_ids = vec_store.add_texts(["abc", "def"], ids=ids, batch_size=batch_size) - - assert all(first == second for first, second in zip(ids, returned_ids)) - assert 2 == client.count("test").count - stored_ids = [point.id for point in client.scroll("test")[0]] - assert set(ids) == set(stored_ids) - - -@pytest.mark.parametrize("vector_name", ["custom-vector"]) -def test_qdrant_from_texts_stores_embeddings_as_named_vectors(vector_name: str) -> None: - """Test end to end Qdrant.from_texts stores named vectors if name is provided.""" - from qdrant_client import QdrantClient - - collection_name = "test" - with tempfile.TemporaryDirectory() as tmpdir: - vec_store = Qdrant.from_texts( - ["lorem", "ipsum", "dolor", "sit", "amet"], - ConsistentFakeEmbeddings(), - collection_name=collection_name, - path=str(tmpdir), - vector_name=vector_name, - ) - del vec_store - - client = QdrantClient(path=str(tmpdir)) - assert 5 == client.count("test").count - assert all( - vector_name in point.vector # type: ignore[operator] - for point in client.scroll(collection_name, with_vectors=True)[0] - ) - - -@pytest.mark.parametrize("vector_name", ["custom-vector"]) -def test_qdrant_add_texts_stores_embeddings_as_named_vectors(vector_name: str) -> None: - """Test end to end Qdrant.add_texts stores named vectors if name is provided.""" - from qdrant_client import QdrantClient - - collection_name = "test" - - client = QdrantClient(":memory:") - client.recreate_collection( - collection_name, - vectors_config={ - vector_name: rest.VectorParams(size=10, distance=rest.Distance.COSINE) - }, - ) - - vec_store = Qdrant( - client, - collection_name, - ConsistentFakeEmbeddings(), - vector_name=vector_name, - ) - vec_store.add_texts(["lorem", "ipsum", "dolor", "sit", "amet"]) - - assert 5 == client.count("test").count - assert all( - vector_name in point.vector # type: ignore[operator] - for point in client.scroll(collection_name, with_vectors=True)[0] - ) - - -@pytest.mark.parametrize("batch_size", [1, 64]) -@pytest.mark.parametrize("content_payload_key", [Qdrant.CONTENT_KEY, "foo"]) -@pytest.mark.parametrize("metadata_payload_key", [Qdrant.METADATA_KEY, "bar"]) -def test_qdrant_similarity_search_with_relevance_scores( - batch_size: int, content_payload_key: str, metadata_payload_key: str -) -> None: - """Test end to end construction and search.""" - texts = ["foo", "bar", "baz"] - docsearch = Qdrant.from_texts( - texts, - ConsistentFakeEmbeddings(), - location=":memory:", - content_payload_key=content_payload_key, - metadata_payload_key=metadata_payload_key, - batch_size=batch_size, - ) - output = docsearch.similarity_search_with_relevance_scores("foo", k=3) - - assert all( - (1 >= score or np.isclose(score, 1)) and score >= 0 for _, score in output - ) - - -@pytest.mark.parametrize("vector_name", [None, "custom-vector"]) -def test_qdrant_from_texts_reuses_same_collection(vector_name: Optional[str]) -> None: - """Test if Qdrant.from_texts reuses the same collection""" - from qdrant_client import QdrantClient - - collection_name = "test" - embeddings = ConsistentFakeEmbeddings() - with tempfile.TemporaryDirectory() as tmpdir: - vec_store = Qdrant.from_texts( - ["lorem", "ipsum", "dolor", "sit", "amet"], - embeddings, - collection_name=collection_name, - path=str(tmpdir), - vector_name=vector_name, - ) - del vec_store - - vec_store = Qdrant.from_texts( - ["foo", "bar"], - embeddings, - collection_name=collection_name, - path=str(tmpdir), - vector_name=vector_name, - ) - del vec_store - - client = QdrantClient(path=str(tmpdir)) - assert 7 == client.count(collection_name).count - - -@pytest.mark.parametrize("vector_name", [None, "custom-vector"]) -def test_qdrant_from_texts_raises_error_on_different_dimensionality( - vector_name: Optional[str], -) -> None: - """Test if Qdrant.from_texts raises an exception if dimensionality does not match""" - collection_name = "test" - with tempfile.TemporaryDirectory() as tmpdir: - vec_store = Qdrant.from_texts( - ["lorem", "ipsum", "dolor", "sit", "amet"], - ConsistentFakeEmbeddings(dimensionality=10), - collection_name=collection_name, - path=str(tmpdir), - vector_name=vector_name, - ) - del vec_store - - with pytest.raises(QdrantException): - Qdrant.from_texts( - ["foo", "bar"], - ConsistentFakeEmbeddings(dimensionality=5), - collection_name=collection_name, - path=str(tmpdir), - vector_name=vector_name, - ) - - -@pytest.mark.parametrize( - ["first_vector_name", "second_vector_name"], - [ - (None, "custom-vector"), - ("custom-vector", None), - ("my-first-vector", "my-second_vector"), - ], -) -def test_qdrant_from_texts_raises_error_on_different_vector_name( - first_vector_name: Optional[str], - second_vector_name: Optional[str], -) -> None: - """Test if Qdrant.from_texts raises an exception if vector name does not match""" - collection_name = "test" - with tempfile.TemporaryDirectory() as tmpdir: - vec_store = Qdrant.from_texts( - ["lorem", "ipsum", "dolor", "sit", "amet"], - ConsistentFakeEmbeddings(dimensionality=10), - collection_name=collection_name, - path=str(tmpdir), - vector_name=first_vector_name, - ) - del vec_store - - with pytest.raises(QdrantException): - Qdrant.from_texts( - ["foo", "bar"], - ConsistentFakeEmbeddings(dimensionality=5), - collection_name=collection_name, - path=str(tmpdir), - vector_name=second_vector_name, - ) - - -def test_qdrant_from_texts_raises_error_on_different_distance() -> None: - """Test if Qdrant.from_texts raises an exception if distance does not match""" - collection_name = "test" - with tempfile.TemporaryDirectory() as tmpdir: - vec_store = Qdrant.from_texts( - ["lorem", "ipsum", "dolor", "sit", "amet"], - ConsistentFakeEmbeddings(dimensionality=10), - collection_name=collection_name, - path=str(tmpdir), - distance_func="Cosine", - ) - del vec_store - - with pytest.raises(QdrantException): - Qdrant.from_texts( - ["foo", "bar"], - ConsistentFakeEmbeddings(dimensionality=5), - collection_name=collection_name, - path=str(tmpdir), - distance_func="Euclid", - ) - - -@pytest.mark.parametrize("vector_name", [None, "custom-vector"]) -def test_qdrant_from_texts_recreates_collection_on_force_recreate( - vector_name: Optional[str], -) -> None: - """Test if Qdrant.from_texts recreates the collection even if config mismatches""" - from qdrant_client import QdrantClient - - collection_name = "test" - with tempfile.TemporaryDirectory() as tmpdir: - vec_store = Qdrant.from_texts( - ["lorem", "ipsum", "dolor", "sit", "amet"], - ConsistentFakeEmbeddings(dimensionality=10), - collection_name=collection_name, - path=str(tmpdir), - vector_name=vector_name, - ) - del vec_store - - vec_store = Qdrant.from_texts( - ["foo", "bar"], - ConsistentFakeEmbeddings(dimensionality=5), - collection_name=collection_name, - path=str(tmpdir), - vector_name=vector_name, - force_recreate=True, - ) - del vec_store - - client = QdrantClient(path=str(tmpdir)) - assert 2 == client.count(collection_name).count