From 0f45ac4088126746575d40959dabaadd787faeb6 Mon Sep 17 00:00:00 2001 From: Aayush Kataria Date: Tue, 23 Jul 2024 16:59:23 -0700 Subject: [PATCH] LangChain Community: VectorStores: Azure Cosmos DB Filtered Vector Search (#24087) Thank you for contributing to LangChain! - This PR adds vector search filtering for Azure Cosmos DB Mongo vCore and NoSQL. - [ ] **PR message**: ***Delete this entire checklist*** and replace with - **Description:** a description of the change - **Issue:** the issue # it fixes, if applicable - **Dependencies:** any dependencies required for this change - **Twitter handle:** if your PR gets announced, and you'd like a mention, we'll gladly shout you out! - [ ] **Add tests and docs**: If you're adding a new integration, please include 1. a test for the integration, preferably unit tests that do not rely on network access, 2. an example notebook showing its use. It lives in `docs/docs/integrations` directory. - [ ] **Lint and test**: Run `make format`, `make lint` and `make test` from the root of the package(s) you've modified. See contribution guidelines for more: https://python.langchain.com/docs/contributing/ Additional guidelines: - Make sure optional dependencies are imported within a function. - Please do not add dependencies to pyproject.toml files (even optional ones) unless they are required for unit tests. - Most PRs should not touch more than one package. - Changes should be backwards compatible. - If you are adding something to community, do not re-import it in langchain. If no one reviews your PR within a few days, please @-mention one of baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17. --- .../vectorstores/azure_cosmos_db.py | 101 ++++++++++++----- .../vectorstores/azure_cosmos_db_no_sql.py | 102 +++++++++++++++--- .../vectorstores/test_azure_cosmos_db.py | 2 +- .../test_azure_cosmos_db_no_sql.py | 59 ++++++++++ 4 files changed, 222 insertions(+), 42 deletions(-) diff --git a/libs/community/langchain_community/vectorstores/azure_cosmos_db.py b/libs/community/langchain_community/vectorstores/azure_cosmos_db.py index 559296c71a6..92a450bd86f 100644 --- a/libs/community/langchain_community/vectorstores/azure_cosmos_db.py +++ b/libs/community/langchain_community/vectorstores/azure_cosmos_db.py @@ -306,6 +306,27 @@ class AzureCosmosDBVectorSearch(VectorStore): } return command + def create_filter_index( + self, + property_to_filter: str, + index_name: str, + ) -> dict[str, Any]: + command = { + "createIndexes": self._collection.name, + "indexes": [ + { + "key": {property_to_filter: 1}, + "name": index_name, + } + ], + } + # retrieve the database object + current_database = self._collection.database + + # invoke the command from the database object + create_index_responses: dict[str, Any] = current_database.command(command) + return create_index_responses + def add_texts( self, texts: Iterable[str], @@ -345,7 +366,7 @@ class AzureCosmosDBVectorSearch(VectorStore): # Embed and create the documents embeddings = self._embedding.embed_documents(texts) to_insert = [ - {self._text_key: t, self._embedding_key: embedding, **m} + {self._text_key: t, self._embedding_key: embedding, "metadata": m} for t, m, embedding in zip(texts, metadatas, embeddings) ] # insert the documents in Cosmos DB @@ -397,8 +418,10 @@ class AzureCosmosDBVectorSearch(VectorStore): embeddings: List[float], k: int = 4, kind: CosmosDBVectorSearchType = CosmosDBVectorSearchType.VECTOR_IVF, + pre_filter: Optional[Dict] = None, ef_search: int = 40, score_threshold: float = 0.0, + with_embedding: bool = False, ) -> List[Tuple[Document, float]]: """Returns a list of documents with their scores @@ -422,9 +445,11 @@ class AzureCosmosDBVectorSearch(VectorStore): """ pipeline: List[dict[str, Any]] = [] if kind == CosmosDBVectorSearchType.VECTOR_IVF: - pipeline = self._get_pipeline_vector_ivf(embeddings, k) + pipeline = self._get_pipeline_vector_ivf(embeddings, k, pre_filter) elif kind == CosmosDBVectorSearchType.VECTOR_HNSW: - pipeline = self._get_pipeline_vector_hnsw(embeddings, k, ef_search) + pipeline = self._get_pipeline_vector_hnsw( + embeddings, k, ef_search, pre_filter + ) cursor = self._collection.aggregate(pipeline) @@ -433,28 +458,32 @@ class AzureCosmosDBVectorSearch(VectorStore): score = res.pop("similarityScore") if score < score_threshold: continue - document_object_field = ( - res.pop("document") - if kind == CosmosDBVectorSearchType.VECTOR_IVF - else res - ) + document_object_field = res.pop("document") text = document_object_field.pop(self._text_key) - docs.append( - (Document(page_content=text, metadata=document_object_field), score) - ) + metadata = document_object_field.pop("metadata") + if with_embedding: + metadata[self._embedding_key] = document_object_field.pop( + self._embedding_key + ) + + docs.append((Document(page_content=text, metadata=metadata), score)) return docs def _get_pipeline_vector_ivf( - self, embeddings: List[float], k: int = 4 + self, embeddings: List[float], k: int = 4, pre_filter: Optional[Dict] = None ) -> List[dict[str, Any]]: + params = { + "vector": embeddings, + "path": self._embedding_key, + "k": k, + } + if pre_filter: + params["filter"] = pre_filter + pipeline: List[dict[str, Any]] = [ { "$search": { - "cosmosSearch": { - "vector": embeddings, - "path": self._embedding_key, - "k": k, - }, + "cosmosSearch": params, "returnStoredSource": True, } }, @@ -468,17 +497,25 @@ class AzureCosmosDBVectorSearch(VectorStore): return pipeline def _get_pipeline_vector_hnsw( - self, embeddings: List[float], k: int = 4, ef_search: int = 40 + self, + embeddings: List[float], + k: int = 4, + ef_search: int = 40, + pre_filter: Optional[Dict] = None, ) -> List[dict[str, Any]]: + params = { + "vector": embeddings, + "path": self._embedding_key, + "k": k, + "efSearch": ef_search, + } + if pre_filter: + params["filter"] = pre_filter + pipeline: List[dict[str, Any]] = [ { "$search": { - "cosmosSearch": { - "vector": embeddings, - "path": self._embedding_key, - "k": k, - "efSearch": ef_search, - }, + "cosmosSearch": params, } }, { @@ -495,16 +532,20 @@ class AzureCosmosDBVectorSearch(VectorStore): query: str, k: int = 4, kind: CosmosDBVectorSearchType = CosmosDBVectorSearchType.VECTOR_IVF, + pre_filter: Optional[Dict] = None, ef_search: int = 40, score_threshold: float = 0.0, + with_embedding: bool = False, ) -> List[Tuple[Document, float]]: embeddings = self._embedding.embed_query(query) docs = self._similarity_search_with_score( embeddings=embeddings, k=k, kind=kind, + pre_filter=pre_filter, ef_search=ef_search, score_threshold=score_threshold, + with_embedding=with_embedding, ) return docs @@ -513,16 +554,20 @@ class AzureCosmosDBVectorSearch(VectorStore): query: str, k: int = 4, kind: CosmosDBVectorSearchType = CosmosDBVectorSearchType.VECTOR_IVF, + pre_filter: Optional[Dict] = None, ef_search: int = 40, score_threshold: float = 0.0, + with_embedding: bool = False, **kwargs: Any, ) -> List[Document]: docs_and_scores = self.similarity_search_with_score( query, k=k, kind=kind, + pre_filter=pre_filter, ef_search=ef_search, score_threshold=score_threshold, + with_embedding=with_embedding, ) return [doc for doc, _ in docs_and_scores] @@ -533,8 +578,10 @@ class AzureCosmosDBVectorSearch(VectorStore): fetch_k: int = 20, lambda_mult: float = 0.5, kind: CosmosDBVectorSearchType = CosmosDBVectorSearchType.VECTOR_IVF, + pre_filter: Optional[Dict] = None, ef_search: int = 40, score_threshold: float = 0.0, + with_embedding: bool = False, **kwargs: Any, ) -> List[Document]: # Retrieves the docs with similarity scores @@ -543,8 +590,10 @@ class AzureCosmosDBVectorSearch(VectorStore): embedding, k=fetch_k, kind=kind, + pre_filter=pre_filter, ef_search=ef_search, score_threshold=score_threshold, + with_embedding=with_embedding, ) # Re-ranks the docs using MMR @@ -564,8 +613,10 @@ class AzureCosmosDBVectorSearch(VectorStore): fetch_k: int = 20, lambda_mult: float = 0.5, kind: CosmosDBVectorSearchType = CosmosDBVectorSearchType.VECTOR_IVF, + pre_filter: Optional[Dict] = None, ef_search: int = 40, score_threshold: float = 0.0, + with_embedding: bool = False, **kwargs: Any, ) -> List[Document]: # compute the embeddings vector from the query string @@ -577,8 +628,10 @@ class AzureCosmosDBVectorSearch(VectorStore): fetch_k=fetch_k, lambda_mult=lambda_mult, kind=kind, + pre_filter=pre_filter, ef_search=ef_search, score_threshold=score_threshold, + with_embedding=with_embedding, ) return docs diff --git a/libs/community/langchain_community/vectorstores/azure_cosmos_db_no_sql.py b/libs/community/langchain_community/vectorstores/azure_cosmos_db_no_sql.py index 5be52fb02c7..8a671da921c 100644 --- a/libs/community/langchain_community/vectorstores/azure_cosmos_db_no_sql.py +++ b/libs/community/langchain_community/vectorstores/azure_cosmos_db_no_sql.py @@ -162,7 +162,12 @@ class AzureCosmosDBNoSqlVectorSearch(VectorStore): text_key = "text" to_insert = [ - {"id": str(uuid.uuid4()), text_key: t, self._embedding_key: embedding, **m} + { + "id": str(uuid.uuid4()), + text_key: t, + self._embedding_key: embedding, + "metadata": m, + } for t, m, embedding in zip(texts, metadatas, embeddings) ] # insert the documents in CosmosDB No Sql @@ -184,6 +189,7 @@ class AzureCosmosDBNoSqlVectorSearch(VectorStore): cosmos_database_properties: Dict[str, Any], database_name: str = "vectorSearchDB", container_name: str = "vectorSearchContainer", + create_container: bool = True, **kwargs: Any, ) -> AzureCosmosDBNoSqlVectorSearch: if kwargs: @@ -204,6 +210,7 @@ class AzureCosmosDBNoSqlVectorSearch(VectorStore): cosmos_database_properties=cosmos_database_properties, database_name=database_name, container_name=container_name, + create_container=create_container, ) @classmethod @@ -257,41 +264,83 @@ class AzureCosmosDBNoSqlVectorSearch(VectorStore): self, embeddings: List[float], k: int = 4, + pre_filter: Optional[Dict] = None, + with_embedding: bool = False, ) -> List[Tuple[Document, float]]: - query = ( - "SELECT TOP {} c.id, c.{}, c.text, VectorDistance(c.{}, {}) AS " - "SimilarityScore FROM c ORDER BY VectorDistance(c.{}, {})".format( - k, - self._embedding_key, - self._embedding_key, - embeddings, - self._embedding_key, - embeddings, - ) + query = "SELECT " + + # If limit_offset_clause is not specified, add TOP clause + if pre_filter is None or pre_filter.get("limit_offset_clause") is None: + query += "TOP @limit " + + query += ( + "c.id, c.{}, c.text, c.metadata, " + "VectorDistance(c.@embeddingKey, @embeddings) AS SimilarityScore FROM c" ) + + # Add where_clause if specified + if pre_filter is not None and pre_filter.get("where_clause") is not None: + query += " {}".format(pre_filter["where_clause"]) + + query += " ORDER BY VectorDistance(c.@embeddingKey, @embeddings)" + + # Add limit_offset_clause if specified + if pre_filter is not None and pre_filter.get("limit_offset_clause") is not None: + query += " {}".format(pre_filter["limit_offset_clause"]) + parameters = [ + {"name": "@limit", "value": k}, + {"name": "@embeddingKey", "value": self._embedding_key}, + {"name": "@embeddings", "value": embeddings}, + ] + docs_and_scores = [] + items = list( - self._container.query_items(query=query, enable_cross_partition_query=True) + self._container.query_items( + query=query, parameters=parameters, enable_cross_partition_query=True + ) ) for item in items: text = item["text"] + metadata = item["metadata"] score = item["SimilarityScore"] - docs_and_scores.append((Document(page_content=text, metadata=item), score)) + if with_embedding: + metadata[self._embedding_key] = item[self._embedding_key] + docs_and_scores.append( + (Document(page_content=text, metadata=metadata), score) + ) return docs_and_scores def similarity_search_with_score( self, query: str, k: int = 4, + pre_filter: Optional[Dict] = None, + with_embedding: bool = False, ) -> List[Tuple[Document, float]]: embeddings = self._embedding.embed_query(query) - docs_and_scores = self._similarity_search_with_score(embeddings=embeddings, k=k) + docs_and_scores = self._similarity_search_with_score( + embeddings=embeddings, + k=k, + pre_filter=pre_filter, + with_embedding=with_embedding, + ) return docs_and_scores def similarity_search( - self, query: str, k: int = 4, **kwargs: Any + self, + query: str, + k: int = 4, + pre_filter: Optional[Dict] = None, + with_embedding: bool = False, + **kwargs: Any, ) -> List[Document]: - docs_and_scores = self.similarity_search_with_score(query, k=k) + docs_and_scores = self.similarity_search_with_score( + query, + k=k, + pre_filter=pre_filter, + with_embedding=with_embedding, + ) return [doc for doc, _ in docs_and_scores] @@ -304,7 +353,18 @@ class AzureCosmosDBNoSqlVectorSearch(VectorStore): **kwargs: Any, ) -> List[Document]: # Retrieves the docs with similarity scores - docs = self._similarity_search_with_score(embeddings=embedding, k=fetch_k) + pre_filter = {} + with_embedding = False + if kwargs["pre_filter"]: + pre_filter = kwargs["pre_filter"] + if kwargs["with_embedding"]: + with_embedding = kwargs["with_embedding"] + docs = self._similarity_search_with_score( + embeddings=embedding, + k=fetch_k, + pre_filter=pre_filter, + with_embedding=with_embedding, + ) # Re-ranks the docs using MMR mmr_doc_indexes = maximal_marginal_relevance( @@ -326,6 +386,12 @@ class AzureCosmosDBNoSqlVectorSearch(VectorStore): **kwargs: Any, ) -> List[Document]: # compute the embeddings vector from the query string + pre_filter = {} + with_embedding = False + if kwargs["pre_filter"]: + pre_filter = kwargs["pre_filter"] + if kwargs["with_embedding"]: + with_embedding = kwargs["with_embedding"] embeddings = self._embedding.embed_query(query) docs = self.max_marginal_relevance_search_by_vector( @@ -333,5 +399,7 @@ class AzureCosmosDBNoSqlVectorSearch(VectorStore): k=k, fetch_k=fetch_k, lambda_mult=lambda_mult, + pre_filter=pre_filter, + with_embedding=with_embedding, ) return docs diff --git a/libs/community/tests/integration_tests/vectorstores/test_azure_cosmos_db.py b/libs/community/tests/integration_tests/vectorstores/test_azure_cosmos_db.py index b76bba231a2..7555a6bd560 100644 --- a/libs/community/tests/integration_tests/vectorstores/test_azure_cosmos_db.py +++ b/libs/community/tests/integration_tests/vectorstores/test_azure_cosmos_db.py @@ -25,7 +25,7 @@ model_name = os.getenv("OPENAI_EMBEDDINGS_MODEL_NAME", "text-embedding-ada-002") INDEX_NAME = "langchain-test-index" INDEX_NAME_VECTOR_HNSW = "langchain-test-index-hnsw" NAMESPACE = "langchain_test_db.langchain_test_collection" -CONNECTION_STRING: str = os.environ.get("MONGODB_VCORE_URI", "") +CONNECTION_STRING: str = "mongodb+srv://akataria:Basket24ball@akataria-vector-search-testing.mongocluster.cosmos.azure.com/?tls=true&authMechanism=SCRAM-SHA-256&retrywrites=false&maxIdleTimeMS=120000" DB_NAME, COLLECTION_NAME = NAMESPACE.split(".") num_lists = 3 diff --git a/libs/community/tests/integration_tests/vectorstores/test_azure_cosmos_db_no_sql.py b/libs/community/tests/integration_tests/vectorstores/test_azure_cosmos_db_no_sql.py index 73659e72215..c8a8f87a599 100644 --- a/libs/community/tests/integration_tests/vectorstores/test_azure_cosmos_db_no_sql.py +++ b/libs/community/tests/integration_tests/vectorstores/test_azure_cosmos_db_no_sql.py @@ -104,6 +104,7 @@ class TestAzureCosmosDBNoSqlVectorSearch: ), indexing_policy=get_vector_indexing_policy("flat"), cosmos_container_properties={"partition_key": partition_key}, + cosmos_database_properties={}, ) sleep(1) # waits for Cosmos DB to save contents to the collection @@ -139,6 +140,7 @@ class TestAzureCosmosDBNoSqlVectorSearch: ), indexing_policy=get_vector_indexing_policy("flat"), cosmos_container_properties={"partition_key": partition_key}, + cosmos_database_properties={}, ) sleep(1) # waits for Cosmos DB to save contents to the collection @@ -154,3 +156,60 @@ class TestAzureCosmosDBNoSqlVectorSearch: assert output2 assert output2[0].page_content != "Dogs are tough." safe_delete_database(cosmos_client) + + def test_from_documents_cosine_distance_with_filtering( + self, + cosmos_client: Any, + partition_key: Any, + azure_openai_embeddings: OpenAIEmbeddings, + ) -> None: + """Test end to end construction and search.""" + documents = [ + Document(page_content="Dogs are tough.", metadata={"a": 1}), + Document(page_content="Cats have fluff.", metadata={"a": 1}), + Document(page_content="What is a sandwich?", metadata={"c": 1}), + Document(page_content="That fence is purple.", metadata={"d": 1, "e": 2}), + ] + + store = AzureCosmosDBNoSqlVectorSearch.from_documents( + documents, + azure_openai_embeddings, + cosmos_client=cosmos_client, + database_name=database_name, + container_name=container_name, + vector_embedding_policy=get_vector_embedding_policy( + "cosine", "float32", 400 + ), + indexing_policy=get_vector_indexing_policy("flat"), + cosmos_container_properties={"partition_key": partition_key}, + cosmos_database_properties={}, + ) + sleep(1) # waits for Cosmos DB to save contents to the collection + + output = store.similarity_search("Dogs", k=4) + assert len(output) == 4 + assert output[0].page_content == "Dogs are tough." + assert output[0].metadata["a"] == 1 + + pre_filter = { + "where_clause": "WHERE c.metadata.a=1", + } + output = store.similarity_search( + "Dogs", k=4, pre_filter=pre_filter, with_embedding=True + ) + + assert len(output) == 2 + assert output[0].page_content == "Dogs are tough." + assert output[0].metadata["a"] == 1 + + pre_filter = { + "where_clause": "WHERE c.metadata.a=1", + "limit_offset_clause": "OFFSET 0 LIMIT 1", + } + + output = store.similarity_search("Dogs", k=4, pre_filter=pre_filter) + + assert len(output) == 1 + assert output[0].page_content == "Dogs are tough." + assert output[0].metadata["a"] == 1 + safe_delete_database(cosmos_client)