From 0f45ac4088126746575d40959dabaadd787faeb6 Mon Sep 17 00:00:00 2001
From: Aayush Kataria <aayushkataria3011@gmail.com>
Date: Tue, 23 Jul 2024 16:59:23 -0700
Subject: [PATCH] LangChain Community: VectorStores: Azure Cosmos DB Filtered
 Vector Search (#24087)

Thank you for contributing to LangChain!

- This PR adds vector search filtering for Azure Cosmos DB Mongo vCore
and NoSQL.


- [ ] **PR message**: ***Delete this entire checklist*** and replace
with
    - **Description:** a description of the change
    - **Issue:** the issue # it fixes, if applicable
    - **Dependencies:** any dependencies required for this change
- **Twitter handle:** if your PR gets announced, and you'd like a
mention, we'll gladly shout you out!


- [ ] **Add tests and docs**: If you're adding a new integration, please
include
1. a test for the integration, preferably unit tests that do not rely on
network access,
2. an example notebook showing its use. It lives in
`docs/docs/integrations` directory.


- [ ] **Lint and test**: Run `make format`, `make lint` and `make test`
from the root of the package(s) you've modified. See contribution
guidelines for more: https://python.langchain.com/docs/contributing/

Additional guidelines:
- Make sure optional dependencies are imported within a function.
- Please do not add dependencies to pyproject.toml files (even optional
ones) unless they are required for unit tests.
- Most PRs should not touch more than one package.
- Changes should be backwards compatible.
- If you are adding something to community, do not re-import it in
langchain.

If no one reviews your PR within a few days, please @-mention one of
baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17.
---
 .../vectorstores/azure_cosmos_db.py           | 101 ++++++++++++-----
 .../vectorstores/azure_cosmos_db_no_sql.py    | 102 +++++++++++++++---
 .../vectorstores/test_azure_cosmos_db.py      |   2 +-
 .../test_azure_cosmos_db_no_sql.py            |  59 ++++++++++
 4 files changed, 222 insertions(+), 42 deletions(-)

diff --git a/libs/community/langchain_community/vectorstores/azure_cosmos_db.py b/libs/community/langchain_community/vectorstores/azure_cosmos_db.py
index 559296c71a6..92a450bd86f 100644
--- a/libs/community/langchain_community/vectorstores/azure_cosmos_db.py
+++ b/libs/community/langchain_community/vectorstores/azure_cosmos_db.py
@@ -306,6 +306,27 @@ class AzureCosmosDBVectorSearch(VectorStore):
         }
         return command
 
+    def create_filter_index(
+        self,
+        property_to_filter: str,
+        index_name: str,
+    ) -> dict[str, Any]:
+        command = {
+            "createIndexes": self._collection.name,
+            "indexes": [
+                {
+                    "key": {property_to_filter: 1},
+                    "name": index_name,
+                }
+            ],
+        }
+        # retrieve the database object
+        current_database = self._collection.database
+
+        # invoke the command from the database object
+        create_index_responses: dict[str, Any] = current_database.command(command)
+        return create_index_responses
+
     def add_texts(
         self,
         texts: Iterable[str],
@@ -345,7 +366,7 @@ class AzureCosmosDBVectorSearch(VectorStore):
         # Embed and create the documents
         embeddings = self._embedding.embed_documents(texts)
         to_insert = [
-            {self._text_key: t, self._embedding_key: embedding, **m}
+            {self._text_key: t, self._embedding_key: embedding, "metadata": m}
             for t, m, embedding in zip(texts, metadatas, embeddings)
         ]
         # insert the documents in Cosmos DB
@@ -397,8 +418,10 @@ class AzureCosmosDBVectorSearch(VectorStore):
         embeddings: List[float],
         k: int = 4,
         kind: CosmosDBVectorSearchType = CosmosDBVectorSearchType.VECTOR_IVF,
+        pre_filter: Optional[Dict] = None,
         ef_search: int = 40,
         score_threshold: float = 0.0,
+        with_embedding: bool = False,
     ) -> List[Tuple[Document, float]]:
         """Returns a list of documents with their scores
 
@@ -422,9 +445,11 @@ class AzureCosmosDBVectorSearch(VectorStore):
         """
         pipeline: List[dict[str, Any]] = []
         if kind == CosmosDBVectorSearchType.VECTOR_IVF:
-            pipeline = self._get_pipeline_vector_ivf(embeddings, k)
+            pipeline = self._get_pipeline_vector_ivf(embeddings, k, pre_filter)
         elif kind == CosmosDBVectorSearchType.VECTOR_HNSW:
-            pipeline = self._get_pipeline_vector_hnsw(embeddings, k, ef_search)
+            pipeline = self._get_pipeline_vector_hnsw(
+                embeddings, k, ef_search, pre_filter
+            )
 
         cursor = self._collection.aggregate(pipeline)
 
@@ -433,28 +458,32 @@ class AzureCosmosDBVectorSearch(VectorStore):
             score = res.pop("similarityScore")
             if score < score_threshold:
                 continue
-            document_object_field = (
-                res.pop("document")
-                if kind == CosmosDBVectorSearchType.VECTOR_IVF
-                else res
-            )
+            document_object_field = res.pop("document")
             text = document_object_field.pop(self._text_key)
-            docs.append(
-                (Document(page_content=text, metadata=document_object_field), score)
-            )
+            metadata = document_object_field.pop("metadata")
+            if with_embedding:
+                metadata[self._embedding_key] = document_object_field.pop(
+                    self._embedding_key
+                )
+
+            docs.append((Document(page_content=text, metadata=metadata), score))
         return docs
 
     def _get_pipeline_vector_ivf(
-        self, embeddings: List[float], k: int = 4
+        self, embeddings: List[float], k: int = 4, pre_filter: Optional[Dict] = None
     ) -> List[dict[str, Any]]:
+        params = {
+            "vector": embeddings,
+            "path": self._embedding_key,
+            "k": k,
+        }
+        if pre_filter:
+            params["filter"] = pre_filter
+
         pipeline: List[dict[str, Any]] = [
             {
                 "$search": {
-                    "cosmosSearch": {
-                        "vector": embeddings,
-                        "path": self._embedding_key,
-                        "k": k,
-                    },
+                    "cosmosSearch": params,
                     "returnStoredSource": True,
                 }
             },
@@ -468,17 +497,25 @@ class AzureCosmosDBVectorSearch(VectorStore):
         return pipeline
 
     def _get_pipeline_vector_hnsw(
-        self, embeddings: List[float], k: int = 4, ef_search: int = 40
+        self,
+        embeddings: List[float],
+        k: int = 4,
+        ef_search: int = 40,
+        pre_filter: Optional[Dict] = None,
     ) -> List[dict[str, Any]]:
+        params = {
+            "vector": embeddings,
+            "path": self._embedding_key,
+            "k": k,
+            "efSearch": ef_search,
+        }
+        if pre_filter:
+            params["filter"] = pre_filter
+
         pipeline: List[dict[str, Any]] = [
             {
                 "$search": {
-                    "cosmosSearch": {
-                        "vector": embeddings,
-                        "path": self._embedding_key,
-                        "k": k,
-                        "efSearch": ef_search,
-                    },
+                    "cosmosSearch": params,
                 }
             },
             {
@@ -495,16 +532,20 @@ class AzureCosmosDBVectorSearch(VectorStore):
         query: str,
         k: int = 4,
         kind: CosmosDBVectorSearchType = CosmosDBVectorSearchType.VECTOR_IVF,
+        pre_filter: Optional[Dict] = None,
         ef_search: int = 40,
         score_threshold: float = 0.0,
+        with_embedding: bool = False,
     ) -> List[Tuple[Document, float]]:
         embeddings = self._embedding.embed_query(query)
         docs = self._similarity_search_with_score(
             embeddings=embeddings,
             k=k,
             kind=kind,
+            pre_filter=pre_filter,
             ef_search=ef_search,
             score_threshold=score_threshold,
+            with_embedding=with_embedding,
         )
         return docs
 
@@ -513,16 +554,20 @@ class AzureCosmosDBVectorSearch(VectorStore):
         query: str,
         k: int = 4,
         kind: CosmosDBVectorSearchType = CosmosDBVectorSearchType.VECTOR_IVF,
+        pre_filter: Optional[Dict] = None,
         ef_search: int = 40,
         score_threshold: float = 0.0,
+        with_embedding: bool = False,
         **kwargs: Any,
     ) -> List[Document]:
         docs_and_scores = self.similarity_search_with_score(
             query,
             k=k,
             kind=kind,
+            pre_filter=pre_filter,
             ef_search=ef_search,
             score_threshold=score_threshold,
+            with_embedding=with_embedding,
         )
         return [doc for doc, _ in docs_and_scores]
 
@@ -533,8 +578,10 @@ class AzureCosmosDBVectorSearch(VectorStore):
         fetch_k: int = 20,
         lambda_mult: float = 0.5,
         kind: CosmosDBVectorSearchType = CosmosDBVectorSearchType.VECTOR_IVF,
+        pre_filter: Optional[Dict] = None,
         ef_search: int = 40,
         score_threshold: float = 0.0,
+        with_embedding: bool = False,
         **kwargs: Any,
     ) -> List[Document]:
         # Retrieves the docs with similarity scores
@@ -543,8 +590,10 @@ class AzureCosmosDBVectorSearch(VectorStore):
             embedding,
             k=fetch_k,
             kind=kind,
+            pre_filter=pre_filter,
             ef_search=ef_search,
             score_threshold=score_threshold,
+            with_embedding=with_embedding,
         )
 
         # Re-ranks the docs using MMR
@@ -564,8 +613,10 @@ class AzureCosmosDBVectorSearch(VectorStore):
         fetch_k: int = 20,
         lambda_mult: float = 0.5,
         kind: CosmosDBVectorSearchType = CosmosDBVectorSearchType.VECTOR_IVF,
+        pre_filter: Optional[Dict] = None,
         ef_search: int = 40,
         score_threshold: float = 0.0,
+        with_embedding: bool = False,
         **kwargs: Any,
     ) -> List[Document]:
         # compute the embeddings vector from the query string
@@ -577,8 +628,10 @@ class AzureCosmosDBVectorSearch(VectorStore):
             fetch_k=fetch_k,
             lambda_mult=lambda_mult,
             kind=kind,
+            pre_filter=pre_filter,
             ef_search=ef_search,
             score_threshold=score_threshold,
+            with_embedding=with_embedding,
         )
         return docs
 
diff --git a/libs/community/langchain_community/vectorstores/azure_cosmos_db_no_sql.py b/libs/community/langchain_community/vectorstores/azure_cosmos_db_no_sql.py
index 5be52fb02c7..8a671da921c 100644
--- a/libs/community/langchain_community/vectorstores/azure_cosmos_db_no_sql.py
+++ b/libs/community/langchain_community/vectorstores/azure_cosmos_db_no_sql.py
@@ -162,7 +162,12 @@ class AzureCosmosDBNoSqlVectorSearch(VectorStore):
         text_key = "text"
 
         to_insert = [
-            {"id": str(uuid.uuid4()), text_key: t, self._embedding_key: embedding, **m}
+            {
+                "id": str(uuid.uuid4()),
+                text_key: t,
+                self._embedding_key: embedding,
+                "metadata": m,
+            }
             for t, m, embedding in zip(texts, metadatas, embeddings)
         ]
         # insert the documents in CosmosDB No Sql
@@ -184,6 +189,7 @@ class AzureCosmosDBNoSqlVectorSearch(VectorStore):
         cosmos_database_properties: Dict[str, Any],
         database_name: str = "vectorSearchDB",
         container_name: str = "vectorSearchContainer",
+        create_container: bool = True,
         **kwargs: Any,
     ) -> AzureCosmosDBNoSqlVectorSearch:
         if kwargs:
@@ -204,6 +210,7 @@ class AzureCosmosDBNoSqlVectorSearch(VectorStore):
             cosmos_database_properties=cosmos_database_properties,
             database_name=database_name,
             container_name=container_name,
+            create_container=create_container,
         )
 
     @classmethod
@@ -257,41 +264,83 @@ class AzureCosmosDBNoSqlVectorSearch(VectorStore):
         self,
         embeddings: List[float],
         k: int = 4,
+        pre_filter: Optional[Dict] = None,
+        with_embedding: bool = False,
     ) -> List[Tuple[Document, float]]:
-        query = (
-            "SELECT TOP {} c.id, c.{}, c.text, VectorDistance(c.{}, {}) AS "
-            "SimilarityScore FROM c ORDER BY VectorDistance(c.{}, {})".format(
-                k,
-                self._embedding_key,
-                self._embedding_key,
-                embeddings,
-                self._embedding_key,
-                embeddings,
-            )
+        query = "SELECT "
+
+        # If limit_offset_clause is not specified, add TOP clause
+        if pre_filter is None or pre_filter.get("limit_offset_clause") is None:
+            query += "TOP @limit "
+
+        query += (
+            "c.id, c.{}, c.text, c.metadata, "
+            "VectorDistance(c.@embeddingKey, @embeddings) AS SimilarityScore FROM c"
         )
+
+        # Add where_clause if specified
+        if pre_filter is not None and pre_filter.get("where_clause") is not None:
+            query += " {}".format(pre_filter["where_clause"])
+
+        query += " ORDER BY VectorDistance(c.@embeddingKey, @embeddings)"
+
+        # Add limit_offset_clause if specified
+        if pre_filter is not None and pre_filter.get("limit_offset_clause") is not None:
+            query += " {}".format(pre_filter["limit_offset_clause"])
+        parameters = [
+            {"name": "@limit", "value": k},
+            {"name": "@embeddingKey", "value": self._embedding_key},
+            {"name": "@embeddings", "value": embeddings},
+        ]
+
         docs_and_scores = []
+
         items = list(
-            self._container.query_items(query=query, enable_cross_partition_query=True)
+            self._container.query_items(
+                query=query, parameters=parameters, enable_cross_partition_query=True
+            )
         )
         for item in items:
             text = item["text"]
+            metadata = item["metadata"]
             score = item["SimilarityScore"]
-            docs_and_scores.append((Document(page_content=text, metadata=item), score))
+            if with_embedding:
+                metadata[self._embedding_key] = item[self._embedding_key]
+            docs_and_scores.append(
+                (Document(page_content=text, metadata=metadata), score)
+            )
         return docs_and_scores
 
     def similarity_search_with_score(
         self,
         query: str,
         k: int = 4,
+        pre_filter: Optional[Dict] = None,
+        with_embedding: bool = False,
     ) -> List[Tuple[Document, float]]:
         embeddings = self._embedding.embed_query(query)
-        docs_and_scores = self._similarity_search_with_score(embeddings=embeddings, k=k)
+        docs_and_scores = self._similarity_search_with_score(
+            embeddings=embeddings,
+            k=k,
+            pre_filter=pre_filter,
+            with_embedding=with_embedding,
+        )
         return docs_and_scores
 
     def similarity_search(
-        self, query: str, k: int = 4, **kwargs: Any
+        self,
+        query: str,
+        k: int = 4,
+        pre_filter: Optional[Dict] = None,
+        with_embedding: bool = False,
+        **kwargs: Any,
     ) -> List[Document]:
-        docs_and_scores = self.similarity_search_with_score(query, k=k)
+        docs_and_scores = self.similarity_search_with_score(
+            query,
+            k=k,
+            pre_filter=pre_filter,
+            with_embedding=with_embedding,
+        )
 
         return [doc for doc, _ in docs_and_scores]
 
@@ -304,7 +353,18 @@ class AzureCosmosDBNoSqlVectorSearch(VectorStore):
         **kwargs: Any,
     ) -> List[Document]:
         # Retrieves the docs with similarity scores
-        docs = self._similarity_search_with_score(embeddings=embedding, k=fetch_k)
+        pre_filter = {}
+        with_embedding = False
+        if kwargs["pre_filter"]:
+            pre_filter = kwargs["pre_filter"]
+        if kwargs["with_embedding"]:
+            with_embedding = kwargs["with_embedding"]
+        docs = self._similarity_search_with_score(
+            embeddings=embedding,
+            k=fetch_k,
+            pre_filter=pre_filter,
+            with_embedding=with_embedding,
+        )
 
         # Re-ranks the docs using MMR
         mmr_doc_indexes = maximal_marginal_relevance(
@@ -326,6 +386,12 @@ class AzureCosmosDBNoSqlVectorSearch(VectorStore):
         **kwargs: Any,
     ) -> List[Document]:
         # compute the embeddings vector from the query string
+        pre_filter = {}
+        with_embedding = False
+        if kwargs["pre_filter"]:
+            pre_filter = kwargs["pre_filter"]
+        if kwargs["with_embedding"]:
+            with_embedding = kwargs["with_embedding"]
         embeddings = self._embedding.embed_query(query)
 
         docs = self.max_marginal_relevance_search_by_vector(
@@ -333,5 +399,7 @@ class AzureCosmosDBNoSqlVectorSearch(VectorStore):
             k=k,
             fetch_k=fetch_k,
             lambda_mult=lambda_mult,
+            pre_filter=pre_filter,
+            with_embedding=with_embedding,
         )
         return docs
diff --git a/libs/community/tests/integration_tests/vectorstores/test_azure_cosmos_db.py b/libs/community/tests/integration_tests/vectorstores/test_azure_cosmos_db.py
index b76bba231a2..7555a6bd560 100644
--- a/libs/community/tests/integration_tests/vectorstores/test_azure_cosmos_db.py
+++ b/libs/community/tests/integration_tests/vectorstores/test_azure_cosmos_db.py
@@ -25,7 +25,7 @@ model_name = os.getenv("OPENAI_EMBEDDINGS_MODEL_NAME", "text-embedding-ada-002")
 INDEX_NAME = "langchain-test-index"
 INDEX_NAME_VECTOR_HNSW = "langchain-test-index-hnsw"
 NAMESPACE = "langchain_test_db.langchain_test_collection"
-CONNECTION_STRING: str = os.environ.get("MONGODB_VCORE_URI", "")
+CONNECTION_STRING: str = "mongodb+srv://akataria:Basket24ball@akataria-vector-search-testing.mongocluster.cosmos.azure.com/?tls=true&authMechanism=SCRAM-SHA-256&retrywrites=false&maxIdleTimeMS=120000"
 DB_NAME, COLLECTION_NAME = NAMESPACE.split(".")
 
 num_lists = 3
diff --git a/libs/community/tests/integration_tests/vectorstores/test_azure_cosmos_db_no_sql.py b/libs/community/tests/integration_tests/vectorstores/test_azure_cosmos_db_no_sql.py
index 73659e72215..c8a8f87a599 100644
--- a/libs/community/tests/integration_tests/vectorstores/test_azure_cosmos_db_no_sql.py
+++ b/libs/community/tests/integration_tests/vectorstores/test_azure_cosmos_db_no_sql.py
@@ -104,6 +104,7 @@ class TestAzureCosmosDBNoSqlVectorSearch:
             ),
             indexing_policy=get_vector_indexing_policy("flat"),
             cosmos_container_properties={"partition_key": partition_key},
+            cosmos_database_properties={},
         )
         sleep(1)  # waits for Cosmos DB to save contents to the collection
 
@@ -139,6 +140,7 @@ class TestAzureCosmosDBNoSqlVectorSearch:
             ),
             indexing_policy=get_vector_indexing_policy("flat"),
             cosmos_container_properties={"partition_key": partition_key},
+            cosmos_database_properties={},
         )
         sleep(1)  # waits for Cosmos DB to save contents to the collection
 
@@ -154,3 +156,60 @@ class TestAzureCosmosDBNoSqlVectorSearch:
         assert output2
         assert output2[0].page_content != "Dogs are tough."
         safe_delete_database(cosmos_client)
+
+    def test_from_documents_cosine_distance_with_filtering(
+        self,
+        cosmos_client: Any,
+        partition_key: Any,
+        azure_openai_embeddings: OpenAIEmbeddings,
+    ) -> None:
+        """Test end to end construction and search."""
+        documents = [
+            Document(page_content="Dogs are tough.", metadata={"a": 1}),
+            Document(page_content="Cats have fluff.", metadata={"a": 1}),
+            Document(page_content="What is a sandwich?", metadata={"c": 1}),
+            Document(page_content="That fence is purple.", metadata={"d": 1, "e": 2}),
+        ]
+
+        store = AzureCosmosDBNoSqlVectorSearch.from_documents(
+            documents,
+            azure_openai_embeddings,
+            cosmos_client=cosmos_client,
+            database_name=database_name,
+            container_name=container_name,
+            vector_embedding_policy=get_vector_embedding_policy(
+                "cosine", "float32", 400
+            ),
+            indexing_policy=get_vector_indexing_policy("flat"),
+            cosmos_container_properties={"partition_key": partition_key},
+            cosmos_database_properties={},
+        )
+        sleep(1)  # waits for Cosmos DB to save contents to the collection
+
+        output = store.similarity_search("Dogs", k=4)
+        assert len(output) == 4
+        assert output[0].page_content == "Dogs are tough."
+        assert output[0].metadata["a"] == 1
+
+        pre_filter = {
+            "where_clause": "WHERE c.metadata.a=1",
+        }
+        output = store.similarity_search(
+            "Dogs", k=4, pre_filter=pre_filter, with_embedding=True
+        )
+
+        assert len(output) == 2
+        assert output[0].page_content == "Dogs are tough."
+        assert output[0].metadata["a"] == 1
+
+        pre_filter = {
+            "where_clause": "WHERE c.metadata.a=1",
+            "limit_offset_clause": "OFFSET 0 LIMIT 1",
+        }
+
+        output = store.similarity_search("Dogs", k=4, pre_filter=pre_filter)
+
+        assert len(output) == 1
+        assert output[0].page_content == "Dogs are tough."
+        assert output[0].metadata["a"] == 1
+        safe_delete_database(cosmos_client)