Community: Azure CosmosDB No Sql Vector Store: Full Text and Hybrid Search Support (#28716)

Thank you for contributing to LangChain! - Added [full text](https://learn.microsoft.com/en-us/azure/cosmos-db/gen-ai/full-text-search) and [hybrid search](https://learn.microsoft.com/en-us/azure/cosmos-db/gen-ai/hybrid-search) support for Azure CosmosDB NoSql Vector Store - Added a new enum called CosmosDBQueryType which supports the following values: - VECTOR = "vector" - FULL_TEXT_SEARCH = "full_text_search" - FULL_TEXT_RANK = "full_text_rank" - HYBRID = "hybrid" - User now needs to provide this query_type to the similarity_search method for the vectorStore to make the correct query api call. - Added a couple of work arounds as for the FULL_TEXT_RANK and HYBRID query functions we don't support parameterized queries right now. I have added TODO's in place, and will remove these work arounds by end of January. - Added necessary test cases and updated the - [x] **Add tests and docs**: If you're adding a new integration, please include 1. a test for the integration, preferably unit tests that do not rely on network access, 2. an example notebook showing its use. It lives in `docs/docs/integrations` directory. - [x] **Lint and test**: Run `make format`, `make lint` and `make test` from the root of the package(s) you've modified. See contribution guidelines for more: https://python.langchain.com/docs/contributing/ Additional guidelines: - Make sure optional dependencies are imported within a function. - Please do not add dependencies to pyproject.toml files (even optional ones) unless they are required for unit tests. - Most PRs should not touch more than one package. - Changes should be backwards compatible. - If you are adding something to community, do not re-import it in langchain. If no one reviews your PR within a few days, please @-mention one of baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17. --------- Co-authored-by: Erick Friis <erickfriis@gmail.com>
2025-06-26 16:43:35 +00:00 · 2024-12-15 13:26:32 -08:00 · 2024-12-15 13:26:32 -08:00 · d417e4b372
commit d417e4b372
parent 4c1871d9a8
4 changed files with 1269 additions and 170 deletions
--- a/docs/docs/integrations/vectorstores/azure_cosmos_db_no_sql.ipynb
+++ b/docs/docs/integrations/vectorstores/azure_cosmos_db_no_sql.ipynb
--- a/libs/community/langchain_community/vectorstores/azure_cosmos_db.py
+++ b/libs/community/langchain_community/vectorstores/azure_cosmos_db.py
@ -131,6 +131,7 @@ class AzureCosmosDBVectorSearch(VectorStore):
            connection_string: The MongoDB vCore instance connection string
            namespace: The namespace (database.collection)
            embedding: The embedding utility
            application_name: The user agent for telemetry
            **kwargs: Dynamic keyword arguments
        Returns:
--- a/libs/community/langchain_community/vectorstores/azure_cosmos_db_no_sql.py
+++ b/libs/community/langchain_community/vectorstores/azure_cosmos_db_no_sql.py
@ -2,17 +2,42 @@ from __future__ import annotations
 import uuid
 import warnings
 from enum import Enum
 from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple
 import numpy as np
 from langchain_core.documents import Document
 from langchain_core.embeddings import Embeddings
 from langchain_core.vectorstores import VectorStore
 from pydantic import BaseModel, Field
 from langchain_community.vectorstores.utils import maximal_marginal_relevance
 if TYPE_CHECKING:
-    from azure.cosmos.cosmos_client import CosmosClient
+    from azure.cosmos import CosmosClient
    from azure.identity import DefaultAzureCredential
 USER_AGENT = ("LangChain-CDBNoSql-VectorStore-Python",)
 class Condition(BaseModel):
    property: str
    operator: str
    value: Any
 class PreFilter(BaseModel):
    conditions: List[Condition] = Field(default_factory=list)
    logical_operator: Optional[str] = None
 class CosmosDBQueryType(str, Enum):
    """CosmosDB Query Type"""
    VECTOR = "vector"
    FULL_TEXT_SEARCH = "full_text_search"
    FULL_TEXT_RANK = "full_text_rank"
    HYBRID = "hybrid"
 class AzureCosmosDBNoSqlVectorSearch(VectorStore):
@ -21,8 +46,11 @@ class AzureCosmosDBNoSqlVectorSearch(VectorStore):
    To use, you should have both:
        - the ``azure-cosmos`` python package installed
-    You can read more about vector search using AzureCosmosDBNoSQL here:
+    You can read more about vector search, full text search
    and hybrid search using AzureCosmosDBNoSQL here:
    https://learn.microsoft.com/en-us/azure/cosmos-db/nosql/vector-search
    https://learn.microsoft.com/en-us/azure/cosmos-db/gen-ai/full-text-search
    https://learn.microsoft.com/en-us/azure/cosmos-db/gen-ai/hybrid-search
    """
    def __init__(
@ -34,9 +62,14 @@ class AzureCosmosDBNoSqlVectorSearch(VectorStore):
        indexing_policy: Dict[str, Any],
        cosmos_container_properties: Dict[str, Any],
        cosmos_database_properties: Dict[str, Any],
        full_text_policy: Optional[Dict[str, Any]] = None,
        database_name: str = "vectorSearchDB",
        container_name: str = "vectorSearchContainer",
        text_key: str = "text",
        embedding_key: str = "embedding",
        metadata_key: str = "metadata",
        create_container: bool = True,
        full_text_search_enabled: bool = False,
    ):
        """
        Constructor for AzureCosmosDBNoSqlVectorSearch
@ -47,30 +80,42 @@ class AzureCosmosDBNoSqlVectorSearch(VectorStore):
            container_name: Name of the container to be created.
            embedding: Text embedding model to use.
            vector_embedding_policy: Vector Embedding Policy for the container.
            full_text_policy: Full Text Policy for the container.
            indexing_policy: Indexing Policy for the container.
            cosmos_container_properties: Container Properties for the container.
            cosmos_database_properties: Database Properties for the container.
            text_key: Text key to use for text property which will be
                      embedded in the data schema.
            embedding_key: Embedding key to use for vector embedding.
            metadata_key: Metadata key to use for data schema.
            create_container: Set to true if the container does not exist.
            full_text_search_enabled: Set to true if the full text search is enabled.
        """
        self._cosmos_client = cosmos_client
        self._database_name = database_name
        self._container_name = container_name
        self._embedding = embedding
        self._vector_embedding_policy = vector_embedding_policy
        self._full_text_policy = full_text_policy
        self._indexing_policy = indexing_policy
        self._cosmos_container_properties = cosmos_container_properties
        self._cosmos_database_properties = cosmos_database_properties
        self._text_key = text_key
        self._embedding_key = embedding_key
        self._metadata_key = metadata_key
        self._create_container = create_container
        self._full_text_search_enabled = full_text_search_enabled
        if self._create_container:
            if (
-                indexing_policy["vectorIndexes"] is None
+                self._indexing_policy["vectorIndexes"] is None
-                or len(indexing_policy["vectorIndexes"]) == 0
+                or len(self._indexing_policy["vectorIndexes"]) == 0
            ):
                raise ValueError(
                    "vectorIndexes cannot be null or empty in the indexing_policy."
                )
            if (
-                vector_embedding_policy is None
+                self._vector_embedding_policy is None
                or len(vector_embedding_policy["vectorEmbeddings"]) == 0
            ):
                raise ValueError(
@ -81,6 +126,23 @@ class AzureCosmosDBNoSqlVectorSearch(VectorStore):
                raise ValueError(
                    "partition_key cannot be null or empty for a container."
                )
            if self._full_text_search_enabled:
                if (
                    self._indexing_policy["fullTextIndexes"] is None
                    or len(self._indexing_policy["fullTextIndexes"]) == 0
                ):
                    raise ValueError(
                        "fullTextIndexes cannot be null or empty in the "
                        "indexing_policy if full text search is enabled."
                    )
                if (
                    self._full_text_policy is None
                    or len(self._full_text_policy["fullTextPaths"]) == 0
                ):
                    raise ValueError(
                        "fullTextPaths cannot be null or empty in the "
                        "full_text_policy if full text search is enabled."
                    )
        # Create the database if it already doesn't exist
        self._database = self._cosmos_client.create_database_if_not_exists(
@ -116,12 +178,9 @@ class AzureCosmosDBNoSqlVectorSearch(VectorStore):
            session_token=self._cosmos_container_properties.get("session_token"),
            initial_headers=self._cosmos_container_properties.get("initial_headers"),
            vector_embedding_policy=self._vector_embedding_policy,
            full_text_policy=self._full_text_policy,
        )
        self._embedding_key = self._vector_embedding_policy["vectorEmbeddings"][0][
            "path"
        ][1:]
    def add_texts(
        self,
        texts: Iterable[str],
@ -187,9 +246,14 @@ class AzureCosmosDBNoSqlVectorSearch(VectorStore):
        indexing_policy: Dict[str, Any],
        cosmos_container_properties: Dict[str, Any],
        cosmos_database_properties: Dict[str, Any],
        full_text_policy: Optional[Dict[str, Any]] = None,
        database_name: str = "vectorSearchDB",
        container_name: str = "vectorSearchContainer",
        text_key: str = "text",
        embedding_key: str = "embedding",
        metadata_key: str = "metadata",
        create_container: bool = True,
        full_text_search_enabled: bool = False,
        **kwargs: Any,
    ) -> AzureCosmosDBNoSqlVectorSearch:
        if kwargs:
@ -205,12 +269,17 @@ class AzureCosmosDBNoSqlVectorSearch(VectorStore):
            embedding=embedding,
            cosmos_client=cosmos_client,
            vector_embedding_policy=vector_embedding_policy,
            full_text_policy=full_text_policy,
            indexing_policy=indexing_policy,
            cosmos_container_properties=cosmos_container_properties,
            cosmos_database_properties=cosmos_database_properties,
            database_name=database_name,
            container_name=container_name,
            text_key=text_key,
            embedding_key=embedding_key,
            metadata_key=metadata_key,
            create_container=create_container,
            full_text_search_enabled=full_text_search_enabled,
        )
    @classmethod
@ -242,6 +311,46 @@ class AzureCosmosDBNoSqlVectorSearch(VectorStore):
        )
        return vectorstore
    @classmethod
    def from_connection_string_and_aad(
        cls,
        connection_string: str,
        defaultAzureCredential: DefaultAzureCredential,
        texts: List[str],
        embedding: Embeddings,
        metadatas: Optional[List[dict]] = None,
        **kwargs: Any,
    ) -> AzureCosmosDBNoSqlVectorSearch:
        cosmos_client = CosmosClient(
            connection_string, defaultAzureCredential, user_agent=USER_AGENT
        )
        kwargs["cosmos_client"] = cosmos_client
        vectorstore = cls._from_kwargs(embedding, **kwargs)
        vectorstore.add_texts(
            texts=texts,
            metadatas=metadatas,
        )
        return vectorstore
    @classmethod
    def from_connection_string_and_key(
        cls,
        connection_string: str,
        key: str,
        texts: List[str],
        embedding: Embeddings,
        metadatas: Optional[List[dict]] = None,
        **kwargs: Any,
    ) -> AzureCosmosDBNoSqlVectorSearch:
        cosmos_client = CosmosClient(connection_string, key, user_agent=USER_AGENT)
        kwargs["cosmos_client"] = cosmos_client
        vectorstore = cls._from_kwargs(embedding, **kwargs)
        vectorstore.add_texts(
            texts=texts,
            metadatas=metadatas,
        )
        return vectorstore
    def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> Optional[bool]:
        if ids is None:
            raise ValueError("No document ids provided to delete.")
@ -262,68 +371,141 @@ class AzureCosmosDBNoSqlVectorSearch(VectorStore):
    def _similarity_search_with_score(
        self,
        query_type: CosmosDBQueryType,
        embeddings: List[float],
        k: int = 4,
-        pre_filter: Optional[Dict] = None,
+        pre_filter: Optional[PreFilter] = None,
        with_embedding: bool = False,
        offset_limit: Optional[str] = None,
        *,
        projection_mapping: Optional[Dict[str, Any]] = None,
        **kwargs: Any,
    ) -> List[Tuple[Document, float]]:
-        query = "SELECT "
+        query, parameters = self._construct_query(
-
+            k=k,
-        # If limit_offset_clause is not specified, add TOP clause
+            query_type=query_type,
-        if pre_filter is None or pre_filter.get("limit_offset_clause") is None:
+            embeddings=embeddings,
-            query += "TOP @limit "
+            pre_filter=pre_filter,
-
+            offset_limit=offset_limit,
-        query += (
+            projection_mapping=projection_mapping,
            "c.id, c[@embeddingKey], c.text, c.metadata, "
            "VectorDistance(c[@embeddingKey], @embeddings) AS SimilarityScore FROM c"
        )
-        # Add where_clause if specified
+        return self._execute_query(
-        if pre_filter is not None and pre_filter.get("where_clause") is not None:
+            query=query,
-            query += " {}".format(pre_filter["where_clause"])
+            query_type=query_type,
-
+            parameters=parameters,
-        query += " ORDER BY VectorDistance(c[@embeddingKey], @embeddings)"
+            with_embedding=with_embedding,
-
+            projection_mapping=projection_mapping,
        # Add limit_offset_clause if specified
        if pre_filter is not None and pre_filter.get("limit_offset_clause") is not None:
            query += " {}".format(pre_filter["limit_offset_clause"])
        parameters = [
            {"name": "@limit", "value": k},
            {"name": "@embeddingKey", "value": self._embedding_key},
            {"name": "@embeddings", "value": embeddings},
        ]
        docs_and_scores = []
        items = list(
            self._container.query_items(
                query=query, parameters=parameters, enable_cross_partition_query=True
        )
    def _full_text_search(
        self,
        query_type: CosmosDBQueryType,
        search_text: Optional[str] = None,
        k: int = 4,
        pre_filter: Optional[PreFilter] = None,
        offset_limit: Optional[str] = None,
        *,
        projection_mapping: Optional[Dict[str, Any]] = None,
        **kwargs: Any,
    ) -> List[Tuple[Document, float]]:
        query, parameters = self._construct_query(
            k=k,
            query_type=query_type,
            search_text=search_text,
            pre_filter=pre_filter,
            offset_limit=offset_limit,
            projection_mapping=projection_mapping,
        )
-        for item in items:
+
-            text = item["text"]
+        return self._execute_query(
-            metadata = item["metadata"]
+            query=query,
-            score = item["SimilarityScore"]
+            query_type=query_type,
-            if with_embedding:
+            parameters=parameters,
-                metadata[self._embedding_key] = item[self._embedding_key]
+            with_embedding=False,
-            docs_and_scores.append(
+            projection_mapping=projection_mapping,
-                (Document(page_content=text, metadata=metadata), score)
+        )
    def _hybrid_search_with_score(
        self,
        query_type: CosmosDBQueryType,
        embeddings: List[float],
        search_text: str,
        k: int = 4,
        pre_filter: Optional[PreFilter] = None,
        with_embedding: bool = False,
        offset_limit: Optional[str] = None,
        *,
        projection_mapping: Optional[Dict[str, Any]] = None,
        **kwargs: Any,
    ) -> List[Tuple[Document, float]]:
        query, parameters = self._construct_query(
            k=k,
            query_type=query_type,
            embeddings=embeddings,
            search_text=search_text,
            pre_filter=pre_filter,
            offset_limit=offset_limit,
            projection_mapping=projection_mapping,
        )
        return self._execute_query(
            query=query,
            query_type=query_type,
            parameters=parameters,
            with_embedding=with_embedding,
            projection_mapping=projection_mapping,
        )
        return docs_and_scores
    def similarity_search_with_score(
        self,
        query: str,
        k: int = 4,
-        pre_filter: Optional[Dict] = None,
+        pre_filter: Optional[PreFilter] = None,
        with_embedding: bool = False,
        query_type: CosmosDBQueryType = CosmosDBQueryType.VECTOR,
        offset_limit: Optional[str] = None,
        **kwargs: Any,
    ) -> List[Tuple[Document, float]]:
        embeddings = self._embedding.embed_query(query)
        docs_and_scores = []
        if query_type == CosmosDBQueryType.VECTOR:
            docs_and_scores = self._similarity_search_with_score(
                query_type=query_type,
                embeddings=embeddings,
                k=k,
                pre_filter=pre_filter,
                with_embedding=with_embedding,
                offset_limit=offset_limit,
                **kwargs,
            )
        elif query_type == CosmosDBQueryType.FULL_TEXT_SEARCH:
            docs_and_scores = self._full_text_search(
                k=k,
                query_type=query_type,
                pre_filter=pre_filter,
                offset_limit=offset_limit,
                **kwargs,
            )
        elif query_type == CosmosDBQueryType.FULL_TEXT_RANK:
            docs_and_scores = self._full_text_search(
                search_text=query,
                k=k,
                query_type=query_type,
                pre_filter=pre_filter,
                offset_limit=offset_limit,
                **kwargs,
            )
        elif query_type == CosmosDBQueryType.HYBRID:
            docs_and_scores = self._hybrid_search_with_score(
                query_type=query_type,
                embeddings=embeddings,
                search_text=query,
                k=k,
                pre_filter=pre_filter,
                with_embedding=with_embedding,
                offset_limit=offset_limit,
                **kwargs,
            )
        return docs_and_scores
@ -331,15 +513,26 @@ class AzureCosmosDBNoSqlVectorSearch(VectorStore):
        self,
        query: str,
        k: int = 4,
-        pre_filter: Optional[Dict] = None,
+        pre_filter: Optional[PreFilter] = None,
        with_embedding: bool = False,
        query_type: CosmosDBQueryType = CosmosDBQueryType.VECTOR,
        offset_limit: Optional[str] = None,
        **kwargs: Any,
    ) -> List[Document]:
        if query_type not in CosmosDBQueryType.__members__.values():
            raise ValueError(
                f"Invalid query_type: {query_type}. "
                f"Expected one of: {', '.join(t.value for t in CosmosDBQueryType)}."
            )
        else:
            docs_and_scores = self.similarity_search_with_score(
                query,
                k=k,
                pre_filter=pre_filter,
                with_embedding=with_embedding,
                query_type=query_type,
                offset_limit=offset_limit,
                kwargs=kwargs,
            )
        return [doc for doc, _ in docs_and_scores]
@ -350,18 +543,20 @@ class AzureCosmosDBNoSqlVectorSearch(VectorStore):
        k: int = 4,
        fetch_k: int = 20,
        lambda_mult: float = 0.5,
        query_type: CosmosDBQueryType = CosmosDBQueryType.VECTOR,
        pre_filter: Optional[PreFilter] = None,
        with_embedding: bool = False,
        **kwargs: Any,
    ) -> List[Document]:
        # Retrieves the docs with similarity scores
-        pre_filter = {}
+        # if kwargs["pre_filter"]:
-        with_embedding = False
+        #     pre_filter = kwargs["pre_filter"]
-        if kwargs["pre_filter"]:
+        # if kwargs["with_embedding"]:
-            pre_filter = kwargs["pre_filter"]
+        #     with_embedding = kwargs["with_embedding"]
        if kwargs["with_embedding"]:
            with_embedding = kwargs["with_embedding"]
        docs = self._similarity_search_with_score(
            embeddings=embedding,
            k=fetch_k,
            query_type=query_type,
            pre_filter=pre_filter,
            with_embedding=with_embedding,
        )
@ -383,15 +578,16 @@ class AzureCosmosDBNoSqlVectorSearch(VectorStore):
        k: int = 4,
        fetch_k: int = 20,
        lambda_mult: float = 0.5,
        query_type: CosmosDBQueryType = CosmosDBQueryType.VECTOR,
        pre_filter: Optional[PreFilter] = None,
        with_embedding: bool = False,
        **kwargs: Any,
    ) -> List[Document]:
        # compute the embeddings vector from the query string
-        pre_filter = {}
+        # if kwargs["pre_filter"]:
-        with_embedding = False
+        #     pre_filter = kwargs["pre_filter"]
-        if kwargs["pre_filter"]:
+        # if kwargs["with_embedding"]:
-            pre_filter = kwargs["pre_filter"]
+        #     with_embedding = kwargs["with_embedding"]
        if kwargs["with_embedding"]:
            with_embedding = kwargs["with_embedding"]
        embeddings = self._embedding.embed_query(query)
        docs = self.max_marginal_relevance_search_by_vector(
@ -400,6 +596,266 @@ class AzureCosmosDBNoSqlVectorSearch(VectorStore):
            fetch_k=fetch_k,
            lambda_mult=lambda_mult,
            pre_filter=pre_filter,
            query_type=query_type,
            with_embedding=with_embedding,
        )
        return docs
    def _construct_query(
        self,
        k: int,
        query_type: CosmosDBQueryType,
        embeddings: Optional[List[float]] = None,
        search_text: Optional[str] = None,
        pre_filter: Optional[PreFilter] = None,
        offset_limit: Optional[str] = None,
        projection_mapping: Optional[Dict[str, Any]] = None,
    ) -> Tuple[str, List[Dict[str, Any]]]:
        if (
            query_type == CosmosDBQueryType.FULL_TEXT_RANK
            or query_type == CosmosDBQueryType.HYBRID
        ):
            query = f"SELECT {'TOP ' + str(k) + ' ' if not offset_limit else ''}"
        else:
            query = f"""SELECT {'TOP @limit ' if not offset_limit else ''}"""
        query += self._generate_projection_fields(
            projection_mapping, query_type, embeddings
        )
        query += " FROM c "
        # Add where_clause if specified
        if pre_filter:
            where_clause = self._build_where_clause(pre_filter)
            query += f"""{where_clause}"""
        # TODO: Update the code to use parameters once parametrized queries
        #  are allowed for these query functions
        if query_type == CosmosDBQueryType.FULL_TEXT_RANK:
            if search_text is None:
                raise ValueError(
                    "search text cannot be None for FULL_TEXT_RANK queries."
                )
            query += f""" ORDER BY RANK FullTextScore(c.{self._text_key}, 
            [{", ".join(f"'{term}'" for term in search_text.split())}])"""
        elif query_type == CosmosDBQueryType.VECTOR:
            query += " ORDER BY VectorDistance(c[@embeddingKey], @embeddings)"
        elif query_type == CosmosDBQueryType.HYBRID:
            if search_text is None:
                raise ValueError("search text cannot be None for HYBRID queries.")
            query += f""" ORDER BY RANK RRF(FullTextScore(c.{self._text_key}, 
            [{", ".join(f"'{term}'" for term in search_text.split())}]), 
            VectorDistance(c.{self._embedding_key}, {embeddings}))"""
        else:
            query += ""
        # Add limit_offset_clause if specified
        if offset_limit is not None:
            query += f""" {offset_limit}"""
        # TODO: Remove this if check once parametrized queries
        #  are allowed for these query functions
        parameters = []
        if (
            query_type == CosmosDBQueryType.FULL_TEXT_SEARCH
            or query_type == CosmosDBQueryType.VECTOR
        ):
            parameters = self._build_parameters(
                k=k,
                query_type=query_type,
                embeddings=embeddings,
                projection_mapping=projection_mapping,
            )
        return query, parameters
    def _generate_projection_fields(
        self,
        projection_mapping: Optional[Dict[str, Any]],
        query_type: CosmosDBQueryType,
        embeddings: Optional[List[float]] = None,
    ) -> str:
        # TODO: Remove this if check once parametrized queries
        #  are allowed for these query functions
        if (
            query_type == CosmosDBQueryType.FULL_TEXT_RANK
            or query_type == CosmosDBQueryType.HYBRID
        ):
            if projection_mapping:
                projection = ", ".join(
                    f"c.{key} as {alias}" for key, alias in projection_mapping.items()
                )
            else:
                projection = (
                    f"c.id, c.{self._text_key} as text, "
                    f"c.{self._metadata_key} as metadata"
                )
            if query_type == CosmosDBQueryType.HYBRID:
                projection += (
                    f", c.{self._embedding_key} as embedding, "
                    f"VectorDistance(c.{self._embedding_key}, "
                    f"{embeddings}) as SimilarityScore"
                )
        else:
            if projection_mapping:
                projection = ", ".join(
                    f"c.[@{key}] as {alias}"
                    for key, alias in projection_mapping.items()
                )
            else:
                projection = "c.id, c[@textKey] as text, c[@metadataKey] as metadata"
            if (
                query_type == CosmosDBQueryType.VECTOR
                or query_type == CosmosDBQueryType.HYBRID
            ):
                projection += (
                    ", c[@embeddingKey] as embedding, "
                    "VectorDistance(c[@embeddingKey], "
                    "@embeddings) as SimilarityScore"
                )
        return projection
    def _build_parameters(
        self,
        k: int,
        query_type: CosmosDBQueryType,
        embeddings: Optional[List[float]],
        search_terms: Optional[List[str]] = None,
        projection_mapping: Optional[Dict[str, Any]] = None,
    ) -> List[Dict[str, Any]]:
        parameters: List[Dict[str, Any]] = [
            {"name": "@limit", "value": k},
            {"name": "@textKey", "value": self._text_key},
        ]
        if projection_mapping:
            for key in projection_mapping.keys():
                parameters.append({"name": f"@{key}", "value": key})
        else:
            parameters.append({"name": "@metadataKey", "value": self._metadata_key})
        if (
            query_type == CosmosDBQueryType.FULL_TEXT_RANK
            or query_type == CosmosDBQueryType.HYBRID
        ):
            parameters.append({"name": "@searchTerms", "value": search_terms})
        elif (
            query_type == CosmosDBQueryType.VECTOR
            or query_type == CosmosDBQueryType.HYBRID
        ):
            parameters.append({"name": "@embeddingKey", "value": self._embedding_key})
            parameters.append({"name": "@embeddings", "value": embeddings})
        return parameters
    def _build_where_clause(self, pre_filter: PreFilter) -> str:
        """
        Builds a where clause based on the given pre_filter.
        """
        operator_map = self._where_clause_operator_map()
        if (
            pre_filter.logical_operator
            and pre_filter.logical_operator not in operator_map
        ):
            raise ValueError(
                f"unsupported logical_operator: {pre_filter.logical_operator}"
            )
        sql_logical_operator = operator_map.get(pre_filter.logical_operator or "", "")
        clauses = []
        for condition in pre_filter.conditions:
            if condition.operator not in operator_map:
                raise ValueError(f"Unsupported operator: {condition.operator}")
            if "full_text" in condition.operator:
                if not isinstance(condition.value, str):
                    raise ValueError(
                        f"Expected a string for {condition.operator}, "
                        f"got {type(condition.value)}"
                    )
                search_terms = ", ".join(
                    f"'{term}'" for term in condition.value.split()
                )
                sql_function = operator_map[condition.operator]
                clauses.append(
                    f"{sql_function}(c.{condition.property}, {search_terms})"
                )
            else:
                sql_operator = operator_map[condition.operator]
                if isinstance(condition.value, str):
                    value = f"'{condition.value}'"
                elif isinstance(condition.value, list):
                    # e.g., for IN clauses
                    value = f"({', '.join(map(str, condition.value))})"
                clauses.append(f"c.{condition.property} {sql_operator} {value}")
        return f""" WHERE {' {} '.format(sql_logical_operator).join(clauses)}""".strip()
    def _execute_query(
        self,
        query: str,
        query_type: CosmosDBQueryType,
        parameters: List[Dict[str, Any]],
        with_embedding: bool,
        projection_mapping: Optional[Dict[str, Any]],
    ) -> List[Tuple[Document, float]]:
        docs_and_scores = []
        items = list(
            self._container.query_items(
                query=query, parameters=parameters, enable_cross_partition_query=True
            )
        )
        for item in items:
            text = item[self._text_key]
            metadata = item.pop(self._metadata_key, {})
            score = 0.0
            if projection_mapping:
                for key, alias in projection_mapping.items():
                    if key == self._text_key:
                        continue
                    metadata[alias] = item[alias]
            else:
                metadata["id"] = item["id"]
            if (
                query_type == CosmosDBQueryType.VECTOR
                or query_type == CosmosDBQueryType.HYBRID
            ):
                score = item["SimilarityScore"]
                if with_embedding:
                    metadata[self._embedding_key] = item[self._embedding_key]
            docs_and_scores.append(
                (Document(page_content=text, metadata=metadata), score)
            )
        return docs_and_scores
    def _where_clause_operator_map(self) -> Dict[str, str]:
        operator_map = {
            "$eq": "=",
            "$ne": "!=",
            "$lt": "<",
            "$lte": "<=",
            "$gt": ">",
            "$gte": ">=",
            "$add": "+",
            "$sub": "-",
            "$mul": "*",
            "$div": "/",
            "$mod": "%",
            "$or": "OR",
            "$and": "AND",
            "$not": "NOT",
            "$concat": "||",
            "$bit_or": "|",
            "$bit_and": "&",
            "$bit_xor": "^",
            "$bit_lshift": "<<",
            "$bit_rshift": ">>",
            "$bit_zerofill_rshift": ">>>",
            "$full_text_contains": "FullTextContains",
            "$full_text_contains_all": "FullTextContainsAll",
            "$full_text_contains_any": "FullTextContainsAny",
        }
        return operator_map
--- a/libs/community/tests/integration_tests/vectorstores/test_azure_cosmos_db_no_sql.py
+++ b/libs/community/tests/integration_tests/vectorstores/test_azure_cosmos_db_no_sql.py
@ -3,7 +3,7 @@
 import logging
 import os
 from time import sleep
-from typing import Any
+from typing import Any, Dict, List, Tuple
 import pytest
 from langchain_core.documents import Document
@ -11,6 +11,9 @@ from langchain_core.documents import Document
 from langchain_community.embeddings import OpenAIEmbeddings
 from langchain_community.vectorstores.azure_cosmos_db_no_sql import (
    AzureCosmosDBNoSqlVectorSearch,
    Condition,
    CosmosDBQueryType,
    PreFilter,
 )
 logging.basicConfig(level=logging.DEBUG)
@ -60,6 +63,7 @@ def get_vector_indexing_policy(embedding_type: str) -> dict:
        "includedPaths": [{"path": "/*"}],
        "excludedPaths": [{"path": '/"_etag"/?'}],
        "vectorIndexes": [{"path": "/embedding", "type": embedding_type}],
        "fullTextIndexes": [{"path": "/text"}],
    }
@ -78,6 +82,13 @@ def get_vector_embedding_policy(
    }
 def get_full_text_policy() -> dict:
    return {
        "defaultLanguage": "en-US",
        "fullTextPaths": [{"path": "/text", "language": "en-US"}],
    }
 class TestAzureCosmosDBNoSqlVectorSearch:
    def test_from_documents_cosine_distance(
        self,
@ -86,12 +97,7 @@ class TestAzureCosmosDBNoSqlVectorSearch:
        azure_openai_embeddings: OpenAIEmbeddings,
    ) -> None:
        """Test end to end construction and search."""
-        documents = [
+        documents = self._get_documents()
            Document(page_content="Dogs are tough.", metadata={"a": 1}),
            Document(page_content="Cats have fluff.", metadata={"b": 1}),
            Document(page_content="What is a sandwich?", metadata={"c": 1}),
            Document(page_content="That fence is purple.", metadata={"d": 1, "e": 2}),
        ]
        store = AzureCosmosDBNoSqlVectorSearch.from_documents(
            documents,
@ -105,13 +111,16 @@ class TestAzureCosmosDBNoSqlVectorSearch:
            indexing_policy=get_vector_indexing_policy("flat"),
            cosmos_container_properties={"partition_key": partition_key},
            cosmos_database_properties={},
            full_text_policy=get_full_text_policy(),
            full_text_search_enabled=True,
        )
        sleep(1)  # waits for Cosmos DB to save contents to the collection
-        output = store.similarity_search("Dogs", k=2)
+        output = store.similarity_search("intelligent herders", k=5)
        assert output
-        assert output[0].page_content == "Dogs are tough."
+        assert len(output) == 5
        assert "Border Collies" in output[0].page_content
        safe_delete_database(cosmos_client)
    def test_from_texts_cosine_distance_delete_one(
@ -120,13 +129,7 @@ class TestAzureCosmosDBNoSqlVectorSearch:
        partition_key: Any,
        azure_openai_embeddings: OpenAIEmbeddings,
    ) -> None:
-        texts = [
+        texts, metadatas = self._get_texts_and_metadata()
            "Dogs are tough.",
            "Cats have fluff.",
            "What is a sandwich?",
            "That fence is purple.",
        ]
        metadatas = [{"a": 1}, {"b": 1}, {"c": 1}, {"d": 1, "e": 2}]
        store = AzureCosmosDBNoSqlVectorSearch.from_texts(
            texts,
@ -141,20 +144,24 @@ class TestAzureCosmosDBNoSqlVectorSearch:
            indexing_policy=get_vector_indexing_policy("flat"),
            cosmos_container_properties={"partition_key": partition_key},
            cosmos_database_properties={},
            full_text_policy=get_full_text_policy(),
            full_text_search_enabled=True,
        )
        sleep(1)  # waits for Cosmos DB to save contents to the collection
-        output = store.similarity_search("Dogs", k=1)
+        output = store.similarity_search("intelligent herders", k=1)
        assert output
-        assert output[0].page_content == "Dogs are tough."
+        assert len(output) == 1
        assert "Border Collies" in output[0].page_content
        # delete one document
        store.delete_document_by_id(str(output[0].metadata["id"]))
        sleep(2)
-        output2 = store.similarity_search("Dogs", k=1)
+        output2 = store.similarity_search("intelligent herders", k=1)
        assert output2
-        assert output2[0].page_content != "Dogs are tough."
+        assert len(output2) == 1
        assert "Border Collies" not in output2[0].page_content
        safe_delete_database(cosmos_client)
    def test_from_documents_cosine_distance_with_filtering(
@ -164,12 +171,7 @@ class TestAzureCosmosDBNoSqlVectorSearch:
        azure_openai_embeddings: OpenAIEmbeddings,
    ) -> None:
        """Test end to end construction and search."""
-        documents = [
+        documents = self._get_documents()
            Document(page_content="Dogs are tough.", metadata={"a": 1}),
            Document(page_content="Cats have fluff.", metadata={"a": 1}),
            Document(page_content="What is a sandwich?", metadata={"c": 1}),
            Document(page_content="That fence is purple.", metadata={"d": 1, "e": 2}),
        ]
        store = AzureCosmosDBNoSqlVectorSearch.from_documents(
            documents,
@ -183,33 +185,321 @@ class TestAzureCosmosDBNoSqlVectorSearch:
            indexing_policy=get_vector_indexing_policy("flat"),
            cosmos_container_properties={"partition_key": partition_key},
            cosmos_database_properties={},
            full_text_policy=get_full_text_policy(),
            full_text_search_enabled=True,
        )
        sleep(1)  # waits for Cosmos DB to save contents to the collection
-        output = store.similarity_search("Dogs", k=4)
+        output = store.similarity_search("intelligent herders", k=4)
        assert len(output) == 4
-        assert output[0].page_content == "Dogs are tough."
+        assert "Border Collies" in output[0].page_content
        assert output[0].metadata["a"] == 1
-        pre_filter = {
+        # pre_filter = {
-            "where_clause": "WHERE c.metadata.a=1",
+        #     "conditions": [
-        }
+        #         {"property": "metadata.a", "operator": "$eq", "value": 1},
        #     ],
        # }
        pre_filter = PreFilter(
            conditions=[
                Condition(property="metadata.a", operator="$eq", value=1),
            ],
        )
        output = store.similarity_search(
-            "Dogs", k=4, pre_filter=pre_filter, with_embedding=True
+            "intelligent herders", k=4, pre_filter=pre_filter, with_embedding=True
        )
-        assert len(output) == 2
+        assert len(output) == 3
-        assert output[0].page_content == "Dogs are tough."
+        assert "Border Collies" in output[0].page_content
        assert output[0].metadata["a"] == 1
-        pre_filter = {
+        # pre_filter = {
-            "where_clause": "WHERE c.metadata.a=1",
+        #     "conditions": [
-            "limit_offset_clause": "OFFSET 0 LIMIT 1",
+        #         {"property": "metadata.a", "operator": "$eq", "value": 1},
-        }
+        #     ],
        # }
        pre_filter = PreFilter(
            conditions=[
                Condition(property="metadata.a", operator="$eq", value=1),
            ],
        )
        offset_limit = "OFFSET 0 LIMIT 1"
-        output = store.similarity_search("Dogs", k=4, pre_filter=pre_filter)
+        output = store.similarity_search(
            "intelligent herders", k=4, pre_filter=pre_filter, offset_limit=offset_limit
        )
        assert len(output) == 1
-        assert output[0].page_content == "Dogs are tough."
+        assert "Border Collies" in output[0].page_content
        assert output[0].metadata["a"] == 1
        safe_delete_database(cosmos_client)
    def test_from_documents_full_text_and_hybrid(
        self,
        cosmos_client: Any,
        partition_key: Any,
        azure_openai_embeddings: OpenAIEmbeddings,
    ) -> None:
        """Test end to end construction and search."""
        documents = self._get_documents()
        store = AzureCosmosDBNoSqlVectorSearch.from_documents(
            documents,
            embedding=azure_openai_embeddings,
            cosmos_client=cosmos_client,
            database_name=database_name,
            container_name=container_name,
            vector_embedding_policy=get_vector_embedding_policy(
                "cosine", "float32", 1536
            ),
            full_text_policy=get_full_text_policy(),
            indexing_policy=get_vector_indexing_policy("diskANN"),
            cosmos_container_properties={"partition_key": partition_key},
            cosmos_database_properties={},
            full_text_search_enabled=True,
        )
        sleep(480)  # waits for Cosmos DB to save contents to the collection
        # Full text search contains any
        # pre_filter = {
        #     "conditions": [
        #         {
        #             "property": "text",
        #             "operator": "$full_text_contains_any",
        #             "value": "intelligent herders",
        #         },
        #     ],
        # }
        pre_filter = PreFilter(
            conditions=[
                Condition(
                    property="text",
                    operator="$full_text_contains_all",
                    value="intelligent herders",
                ),
            ],
        )
        output = store.similarity_search(
            "intelligent herders",
            k=5,
            pre_filter=pre_filter,
            query_type=CosmosDBQueryType.FULL_TEXT_SEARCH,
        )
        assert output
        assert len(output) == 3
        assert "Border Collies" in output[0].page_content
        # Full text search contains all
        # pre_filter = {
        #     "conditions": [
        #         {
        #             "property": "text",
        #             "operator": "$full_text_contains_all",
        #             "value": "intelligent herders",
        #         },
        #     ],
        # }
        pre_filter = PreFilter(
            conditions=[
                Condition(
                    property="text",
                    operator="$full_text_contains_all",
                    value="intelligent herders",
                ),
            ],
        )
        output = store.similarity_search(
            "intelligent herders",
            k=5,
            pre_filter=pre_filter,
            query_type=CosmosDBQueryType.FULL_TEXT_SEARCH,
        )
        assert output
        assert len(output) == 1
        assert "Border Collies" in output[0].page_content
        # Full text search BM25 ranking
        output = store.similarity_search(
            "intelligent herders", k=5, query_type=CosmosDBQueryType.FULL_TEXT_RANK
        )
        assert output
        assert len(output) == 5
        assert "Standard Poodles" in output[0].page_content
        # Full text search BM25 ranking with filtering
        # pre_filter = {
        #     "conditions": [
        #         {"property": "metadata.a", "operator": "$eq", "value": 1},
        #     ],
        # }
        pre_filter = PreFilter(
            conditions=[
                Condition(property="metadata.a", operator="$eq", value=1),
            ],
        )
        output = store.similarity_search(
            "intelligent herders",
            k=5,
            pre_filter=pre_filter,
            query_type=CosmosDBQueryType.FULL_TEXT_RANK,
        )
        assert output
        assert len(output) == 3
        assert "Border Collies" in output[0].page_content
        # Hybrid search RRF ranking combination of full text search and vector search
        output = store.similarity_search(
            "intelligent herders", k=5, query_type=CosmosDBQueryType.HYBRID
        )
        assert output
        assert len(output) == 5
        assert "Border Collies" in output[0].page_content
        # Hybrid search RRF ranking with filtering
        # pre_filter = {
        #     "conditions": [
        #         {"property": "metadata.a", "operator": "$eq", "value": 1},
        #     ],
        # }
        pre_filter = PreFilter(
            conditions=[
                Condition(property="metadata.a", operator="$eq", value=1),
            ],
        )
        output = store.similarity_search(
            "intelligent herders",
            k=5,
            pre_filter=pre_filter,
            query_type=CosmosDBQueryType.HYBRID,
        )
        assert output
        assert len(output) == 3
        assert "Border Collies" in output[0].page_content
        # Full text search BM25 ranking with full text filtering
        # pre_filter = {
        #     "conditions": [
        #         {
        #             "property": "text",
        #             "operator": "$full_text_contains",
        #             "value": "energetic",
        #         },
        #     ]
        # }
        pre_filter = PreFilter(
            conditions=[
                Condition(
                    property="text", operator="$full_text_contains", value="energetic"
                ),
            ],
        )
        output = store.similarity_search(
            "intelligent herders",
            k=5,
            pre_filter=pre_filter,
            query_type=CosmosDBQueryType.FULL_TEXT_RANK,
        )
        assert output
        assert len(output) == 3
        assert "Border Collies" in output[0].page_content
        # Full text search BM25 ranking with full text filtering
        # pre_filter = {
        #     "conditions": [
        #         {
        #             "property": "text",
        #             "operator": "$full_text_contains",
        #             "value": "energetic",
        #         },
        #         {"property": "metadata.a", "operator": "$eq", "value": 2},
        #     ],
        #     "logical_operator": "$and",
        # }
        pre_filter = PreFilter(
            conditions=[
                Condition(
                    property="text", operator="$full_text_contains", value="energetic"
                ),
                Condition(property="metadata.a", operator="$eq", value=2),
            ],
            logical_operator="$and",
        )
        output = store.similarity_search(
            "intelligent herders",
            k=5,
            pre_filter=pre_filter,
            query_type=CosmosDBQueryType.FULL_TEXT_RANK,
        )
        assert output
        assert len(output) == 2
        assert "Standard Poodles" in output[0].page_content
    def _get_documents(self) -> List[Document]:
        return [
            Document(
                page_content="Border Collies are intelligent, energetic "
                "herders skilled in outdoor activities.",
                metadata={"a": 1},
            ),
            Document(
                page_content="Golden Retrievers are friendly, loyal companions "
                "with excellent retrieving skills.",
                metadata={"a": 2},
            ),
            Document(
                page_content="Labrador Retrievers are playful, eager "
                "learners and skilled retrievers.",
                metadata={"a": 1},
            ),
            Document(
                page_content="Australian Shepherds are agile, energetic "
                "herders excelling in outdoor tasks.",
                metadata={"a": 2, "b": 1},
            ),
            Document(
                page_content="German Shepherds are brave, loyal protectors "
                "excelling in versatile tasks.",
                metadata={"a": 1, "b": 2},
            ),
            Document(
                page_content="Standard Poodles are intelligent, energetic "
                "learners excelling in agility.",
                metadata={"a": 2, "b": 3},
            ),
        ]
    def _get_texts_and_metadata(self) -> Tuple[List[str], List[Dict[str, Any]]]:
        texts = [
            "Border Collies are intelligent, "
            "energetic herders skilled in outdoor activities.",
            "Golden Retrievers are friendly, "
            "loyal companions with excellent retrieving skills.",
            "Labrador Retrievers are playful, "
            "eager learners and skilled retrievers.",
            "Australian Shepherds are agile, "
            "energetic herders excelling in outdoor tasks.",
            "German Shepherds are brave, "
            "loyal protectors excelling in versatile tasks.",
            "Standard Poodles are intelligent, "
            "energetic learners excelling in agility.",
        ]
        metadatas = [
            {"a": 1},
            {"a": 2},
            {"a": 1},
            {"a": 2, "b": 1},
            {"a": 1, "b": 2},
            {"a": 2, "b": 1},
        ]
        return texts, metadatas