mongodb[minor]: MongoDB Partner Package -- Porting MongoDBAtlasVectorSearch (#17652)

This PR migrates the existing MongoDBAtlasVectorSearch abstraction from the `langchain_community` section to the partners package section of the codebase. - [x] Run the partner package script as advised in the partner-packages documentation. - [x] Add Unit Tests - [x] Migrate Integration Tests - [x] Refactor `MongoDBAtlasVectorStore` (autogenerated) to `MongoDBAtlasVectorSearch` - [x] ~Remove~ deprecate the old `langchain_community` VectorStore references. ## Additional Callouts - Implemented the `delete` method - Included any missing async function implementations - `amax_marginal_relevance_search_by_vector` - `adelete` - Added new Unit Tests that test for functionality of `MongoDBVectorSearch` methods - Removed [`del res[self._embedding_key]`](e0c81e1cb0/libs/community/langchain_community/vectorstores/mongodb_atlas.py (L218)) in `_similarity_search_with_score` function as it would make the `maximal_marginal_relevance` function fail otherwise. The `Document` needs to store the embedding key in metadata to work. Checklist: - [x] PR title: Please title your PR "package: description", where "package" is whichever of langchain, community, core, experimental, etc. is being modified. Use "docs: ..." for purely docs changes, "templates: ..." for template changes, "infra: ..." for CI changes. - Example: "community: add foobar LLM" - [x] PR message - [x] Pass lint and test: Run `make format`, `make lint` and `make test` from the root of the package(s) you've modified to check that you're passing lint and testing. See contribution guidelines for more information on how to write/run tests, lint, etc: https://python.langchain.com/docs/contributing/ - [x] Add tests and docs: If you're adding a new integration, please include 1. Existing tests supplied in docs/docs do not change. Updated docstrings for new functions like `delete` 2. an example notebook showing its use. It lives in `docs/docs/integrations` directory. (This already exists) If no one reviews your PR within a few days, please @-mention one of baskaryan, efriis, eyurtsev, hwchase17. --------- Co-authored-by: Steven Silvester <steven.silvester@ieee.org> Co-authored-by: Erick Friis <erick@langchain.dev>
2025-09-13 05:25:07 +00:00 · 2024-02-29 18:09:48 -05:00
parent 412148773c
commit 72bfc1d3db
23 changed files with 2321 additions and 3 deletions
--- a/libs/partners/mongodb/langchain_mongodb/init.py
+++ b/libs/partners/mongodb/langchain_mongodb/init.py
@@ -0,0 +1,7 @@
+from langchain_mongodb.vectorstores import (
+    MongoDBAtlasVectorSearch,
+)
+
+__all__ = [
+    "MongoDBAtlasVectorSearch",
+]
--- a/libs/partners/mongodb/langchain_mongodb/py.typed
+++ b/libs/partners/mongodb/langchain_mongodb/py.typed
--- a/libs/partners/mongodb/langchain_mongodb/utils.py
+++ b/libs/partners/mongodb/langchain_mongodb/utils.py
@@ -0,0 +1,87 @@
+"""
+Tools for the Maximal Marginal Relevance (MMR) reranking.
+Duplicated from langchain_community to avoid cross-dependencies.
+
+Functions "maximal_marginal_relevance" and "cosine_similarity"
+are duplicated in this utility respectively from modules:
+    - "libs/community/langchain_community/vectorstores/utils.py"
+    - "libs/community/langchain_community/utils/math.py"
+"""
+
+import logging
+from typing import List, Union
+
+import numpy as np
+
+logger = logging.getLogger(__name__)
+
+Matrix = Union[List[List[float]], List[np.ndarray], np.ndarray]
+
+
+def cosine_similarity(X: Matrix, Y: Matrix) -> np.ndarray:
+    """Row-wise cosine similarity between two equal-width matrices."""
+    if len(X) == 0 or len(Y) == 0:
+        return np.array([])
+
+    X = np.array(X)
+    Y = np.array(Y)
+    if X.shape[1] != Y.shape[1]:
+        raise ValueError(
+            f"Number of columns in X and Y must be the same. X has shape {X.shape} "
+            f"and Y has shape {Y.shape}."
+        )
+    try:
+        import simsimd as simd  # type: ignore
+
+        X = np.array(X, dtype=np.float32)
+        Y = np.array(Y, dtype=np.float32)
+        Z = 1 - simd.cdist(X, Y, metric="cosine")
+        if isinstance(Z, float):
+            return np.array([Z])
+        return Z
+    except ImportError:
+        logger.info(
+            "Unable to import simsimd, defaulting to NumPy implementation. If you want "
+            "to use simsimd please install with `pip install simsimd`."
+        )
+        X_norm = np.linalg.norm(X, axis=1)
+        Y_norm = np.linalg.norm(Y, axis=1)
+        # Ignore divide by zero errors run time warnings as those are handled below.
+        with np.errstate(divide="ignore", invalid="ignore"):
+            similarity = np.dot(X, Y.T) / np.outer(X_norm, Y_norm)
+        similarity[np.isnan(similarity) | np.isinf(similarity)] = 0.0
+        return similarity
+
+
+def maximal_marginal_relevance(
+    query_embedding: np.ndarray,
+    embedding_list: list,
+    lambda_mult: float = 0.5,
+    k: int = 4,
+) -> List[int]:
+    """Calculate maximal marginal relevance."""
+    if min(k, len(embedding_list)) <= 0:
+        return []
+    if query_embedding.ndim == 1:
+        query_embedding = np.expand_dims(query_embedding, axis=0)
+    similarity_to_query = cosine_similarity(query_embedding, embedding_list)[0]
+    most_similar = int(np.argmax(similarity_to_query))
+    idxs = [most_similar]
+    selected = np.array([embedding_list[most_similar]])
+    while len(idxs) < min(k, len(embedding_list)):
+        best_score = -np.inf
+        idx_to_add = -1
+        similarity_to_selected = cosine_similarity(embedding_list, selected)
+        for i, query_score in enumerate(similarity_to_query):
+            if i in idxs:
+                continue
+            redundant_score = max(similarity_to_selected[i])
+            equation_score = (
+                lambda_mult * query_score - (1 - lambda_mult) * redundant_score
+            )
+            if equation_score > best_score:
+                best_score = equation_score
+                idx_to_add = i
+        idxs.append(idx_to_add)
+        selected = np.append(selected, [embedding_list[idx_to_add]], axis=0)
+    return idxs
--- a/libs/partners/mongodb/langchain_mongodb/vectorstores.py
+++ b/libs/partners/mongodb/langchain_mongodb/vectorstores.py
@@ -0,0 +1,463 @@
+from __future__ import annotations
+
+import logging
+from importlib.metadata import version
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Generator,
+    Iterable,
+    List,
+    Optional,
+    Tuple,
+    TypeVar,
+    Union,
+)
+
+import numpy as np
+from langchain_core.documents import Document
+from langchain_core.embeddings import Embeddings
+from langchain_core.runnables.config import run_in_executor
+from langchain_core.vectorstores import VectorStore
+from pymongo import MongoClient
+from pymongo.collection import Collection
+from pymongo.driver_info import DriverInfo
+
+from langchain_mongodb.utils import maximal_marginal_relevance
+
+MongoDBDocumentType = TypeVar("MongoDBDocumentType", bound=Dict[str, Any])
+VST = TypeVar("VST", bound=VectorStore)
+
+logger = logging.getLogger(__name__)
+
+DEFAULT_INSERT_BATCH_SIZE = 100
+
+
+class MongoDBAtlasVectorSearch(VectorStore):
+    """`MongoDB Atlas Vector Search` vector store.
+
+    To use, you should have both:
+    - the ``pymongo`` python package installed
+    - a connection string associated with a MongoDB Atlas Cluster having deployed an
+        Atlas Search index
+
+    Example:
+        .. code-block:: python
+
+            from langchain_community.vectorstores import MongoDBAtlasVectorSearch
+            from langchain_community.embeddings.openai import OpenAIEmbeddings
+            from pymongo import MongoClient
+
+            mongo_client = MongoClient("<YOUR-CONNECTION-STRING>")
+            collection = mongo_client["<db_name>"]["<collection_name>"]
+            embeddings = OpenAIEmbeddings()
+            vectorstore = MongoDBAtlasVectorSearch(collection, embeddings)
+    """
+
+    def __init__(
+        self,
+        collection: Collection[MongoDBDocumentType],
+        embedding: Embeddings,
+        *,
+        index_name: str = "default",
+        text_key: str = "text",
+        embedding_key: str = "embedding",
+        relevance_score_fn: str = "cosine",
+    ):
+        """
+        Args:
+            collection: MongoDB collection to add the texts to.
+            embedding: Text embedding model to use.
+            text_key: MongoDB field that will contain the text for each
+                document.
+                defaults to 'text'
+            embedding_key: MongoDB field that will contain the embedding for
+                each document.
+                defaults to 'embedding'
+            index_name: Name of the Atlas Search index.
+                defaults to 'default'
+            relevance_score_fn: The similarity score used for the index.
+                defaults to 'cosine'
+            Currently supported: 'euclidean', 'cosine', and 'dotProduct'.
+        """
+        self._collection = collection
+        self._embedding = embedding
+        self._index_name = index_name
+        self._text_key = text_key
+        self._embedding_key = embedding_key
+        self._relevance_score_fn = relevance_score_fn
+
+    @property
+    def embeddings(self) -> Embeddings:
+        return self._embedding
+
+    def _select_relevance_score_fn(self) -> Callable[[float], float]:
+        scoring: dict[str, Callable] = {
+            "euclidean": self._euclidean_relevance_score_fn,
+            "dotProduct": self._max_inner_product_relevance_score_fn,
+            "cosine": self._cosine_relevance_score_fn,
+        }
+        if self._relevance_score_fn in scoring:
+            return scoring[self._relevance_score_fn]
+        else:
+            raise NotImplementedError(
+                f"No relevance score function for ${self._relevance_score_fn}"
+            )
+
+    @classmethod
+    def from_connection_string(
+        cls,
+        connection_string: str,
+        namespace: str,
+        embedding: Embeddings,
+        **kwargs: Any,
+    ) -> MongoDBAtlasVectorSearch:
+        """Construct a `MongoDB Atlas Vector Search` vector store
+        from a MongoDB connection URI.
+
+        Args:
+            connection_string: A valid MongoDB connection URI.
+            namespace: A valid MongoDB namespace (database and collection).
+            embedding: The text embedding model to use for the vector store.
+
+        Returns:
+            A new MongoDBAtlasVectorSearch instance.
+
+        """
+        client: MongoClient = MongoClient(
+            connection_string,
+            driver=DriverInfo(name="Langchain", version=version("langchain")),
+        )
+        db_name, collection_name = namespace.split(".")
+        collection = client[db_name][collection_name]
+        return cls(collection, embedding, **kwargs)
+
+    def add_texts(
+        self,
+        texts: Iterable[str],
+        metadatas: Optional[List[Dict[str, Any]]] = None,
+        **kwargs: Any,
+    ) -> List:
+        """Run more texts through the embeddings and add to the vectorstore.
+
+        Args:
+            texts: Iterable of strings to add to the vectorstore.
+            metadatas: Optional list of metadatas associated with the texts.
+
+        Returns:
+            List of ids from adding the texts into the vectorstore.
+        """
+        batch_size = kwargs.get("batch_size", DEFAULT_INSERT_BATCH_SIZE)
+        _metadatas: Union[List, Generator] = metadatas or ({} for _ in texts)
+        texts_batch = []
+        metadatas_batch = []
+        result_ids = []
+        for i, (text, metadata) in enumerate(zip(texts, _metadatas)):
+            texts_batch.append(text)
+            metadatas_batch.append(metadata)
+            if (i + 1) % batch_size == 0:
+                result_ids.extend(self._insert_texts(texts_batch, metadatas_batch))
+                texts_batch = []
+                metadatas_batch = []
+        if texts_batch:
+            result_ids.extend(self._insert_texts(texts_batch, metadatas_batch))
+        return result_ids
+
+    def _insert_texts(self, texts: List[str], metadatas: List[Dict[str, Any]]) -> List:
+        if not texts:
+            return []
+        # Embed and create the documents
+        embeddings = self._embedding.embed_documents(texts)
+        to_insert = [
+            {self._text_key: t, self._embedding_key: embedding, **m}
+            for t, m, embedding in zip(texts, metadatas, embeddings)
+        ]
+        # insert the documents in MongoDB Atlas
+        insert_result = self._collection.insert_many(to_insert)  # type: ignore
+        return insert_result.inserted_ids
+
+    def _similarity_search_with_score(
+        self,
+        embedding: List[float],
+        k: int = 4,
+        pre_filter: Optional[Dict] = None,
+        post_filter_pipeline: Optional[List[Dict]] = None,
+    ) -> List[Tuple[Document, float]]:
+        params = {
+            "queryVector": embedding,
+            "path": self._embedding_key,
+            "numCandidates": k * 10,
+            "limit": k,
+            "index": self._index_name,
+        }
+        if pre_filter:
+            params["filter"] = pre_filter
+        query = {"$vectorSearch": params}
+
+        pipeline = [
+            query,
+            {"$set": {"score": {"$meta": "vectorSearchScore"}}},
+        ]
+        if post_filter_pipeline is not None:
+            pipeline.extend(post_filter_pipeline)
+        cursor = self._collection.aggregate(pipeline)  # type: ignore[arg-type]
+        docs = []
+        for res in cursor:
+            text = res.pop(self._text_key)
+            score = res.pop("score")
+            docs.append((Document(page_content=text, metadata=res), score))
+        return docs
+
+    def similarity_search_with_score(
+        self,
+        query: str,
+        k: int = 4,
+        pre_filter: Optional[Dict] = None,
+        post_filter_pipeline: Optional[List[Dict]] = None,
+    ) -> List[Tuple[Document, float]]:
+        """Return MongoDB documents most similar to the given query and their scores.
+
+        Uses the vectorSearch operator available in MongoDB Atlas Search.
+        For more: https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-search-stage/
+
+        Args:
+            query: Text to look up documents similar to.
+            k: (Optional) number of documents to return. Defaults to 4.
+            pre_filter: (Optional) dictionary of argument(s) to prefilter document
+                fields on.
+            post_filter_pipeline: (Optional) Pipeline of MongoDB aggregation stages
+                following the vectorSearch stage.
+
+        Returns:
+            List of documents most similar to the query and their scores.
+        """
+        embedding = self._embedding.embed_query(query)
+        docs = self._similarity_search_with_score(
+            embedding,
+            k=k,
+            pre_filter=pre_filter,
+            post_filter_pipeline=post_filter_pipeline,
+        )
+        return docs
+
+    def similarity_search(
+        self,
+        query: str,
+        k: int = 4,
+        pre_filter: Optional[Dict] = None,
+        post_filter_pipeline: Optional[List[Dict]] = None,
+        **kwargs: Any,
+    ) -> List[Document]:
+        """Return MongoDB documents most similar to the given query.
+
+        Uses the vectorSearch operator available in MongoDB Atlas Search.
+        For more: https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-search-stage/
+
+        Args:
+            query: Text to look up documents similar to.
+            k: (Optional) number of documents to return. Defaults to 4.
+            pre_filter: (Optional) dictionary of argument(s) to prefilter document
+                fields on.
+            post_filter_pipeline: (Optional) Pipeline of MongoDB aggregation stages
+                following the vectorSearch stage.
+
+        Returns:
+            List of documents most similar to the query and their scores.
+        """
+        additional = kwargs.get("additional")
+        docs_and_scores = self.similarity_search_with_score(
+            query,
+            k=k,
+            pre_filter=pre_filter,
+            post_filter_pipeline=post_filter_pipeline,
+        )
+
+        if additional and "similarity_score" in additional:
+            for doc, score in docs_and_scores:
+                doc.metadata["score"] = score
+        return [doc for doc, _ in docs_and_scores]
+
+    def max_marginal_relevance_search(
+        self,
+        query: str,
+        k: int = 4,
+        fetch_k: int = 20,
+        lambda_mult: float = 0.5,
+        pre_filter: Optional[Dict] = None,
+        post_filter_pipeline: Optional[List[Dict]] = None,
+        **kwargs: Any,
+    ) -> List[Document]:
+        """Return documents selected using the maximal marginal relevance.
+
+        Maximal marginal relevance optimizes for similarity to query AND diversity
+        among selected documents.
+
+        Args:
+            query: Text to look up documents similar to.
+            k: (Optional) number of documents to return. Defaults to 4.
+            fetch_k: (Optional) number of documents to fetch before passing to MMR
+                algorithm. Defaults to 20.
+            lambda_mult: Number between 0 and 1 that determines the degree
+                        of diversity among the results with 0 corresponding
+                        to maximum diversity and 1 to minimum diversity.
+                        Defaults to 0.5.
+            pre_filter: (Optional) dictionary of argument(s) to prefilter on document
+                fields.
+            post_filter_pipeline: (Optional) pipeline of MongoDB aggregation stages
+                following the vectorSearch stage.
+        Returns:
+            List of documents selected by maximal marginal relevance.
+        """
+        query_embedding = self._embedding.embed_query(query)
+        docs = self._similarity_search_with_score(
+            query_embedding,
+            k=fetch_k,
+            pre_filter=pre_filter,
+            post_filter_pipeline=post_filter_pipeline,
+        )
+        mmr_doc_indexes = maximal_marginal_relevance(
+            np.array(query_embedding),
+            [doc.metadata[self._embedding_key] for doc, _ in docs],
+            k=k,
+            lambda_mult=lambda_mult,
+        )
+        mmr_docs = [docs[i][0] for i in mmr_doc_indexes]
+        return mmr_docs
+
+    @classmethod
+    def from_texts(
+        cls,
+        texts: List[str],
+        embedding: Embeddings,
+        metadatas: Optional[List[Dict]] = None,
+        collection: Optional[Collection[MongoDBDocumentType]] = None,
+        **kwargs: Any,
+    ) -> MongoDBAtlasVectorSearch:
+        """Construct a `MongoDB Atlas Vector Search` vector store from raw documents.
+
+        This is a user-friendly interface that:
+            1. Embeds documents.
+            2. Adds the documents to a provided MongoDB Atlas Vector Search index
+                (Lucene)
+
+        This is intended to be a quick way to get started.
+
+        Example:
+            .. code-block:: python
+                from pymongo import MongoClient
+
+                from langchain_community.vectorstores import MongoDBAtlasVectorSearch
+                from langchain_community.embeddings import OpenAIEmbeddings
+
+                mongo_client = MongoClient("<YOUR-CONNECTION-STRING>")
+                collection = mongo_client["<db_name>"]["<collection_name>"]
+                embeddings = OpenAIEmbeddings()
+                vectorstore = MongoDBAtlasVectorSearch.from_texts(
+                    texts,
+                    embeddings,
+                    metadatas=metadatas,
+                    collection=collection
+                )
+        """
+        if collection is None:
+            raise ValueError("Must provide 'collection' named parameter.")
+        vectorstore = cls(collection, embedding, **kwargs)
+        vectorstore.add_texts(texts, metadatas=metadatas)
+        return vectorstore
+
+    def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> Optional[bool]:
+        """Delete by ObjectId or other criteria.
+
+        Args:
+            ids: List of ids to delete.
+            **kwargs: Other keyword arguments that subclasses might use.
+
+        Returns:
+            Optional[bool]: True if deletion is successful,
+            False otherwise, None if not implemented.
+        """
+        search_params: dict[str, Any] = {}
+        if ids:
+            search_params[self._text_key]["$in"] = ids
+
+        return self._collection.delete_many({**search_params, **kwargs}).acknowledged
+
+    async def adelete(
+        self, ids: Optional[List[str]] = None, **kwargs: Any
+    ) -> Optional[bool]:
+        """Delete by vector ID or other criteria.
+
+        Args:
+            ids: List of ids to delete.
+            **kwargs: Other keyword arguments that subclasses might use.
+
+        Returns:
+            Optional[bool]: True if deletion is successful,
+            False otherwise, None if not implemented.
+        """
+        return await run_in_executor(None, self.delete, ids=ids, **kwargs)
+
+    def max_marginal_relevance_search_by_vector(
+        self,
+        embedding: List[float],
+        k: int = 4,
+        fetch_k: int = 20,
+        lambda_mult: float = 0.5,
+        pre_filter: Optional[Dict] = None,
+        post_filter_pipeline: Optional[List[Dict]] = None,
+        **kwargs: Any,
+    ) -> List[Document]:  # type: ignore
+        """Return docs selected using the maximal marginal relevance.
+
+        Maximal marginal relevance optimizes for similarity to query AND diversity
+        among selected documents.
+
+        Args:
+            embedding: Embedding to look up documents similar to.
+            k: Number of Documents to return. Defaults to 4.
+            fetch_k: Number of Documents to fetch to pass to MMR algorithm.
+            lambda_mult: Number between 0 and 1 that determines the degree
+                        of diversity among the results with 0 corresponding
+                        to maximum diversity and 1 to minimum diversity.
+                        Defaults to 0.5.
+            pre_filter: (Optional) dictionary of argument(s) to prefilter on document
+                fields.
+            post_filter_pipeline: (Optional) pipeline of MongoDB aggregation stages
+                following the vectorSearch stage.
+        Returns:
+            List of Documents selected by maximal marginal relevance.
+        """
+        docs = self._similarity_search_with_score(
+            embedding,
+            k=fetch_k,
+            pre_filter=pre_filter,
+            post_filter_pipeline=post_filter_pipeline,
+        )
+        mmr_doc_indexes = maximal_marginal_relevance(
+            np.array(embedding),
+            [doc.metadata[self._embedding_key] for doc, _ in docs],
+            k=k,
+            lambda_mult=lambda_mult,
+        )
+        mmr_docs = [docs[i][0] for i in mmr_doc_indexes]
+        return mmr_docs
+
+    async def amax_marginal_relevance_search_by_vector(
+        self,
+        embedding: List[float],
+        k: int = 4,
+        fetch_k: int = 20,
+        lambda_mult: float = 0.5,
+        **kwargs: Any,
+    ) -> List[Document]:
+        """Return docs selected using the maximal marginal relevance."""
+        return await run_in_executor(
+            None,
+            self.max_marginal_relevance_search_by_vector,
+            embedding,
+            k=k,
+            fetch_k=fetch_k,
+            lambda_mult=lambda_mult,
+            **kwargs,
+        )