mongodb: Add Hybrid and Full-Text Search Retrievers, release 0.2.0 (#25057)

## Description This pull-request extends the existing vector search strategies of MongoDBAtlasVectorSearch to include Hybrid (Reciprocal Rank Fusion) and Full-text via new Retrievers. There is a small breaking change in the form of the `prefilter` kwarg to search. For this, and because we have now added a great deal of features, including programmatic Index creation/deletion since 0.1.0, we plan to bump the version to 0.2.0. ### Checklist * Unit tests have been extended * formatting has been applied * One mypy error remains which will either go away in CI or be simplified. --------- Signed-off-by: Casey Clements <casey.clements@mongodb.com> Co-authored-by: Erick Friis <erick@langchain.dev>
2025-09-11 16:01:33 +00:00 · 2024-08-07 16:10:29 -04:00
parent f337408b0f
commit 6e9a8b188f
22 changed files with 1749 additions and 508 deletions
--- a/libs/partners/mongodb/langchain_mongodb/utils.py
+++ b/libs/partners/mongodb/langchain_mongodb/utils.py
@@ -1,6 +1,13 @@
-"""
-Tools for the Maximal Marginal Relevance (MMR) reranking.
-Duplicated from langchain_community to avoid cross-dependencies.
+"""Various Utility Functions
+
+- Tools for handling bson.ObjectId
+
+The help IDs live as ObjectId in MongoDB and str in Langchain and JSON.
+
+
+- Tools for the Maximal Marginal Relevance (MMR) reranking
+
+These are duplicated from langchain_community to avoid cross-dependencies.

 Functions "maximal_marginal_relevance" and "cosine_similarity"
 are duplicated in this utility respectively from modules:
@@ -21,11 +28,6 @@ logger = logging.getLogger(__name__)
 Matrix = Union[List[List[float]], List[np.ndarray], np.ndarray]


-class FailCode:
-    INDEX_NOT_FOUND = 27
-    INDEX_ALREADY_EXISTS = 68
-
-
 def cosine_similarity(X: Matrix, Y: Matrix) -> np.ndarray:
    """Row-wise cosine similarity between two equal-width matrices."""
    if len(X) == 0 or len(Y) == 0:
@@ -65,7 +67,37 @@ def maximal_marginal_relevance(
    lambda_mult: float = 0.5,
    k: int = 4,
 ) -> List[int]:
-    """Calculate maximal marginal relevance."""
+    """Compute Maximal Marginal Relevance (MMR).
+
+    MMR is a technique used to select documents that are both relevant to the query
+    and diverse among themselves. This function returns the indices
+    of the top-k embeddings that maximize the marginal relevance.
+
+    Args:
+        query_embedding (np.ndarray): The embedding vector of the query.
+        embedding_list (list of np.ndarray): A list containing the embedding vectors
+            of the candidate documents.
+        lambda_mult (float, optional): The trade-off parameter between
+            relevance and diversity. Defaults to 0.5.
+        k (int, optional): The number of embeddings to select. Defaults to 4.
+
+    Returns:
+        list of int: The indices of the embeddings that maximize the marginal relevance.
+
+    Notes:
+        The Maximal Marginal Relevance (MMR) is computed using the following formula:
+
+    MMR = argmax_{D_i ∈ R \ S} [λ * Sim(D_i, Q) - (1 - λ) * max_{D_j ∈ S} Sim(D_i, D_j)]
+
+        where:
+        - R is the set of candidate documents,
+        - S is the set of selected documents,
+        - Q is the query embedding,
+        - Sim(D_i, Q) is the similarity between document D_i and the query,
+        - Sim(D_i, D_j) is the similarity between documents D_i and D_j,
+        - λ is the trade-off parameter.
+    """
+
    if min(k, len(embedding_list)) <= 0:
        return []
    if query_embedding.ndim == 1:
@@ -137,6 +169,7 @@ def make_serializable(
    obj: Dict[str, Any],
 ) -> None:
    """Recursively cast values in a dict to a form able to json.dump"""
+
    from bson import ObjectId

    for k, v in obj.items():