mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-13 05:25:07 +00:00
mongodb[minor]: MongoDB Partner Package -- Porting MongoDBAtlasVectorSearch (#17652)
This PR migrates the existing MongoDBAtlasVectorSearch abstraction from
the `langchain_community` section to the partners package section of the
codebase.
- [x] Run the partner package script as advised in the partner-packages
documentation.
- [x] Add Unit Tests
- [x] Migrate Integration Tests
- [x] Refactor `MongoDBAtlasVectorStore` (autogenerated) to
`MongoDBAtlasVectorSearch`
- [x] ~Remove~ deprecate the old `langchain_community` VectorStore
references.
## Additional Callouts
- Implemented the `delete` method
- Included any missing async function implementations
- `amax_marginal_relevance_search_by_vector`
- `adelete`
- Added new Unit Tests that test for functionality of
`MongoDBVectorSearch` methods
- Removed [`del
res[self._embedding_key]`](e0c81e1cb0/libs/community/langchain_community/vectorstores/mongodb_atlas.py (L218)
)
in `_similarity_search_with_score` function as it would make the
`maximal_marginal_relevance` function fail otherwise. The `Document`
needs to store the embedding key in metadata to work.
Checklist:
- [x] PR title: Please title your PR "package: description", where
"package" is whichever of langchain, community, core, experimental, etc.
is being modified. Use "docs: ..." for purely docs changes, "templates:
..." for template changes, "infra: ..." for CI changes.
- Example: "community: add foobar LLM"
- [x] PR message
- [x] Pass lint and test: Run `make format`, `make lint` and `make test`
from the root of the package(s) you've modified to check that you're
passing lint and testing. See contribution guidelines for more
information on how to write/run tests, lint, etc:
https://python.langchain.com/docs/contributing/
- [x] Add tests and docs: If you're adding a new integration, please
include
1. Existing tests supplied in docs/docs do not change. Updated
docstrings for new functions like `delete`
2. an example notebook showing its use. It lives in
`docs/docs/integrations` directory. (This already exists)
If no one reviews your PR within a few days, please @-mention one of
baskaryan, efriis, eyurtsev, hwchase17.
---------
Co-authored-by: Steven Silvester <steven.silvester@ieee.org>
Co-authored-by: Erick Friis <erick@langchain.dev>
This commit is contained in:
7
libs/partners/mongodb/langchain_mongodb/__init__.py
Normal file
7
libs/partners/mongodb/langchain_mongodb/__init__.py
Normal file
@@ -0,0 +1,7 @@
|
||||
from langchain_mongodb.vectorstores import (
|
||||
MongoDBAtlasVectorSearch,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"MongoDBAtlasVectorSearch",
|
||||
]
|
0
libs/partners/mongodb/langchain_mongodb/py.typed
Normal file
0
libs/partners/mongodb/langchain_mongodb/py.typed
Normal file
87
libs/partners/mongodb/langchain_mongodb/utils.py
Normal file
87
libs/partners/mongodb/langchain_mongodb/utils.py
Normal file
@@ -0,0 +1,87 @@
|
||||
"""
|
||||
Tools for the Maximal Marginal Relevance (MMR) reranking.
|
||||
Duplicated from langchain_community to avoid cross-dependencies.
|
||||
|
||||
Functions "maximal_marginal_relevance" and "cosine_similarity"
|
||||
are duplicated in this utility respectively from modules:
|
||||
- "libs/community/langchain_community/vectorstores/utils.py"
|
||||
- "libs/community/langchain_community/utils/math.py"
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import List, Union
|
||||
|
||||
import numpy as np
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
Matrix = Union[List[List[float]], List[np.ndarray], np.ndarray]
|
||||
|
||||
|
||||
def cosine_similarity(X: Matrix, Y: Matrix) -> np.ndarray:
|
||||
"""Row-wise cosine similarity between two equal-width matrices."""
|
||||
if len(X) == 0 or len(Y) == 0:
|
||||
return np.array([])
|
||||
|
||||
X = np.array(X)
|
||||
Y = np.array(Y)
|
||||
if X.shape[1] != Y.shape[1]:
|
||||
raise ValueError(
|
||||
f"Number of columns in X and Y must be the same. X has shape {X.shape} "
|
||||
f"and Y has shape {Y.shape}."
|
||||
)
|
||||
try:
|
||||
import simsimd as simd # type: ignore
|
||||
|
||||
X = np.array(X, dtype=np.float32)
|
||||
Y = np.array(Y, dtype=np.float32)
|
||||
Z = 1 - simd.cdist(X, Y, metric="cosine")
|
||||
if isinstance(Z, float):
|
||||
return np.array([Z])
|
||||
return Z
|
||||
except ImportError:
|
||||
logger.info(
|
||||
"Unable to import simsimd, defaulting to NumPy implementation. If you want "
|
||||
"to use simsimd please install with `pip install simsimd`."
|
||||
)
|
||||
X_norm = np.linalg.norm(X, axis=1)
|
||||
Y_norm = np.linalg.norm(Y, axis=1)
|
||||
# Ignore divide by zero errors run time warnings as those are handled below.
|
||||
with np.errstate(divide="ignore", invalid="ignore"):
|
||||
similarity = np.dot(X, Y.T) / np.outer(X_norm, Y_norm)
|
||||
similarity[np.isnan(similarity) | np.isinf(similarity)] = 0.0
|
||||
return similarity
|
||||
|
||||
|
||||
def maximal_marginal_relevance(
|
||||
query_embedding: np.ndarray,
|
||||
embedding_list: list,
|
||||
lambda_mult: float = 0.5,
|
||||
k: int = 4,
|
||||
) -> List[int]:
|
||||
"""Calculate maximal marginal relevance."""
|
||||
if min(k, len(embedding_list)) <= 0:
|
||||
return []
|
||||
if query_embedding.ndim == 1:
|
||||
query_embedding = np.expand_dims(query_embedding, axis=0)
|
||||
similarity_to_query = cosine_similarity(query_embedding, embedding_list)[0]
|
||||
most_similar = int(np.argmax(similarity_to_query))
|
||||
idxs = [most_similar]
|
||||
selected = np.array([embedding_list[most_similar]])
|
||||
while len(idxs) < min(k, len(embedding_list)):
|
||||
best_score = -np.inf
|
||||
idx_to_add = -1
|
||||
similarity_to_selected = cosine_similarity(embedding_list, selected)
|
||||
for i, query_score in enumerate(similarity_to_query):
|
||||
if i in idxs:
|
||||
continue
|
||||
redundant_score = max(similarity_to_selected[i])
|
||||
equation_score = (
|
||||
lambda_mult * query_score - (1 - lambda_mult) * redundant_score
|
||||
)
|
||||
if equation_score > best_score:
|
||||
best_score = equation_score
|
||||
idx_to_add = i
|
||||
idxs.append(idx_to_add)
|
||||
selected = np.append(selected, [embedding_list[idx_to_add]], axis=0)
|
||||
return idxs
|
463
libs/partners/mongodb/langchain_mongodb/vectorstores.py
Normal file
463
libs/partners/mongodb/langchain_mongodb/vectorstores.py
Normal file
@@ -0,0 +1,463 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from importlib.metadata import version
|
||||
from typing import (
|
||||
Any,
|
||||
Callable,
|
||||
Dict,
|
||||
Generator,
|
||||
Iterable,
|
||||
List,
|
||||
Optional,
|
||||
Tuple,
|
||||
TypeVar,
|
||||
Union,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.embeddings import Embeddings
|
||||
from langchain_core.runnables.config import run_in_executor
|
||||
from langchain_core.vectorstores import VectorStore
|
||||
from pymongo import MongoClient
|
||||
from pymongo.collection import Collection
|
||||
from pymongo.driver_info import DriverInfo
|
||||
|
||||
from langchain_mongodb.utils import maximal_marginal_relevance
|
||||
|
||||
MongoDBDocumentType = TypeVar("MongoDBDocumentType", bound=Dict[str, Any])
|
||||
VST = TypeVar("VST", bound=VectorStore)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
DEFAULT_INSERT_BATCH_SIZE = 100
|
||||
|
||||
|
||||
class MongoDBAtlasVectorSearch(VectorStore):
|
||||
"""`MongoDB Atlas Vector Search` vector store.
|
||||
|
||||
To use, you should have both:
|
||||
- the ``pymongo`` python package installed
|
||||
- a connection string associated with a MongoDB Atlas Cluster having deployed an
|
||||
Atlas Search index
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_community.vectorstores import MongoDBAtlasVectorSearch
|
||||
from langchain_community.embeddings.openai import OpenAIEmbeddings
|
||||
from pymongo import MongoClient
|
||||
|
||||
mongo_client = MongoClient("<YOUR-CONNECTION-STRING>")
|
||||
collection = mongo_client["<db_name>"]["<collection_name>"]
|
||||
embeddings = OpenAIEmbeddings()
|
||||
vectorstore = MongoDBAtlasVectorSearch(collection, embeddings)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
collection: Collection[MongoDBDocumentType],
|
||||
embedding: Embeddings,
|
||||
*,
|
||||
index_name: str = "default",
|
||||
text_key: str = "text",
|
||||
embedding_key: str = "embedding",
|
||||
relevance_score_fn: str = "cosine",
|
||||
):
|
||||
"""
|
||||
Args:
|
||||
collection: MongoDB collection to add the texts to.
|
||||
embedding: Text embedding model to use.
|
||||
text_key: MongoDB field that will contain the text for each
|
||||
document.
|
||||
defaults to 'text'
|
||||
embedding_key: MongoDB field that will contain the embedding for
|
||||
each document.
|
||||
defaults to 'embedding'
|
||||
index_name: Name of the Atlas Search index.
|
||||
defaults to 'default'
|
||||
relevance_score_fn: The similarity score used for the index.
|
||||
defaults to 'cosine'
|
||||
Currently supported: 'euclidean', 'cosine', and 'dotProduct'.
|
||||
"""
|
||||
self._collection = collection
|
||||
self._embedding = embedding
|
||||
self._index_name = index_name
|
||||
self._text_key = text_key
|
||||
self._embedding_key = embedding_key
|
||||
self._relevance_score_fn = relevance_score_fn
|
||||
|
||||
@property
|
||||
def embeddings(self) -> Embeddings:
|
||||
return self._embedding
|
||||
|
||||
def _select_relevance_score_fn(self) -> Callable[[float], float]:
|
||||
scoring: dict[str, Callable] = {
|
||||
"euclidean": self._euclidean_relevance_score_fn,
|
||||
"dotProduct": self._max_inner_product_relevance_score_fn,
|
||||
"cosine": self._cosine_relevance_score_fn,
|
||||
}
|
||||
if self._relevance_score_fn in scoring:
|
||||
return scoring[self._relevance_score_fn]
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
f"No relevance score function for ${self._relevance_score_fn}"
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_connection_string(
|
||||
cls,
|
||||
connection_string: str,
|
||||
namespace: str,
|
||||
embedding: Embeddings,
|
||||
**kwargs: Any,
|
||||
) -> MongoDBAtlasVectorSearch:
|
||||
"""Construct a `MongoDB Atlas Vector Search` vector store
|
||||
from a MongoDB connection URI.
|
||||
|
||||
Args:
|
||||
connection_string: A valid MongoDB connection URI.
|
||||
namespace: A valid MongoDB namespace (database and collection).
|
||||
embedding: The text embedding model to use for the vector store.
|
||||
|
||||
Returns:
|
||||
A new MongoDBAtlasVectorSearch instance.
|
||||
|
||||
"""
|
||||
client: MongoClient = MongoClient(
|
||||
connection_string,
|
||||
driver=DriverInfo(name="Langchain", version=version("langchain")),
|
||||
)
|
||||
db_name, collection_name = namespace.split(".")
|
||||
collection = client[db_name][collection_name]
|
||||
return cls(collection, embedding, **kwargs)
|
||||
|
||||
def add_texts(
|
||||
self,
|
||||
texts: Iterable[str],
|
||||
metadatas: Optional[List[Dict[str, Any]]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List:
|
||||
"""Run more texts through the embeddings and add to the vectorstore.
|
||||
|
||||
Args:
|
||||
texts: Iterable of strings to add to the vectorstore.
|
||||
metadatas: Optional list of metadatas associated with the texts.
|
||||
|
||||
Returns:
|
||||
List of ids from adding the texts into the vectorstore.
|
||||
"""
|
||||
batch_size = kwargs.get("batch_size", DEFAULT_INSERT_BATCH_SIZE)
|
||||
_metadatas: Union[List, Generator] = metadatas or ({} for _ in texts)
|
||||
texts_batch = []
|
||||
metadatas_batch = []
|
||||
result_ids = []
|
||||
for i, (text, metadata) in enumerate(zip(texts, _metadatas)):
|
||||
texts_batch.append(text)
|
||||
metadatas_batch.append(metadata)
|
||||
if (i + 1) % batch_size == 0:
|
||||
result_ids.extend(self._insert_texts(texts_batch, metadatas_batch))
|
||||
texts_batch = []
|
||||
metadatas_batch = []
|
||||
if texts_batch:
|
||||
result_ids.extend(self._insert_texts(texts_batch, metadatas_batch))
|
||||
return result_ids
|
||||
|
||||
def _insert_texts(self, texts: List[str], metadatas: List[Dict[str, Any]]) -> List:
|
||||
if not texts:
|
||||
return []
|
||||
# Embed and create the documents
|
||||
embeddings = self._embedding.embed_documents(texts)
|
||||
to_insert = [
|
||||
{self._text_key: t, self._embedding_key: embedding, **m}
|
||||
for t, m, embedding in zip(texts, metadatas, embeddings)
|
||||
]
|
||||
# insert the documents in MongoDB Atlas
|
||||
insert_result = self._collection.insert_many(to_insert) # type: ignore
|
||||
return insert_result.inserted_ids
|
||||
|
||||
def _similarity_search_with_score(
|
||||
self,
|
||||
embedding: List[float],
|
||||
k: int = 4,
|
||||
pre_filter: Optional[Dict] = None,
|
||||
post_filter_pipeline: Optional[List[Dict]] = None,
|
||||
) -> List[Tuple[Document, float]]:
|
||||
params = {
|
||||
"queryVector": embedding,
|
||||
"path": self._embedding_key,
|
||||
"numCandidates": k * 10,
|
||||
"limit": k,
|
||||
"index": self._index_name,
|
||||
}
|
||||
if pre_filter:
|
||||
params["filter"] = pre_filter
|
||||
query = {"$vectorSearch": params}
|
||||
|
||||
pipeline = [
|
||||
query,
|
||||
{"$set": {"score": {"$meta": "vectorSearchScore"}}},
|
||||
]
|
||||
if post_filter_pipeline is not None:
|
||||
pipeline.extend(post_filter_pipeline)
|
||||
cursor = self._collection.aggregate(pipeline) # type: ignore[arg-type]
|
||||
docs = []
|
||||
for res in cursor:
|
||||
text = res.pop(self._text_key)
|
||||
score = res.pop("score")
|
||||
docs.append((Document(page_content=text, metadata=res), score))
|
||||
return docs
|
||||
|
||||
def similarity_search_with_score(
|
||||
self,
|
||||
query: str,
|
||||
k: int = 4,
|
||||
pre_filter: Optional[Dict] = None,
|
||||
post_filter_pipeline: Optional[List[Dict]] = None,
|
||||
) -> List[Tuple[Document, float]]:
|
||||
"""Return MongoDB documents most similar to the given query and their scores.
|
||||
|
||||
Uses the vectorSearch operator available in MongoDB Atlas Search.
|
||||
For more: https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-search-stage/
|
||||
|
||||
Args:
|
||||
query: Text to look up documents similar to.
|
||||
k: (Optional) number of documents to return. Defaults to 4.
|
||||
pre_filter: (Optional) dictionary of argument(s) to prefilter document
|
||||
fields on.
|
||||
post_filter_pipeline: (Optional) Pipeline of MongoDB aggregation stages
|
||||
following the vectorSearch stage.
|
||||
|
||||
Returns:
|
||||
List of documents most similar to the query and their scores.
|
||||
"""
|
||||
embedding = self._embedding.embed_query(query)
|
||||
docs = self._similarity_search_with_score(
|
||||
embedding,
|
||||
k=k,
|
||||
pre_filter=pre_filter,
|
||||
post_filter_pipeline=post_filter_pipeline,
|
||||
)
|
||||
return docs
|
||||
|
||||
def similarity_search(
|
||||
self,
|
||||
query: str,
|
||||
k: int = 4,
|
||||
pre_filter: Optional[Dict] = None,
|
||||
post_filter_pipeline: Optional[List[Dict]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
"""Return MongoDB documents most similar to the given query.
|
||||
|
||||
Uses the vectorSearch operator available in MongoDB Atlas Search.
|
||||
For more: https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-search-stage/
|
||||
|
||||
Args:
|
||||
query: Text to look up documents similar to.
|
||||
k: (Optional) number of documents to return. Defaults to 4.
|
||||
pre_filter: (Optional) dictionary of argument(s) to prefilter document
|
||||
fields on.
|
||||
post_filter_pipeline: (Optional) Pipeline of MongoDB aggregation stages
|
||||
following the vectorSearch stage.
|
||||
|
||||
Returns:
|
||||
List of documents most similar to the query and their scores.
|
||||
"""
|
||||
additional = kwargs.get("additional")
|
||||
docs_and_scores = self.similarity_search_with_score(
|
||||
query,
|
||||
k=k,
|
||||
pre_filter=pre_filter,
|
||||
post_filter_pipeline=post_filter_pipeline,
|
||||
)
|
||||
|
||||
if additional and "similarity_score" in additional:
|
||||
for doc, score in docs_and_scores:
|
||||
doc.metadata["score"] = score
|
||||
return [doc for doc, _ in docs_and_scores]
|
||||
|
||||
def max_marginal_relevance_search(
|
||||
self,
|
||||
query: str,
|
||||
k: int = 4,
|
||||
fetch_k: int = 20,
|
||||
lambda_mult: float = 0.5,
|
||||
pre_filter: Optional[Dict] = None,
|
||||
post_filter_pipeline: Optional[List[Dict]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
"""Return documents selected using the maximal marginal relevance.
|
||||
|
||||
Maximal marginal relevance optimizes for similarity to query AND diversity
|
||||
among selected documents.
|
||||
|
||||
Args:
|
||||
query: Text to look up documents similar to.
|
||||
k: (Optional) number of documents to return. Defaults to 4.
|
||||
fetch_k: (Optional) number of documents to fetch before passing to MMR
|
||||
algorithm. Defaults to 20.
|
||||
lambda_mult: Number between 0 and 1 that determines the degree
|
||||
of diversity among the results with 0 corresponding
|
||||
to maximum diversity and 1 to minimum diversity.
|
||||
Defaults to 0.5.
|
||||
pre_filter: (Optional) dictionary of argument(s) to prefilter on document
|
||||
fields.
|
||||
post_filter_pipeline: (Optional) pipeline of MongoDB aggregation stages
|
||||
following the vectorSearch stage.
|
||||
Returns:
|
||||
List of documents selected by maximal marginal relevance.
|
||||
"""
|
||||
query_embedding = self._embedding.embed_query(query)
|
||||
docs = self._similarity_search_with_score(
|
||||
query_embedding,
|
||||
k=fetch_k,
|
||||
pre_filter=pre_filter,
|
||||
post_filter_pipeline=post_filter_pipeline,
|
||||
)
|
||||
mmr_doc_indexes = maximal_marginal_relevance(
|
||||
np.array(query_embedding),
|
||||
[doc.metadata[self._embedding_key] for doc, _ in docs],
|
||||
k=k,
|
||||
lambda_mult=lambda_mult,
|
||||
)
|
||||
mmr_docs = [docs[i][0] for i in mmr_doc_indexes]
|
||||
return mmr_docs
|
||||
|
||||
@classmethod
|
||||
def from_texts(
|
||||
cls,
|
||||
texts: List[str],
|
||||
embedding: Embeddings,
|
||||
metadatas: Optional[List[Dict]] = None,
|
||||
collection: Optional[Collection[MongoDBDocumentType]] = None,
|
||||
**kwargs: Any,
|
||||
) -> MongoDBAtlasVectorSearch:
|
||||
"""Construct a `MongoDB Atlas Vector Search` vector store from raw documents.
|
||||
|
||||
This is a user-friendly interface that:
|
||||
1. Embeds documents.
|
||||
2. Adds the documents to a provided MongoDB Atlas Vector Search index
|
||||
(Lucene)
|
||||
|
||||
This is intended to be a quick way to get started.
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
from pymongo import MongoClient
|
||||
|
||||
from langchain_community.vectorstores import MongoDBAtlasVectorSearch
|
||||
from langchain_community.embeddings import OpenAIEmbeddings
|
||||
|
||||
mongo_client = MongoClient("<YOUR-CONNECTION-STRING>")
|
||||
collection = mongo_client["<db_name>"]["<collection_name>"]
|
||||
embeddings = OpenAIEmbeddings()
|
||||
vectorstore = MongoDBAtlasVectorSearch.from_texts(
|
||||
texts,
|
||||
embeddings,
|
||||
metadatas=metadatas,
|
||||
collection=collection
|
||||
)
|
||||
"""
|
||||
if collection is None:
|
||||
raise ValueError("Must provide 'collection' named parameter.")
|
||||
vectorstore = cls(collection, embedding, **kwargs)
|
||||
vectorstore.add_texts(texts, metadatas=metadatas)
|
||||
return vectorstore
|
||||
|
||||
def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> Optional[bool]:
|
||||
"""Delete by ObjectId or other criteria.
|
||||
|
||||
Args:
|
||||
ids: List of ids to delete.
|
||||
**kwargs: Other keyword arguments that subclasses might use.
|
||||
|
||||
Returns:
|
||||
Optional[bool]: True if deletion is successful,
|
||||
False otherwise, None if not implemented.
|
||||
"""
|
||||
search_params: dict[str, Any] = {}
|
||||
if ids:
|
||||
search_params[self._text_key]["$in"] = ids
|
||||
|
||||
return self._collection.delete_many({**search_params, **kwargs}).acknowledged
|
||||
|
||||
async def adelete(
|
||||
self, ids: Optional[List[str]] = None, **kwargs: Any
|
||||
) -> Optional[bool]:
|
||||
"""Delete by vector ID or other criteria.
|
||||
|
||||
Args:
|
||||
ids: List of ids to delete.
|
||||
**kwargs: Other keyword arguments that subclasses might use.
|
||||
|
||||
Returns:
|
||||
Optional[bool]: True if deletion is successful,
|
||||
False otherwise, None if not implemented.
|
||||
"""
|
||||
return await run_in_executor(None, self.delete, ids=ids, **kwargs)
|
||||
|
||||
def max_marginal_relevance_search_by_vector(
|
||||
self,
|
||||
embedding: List[float],
|
||||
k: int = 4,
|
||||
fetch_k: int = 20,
|
||||
lambda_mult: float = 0.5,
|
||||
pre_filter: Optional[Dict] = None,
|
||||
post_filter_pipeline: Optional[List[Dict]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]: # type: ignore
|
||||
"""Return docs selected using the maximal marginal relevance.
|
||||
|
||||
Maximal marginal relevance optimizes for similarity to query AND diversity
|
||||
among selected documents.
|
||||
|
||||
Args:
|
||||
embedding: Embedding to look up documents similar to.
|
||||
k: Number of Documents to return. Defaults to 4.
|
||||
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
|
||||
lambda_mult: Number between 0 and 1 that determines the degree
|
||||
of diversity among the results with 0 corresponding
|
||||
to maximum diversity and 1 to minimum diversity.
|
||||
Defaults to 0.5.
|
||||
pre_filter: (Optional) dictionary of argument(s) to prefilter on document
|
||||
fields.
|
||||
post_filter_pipeline: (Optional) pipeline of MongoDB aggregation stages
|
||||
following the vectorSearch stage.
|
||||
Returns:
|
||||
List of Documents selected by maximal marginal relevance.
|
||||
"""
|
||||
docs = self._similarity_search_with_score(
|
||||
embedding,
|
||||
k=fetch_k,
|
||||
pre_filter=pre_filter,
|
||||
post_filter_pipeline=post_filter_pipeline,
|
||||
)
|
||||
mmr_doc_indexes = maximal_marginal_relevance(
|
||||
np.array(embedding),
|
||||
[doc.metadata[self._embedding_key] for doc, _ in docs],
|
||||
k=k,
|
||||
lambda_mult=lambda_mult,
|
||||
)
|
||||
mmr_docs = [docs[i][0] for i in mmr_doc_indexes]
|
||||
return mmr_docs
|
||||
|
||||
async def amax_marginal_relevance_search_by_vector(
|
||||
self,
|
||||
embedding: List[float],
|
||||
k: int = 4,
|
||||
fetch_k: int = 20,
|
||||
lambda_mult: float = 0.5,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
"""Return docs selected using the maximal marginal relevance."""
|
||||
return await run_in_executor(
|
||||
None,
|
||||
self.max_marginal_relevance_search_by_vector,
|
||||
embedding,
|
||||
k=k,
|
||||
fetch_k=fetch_k,
|
||||
lambda_mult=lambda_mult,
|
||||
**kwargs,
|
||||
)
|
Reference in New Issue
Block a user