mirror of
https://github.com/hwchase17/langchain.git
synced 2025-05-24 16:37:46 +00:00
Add support for MongoDB Atlas $vectorSearch vector search (#11139)
Adds support for the `$vectorSearch` operator for MongoDBAtlasVectorSearch, which was announced at .Local London (September 26th, 2023). This change maintains breaks compatibility support for the existing `$search` operator used by the original integration (https://github.com/langchain-ai/langchain/pull/5338) due to incompatibilities in the Atlas search implementations. --------- Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
parent
b599f91e33
commit
2c952de21a
@ -89,6 +89,18 @@ class MongoDBAtlasVectorSearch(VectorStore):
|
||||
embedding: Embeddings,
|
||||
**kwargs: Any,
|
||||
) -> MongoDBAtlasVectorSearch:
|
||||
"""Construct a `MongoDB Atlas Vector Search` vector store
|
||||
from a MongoDB connection URI.
|
||||
|
||||
Args:
|
||||
connection_string: A valid MongoDB connection URI.
|
||||
namespace: A valid MongoDB namespace (database and collection).
|
||||
embedding: The text embedding model to use for the vector store.
|
||||
|
||||
Returns:
|
||||
A new MongoDBAtlasVectorSearch instance.
|
||||
|
||||
"""
|
||||
try:
|
||||
from pymongo import MongoClient
|
||||
except ImportError:
|
||||
@ -149,24 +161,23 @@ class MongoDBAtlasVectorSearch(VectorStore):
|
||||
self,
|
||||
embedding: List[float],
|
||||
k: int = 4,
|
||||
pre_filter: Optional[dict] = None,
|
||||
pre_filter: Optional[Dict] = None,
|
||||
post_filter_pipeline: Optional[List[Dict]] = None,
|
||||
) -> List[Tuple[Document, float]]:
|
||||
knn_beta = {
|
||||
"vector": embedding,
|
||||
params = {
|
||||
"queryVector": embedding,
|
||||
"path": self._embedding_key,
|
||||
"k": k,
|
||||
"numCandidates": k * 10,
|
||||
"limit": k,
|
||||
"index": self._index_name,
|
||||
}
|
||||
if pre_filter:
|
||||
knn_beta["filter"] = pre_filter
|
||||
params["filter"] = pre_filter
|
||||
query = {"$vectorSearch": params}
|
||||
|
||||
pipeline = [
|
||||
{
|
||||
"$search": {
|
||||
"index": self._index_name,
|
||||
"knnBeta": knn_beta,
|
||||
}
|
||||
},
|
||||
{"$set": {"score": {"$meta": "searchScore"}}},
|
||||
query,
|
||||
{"$set": {"score": {"$meta": "vectorSearchScore"}}},
|
||||
]
|
||||
if post_filter_pipeline is not None:
|
||||
pipeline.extend(post_filter_pipeline)
|
||||
@ -183,12 +194,12 @@ class MongoDBAtlasVectorSearch(VectorStore):
|
||||
query: str,
|
||||
*,
|
||||
k: int = 4,
|
||||
pre_filter: Optional[dict] = None,
|
||||
pre_filter: Optional[Dict] = None,
|
||||
post_filter_pipeline: Optional[List[Dict]] = None,
|
||||
) -> List[Tuple[Document, float]]:
|
||||
"""Return MongoDB documents most similar to query, along with scores.
|
||||
"""Return MongoDB documents most similar to the given query and their scores.
|
||||
|
||||
Use the knnBeta Operator available in MongoDB Atlas Search
|
||||
Uses the knnBeta Operator available in MongoDB Atlas Search.
|
||||
This feature is in early access and available only for evaluation purposes, to
|
||||
validate functionality, and to gather feedback from a small closed group of
|
||||
early access users. It is not recommended for production deployments as we
|
||||
@ -197,14 +208,14 @@ class MongoDBAtlasVectorSearch(VectorStore):
|
||||
|
||||
Args:
|
||||
query: Text to look up documents similar to.
|
||||
k: Optional Number of Documents to return. Defaults to 4.
|
||||
pre_filter: Optional Dictionary of argument(s) to prefilter on document
|
||||
fields.
|
||||
post_filter_pipeline: Optional Pipeline of MongoDB aggregation stages
|
||||
following the knnBeta search.
|
||||
k: (Optional) number of documents to return. Defaults to 4.
|
||||
pre_filter: (Optional) dictionary of argument(s) to prefilter document
|
||||
fields on.
|
||||
post_filter_pipeline: (Optional) Pipeline of MongoDB aggregation stages
|
||||
following the knnBeta vector search.
|
||||
|
||||
Returns:
|
||||
List of Documents most similar to the query and score for each
|
||||
List of documents most similar to the query and their scores.
|
||||
"""
|
||||
embedding = self._embedding.embed_query(query)
|
||||
docs = self._similarity_search_with_score(
|
||||
@ -219,29 +230,29 @@ class MongoDBAtlasVectorSearch(VectorStore):
|
||||
self,
|
||||
query: str,
|
||||
k: int = 4,
|
||||
pre_filter: Optional[dict] = None,
|
||||
pre_filter: Optional[Dict] = None,
|
||||
post_filter_pipeline: Optional[List[Dict]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
"""Return MongoDB documents most similar to query.
|
||||
"""Return MongoDB documents most similar to the given query.
|
||||
|
||||
Use the knnBeta Operator available in MongoDB Atlas Search
|
||||
Uses the knnBeta Operator available in MongoDB Atlas Search.
|
||||
This feature is in early access and available only for evaluation purposes, to
|
||||
validate functionality, and to gather feedback from a small closed group of
|
||||
early access users. It is not recommended for production deployments as we may
|
||||
introduce breaking changes.
|
||||
early access users. It is not recommended for production deployments as we
|
||||
may introduce breaking changes.
|
||||
For more: https://www.mongodb.com/docs/atlas/atlas-search/knn-beta
|
||||
|
||||
Args:
|
||||
query: Text to look up documents similar to.
|
||||
k: Optional Number of Documents to return. Defaults to 4.
|
||||
pre_filter: Optional Dictionary of argument(s) to prefilter on document
|
||||
fields.
|
||||
post_filter_pipeline: Optional Pipeline of MongoDB aggregation stages
|
||||
following the knnBeta search.
|
||||
k: (Optional) number of documents to return. Defaults to 4.
|
||||
pre_filter: (Optional) dictionary of argument(s) to prefilter document
|
||||
fields on.
|
||||
post_filter_pipeline: (Optional) Pipeline of MongoDB aggregation stages
|
||||
following the knnBeta vector search.
|
||||
|
||||
Returns:
|
||||
List of Documents most similar to the query and score for each
|
||||
List of documents most similar to the query and their scores.
|
||||
"""
|
||||
docs_and_scores = self.similarity_search_with_score(
|
||||
query,
|
||||
@ -257,30 +268,30 @@ class MongoDBAtlasVectorSearch(VectorStore):
|
||||
k: int = 4,
|
||||
fetch_k: int = 20,
|
||||
lambda_mult: float = 0.5,
|
||||
pre_filter: Optional[dict] = None,
|
||||
pre_filter: Optional[Dict] = None,
|
||||
post_filter_pipeline: Optional[List[Dict]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
"""Return docs selected using the maximal marginal relevance.
|
||||
"""Return documents selected using the maximal marginal relevance.
|
||||
|
||||
Maximal marginal relevance optimizes for similarity to query AND diversity
|
||||
among selected documents.
|
||||
|
||||
Args:
|
||||
query: Text to look up documents similar to.
|
||||
k: Optional Number of Documents to return. Defaults to 4.
|
||||
fetch_k: Optional Number of Documents to fetch before passing to MMR
|
||||
k: (Optional) number of documents to return. Defaults to 4.
|
||||
fetch_k: (Optional) number of documents to fetch before passing to MMR
|
||||
algorithm. Defaults to 20.
|
||||
lambda_mult: Number between 0 and 1 that determines the degree
|
||||
of diversity among the results with 0 corresponding
|
||||
to maximum diversity and 1 to minimum diversity.
|
||||
Defaults to 0.5.
|
||||
pre_filter: Optional Dictionary of argument(s) to prefilter on document
|
||||
pre_filter: (Optional) dictionary of argument(s) to prefilter on document
|
||||
fields.
|
||||
post_filter_pipeline: Optional Pipeline of MongoDB aggregation stages
|
||||
following the knnBeta search.
|
||||
post_filter_pipeline: (Optional) pipeline of MongoDB aggregation stages
|
||||
following the knnBeta vector search.
|
||||
Returns:
|
||||
List of Documents selected by maximal marginal relevance.
|
||||
List of documents selected by maximal marginal relevance.
|
||||
"""
|
||||
query_embedding = self._embedding.embed_query(query)
|
||||
docs = self._similarity_search_with_score(
|
||||
@ -303,11 +314,11 @@ class MongoDBAtlasVectorSearch(VectorStore):
|
||||
cls,
|
||||
texts: List[str],
|
||||
embedding: Embeddings,
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
metadatas: Optional[List[Dict]] = None,
|
||||
collection: Optional[Collection[MongoDBDocumentType]] = None,
|
||||
**kwargs: Any,
|
||||
) -> MongoDBAtlasVectorSearch:
|
||||
"""Construct MongoDBAtlasVectorSearch wrapper from raw documents.
|
||||
"""Construct a `MongoDB Atlas Vector Search` vector store from raw documents.
|
||||
|
||||
This is a user-friendly interface that:
|
||||
1. Embeds documents.
|
||||
|
@ -3,7 +3,7 @@ from __future__ import annotations
|
||||
|
||||
import os
|
||||
from time import sleep
|
||||
from typing import TYPE_CHECKING, Any
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
|
||||
@ -11,41 +11,46 @@ from langchain.docstore.document import Document
|
||||
from langchain.schema.embeddings import Embeddings
|
||||
from langchain.vectorstores.mongodb_atlas import MongoDBAtlasVectorSearch
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pymongo import MongoClient
|
||||
|
||||
INDEX_NAME = "langchain-test-index"
|
||||
NAMESPACE = "langchain_test_db.langchain_test_collection"
|
||||
CONNECTION_STRING = os.environ.get("MONGODB_ATLAS_URI")
|
||||
DB_NAME, COLLECTION_NAME = NAMESPACE.split(".")
|
||||
|
||||
# Instantiate as constant instead of pytest fixture to prevent needing to make multiple
|
||||
# connections.
|
||||
|
||||
def get_collection() -> Any:
|
||||
from pymongo import MongoClient
|
||||
|
||||
@pytest.fixture
|
||||
def collection() -> Any:
|
||||
test_client = MongoClient(CONNECTION_STRING)
|
||||
test_client: MongoClient = MongoClient(CONNECTION_STRING)
|
||||
return test_client[DB_NAME][COLLECTION_NAME]
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def collection() -> Any:
|
||||
return get_collection()
|
||||
|
||||
|
||||
class TestMongoDBAtlasVectorSearch:
|
||||
@classmethod
|
||||
def setup_class(cls, collection: Any) -> None:
|
||||
def setup_class(cls) -> None:
|
||||
# insure the test collection is empty
|
||||
collection = get_collection()
|
||||
assert collection.count_documents({}) == 0 # type: ignore[index] # noqa: E501
|
||||
|
||||
@classmethod
|
||||
def teardown_class(cls, collection: Any) -> None:
|
||||
def teardown_class(cls) -> None:
|
||||
collection = get_collection()
|
||||
# delete all the documents in the collection
|
||||
collection.delete_many({}) # type: ignore[index]
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def setup(self, collection: Any) -> None:
|
||||
def setup(self) -> None:
|
||||
collection = get_collection()
|
||||
# delete all the documents in the collection
|
||||
collection.delete_many({}) # type: ignore[index]
|
||||
|
||||
def test_from_documents(self, embedding_openai: Embeddings) -> None:
|
||||
def test_from_documents(
|
||||
self, embedding_openai: Embeddings, collection: Any
|
||||
) -> None:
|
||||
"""Test end to end construction and search."""
|
||||
documents = [
|
||||
Document(page_content="Dogs are tough.", metadata={"a": 1}),
|
||||
@ -64,7 +69,7 @@ class TestMongoDBAtlasVectorSearch:
|
||||
assert output[0].page_content == "What is a sandwich?"
|
||||
assert output[0].metadata["c"] == 1
|
||||
|
||||
def test_from_texts(self, embedding_openai: Embeddings) -> None:
|
||||
def test_from_texts(self, embedding_openai: Embeddings, collection: Any) -> None:
|
||||
texts = [
|
||||
"Dogs are tough.",
|
||||
"Cats have fluff.",
|
||||
@ -81,7 +86,9 @@ class TestMongoDBAtlasVectorSearch:
|
||||
output = vectorstore.similarity_search("Sandwich", k=1)
|
||||
assert output[0].page_content == "What is a sandwich?"
|
||||
|
||||
def test_from_texts_with_metadatas(self, embedding_openai: Embeddings) -> None:
|
||||
def test_from_texts_with_metadatas(
|
||||
self, embedding_openai: Embeddings, collection: Any
|
||||
) -> None:
|
||||
texts = [
|
||||
"Dogs are tough.",
|
||||
"Cats have fluff.",
|
||||
@ -102,7 +109,7 @@ class TestMongoDBAtlasVectorSearch:
|
||||
assert output[0].metadata["c"] == 1
|
||||
|
||||
def test_from_texts_with_metadatas_and_pre_filter(
|
||||
self, embedding_openai: Embeddings
|
||||
self, embedding_openai: Embeddings, collection: Any
|
||||
) -> None:
|
||||
texts = [
|
||||
"Dogs are tough.",
|
||||
@ -124,7 +131,7 @@ class TestMongoDBAtlasVectorSearch:
|
||||
)
|
||||
assert output == []
|
||||
|
||||
def test_mmr(self, embedding_openai: Embeddings) -> None:
|
||||
def test_mmr(self, embedding_openai: Embeddings, collection: Any) -> None:
|
||||
texts = ["foo", "foo", "fou", "foy"]
|
||||
vectorstore = MongoDBAtlasVectorSearch.from_texts(
|
||||
texts,
|
||||
|
Loading…
Reference in New Issue
Block a user