diff --git a/docs/docs/integrations/vectorstores/mongodb_atlas.ipynb b/docs/docs/integrations/vectorstores/mongodb_atlas.ipynb index af6cc334049..c0415fed23c 100644 --- a/docs/docs/integrations/vectorstores/mongodb_atlas.ipynb +++ b/docs/docs/integrations/vectorstores/mongodb_atlas.ipynb @@ -143,6 +143,28 @@ " }\n", " ]\n", "}\n", + "```\n", + "\n", + "Additionally, if you are running a MongoDB M10 cluster with server version 6.0+, you can leverage the `MongoDBAtlasVectorSearch.create_index`. To add the above index its usage would look like this.\n", + "\n", + "```python\n", + "from langchain_community.embeddings.openai import OpenAIEmbeddings\n", + "from langchain_mongodb.vectorstores import MongoDBAtlasVectorSearch\n", + "from pymongo import MongoClient\n", + "\n", + "mongo_client = MongoClient(\"\")\n", + "collection = mongo_client[\"\"][\"\"]\n", + "embeddings = OpenAIEmbeddings()\n", + "\n", + "vectorstore = MongoDBAtlasVectorSearch(\n", + " collection=collection,\n", + " embedding=embeddings,\n", + " index_name=\"\",\n", + " relevance_score_fn=\"cosine\",\n", + ")\n", + "\n", + "# Creates an index using the index_name provided and relevance_score_fn type\n", + "vectorstore.create_index(dimensions=1536)\n", "```" ] }, @@ -296,6 +318,16 @@ " }\n", " ]\n", "}\n", + "```\n", + "\n", + "You can also update the index programmatically using the `MongoDBAtlasVectorSearch.create_index` method.\n", + "\n", + "```python\n", + "vectorstore.create_index(\n", + " dimensions=1536,\n", + " filters=[{\"type\":\"filter\", \"path\":\"page\"}],\n", + " update=True\n", + ")\n", "```" ] }, diff --git a/libs/partners/mongodb/README.md b/libs/partners/mongodb/README.md index ca420328a28..a74fe65947a 100644 --- a/libs/partners/mongodb/README.md +++ b/libs/partners/mongodb/README.md @@ -6,7 +6,7 @@ pip install -U langchain-mongodb ``` # Usage -- See [integrations doc](../../../docs/docs/integrations/vectorstores/mongodb.ipynb) for more in-depth usage instructions. +- See [integrations doc](../../../docs/docs/integrations/providers/mongodb_atlas.ipynb) for more in-depth usage instructions. - See [Getting Started with the LangChain Integration](https://www.mongodb.com/docs/atlas/atlas-vector-search/ai-integrations/langchain/#get-started-with-the-langchain-integration) for a walkthrough on using your first LangChain implementation with MongoDB Atlas. ## Using MongoDBAtlasVectorSearch diff --git a/libs/partners/mongodb/langchain_mongodb/index.py b/libs/partners/mongodb/langchain_mongodb/index.py new file mode 100644 index 00000000000..a2c0e10ed66 --- /dev/null +++ b/libs/partners/mongodb/langchain_mongodb/index.py @@ -0,0 +1,105 @@ +import logging +from typing import Any, Dict, List, Optional + +from pymongo.collection import Collection +from pymongo.operations import SearchIndexModel + +logger = logging.getLogger(__file__) + + +def _vector_search_index_definition( + dimensions: int, + path: str, + similarity: str, + filters: Optional[List[Dict[str, str]]], +) -> Dict[str, Any]: + return { + "fields": [ + { + "numDimensions": dimensions, + "path": path, + "similarity": similarity, + "type": "vector", + }, + *(filters or []), + ] + } + + +def create_vector_search_index( + collection: Collection, + index_name: str, + dimensions: int, + path: str, + similarity: str, + filters: List[Dict[str, str]], +) -> None: + """Experimental Utility function to create a vector search index + + Args: + collection (Collection): MongoDB Collection + index_name (str): Name of Index + dimensions (int): Number of dimensions in embedding + path (str): field with vector embedding + similarity (str): The similarity score used for the index + filters (List[Dict[str, str]]): additional filters for index definition. + """ + logger.info("Creating Search Index %s on %s", index_name, collection.name) + result = collection.create_search_index( + SearchIndexModel( + definition=_vector_search_index_definition( + dimensions=dimensions, path=path, similarity=similarity, filters=filters + ), + name=index_name, + type="vectorSearch", + ) + ) + logger.info(result) + + +def drop_vector_search_index(collection: Collection, index_name: str) -> None: + """Drop a created vector search index + + Args: + collection (Collection): MongoDB Collection with index to be dropped + index_name (str): Name of the MongoDB index + """ + logger.info( + "Dropping Search Index %s from Collection: %s", index_name, collection.name + ) + collection.drop_search_index(index_name) + logger.info("Vector Search index %s.%s dropped", collection.name, index_name) + + +def update_vector_search_index( + collection: Collection, + index_name: str, + dimensions: int, + path: str, + similarity: str, + filters: List[Dict[str, str]], +) -> None: + """Leverages the updateSearchIndex call + + Args: + collection (Collection): MongoDB Collection + index_name (str): Name of Index + dimensions (int): Number of dimensions in embedding. + path (str): field with vector embedding. + similarity (str): The similarity score used for the index. + filters (List[Dict[str, str]]): additional filters for index definition. + """ + + logger.info( + "Updating Search Index %s from Collection: %s", index_name, collection.name + ) + collection.update_search_index( + name=index_name, + definition=_vector_search_index_definition( + dimensions=dimensions, + path=path, + similarity=similarity, + filters=filters, + ), + ) + logger.info("Update succeeded") diff --git a/libs/partners/mongodb/langchain_mongodb/utils.py b/libs/partners/mongodb/langchain_mongodb/utils.py index 9c345e1520b..cea4b8c0446 100644 --- a/libs/partners/mongodb/langchain_mongodb/utils.py +++ b/libs/partners/mongodb/langchain_mongodb/utils.py @@ -18,6 +18,11 @@ logger = logging.getLogger(__name__) Matrix = Union[List[List[float]], List[np.ndarray], np.ndarray] +class FailCode: + INDEX_NOT_FOUND = 27 + INDEX_ALREADY_EXISTS = 68 + + def cosine_similarity(X: Matrix, Y: Matrix) -> np.ndarray: """Row-wise cosine similarity between two equal-width matrices.""" if len(X) == 0 or len(Y) == 0: diff --git a/libs/partners/mongodb/langchain_mongodb/vectorstores.py b/libs/partners/mongodb/langchain_mongodb/vectorstores.py index a703d59c510..50848146193 100644 --- a/libs/partners/mongodb/langchain_mongodb/vectorstores.py +++ b/libs/partners/mongodb/langchain_mongodb/vectorstores.py @@ -24,7 +24,12 @@ from langchain_core.vectorstores import VectorStore from pymongo import MongoClient from pymongo.collection import Collection from pymongo.driver_info import DriverInfo +from pymongo.errors import CollectionInvalid +from langchain_mongodb.index import ( + create_vector_search_index, + update_vector_search_index, +) from langchain_mongodb.utils import maximal_marginal_relevance MongoDBDocumentType = TypeVar("MongoDBDocumentType", bound=Dict[str, Any]) @@ -489,3 +494,42 @@ class MongoDBAtlasVectorSearch(VectorStore): lambda_mult=lambda_mult, **kwargs, ) + + def create_vector_search_index( + self, + dimensions: int, + filters: Optional[List[Dict[str, str]]] = None, + update: bool = False, + ) -> None: + """Creates a MongoDB Atlas vectorSearch index for the VectorStore + + Note**: This method may fail as it requires a MongoDB Atlas with + these pre-requisites: + - M10 cluster or higher + - https://www.mongodb.com/docs/atlas/atlas-vector-search/create-index/#prerequisites + + Args: + dimensions (int): Number of dimensions in embedding + filters (Optional[List[Dict[str, str]]], optional): additional filters + for index definition. + Defaults to None. + update (bool, optional): Updates existing vectorSearch index. + Defaults to False. + """ + try: + self._collection.database.create_collection(self._collection.name) + except CollectionInvalid: + pass + + index_operation = ( + update_vector_search_index if update else create_vector_search_index + ) + + index_operation( + collection=self._collection, + index_name=self._index_name, + dimensions=dimensions, + path=self._embedding_key, + similarity=self._relevance_score_fn, + filters=filters or [], + ) diff --git a/libs/partners/mongodb/tests/integration_tests/test_vectorstores.py b/libs/partners/mongodb/tests/integration_tests/test_vectorstores.py index 451ff291134..6767563e245 100644 --- a/libs/partners/mongodb/tests/integration_tests/test_vectorstores.py +++ b/libs/partners/mongodb/tests/integration_tests/test_vectorstores.py @@ -3,22 +3,27 @@ from __future__ import annotations import os -from time import sleep -from typing import Any, Dict, List +from time import monotonic, sleep +from typing import Any, Dict, List, Optional import pytest from langchain_core.documents import Document from langchain_core.embeddings import Embeddings from pymongo import MongoClient from pymongo.collection import Collection +from pymongo.errors import OperationFailure from langchain_mongodb import MongoDBAtlasVectorSearch +from langchain_mongodb.index import drop_vector_search_index from tests.utils import ConsistentFakeEmbeddings INDEX_NAME = "langchain-test-index-vectorstores" +INDEX_CREATION_NAME = "langchain-test-index-vectorstores-create-test" NAMESPACE = "langchain_test_db.langchain_test_vectorstores" CONNECTION_STRING = os.environ.get("MONGODB_ATLAS_URI") DB_NAME, COLLECTION_NAME = NAMESPACE.split(".") +INDEX_COLLECTION_NAME = "langchain_test_vectorstores_index" +INDEX_DB_NAME = "langchain_test_index_db" DIMENSIONS = 1536 TIMEOUT = 10.0 INTERVAL = 0.5 @@ -28,16 +33,53 @@ class PatchedMongoDBAtlasVectorSearch(MongoDBAtlasVectorSearch): def _insert_texts(self, texts: List[str], metadatas: List[Dict[str, Any]]) -> List: """Patched insert_texts that waits for data to be indexed before returning""" ids = super()._insert_texts(texts, metadatas) - timeout = TIMEOUT - while len(ids) != self.similarity_search("sandwich") and timeout >= 0: + start = monotonic() + while len(ids) != self.similarity_search("sandwich") and ( + monotonic() - start <= TIMEOUT + ): sleep(INTERVAL) - timeout -= INTERVAL return ids + def create_vector_search_index( + self, + dimensions: int, + filters: Optional[List[Dict[str, str]]] = None, + update: bool = False, + ) -> None: + result = super().create_vector_search_index( + dimensions=dimensions, filters=filters, update=update + ) + start = monotonic() + while monotonic() - start <= TIMEOUT: + if indexes := list( + self._collection.list_search_indexes(name=self._index_name) + ): + if indexes[0].get("status") == "READY": + return result + sleep(INTERVAL) -def get_collection() -> Collection: + raise TimeoutError(f"{self._index_name} never reached 'status: READY'") + + +def _await_index_deletion(coll: Collection, index_name: str) -> None: + start = monotonic() + try: + drop_vector_search_index(coll, index_name) + except OperationFailure: + # This most likely means an ongoing drop request was made so skip + pass + + while list(coll.list_search_indexes(name=index_name)): + if monotonic() - start > TIMEOUT: + raise TimeoutError(f"Index Name: {index_name} never dropped") + sleep(INTERVAL) + + +def get_collection( + database_name: str = DB_NAME, collection_name: str = COLLECTION_NAME +) -> Collection: test_client: MongoClient = MongoClient(CONNECTION_STRING) - return test_client[DB_NAME][COLLECTION_NAME] + return test_client[database_name][collection_name] @pytest.fixture() @@ -45,6 +87,11 @@ def collection() -> Collection: return get_collection() +@pytest.fixture() +def index_collection() -> Collection: + return get_collection(INDEX_DB_NAME, INDEX_COLLECTION_NAME) + + class TestMongoDBAtlasVectorSearch: @classmethod def setup_class(cls) -> None: @@ -65,6 +112,11 @@ class TestMongoDBAtlasVectorSearch: # delete all the documents in the collection collection.delete_many({}) # type: ignore[index] + # delete all indexes on index collection name + _await_index_deletion( + get_collection(INDEX_DB_NAME, INDEX_COLLECTION_NAME), INDEX_CREATION_NAME + ) + @pytest.fixture def embedding_openai(self) -> Embeddings: return ConsistentFakeEmbeddings(DIMENSIONS) @@ -85,7 +137,6 @@ class TestMongoDBAtlasVectorSearch: collection=collection, index_name=INDEX_NAME, ) - # sleep(5) # waits for mongot to update Lucene's index output = vectorstore.similarity_search("Sandwich", k=1) assert len(output) == 1 # Check for the presence of the metadata key @@ -150,7 +201,6 @@ class TestMongoDBAtlasVectorSearch: collection=collection, index_name=INDEX_NAME, ) - # sleep(5) # waits for mongot to update Lucene's index output = vectorstore.similarity_search("Sandwich", k=1) assert len(output) == 1 @@ -172,7 +222,6 @@ class TestMongoDBAtlasVectorSearch: collection=collection, index_name=INDEX_NAME, ) - # sleep(5) # waits for mongot to update Lucene's index output = vectorstore.similarity_search("Sandwich", k=1) assert len(output) == 1 # Check for the presence of the metadata key @@ -195,7 +244,6 @@ class TestMongoDBAtlasVectorSearch: collection=collection, index_name=INDEX_NAME, ) - # sleep(5) # waits for mongot to update Lucene's index output = vectorstore.similarity_search( "Sandwich", k=1, pre_filter={"c": {"$lte": 0}} ) @@ -209,9 +257,25 @@ class TestMongoDBAtlasVectorSearch: collection=collection, index_name=INDEX_NAME, ) - # sleep(5) # waits for mongot to update Lucene's index query = "foo" output = vectorstore.max_marginal_relevance_search(query, k=10, lambda_mult=0.1) assert len(output) == len(texts) assert output[0].page_content == "foo" assert output[1].page_content != "foo" + + def test_index_creation( + self, embedding_openai: Embeddings, index_collection: Any + ) -> None: + vectorstore = PatchedMongoDBAtlasVectorSearch( + index_collection, embedding_openai, index_name=INDEX_CREATION_NAME + ) + vectorstore.create_vector_search_index(dimensions=1536) + + def test_index_update( + self, embedding_openai: Embeddings, index_collection: Any + ) -> None: + vectorstore = PatchedMongoDBAtlasVectorSearch( + index_collection, embedding_openai, index_name=INDEX_CREATION_NAME + ) + vectorstore.create_vector_search_index(dimensions=1536) + vectorstore.create_vector_search_index(dimensions=1536, update=True)