From c0fcf76e93c9111039c4c8f9939e1a0345aa7a72 Mon Sep 17 00:00:00 2001 From: Jib Date: Wed, 26 Jun 2024 15:07:28 -0400 Subject: [PATCH] LangChain-MongoDB: [Experimental] Driver-side index creation helper (#19359) ## Description Created a helper method to make vector search indexes via client-side pymongo. **Recent Update** -- Removed error suppressing/overwriting layer in favor of letting the original exception provide information. ## ToDo's - [x] Make _wait_untils for integration test delete index functionalities. - [x] Add documentation for its use. Highlight it's experimental - [x] Post Integration Test Results in a screenshot - [x] Get review from MongoDB internal team (@shaneharvey, @blink1073 , @NoahStapp , @caseyclements) - [x] **Add tests and docs**: If you're adding a new integration, please include 1. Added new integration tests. Not eligible for unit testing since the operation is Atlas Cloud specific. 2. an example notebook showing its use. It lives in `docs/docs/integrations` directory. ![image](https://github.com/langchain-ai/langchain/assets/2887713/a3fc8ee1-e04c-4976-accc-fea0eeae028a) - [x] **Lint and test**: Run `make format`, `make lint` and `make test` from the root of the package(s) you've modified. See contribution guidelines for more: https://python.langchain.com/docs/contributing/ --- .../vectorstores/mongodb_atlas.ipynb | 32 ++++++ libs/partners/mongodb/README.md | 2 +- .../mongodb/langchain_mongodb/index.py | 105 ++++++++++++++++++ .../mongodb/langchain_mongodb/utils.py | 5 + .../mongodb/langchain_mongodb/vectorstores.py | 44 ++++++++ .../integration_tests/test_vectorstores.py | 88 +++++++++++++-- 6 files changed, 263 insertions(+), 13 deletions(-) create mode 100644 libs/partners/mongodb/langchain_mongodb/index.py diff --git a/docs/docs/integrations/vectorstores/mongodb_atlas.ipynb b/docs/docs/integrations/vectorstores/mongodb_atlas.ipynb index af6cc334049..c0415fed23c 100644 --- a/docs/docs/integrations/vectorstores/mongodb_atlas.ipynb +++ b/docs/docs/integrations/vectorstores/mongodb_atlas.ipynb @@ -143,6 +143,28 @@ " }\n", " ]\n", "}\n", + "```\n", + "\n", + "Additionally, if you are running a MongoDB M10 cluster with server version 6.0+, you can leverage the `MongoDBAtlasVectorSearch.create_index`. To add the above index its usage would look like this.\n", + "\n", + "```python\n", + "from langchain_community.embeddings.openai import OpenAIEmbeddings\n", + "from langchain_mongodb.vectorstores import MongoDBAtlasVectorSearch\n", + "from pymongo import MongoClient\n", + "\n", + "mongo_client = MongoClient(\"\")\n", + "collection = mongo_client[\"\"][\"\"]\n", + "embeddings = OpenAIEmbeddings()\n", + "\n", + "vectorstore = MongoDBAtlasVectorSearch(\n", + " collection=collection,\n", + " embedding=embeddings,\n", + " index_name=\"\",\n", + " relevance_score_fn=\"cosine\",\n", + ")\n", + "\n", + "# Creates an index using the index_name provided and relevance_score_fn type\n", + "vectorstore.create_index(dimensions=1536)\n", "```" ] }, @@ -296,6 +318,16 @@ " }\n", " ]\n", "}\n", + "```\n", + "\n", + "You can also update the index programmatically using the `MongoDBAtlasVectorSearch.create_index` method.\n", + "\n", + "```python\n", + "vectorstore.create_index(\n", + " dimensions=1536,\n", + " filters=[{\"type\":\"filter\", \"path\":\"page\"}],\n", + " update=True\n", + ")\n", "```" ] }, diff --git a/libs/partners/mongodb/README.md b/libs/partners/mongodb/README.md index ca420328a28..a74fe65947a 100644 --- a/libs/partners/mongodb/README.md +++ b/libs/partners/mongodb/README.md @@ -6,7 +6,7 @@ pip install -U langchain-mongodb ``` # Usage -- See [integrations doc](../../../docs/docs/integrations/vectorstores/mongodb.ipynb) for more in-depth usage instructions. +- See [integrations doc](../../../docs/docs/integrations/providers/mongodb_atlas.ipynb) for more in-depth usage instructions. - See [Getting Started with the LangChain Integration](https://www.mongodb.com/docs/atlas/atlas-vector-search/ai-integrations/langchain/#get-started-with-the-langchain-integration) for a walkthrough on using your first LangChain implementation with MongoDB Atlas. ## Using MongoDBAtlasVectorSearch diff --git a/libs/partners/mongodb/langchain_mongodb/index.py b/libs/partners/mongodb/langchain_mongodb/index.py new file mode 100644 index 00000000000..a2c0e10ed66 --- /dev/null +++ b/libs/partners/mongodb/langchain_mongodb/index.py @@ -0,0 +1,105 @@ +import logging +from typing import Any, Dict, List, Optional + +from pymongo.collection import Collection +from pymongo.operations import SearchIndexModel + +logger = logging.getLogger(__file__) + + +def _vector_search_index_definition( + dimensions: int, + path: str, + similarity: str, + filters: Optional[List[Dict[str, str]]], +) -> Dict[str, Any]: + return { + "fields": [ + { + "numDimensions": dimensions, + "path": path, + "similarity": similarity, + "type": "vector", + }, + *(filters or []), + ] + } + + +def create_vector_search_index( + collection: Collection, + index_name: str, + dimensions: int, + path: str, + similarity: str, + filters: List[Dict[str, str]], +) -> None: + """Experimental Utility function to create a vector search index + + Args: + collection (Collection): MongoDB Collection + index_name (str): Name of Index + dimensions (int): Number of dimensions in embedding + path (str): field with vector embedding + similarity (str): The similarity score used for the index + filters (List[Dict[str, str]]): additional filters for index definition. + """ + logger.info("Creating Search Index %s on %s", index_name, collection.name) + result = collection.create_search_index( + SearchIndexModel( + definition=_vector_search_index_definition( + dimensions=dimensions, path=path, similarity=similarity, filters=filters + ), + name=index_name, + type="vectorSearch", + ) + ) + logger.info(result) + + +def drop_vector_search_index(collection: Collection, index_name: str) -> None: + """Drop a created vector search index + + Args: + collection (Collection): MongoDB Collection with index to be dropped + index_name (str): Name of the MongoDB index + """ + logger.info( + "Dropping Search Index %s from Collection: %s", index_name, collection.name + ) + collection.drop_search_index(index_name) + logger.info("Vector Search index %s.%s dropped", collection.name, index_name) + + +def update_vector_search_index( + collection: Collection, + index_name: str, + dimensions: int, + path: str, + similarity: str, + filters: List[Dict[str, str]], +) -> None: + """Leverages the updateSearchIndex call + + Args: + collection (Collection): MongoDB Collection + index_name (str): Name of Index + dimensions (int): Number of dimensions in embedding. + path (str): field with vector embedding. + similarity (str): The similarity score used for the index. + filters (List[Dict[str, str]]): additional filters for index definition. + """ + + logger.info( + "Updating Search Index %s from Collection: %s", index_name, collection.name + ) + collection.update_search_index( + name=index_name, + definition=_vector_search_index_definition( + dimensions=dimensions, + path=path, + similarity=similarity, + filters=filters, + ), + ) + logger.info("Update succeeded") diff --git a/libs/partners/mongodb/langchain_mongodb/utils.py b/libs/partners/mongodb/langchain_mongodb/utils.py index 9c345e1520b..cea4b8c0446 100644 --- a/libs/partners/mongodb/langchain_mongodb/utils.py +++ b/libs/partners/mongodb/langchain_mongodb/utils.py @@ -18,6 +18,11 @@ logger = logging.getLogger(__name__) Matrix = Union[List[List[float]], List[np.ndarray], np.ndarray] +class FailCode: + INDEX_NOT_FOUND = 27 + INDEX_ALREADY_EXISTS = 68 + + def cosine_similarity(X: Matrix, Y: Matrix) -> np.ndarray: """Row-wise cosine similarity between two equal-width matrices.""" if len(X) == 0 or len(Y) == 0: diff --git a/libs/partners/mongodb/langchain_mongodb/vectorstores.py b/libs/partners/mongodb/langchain_mongodb/vectorstores.py index a703d59c510..50848146193 100644 --- a/libs/partners/mongodb/langchain_mongodb/vectorstores.py +++ b/libs/partners/mongodb/langchain_mongodb/vectorstores.py @@ -24,7 +24,12 @@ from langchain_core.vectorstores import VectorStore from pymongo import MongoClient from pymongo.collection import Collection from pymongo.driver_info import DriverInfo +from pymongo.errors import CollectionInvalid +from langchain_mongodb.index import ( + create_vector_search_index, + update_vector_search_index, +) from langchain_mongodb.utils import maximal_marginal_relevance MongoDBDocumentType = TypeVar("MongoDBDocumentType", bound=Dict[str, Any]) @@ -489,3 +494,42 @@ class MongoDBAtlasVectorSearch(VectorStore): lambda_mult=lambda_mult, **kwargs, ) + + def create_vector_search_index( + self, + dimensions: int, + filters: Optional[List[Dict[str, str]]] = None, + update: bool = False, + ) -> None: + """Creates a MongoDB Atlas vectorSearch index for the VectorStore + + Note**: This method may fail as it requires a MongoDB Atlas with + these pre-requisites: + - M10 cluster or higher + - https://www.mongodb.com/docs/atlas/atlas-vector-search/create-index/#prerequisites + + Args: + dimensions (int): Number of dimensions in embedding + filters (Optional[List[Dict[str, str]]], optional): additional filters + for index definition. + Defaults to None. + update (bool, optional): Updates existing vectorSearch index. + Defaults to False. + """ + try: + self._collection.database.create_collection(self._collection.name) + except CollectionInvalid: + pass + + index_operation = ( + update_vector_search_index if update else create_vector_search_index + ) + + index_operation( + collection=self._collection, + index_name=self._index_name, + dimensions=dimensions, + path=self._embedding_key, + similarity=self._relevance_score_fn, + filters=filters or [], + ) diff --git a/libs/partners/mongodb/tests/integration_tests/test_vectorstores.py b/libs/partners/mongodb/tests/integration_tests/test_vectorstores.py index 451ff291134..6767563e245 100644 --- a/libs/partners/mongodb/tests/integration_tests/test_vectorstores.py +++ b/libs/partners/mongodb/tests/integration_tests/test_vectorstores.py @@ -3,22 +3,27 @@ from __future__ import annotations import os -from time import sleep -from typing import Any, Dict, List +from time import monotonic, sleep +from typing import Any, Dict, List, Optional import pytest from langchain_core.documents import Document from langchain_core.embeddings import Embeddings from pymongo import MongoClient from pymongo.collection import Collection +from pymongo.errors import OperationFailure from langchain_mongodb import MongoDBAtlasVectorSearch +from langchain_mongodb.index import drop_vector_search_index from tests.utils import ConsistentFakeEmbeddings INDEX_NAME = "langchain-test-index-vectorstores" +INDEX_CREATION_NAME = "langchain-test-index-vectorstores-create-test" NAMESPACE = "langchain_test_db.langchain_test_vectorstores" CONNECTION_STRING = os.environ.get("MONGODB_ATLAS_URI") DB_NAME, COLLECTION_NAME = NAMESPACE.split(".") +INDEX_COLLECTION_NAME = "langchain_test_vectorstores_index" +INDEX_DB_NAME = "langchain_test_index_db" DIMENSIONS = 1536 TIMEOUT = 10.0 INTERVAL = 0.5 @@ -28,16 +33,53 @@ class PatchedMongoDBAtlasVectorSearch(MongoDBAtlasVectorSearch): def _insert_texts(self, texts: List[str], metadatas: List[Dict[str, Any]]) -> List: """Patched insert_texts that waits for data to be indexed before returning""" ids = super()._insert_texts(texts, metadatas) - timeout = TIMEOUT - while len(ids) != self.similarity_search("sandwich") and timeout >= 0: + start = monotonic() + while len(ids) != self.similarity_search("sandwich") and ( + monotonic() - start <= TIMEOUT + ): sleep(INTERVAL) - timeout -= INTERVAL return ids + def create_vector_search_index( + self, + dimensions: int, + filters: Optional[List[Dict[str, str]]] = None, + update: bool = False, + ) -> None: + result = super().create_vector_search_index( + dimensions=dimensions, filters=filters, update=update + ) + start = monotonic() + while monotonic() - start <= TIMEOUT: + if indexes := list( + self._collection.list_search_indexes(name=self._index_name) + ): + if indexes[0].get("status") == "READY": + return result + sleep(INTERVAL) -def get_collection() -> Collection: + raise TimeoutError(f"{self._index_name} never reached 'status: READY'") + + +def _await_index_deletion(coll: Collection, index_name: str) -> None: + start = monotonic() + try: + drop_vector_search_index(coll, index_name) + except OperationFailure: + # This most likely means an ongoing drop request was made so skip + pass + + while list(coll.list_search_indexes(name=index_name)): + if monotonic() - start > TIMEOUT: + raise TimeoutError(f"Index Name: {index_name} never dropped") + sleep(INTERVAL) + + +def get_collection( + database_name: str = DB_NAME, collection_name: str = COLLECTION_NAME +) -> Collection: test_client: MongoClient = MongoClient(CONNECTION_STRING) - return test_client[DB_NAME][COLLECTION_NAME] + return test_client[database_name][collection_name] @pytest.fixture() @@ -45,6 +87,11 @@ def collection() -> Collection: return get_collection() +@pytest.fixture() +def index_collection() -> Collection: + return get_collection(INDEX_DB_NAME, INDEX_COLLECTION_NAME) + + class TestMongoDBAtlasVectorSearch: @classmethod def setup_class(cls) -> None: @@ -65,6 +112,11 @@ class TestMongoDBAtlasVectorSearch: # delete all the documents in the collection collection.delete_many({}) # type: ignore[index] + # delete all indexes on index collection name + _await_index_deletion( + get_collection(INDEX_DB_NAME, INDEX_COLLECTION_NAME), INDEX_CREATION_NAME + ) + @pytest.fixture def embedding_openai(self) -> Embeddings: return ConsistentFakeEmbeddings(DIMENSIONS) @@ -85,7 +137,6 @@ class TestMongoDBAtlasVectorSearch: collection=collection, index_name=INDEX_NAME, ) - # sleep(5) # waits for mongot to update Lucene's index output = vectorstore.similarity_search("Sandwich", k=1) assert len(output) == 1 # Check for the presence of the metadata key @@ -150,7 +201,6 @@ class TestMongoDBAtlasVectorSearch: collection=collection, index_name=INDEX_NAME, ) - # sleep(5) # waits for mongot to update Lucene's index output = vectorstore.similarity_search("Sandwich", k=1) assert len(output) == 1 @@ -172,7 +222,6 @@ class TestMongoDBAtlasVectorSearch: collection=collection, index_name=INDEX_NAME, ) - # sleep(5) # waits for mongot to update Lucene's index output = vectorstore.similarity_search("Sandwich", k=1) assert len(output) == 1 # Check for the presence of the metadata key @@ -195,7 +244,6 @@ class TestMongoDBAtlasVectorSearch: collection=collection, index_name=INDEX_NAME, ) - # sleep(5) # waits for mongot to update Lucene's index output = vectorstore.similarity_search( "Sandwich", k=1, pre_filter={"c": {"$lte": 0}} ) @@ -209,9 +257,25 @@ class TestMongoDBAtlasVectorSearch: collection=collection, index_name=INDEX_NAME, ) - # sleep(5) # waits for mongot to update Lucene's index query = "foo" output = vectorstore.max_marginal_relevance_search(query, k=10, lambda_mult=0.1) assert len(output) == len(texts) assert output[0].page_content == "foo" assert output[1].page_content != "foo" + + def test_index_creation( + self, embedding_openai: Embeddings, index_collection: Any + ) -> None: + vectorstore = PatchedMongoDBAtlasVectorSearch( + index_collection, embedding_openai, index_name=INDEX_CREATION_NAME + ) + vectorstore.create_vector_search_index(dimensions=1536) + + def test_index_update( + self, embedding_openai: Embeddings, index_collection: Any + ) -> None: + vectorstore = PatchedMongoDBAtlasVectorSearch( + index_collection, embedding_openai, index_name=INDEX_CREATION_NAME + ) + vectorstore.create_vector_search_index(dimensions=1536) + vectorstore.create_vector_search_index(dimensions=1536, update=True)