LangChain-MongoDB: [Experimental] Driver-side index creation helper (#19359)

## Description
Created a helper method to make vector search indexes via client-side
pymongo.

**Recent Update** -- Removed error suppressing/overwriting layer in
favor of letting the original exception provide information.

## ToDo's
- [x] Make _wait_untils for integration test delete index
functionalities.
- [x] Add documentation for its use. Highlight it's experimental
- [x] Post Integration Test Results in a screenshot
- [x] Get review from MongoDB internal team (@shaneharvey, @blink1073 ,
@NoahStapp , @caseyclements)



- [x] **Add tests and docs**: If you're adding a new integration, please
include
1. Added new integration tests. Not eligible for unit testing since the
operation is Atlas Cloud specific.
2. an example notebook showing its use. It lives in
`docs/docs/integrations` directory.

![image](https://github.com/langchain-ai/langchain/assets/2887713/a3fc8ee1-e04c-4976-accc-fea0eeae028a)


- [x] **Lint and test**: Run `make format`, `make lint` and `make test`
from the root of the package(s) you've modified. See contribution
guidelines for more: https://python.langchain.com/docs/contributing/
This commit is contained in:
Jib
2024-06-26 15:07:28 -04:00
committed by GitHub
parent b1dfb8ea1e
commit c0fcf76e93
6 changed files with 263 additions and 13 deletions

View File

@@ -0,0 +1,105 @@
import logging
from typing import Any, Dict, List, Optional
from pymongo.collection import Collection
from pymongo.operations import SearchIndexModel
logger = logging.getLogger(__file__)
def _vector_search_index_definition(
dimensions: int,
path: str,
similarity: str,
filters: Optional[List[Dict[str, str]]],
) -> Dict[str, Any]:
return {
"fields": [
{
"numDimensions": dimensions,
"path": path,
"similarity": similarity,
"type": "vector",
},
*(filters or []),
]
}
def create_vector_search_index(
collection: Collection,
index_name: str,
dimensions: int,
path: str,
similarity: str,
filters: List[Dict[str, str]],
) -> None:
"""Experimental Utility function to create a vector search index
Args:
collection (Collection): MongoDB Collection
index_name (str): Name of Index
dimensions (int): Number of dimensions in embedding
path (str): field with vector embedding
similarity (str): The similarity score used for the index
filters (List[Dict[str, str]]): additional filters for index definition.
"""
logger.info("Creating Search Index %s on %s", index_name, collection.name)
result = collection.create_search_index(
SearchIndexModel(
definition=_vector_search_index_definition(
dimensions=dimensions, path=path, similarity=similarity, filters=filters
),
name=index_name,
type="vectorSearch",
)
)
logger.info(result)
def drop_vector_search_index(collection: Collection, index_name: str) -> None:
"""Drop a created vector search index
Args:
collection (Collection): MongoDB Collection with index to be dropped
index_name (str): Name of the MongoDB index
"""
logger.info(
"Dropping Search Index %s from Collection: %s", index_name, collection.name
)
collection.drop_search_index(index_name)
logger.info("Vector Search index %s.%s dropped", collection.name, index_name)
def update_vector_search_index(
collection: Collection,
index_name: str,
dimensions: int,
path: str,
similarity: str,
filters: List[Dict[str, str]],
) -> None:
"""Leverages the updateSearchIndex call
Args:
collection (Collection): MongoDB Collection
index_name (str): Name of Index
dimensions (int): Number of dimensions in embedding.
path (str): field with vector embedding.
similarity (str): The similarity score used for the index.
filters (List[Dict[str, str]]): additional filters for index definition.
"""
logger.info(
"Updating Search Index %s from Collection: %s", index_name, collection.name
)
collection.update_search_index(
name=index_name,
definition=_vector_search_index_definition(
dimensions=dimensions,
path=path,
similarity=similarity,
filters=filters,
),
)
logger.info("Update succeeded")

View File

@@ -18,6 +18,11 @@ logger = logging.getLogger(__name__)
Matrix = Union[List[List[float]], List[np.ndarray], np.ndarray]
class FailCode:
INDEX_NOT_FOUND = 27
INDEX_ALREADY_EXISTS = 68
def cosine_similarity(X: Matrix, Y: Matrix) -> np.ndarray:
"""Row-wise cosine similarity between two equal-width matrices."""
if len(X) == 0 or len(Y) == 0:

View File

@@ -24,7 +24,12 @@ from langchain_core.vectorstores import VectorStore
from pymongo import MongoClient
from pymongo.collection import Collection
from pymongo.driver_info import DriverInfo
from pymongo.errors import CollectionInvalid
from langchain_mongodb.index import (
create_vector_search_index,
update_vector_search_index,
)
from langchain_mongodb.utils import maximal_marginal_relevance
MongoDBDocumentType = TypeVar("MongoDBDocumentType", bound=Dict[str, Any])
@@ -489,3 +494,42 @@ class MongoDBAtlasVectorSearch(VectorStore):
lambda_mult=lambda_mult,
**kwargs,
)
def create_vector_search_index(
self,
dimensions: int,
filters: Optional[List[Dict[str, str]]] = None,
update: bool = False,
) -> None:
"""Creates a MongoDB Atlas vectorSearch index for the VectorStore
Note**: This method may fail as it requires a MongoDB Atlas with
these pre-requisites:
- M10 cluster or higher
- https://www.mongodb.com/docs/atlas/atlas-vector-search/create-index/#prerequisites
Args:
dimensions (int): Number of dimensions in embedding
filters (Optional[List[Dict[str, str]]], optional): additional filters
for index definition.
Defaults to None.
update (bool, optional): Updates existing vectorSearch index.
Defaults to False.
"""
try:
self._collection.database.create_collection(self._collection.name)
except CollectionInvalid:
pass
index_operation = (
update_vector_search_index if update else create_vector_search_index
)
index_operation(
collection=self._collection,
index_name=self._index_name,
dimensions=dimensions,
path=self._embedding_key,
similarity=self._relevance_score_fn,
filters=filters or [],
)