mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-05 13:06:03 +00:00
community[minor]: Add DocumentDBVectorSearch VectorStore (#17757)
**Description:** - Added Amazon DocumentDB Vector Search integration (HNSW index) - Added integration tests - Updated AWS documentation with DocumentDB Vector Search instructions - Added notebook for DocumentDB integration with example usage --------- Co-authored-by: EC2 Default User <ec2-user@ip-172-31-95-226.ec2.internal>
This commit is contained in:
361
libs/community/langchain_community/vectorstores/documentdb.py
Normal file
361
libs/community/langchain_community/vectorstores/documentdb.py
Normal file
@@ -0,0 +1,361 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from enum import Enum
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Dict,
|
||||
Generator,
|
||||
Iterable,
|
||||
List,
|
||||
Optional,
|
||||
TypeVar,
|
||||
Union,
|
||||
)
|
||||
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.vectorstores import VectorStore
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from langchain_core.embeddings import Embeddings
|
||||
from pymongo.collection import Collection
|
||||
|
||||
|
||||
# Before Python 3.11 native StrEnum is not available
|
||||
class DocumentDBSimilarityType(str, Enum):
|
||||
"""DocumentDB Similarity Type as enumerator."""
|
||||
|
||||
COS = "cosine"
|
||||
"""Cosine similarity"""
|
||||
DOT = "dotProduct"
|
||||
"""Dot product"""
|
||||
EUC = "euclidean"
|
||||
"""Euclidean distance"""
|
||||
|
||||
|
||||
DocumentDBDocumentType = TypeVar("DocumentDBDocumentType", bound=Dict[str, Any])
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
DEFAULT_INSERT_BATCH_SIZE = 128
|
||||
|
||||
|
||||
class DocumentDBVectorSearch(VectorStore):
|
||||
"""`Amazon DocumentDB (with MongoDB compatibility)` vector store.
|
||||
Please refer to the official Vector Search documentation for more details:
|
||||
https://docs.aws.amazon.com/documentdb/latest/developerguide/vector-search.html
|
||||
|
||||
To use, you should have both:
|
||||
- the ``pymongo`` python package installed
|
||||
- a connection string and credentials associated with a DocumentDB cluster
|
||||
|
||||
Example:
|
||||
. code-block:: python
|
||||
|
||||
from langchain_community.vectorstores import DocumentDBVectorSearch
|
||||
from langchain_community.embeddings.openai import OpenAIEmbeddings
|
||||
from pymongo import MongoClient
|
||||
|
||||
mongo_client = MongoClient("<YOUR-CONNECTION-STRING>")
|
||||
collection = mongo_client["<db_name>"]["<collection_name>"]
|
||||
embeddings = OpenAIEmbeddings()
|
||||
vectorstore = DocumentDBVectorSearch(collection, embeddings)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
collection: Collection[DocumentDBDocumentType],
|
||||
embedding: Embeddings,
|
||||
*,
|
||||
index_name: str = "vectorSearchIndex",
|
||||
text_key: str = "textContent",
|
||||
embedding_key: str = "vectorContent",
|
||||
):
|
||||
"""Constructor for DocumentDBVectorSearch
|
||||
|
||||
Args:
|
||||
collection: MongoDB collection to add the texts to.
|
||||
embedding: Text embedding model to use.
|
||||
index_name: Name of the Vector Search index.
|
||||
text_key: MongoDB field that will contain the text
|
||||
for each document.
|
||||
embedding_key: MongoDB field that will contain the embedding
|
||||
for each document.
|
||||
"""
|
||||
self._collection = collection
|
||||
self._embedding = embedding
|
||||
self._index_name = index_name
|
||||
self._text_key = text_key
|
||||
self._embedding_key = embedding_key
|
||||
self._similarity_type = DocumentDBSimilarityType.COS
|
||||
|
||||
@property
|
||||
def embeddings(self) -> Embeddings:
|
||||
return self._embedding
|
||||
|
||||
def get_index_name(self) -> str:
|
||||
"""Returns the index name
|
||||
|
||||
Returns:
|
||||
Returns the index name
|
||||
|
||||
"""
|
||||
return self._index_name
|
||||
|
||||
@classmethod
|
||||
def from_connection_string(
|
||||
cls,
|
||||
connection_string: str,
|
||||
namespace: str,
|
||||
embedding: Embeddings,
|
||||
**kwargs: Any,
|
||||
) -> DocumentDBVectorSearch:
|
||||
"""Creates an Instance of DocumentDBVectorSearch from a Connection String
|
||||
|
||||
Args:
|
||||
connection_string: The DocumentDB cluster endpoint connection string
|
||||
namespace: The namespace (database.collection)
|
||||
embedding: The embedding utility
|
||||
**kwargs: Dynamic keyword arguments
|
||||
|
||||
Returns:
|
||||
an instance of the vector store
|
||||
|
||||
"""
|
||||
try:
|
||||
from pymongo import MongoClient
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Could not import pymongo, please install it with "
|
||||
"`pip install pymongo`."
|
||||
)
|
||||
client: MongoClient = MongoClient(connection_string)
|
||||
db_name, collection_name = namespace.split(".")
|
||||
collection = client[db_name][collection_name]
|
||||
return cls(collection, embedding, **kwargs)
|
||||
|
||||
def index_exists(self) -> bool:
|
||||
"""Verifies if the specified index name during instance
|
||||
construction exists on the collection
|
||||
|
||||
Returns:
|
||||
Returns True on success and False if no such index exists
|
||||
on the collection
|
||||
"""
|
||||
cursor = self._collection.list_indexes()
|
||||
index_name = self._index_name
|
||||
|
||||
for res in cursor:
|
||||
current_index_name = res.pop("name")
|
||||
if current_index_name == index_name:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def delete_index(self) -> None:
|
||||
"""Deletes the index specified during instance construction if it exists"""
|
||||
if self.index_exists():
|
||||
self._collection.drop_index(self._index_name)
|
||||
# Raises OperationFailure on an error (e.g. trying to drop
|
||||
# an index that does not exist)
|
||||
|
||||
def create_index(
|
||||
self,
|
||||
dimensions: int = 1536,
|
||||
similarity: DocumentDBSimilarityType = DocumentDBSimilarityType.COS,
|
||||
m: int = 16,
|
||||
ef_construction: int = 64,
|
||||
) -> dict[str, Any]:
|
||||
"""Creates an index using the index name specified at
|
||||
instance construction
|
||||
|
||||
Args:
|
||||
dimensions: Number of dimensions for vector similarity.
|
||||
The maximum number of supported dimensions is 2000
|
||||
|
||||
similarity: Similarity algorithm to use with the HNSW index.
|
||||
|
||||
m: Specifies the max number of connections for an HNSW index.
|
||||
Large impact on memory consumption.
|
||||
|
||||
ef_construction: Specifies the size of the dynamic candidate list
|
||||
for constructing the graph for HNSW index. Higher values lead
|
||||
to more accurate results but slower indexing speed.
|
||||
|
||||
Possible options are:
|
||||
- DocumentDBSimilarityType.COS (cosine distance),
|
||||
- DocumentDBSimilarityType.EUC (Euclidean distance), and
|
||||
- DocumentDBSimilarityType.DOT (dot product).
|
||||
|
||||
Returns:
|
||||
An object describing the created index
|
||||
|
||||
"""
|
||||
self._similarity_type = similarity
|
||||
|
||||
# prepare the command
|
||||
create_index_commands = {
|
||||
"createIndexes": self._collection.name,
|
||||
"indexes": [
|
||||
{
|
||||
"name": self._index_name,
|
||||
"key": {self._embedding_key: "vector"},
|
||||
"vectorOptions": {
|
||||
"type": "hnsw",
|
||||
"similarity": similarity,
|
||||
"dimensions": dimensions,
|
||||
"m": m,
|
||||
"efConstruction": ef_construction,
|
||||
},
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
# retrieve the database object
|
||||
current_database = self._collection.database
|
||||
|
||||
# invoke the command from the database object
|
||||
create_index_responses: dict[str, Any] = current_database.command(
|
||||
create_index_commands
|
||||
)
|
||||
|
||||
return create_index_responses
|
||||
|
||||
def add_texts(
|
||||
self,
|
||||
texts: Iterable[str],
|
||||
metadatas: Optional[List[Dict[str, Any]]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List:
|
||||
batch_size = kwargs.get("batch_size", DEFAULT_INSERT_BATCH_SIZE)
|
||||
_metadatas: Union[List, Generator] = metadatas or ({} for _ in texts)
|
||||
texts_batch = []
|
||||
metadatas_batch = []
|
||||
result_ids = []
|
||||
for i, (text, metadata) in enumerate(zip(texts, _metadatas)):
|
||||
texts_batch.append(text)
|
||||
metadatas_batch.append(metadata)
|
||||
if (i + 1) % batch_size == 0:
|
||||
result_ids.extend(self._insert_texts(texts_batch, metadatas_batch))
|
||||
texts_batch = []
|
||||
metadatas_batch = []
|
||||
if texts_batch:
|
||||
result_ids.extend(self._insert_texts(texts_batch, metadatas_batch))
|
||||
return result_ids
|
||||
|
||||
def _insert_texts(self, texts: List[str], metadatas: List[Dict[str, Any]]) -> List:
|
||||
"""Used to Load Documents into the collection
|
||||
|
||||
Args:
|
||||
texts: The list of documents strings to load
|
||||
metadatas: The list of metadata objects associated with each document
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
# If the text is empty, then exit early
|
||||
if not texts:
|
||||
return []
|
||||
|
||||
# Embed and create the documents
|
||||
embeddings = self._embedding.embed_documents(texts)
|
||||
to_insert = [
|
||||
{self._text_key: t, self._embedding_key: embedding, **m}
|
||||
for t, m, embedding in zip(texts, metadatas, embeddings)
|
||||
]
|
||||
# insert the documents in DocumentDB
|
||||
insert_result = self._collection.insert_many(to_insert) # type: ignore
|
||||
return insert_result.inserted_ids
|
||||
|
||||
@classmethod
|
||||
def from_texts(
|
||||
cls,
|
||||
texts: List[str],
|
||||
embedding: Embeddings,
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
collection: Optional[Collection[DocumentDBDocumentType]] = None,
|
||||
**kwargs: Any,
|
||||
) -> DocumentDBVectorSearch:
|
||||
if collection is None:
|
||||
raise ValueError("Must provide 'collection' named parameter.")
|
||||
vectorstore = cls(collection, embedding, **kwargs)
|
||||
vectorstore.add_texts(texts, metadatas=metadatas)
|
||||
return vectorstore
|
||||
|
||||
def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> Optional[bool]:
|
||||
if ids is None:
|
||||
raise ValueError("No document ids provided to delete.")
|
||||
|
||||
for document_id in ids:
|
||||
self.delete_document_by_id(document_id)
|
||||
return True
|
||||
|
||||
def delete_document_by_id(self, document_id: Optional[str] = None) -> None:
|
||||
"""Removes a Specific Document by Id
|
||||
|
||||
Args:
|
||||
document_id: The document identifier
|
||||
"""
|
||||
try:
|
||||
from bson.objectid import ObjectId
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"Unable to import bson, please install with `pip install bson`."
|
||||
) from e
|
||||
if document_id is None:
|
||||
raise ValueError("No document id provided to delete.")
|
||||
|
||||
self._collection.delete_one({"_id": ObjectId(document_id)})
|
||||
|
||||
def _similarity_search_without_score(
|
||||
self, embeddings: List[float], k: int = 4, ef_search: int = 40
|
||||
) -> List[Document]:
|
||||
"""Returns a list of documents.
|
||||
|
||||
Args:
|
||||
embeddings: The query vector
|
||||
k: the number of documents to return
|
||||
ef_search: Specifies the size of the dynamic candidate list
|
||||
that HNSW index uses during search. A higher value of
|
||||
efSearch provides better recall at cost of speed.
|
||||
|
||||
Returns:
|
||||
A list of documents closest to the query vector
|
||||
"""
|
||||
pipeline: List[dict[str, Any]] = [
|
||||
{
|
||||
"$search": {
|
||||
"vectorSearch": {
|
||||
"vector": embeddings,
|
||||
"path": self._embedding_key,
|
||||
"similarity": self._similarity_type,
|
||||
"k": k,
|
||||
"efSearch": ef_search,
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
cursor = self._collection.aggregate(pipeline)
|
||||
|
||||
docs = []
|
||||
|
||||
for res in cursor:
|
||||
text = res.pop(self._text_key)
|
||||
docs.append(Document(page_content=text, metadata=res))
|
||||
|
||||
return docs
|
||||
|
||||
def similarity_search(
|
||||
self,
|
||||
query: str,
|
||||
k: int = 4,
|
||||
ef_search: int = 40,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
embeddings = self._embedding.embed_query(query)
|
||||
docs = self._similarity_search_without_score(
|
||||
embeddings=embeddings, k=k, ef_search=ef_search
|
||||
)
|
||||
return [doc for doc in docs]
|
Reference in New Issue
Block a user