community[minor]: Add DocumentDBVectorSearch VectorStore (#17757)

**Description:**
- Added Amazon DocumentDB Vector Search integration (HNSW index)
- Added integration tests
- Updated AWS documentation with DocumentDB Vector Search instructions
- Added notebook for DocumentDB integration with example usage

---------

Co-authored-by: EC2 Default User <ec2-user@ip-172-31-95-226.ec2.internal>
This commit is contained in:
Sam Khano
2024-03-06 15:11:34 -08:00
committed by GitHub
parent 51f3902bc4
commit 1b4dcf22f3
7 changed files with 1270 additions and 0 deletions

View File

@@ -192,6 +192,14 @@ def _import_docarray_inmemory() -> Any:
return DocArrayInMemorySearch
def _import_documentdb() -> Any:
from langchain_community.vectorstores.documentdb import (
DocumentDBVectorSearch,
)
return DocumentDBVectorSearch
def _import_elasticsearch() -> Any:
from langchain_community.vectorstores.elasticsearch import ElasticsearchStore
@@ -563,6 +571,8 @@ def __getattr__(name: str) -> Any:
return _import_dingo()
elif name == "DocArrayInMemorySearch":
return _import_docarray_inmemory()
elif name == "DocumentDBVectorSearch":
return _import_documentdb()
elif name == "DocArrayHnswSearch":
return _import_docarray_hnsw()
elif name == "ElasticsearchStore":
@@ -754,4 +764,5 @@ __all__ = [
"VectorStore",
"NeuralDBVectorStore",
"Lantern",
"DocumentDBVectorSearch",
]

View File

@@ -0,0 +1,361 @@
from __future__ import annotations
import logging
from enum import Enum
from typing import (
TYPE_CHECKING,
Any,
Dict,
Generator,
Iterable,
List,
Optional,
TypeVar,
Union,
)
from langchain_core.documents import Document
from langchain_core.vectorstores import VectorStore
if TYPE_CHECKING:
from langchain_core.embeddings import Embeddings
from pymongo.collection import Collection
# Before Python 3.11 native StrEnum is not available
class DocumentDBSimilarityType(str, Enum):
"""DocumentDB Similarity Type as enumerator."""
COS = "cosine"
"""Cosine similarity"""
DOT = "dotProduct"
"""Dot product"""
EUC = "euclidean"
"""Euclidean distance"""
DocumentDBDocumentType = TypeVar("DocumentDBDocumentType", bound=Dict[str, Any])
logger = logging.getLogger(__name__)
DEFAULT_INSERT_BATCH_SIZE = 128
class DocumentDBVectorSearch(VectorStore):
"""`Amazon DocumentDB (with MongoDB compatibility)` vector store.
Please refer to the official Vector Search documentation for more details:
https://docs.aws.amazon.com/documentdb/latest/developerguide/vector-search.html
To use, you should have both:
- the ``pymongo`` python package installed
- a connection string and credentials associated with a DocumentDB cluster
Example:
. code-block:: python
from langchain_community.vectorstores import DocumentDBVectorSearch
from langchain_community.embeddings.openai import OpenAIEmbeddings
from pymongo import MongoClient
mongo_client = MongoClient("<YOUR-CONNECTION-STRING>")
collection = mongo_client["<db_name>"]["<collection_name>"]
embeddings = OpenAIEmbeddings()
vectorstore = DocumentDBVectorSearch(collection, embeddings)
"""
def __init__(
self,
collection: Collection[DocumentDBDocumentType],
embedding: Embeddings,
*,
index_name: str = "vectorSearchIndex",
text_key: str = "textContent",
embedding_key: str = "vectorContent",
):
"""Constructor for DocumentDBVectorSearch
Args:
collection: MongoDB collection to add the texts to.
embedding: Text embedding model to use.
index_name: Name of the Vector Search index.
text_key: MongoDB field that will contain the text
for each document.
embedding_key: MongoDB field that will contain the embedding
for each document.
"""
self._collection = collection
self._embedding = embedding
self._index_name = index_name
self._text_key = text_key
self._embedding_key = embedding_key
self._similarity_type = DocumentDBSimilarityType.COS
@property
def embeddings(self) -> Embeddings:
return self._embedding
def get_index_name(self) -> str:
"""Returns the index name
Returns:
Returns the index name
"""
return self._index_name
@classmethod
def from_connection_string(
cls,
connection_string: str,
namespace: str,
embedding: Embeddings,
**kwargs: Any,
) -> DocumentDBVectorSearch:
"""Creates an Instance of DocumentDBVectorSearch from a Connection String
Args:
connection_string: The DocumentDB cluster endpoint connection string
namespace: The namespace (database.collection)
embedding: The embedding utility
**kwargs: Dynamic keyword arguments
Returns:
an instance of the vector store
"""
try:
from pymongo import MongoClient
except ImportError:
raise ImportError(
"Could not import pymongo, please install it with "
"`pip install pymongo`."
)
client: MongoClient = MongoClient(connection_string)
db_name, collection_name = namespace.split(".")
collection = client[db_name][collection_name]
return cls(collection, embedding, **kwargs)
def index_exists(self) -> bool:
"""Verifies if the specified index name during instance
construction exists on the collection
Returns:
Returns True on success and False if no such index exists
on the collection
"""
cursor = self._collection.list_indexes()
index_name = self._index_name
for res in cursor:
current_index_name = res.pop("name")
if current_index_name == index_name:
return True
return False
def delete_index(self) -> None:
"""Deletes the index specified during instance construction if it exists"""
if self.index_exists():
self._collection.drop_index(self._index_name)
# Raises OperationFailure on an error (e.g. trying to drop
# an index that does not exist)
def create_index(
self,
dimensions: int = 1536,
similarity: DocumentDBSimilarityType = DocumentDBSimilarityType.COS,
m: int = 16,
ef_construction: int = 64,
) -> dict[str, Any]:
"""Creates an index using the index name specified at
instance construction
Args:
dimensions: Number of dimensions for vector similarity.
The maximum number of supported dimensions is 2000
similarity: Similarity algorithm to use with the HNSW index.
m: Specifies the max number of connections for an HNSW index.
Large impact on memory consumption.
ef_construction: Specifies the size of the dynamic candidate list
for constructing the graph for HNSW index. Higher values lead
to more accurate results but slower indexing speed.
Possible options are:
- DocumentDBSimilarityType.COS (cosine distance),
- DocumentDBSimilarityType.EUC (Euclidean distance), and
- DocumentDBSimilarityType.DOT (dot product).
Returns:
An object describing the created index
"""
self._similarity_type = similarity
# prepare the command
create_index_commands = {
"createIndexes": self._collection.name,
"indexes": [
{
"name": self._index_name,
"key": {self._embedding_key: "vector"},
"vectorOptions": {
"type": "hnsw",
"similarity": similarity,
"dimensions": dimensions,
"m": m,
"efConstruction": ef_construction,
},
}
],
}
# retrieve the database object
current_database = self._collection.database
# invoke the command from the database object
create_index_responses: dict[str, Any] = current_database.command(
create_index_commands
)
return create_index_responses
def add_texts(
self,
texts: Iterable[str],
metadatas: Optional[List[Dict[str, Any]]] = None,
**kwargs: Any,
) -> List:
batch_size = kwargs.get("batch_size", DEFAULT_INSERT_BATCH_SIZE)
_metadatas: Union[List, Generator] = metadatas or ({} for _ in texts)
texts_batch = []
metadatas_batch = []
result_ids = []
for i, (text, metadata) in enumerate(zip(texts, _metadatas)):
texts_batch.append(text)
metadatas_batch.append(metadata)
if (i + 1) % batch_size == 0:
result_ids.extend(self._insert_texts(texts_batch, metadatas_batch))
texts_batch = []
metadatas_batch = []
if texts_batch:
result_ids.extend(self._insert_texts(texts_batch, metadatas_batch))
return result_ids
def _insert_texts(self, texts: List[str], metadatas: List[Dict[str, Any]]) -> List:
"""Used to Load Documents into the collection
Args:
texts: The list of documents strings to load
metadatas: The list of metadata objects associated with each document
Returns:
"""
# If the text is empty, then exit early
if not texts:
return []
# Embed and create the documents
embeddings = self._embedding.embed_documents(texts)
to_insert = [
{self._text_key: t, self._embedding_key: embedding, **m}
for t, m, embedding in zip(texts, metadatas, embeddings)
]
# insert the documents in DocumentDB
insert_result = self._collection.insert_many(to_insert) # type: ignore
return insert_result.inserted_ids
@classmethod
def from_texts(
cls,
texts: List[str],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
collection: Optional[Collection[DocumentDBDocumentType]] = None,
**kwargs: Any,
) -> DocumentDBVectorSearch:
if collection is None:
raise ValueError("Must provide 'collection' named parameter.")
vectorstore = cls(collection, embedding, **kwargs)
vectorstore.add_texts(texts, metadatas=metadatas)
return vectorstore
def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> Optional[bool]:
if ids is None:
raise ValueError("No document ids provided to delete.")
for document_id in ids:
self.delete_document_by_id(document_id)
return True
def delete_document_by_id(self, document_id: Optional[str] = None) -> None:
"""Removes a Specific Document by Id
Args:
document_id: The document identifier
"""
try:
from bson.objectid import ObjectId
except ImportError as e:
raise ImportError(
"Unable to import bson, please install with `pip install bson`."
) from e
if document_id is None:
raise ValueError("No document id provided to delete.")
self._collection.delete_one({"_id": ObjectId(document_id)})
def _similarity_search_without_score(
self, embeddings: List[float], k: int = 4, ef_search: int = 40
) -> List[Document]:
"""Returns a list of documents.
Args:
embeddings: The query vector
k: the number of documents to return
ef_search: Specifies the size of the dynamic candidate list
that HNSW index uses during search. A higher value of
efSearch provides better recall at cost of speed.
Returns:
A list of documents closest to the query vector
"""
pipeline: List[dict[str, Any]] = [
{
"$search": {
"vectorSearch": {
"vector": embeddings,
"path": self._embedding_key,
"similarity": self._similarity_type,
"k": k,
"efSearch": ef_search,
}
}
}
]
cursor = self._collection.aggregate(pipeline)
docs = []
for res in cursor:
text = res.pop(self._text_key)
docs.append(Document(page_content=text, metadata=res))
return docs
def similarity_search(
self,
query: str,
k: int = 4,
ef_search: int = 40,
**kwargs: Any,
) -> List[Document]:
embeddings = self._embedding.embed_query(query)
docs = self._similarity_search_without_score(
embeddings=embeddings, k=k, ef_search=ef_search
)
return [doc for doc in docs]

View File

@@ -0,0 +1,390 @@
"""Test DocumentDBVectorSearch functionality."""
import logging
import os
from time import sleep
from typing import Any, Optional
import pytest
from langchain_core.documents import Document
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores.documentdb import (
DocumentDBSimilarityType,
DocumentDBVectorSearch,
)
logging.basicConfig(level=logging.DEBUG)
model_deployment = os.getenv(
"OPENAI_EMBEDDINGS_DEPLOYMENT", "smart-agent-embedding-ada"
)
model_name = os.getenv("OPENAI_EMBEDDINGS_MODEL_NAME", "text-embedding-ada-002")
INDEX_NAME = "langchain-test-index"
NAMESPACE = "langchain_test_db.langchain_test_collection"
CONNECTION_STRING = os.getenv("DOCUMENTDB_URI", "")
DB_NAME, COLLECTION_NAME = NAMESPACE.split(".")
dimensions = 1536
similarity_algorithm = DocumentDBSimilarityType.COS
def prepare_collection() -> Any:
from pymongo import MongoClient
test_client: MongoClient = MongoClient(CONNECTION_STRING)
return test_client[DB_NAME][COLLECTION_NAME]
@pytest.fixture()
def collection() -> Any:
return prepare_collection()
@pytest.fixture()
def embedding_openai() -> Any:
openai_embeddings: OpenAIEmbeddings = OpenAIEmbeddings(
deployment=model_deployment, model=model_name, chunk_size=1
)
return openai_embeddings
"""
This is how to run the integration tests:
cd libs/community
make test TEST_FILE=tests/integration_tests/vectorstores/test_documentdb.py
NOTE: You will first need to follow the contributor setup steps:
https://python.langchain.com/docs/contributing/code. You will also need to install
`pymongo` via `poetry`. You can also run the test directly using `pytest`, but please
make sure to install all dependencies.
"""
class TestDocumentDBVectorSearch:
@classmethod
def setup_class(cls) -> None:
if not os.getenv("OPENAI_API_KEY"):
raise ValueError("OPENAI_API_KEY environment variable is not set")
# insure the test collection is empty
collection = prepare_collection()
assert collection.count_documents({}) == 0 # type: ignore[index] # noqa: E501
@classmethod
def teardown_class(cls) -> None:
collection = prepare_collection()
# delete all the documents in the collection
collection.delete_many({}) # type: ignore[index]
collection.drop_indexes()
@pytest.fixture(autouse=True)
def setup(self) -> None:
collection = prepare_collection()
# delete all the documents in the collection
collection.delete_many({}) # type: ignore[index]
collection.drop_indexes()
def test_from_documents_cosine_distance(
self, embedding_openai: OpenAIEmbeddings, collection: Any
) -> None:
"""Test end to end construction and search."""
documents = [
Document(page_content="Dogs are tough.", metadata={"a": 1}),
Document(page_content="Cats have fluff.", metadata={"b": 1}),
Document(page_content="What is a sandwich?", metadata={"c": 1}),
Document(page_content="That fence is purple.", metadata={"d": 1, "e": 2}),
]
vectorstore = DocumentDBVectorSearch.from_documents(
documents,
embedding_openai,
collection=collection,
index_name=INDEX_NAME,
)
sleep(1) # waits for DocumentDB to save contents to the collection
# Create the HNSW index that will be leveraged later for vector search
vectorstore.create_index(dimensions, similarity_algorithm)
sleep(2) # waits for the index to be set up
output = vectorstore.similarity_search("Sandwich", k=1)
assert output
assert output[0].page_content == "What is a sandwich?"
assert output[0].metadata["c"] == 1
vectorstore.delete_index()
def test_from_documents_inner_product(
self, embedding_openai: OpenAIEmbeddings, collection: Any
) -> None:
"""Test end to end construction and search."""
documents = [
Document(page_content="Dogs are tough.", metadata={"a": 1}),
Document(page_content="Cats have fluff.", metadata={"b": 1}),
Document(page_content="What is a sandwich?", metadata={"c": 1}),
Document(page_content="That fence is purple.", metadata={"d": 1, "e": 2}),
]
vectorstore = DocumentDBVectorSearch.from_documents(
documents,
embedding_openai,
collection=collection,
index_name=INDEX_NAME,
)
sleep(1) # waits for DocumentDB to save contents to the collection
# Create the HNSW index that will be leveraged later for vector search
vectorstore.create_index(dimensions, DocumentDBSimilarityType.DOT)
sleep(2) # waits for the index to be set up
output = vectorstore.similarity_search("Sandwich", k=1, ef_search=100)
assert output
assert output[0].page_content == "What is a sandwich?"
assert output[0].metadata["c"] == 1
vectorstore.delete_index()
def test_from_texts_cosine_distance(
self, embedding_openai: OpenAIEmbeddings, collection: Any
) -> None:
texts = [
"Dogs are tough.",
"Cats have fluff.",
"What is a sandwich?",
"That fence is purple.",
]
vectorstore = DocumentDBVectorSearch.from_texts(
texts,
embedding_openai,
collection=collection,
index_name=INDEX_NAME,
)
# Create the HNSW index that will be leveraged later for vector search
vectorstore.create_index(dimensions, similarity_algorithm)
sleep(2) # waits for the index to be set up
output = vectorstore.similarity_search("Sandwich", k=1)
assert output[0].page_content == "What is a sandwich?"
vectorstore.delete_index()
def test_from_texts_with_metadatas_cosine_distance(
self, embedding_openai: OpenAIEmbeddings, collection: Any
) -> None:
texts = [
"Dogs are tough.",
"Cats have fluff.",
"What is a sandwich?",
"The fence is purple.",
]
metadatas = [{"a": 1}, {"b": 1}, {"c": 1}, {"d": 1, "e": 2}]
vectorstore = DocumentDBVectorSearch.from_texts(
texts,
embedding_openai,
metadatas=metadatas,
collection=collection,
index_name=INDEX_NAME,
)
# Create the HNSW index that will be leveraged later for vector search
vectorstore.create_index(dimensions, similarity_algorithm)
sleep(2) # waits for the index to be set up
output = vectorstore.similarity_search("Sandwich", k=1)
assert output
assert output[0].page_content == "What is a sandwich?"
assert output[0].metadata["c"] == 1
vectorstore.delete_index()
def test_from_texts_with_metadatas_delete_one(
self, embedding_openai: OpenAIEmbeddings, collection: Any
) -> None:
texts = [
"Dogs are tough.",
"Cats have fluff.",
"What is a sandwich?",
"The fence is purple.",
]
metadatas = [{"a": 1}, {"b": 1}, {"c": 1}, {"d": 1, "e": 2}]
vectorstore = DocumentDBVectorSearch.from_texts(
texts,
embedding_openai,
metadatas=metadatas,
collection=collection,
index_name=INDEX_NAME,
)
# Create the HNSW index that will be leveraged later for vector search
vectorstore.create_index(dimensions, similarity_algorithm)
sleep(2) # waits for the index to be set up
output = vectorstore.similarity_search("Sandwich", k=1)
assert output
assert output[0].page_content == "What is a sandwich?"
assert output[0].metadata["c"] == 1
first_document_id_object = output[0].metadata["_id"]
first_document_id = str(first_document_id_object)
vectorstore.delete_document_by_id(first_document_id)
sleep(2) # waits for the index to be updated
output2 = vectorstore.similarity_search("Sandwich", k=1, ef_search=10)
assert output2
assert output2[0].page_content != "What is a sandwich?"
vectorstore.delete_index()
def test_from_texts_with_metadatas_delete_multiple(
self, embedding_openai: OpenAIEmbeddings, collection: Any
) -> None:
texts = [
"Dogs are tough.",
"Cats have fluff.",
"What is a sandwich?",
"The fence is purple.",
]
metadatas = [{"a": 1}, {"b": 1}, {"c": 1}, {"d": 1, "e": 2}]
vectorstore = DocumentDBVectorSearch.from_texts(
texts,
embedding_openai,
metadatas=metadatas,
collection=collection,
index_name=INDEX_NAME,
)
# Create the HNSW index that will be leveraged later for vector search
vectorstore.create_index(dimensions, similarity_algorithm)
sleep(2) # waits for the index to be set up
output = vectorstore.similarity_search("Sandwich", k=5)
first_document_id_object = output[0].metadata["_id"]
first_document_id = str(first_document_id_object)
output[1].metadata["_id"]
second_document_id = output[1].metadata["_id"]
output[2].metadata["_id"]
third_document_id = output[2].metadata["_id"]
document_ids = [first_document_id, second_document_id, third_document_id]
vectorstore.delete(document_ids)
sleep(2) # waits for the index to be updated
output_2 = vectorstore.similarity_search("Sandwich", k=5)
assert output
assert output_2
assert len(output) == 4 # we should see all the four documents
assert (
len(output_2) == 1
) # we should see only one document left after three have been deleted
vectorstore.delete_index()
def test_from_texts_with_metadatas_inner_product(
self, embedding_openai: OpenAIEmbeddings, collection: Any
) -> None:
texts = [
"Dogs are tough.",
"Cats have fluff.",
"What is a sandwich?",
"The fence is purple.",
]
metadatas = [{"a": 1}, {"b": 1}, {"c": 1}, {"d": 1, "e": 2}]
vectorstore = DocumentDBVectorSearch.from_texts(
texts,
embedding_openai,
metadatas=metadatas,
collection=collection,
index_name=INDEX_NAME,
)
# Create the HNSW index that will be leveraged later for vector search
vectorstore.create_index(dimensions, DocumentDBSimilarityType.DOT)
sleep(2) # waits for the index to be set up
output = vectorstore.similarity_search("Sandwich", k=1)
assert output
assert output[0].page_content == "What is a sandwich?"
assert output[0].metadata["c"] == 1
vectorstore.delete_index()
def test_from_texts_with_metadatas_euclidean_distance(
self, embedding_openai: OpenAIEmbeddings, collection: Any
) -> None:
texts = [
"Dogs are tough.",
"Cats have fluff.",
"What is a sandwich?",
"The fence is purple.",
]
metadatas = [{"a": 1}, {"b": 1}, {"c": 1}, {"d": 1, "e": 2}]
vectorstore = DocumentDBVectorSearch.from_texts(
texts,
embedding_openai,
metadatas=metadatas,
collection=collection,
index_name=INDEX_NAME,
)
# Create the HNSW index that will be leveraged later for vector search
vectorstore.create_index(dimensions, DocumentDBSimilarityType.EUC)
sleep(2) # waits for the index to be set up
output = vectorstore.similarity_search("Sandwich", k=1)
assert output
assert output[0].page_content == "What is a sandwich?"
assert output[0].metadata["c"] == 1
vectorstore.delete_index()
def invoke_delete_with_no_args(
self, embedding_openai: OpenAIEmbeddings, collection: Any
) -> Optional[bool]:
vectorstore: DocumentDBVectorSearch = (
DocumentDBVectorSearch.from_connection_string(
CONNECTION_STRING,
NAMESPACE,
embedding_openai,
index_name=INDEX_NAME,
)
)
return vectorstore.delete()
def invoke_delete_by_id_with_no_args(
self, embedding_openai: OpenAIEmbeddings, collection: Any
) -> None:
vectorstore: DocumentDBVectorSearch = (
DocumentDBVectorSearch.from_connection_string(
CONNECTION_STRING,
NAMESPACE,
embedding_openai,
index_name=INDEX_NAME,
)
)
vectorstore.delete_document_by_id()
def test_invalid_arguments_to_delete(
self, embedding_openai: OpenAIEmbeddings, collection: Any
) -> None:
with pytest.raises(ValueError) as exception_info:
self.invoke_delete_with_no_args(embedding_openai, collection)
assert str(exception_info.value) == "No document ids provided to delete."
def test_no_arguments_to_delete_by_id(
self, embedding_openai: OpenAIEmbeddings, collection: Any
) -> None:
with pytest.raises(Exception) as exception_info:
self.invoke_delete_by_id_with_no_args(embedding_openai, collection)
assert str(exception_info.value) == "No document id provided to delete."

View File

@@ -57,6 +57,7 @@ def test_compatible_vectorstore_documentation() -> None:
"DatabricksVectorSearch",
"DeepLake",
"Dingo",
"DocumentDBVectorSearch",
"ElasticVectorSearch",
"ElasticsearchStore",
"FAISS",

View File

@@ -24,6 +24,7 @@ _EXPECTED = [
"DistanceStrategy",
"DocArrayHnswSearch",
"DocArrayInMemorySearch",
"DocumentDBVectorSearch",
"ElasticKnnSearch",
"ElasticVectorSearch",
"ElasticsearchStore",