Add Support for Azure Cosmos DB MongoDB vCore Vector Store #11627 (#11632)

This PR adds support for the Azure Cosmos DB MongoDB vCore Vector Store

https://learn.microsoft.com/en-us/azure/cosmos-db/mongodb/vcore/

https://learn.microsoft.com/en-us/azure/cosmos-db/mongodb/vcore/vector-search

Summary:
- **Description:** added vector store integration for Azure Cosmos DB
MongoDB vCore Vector Store,
  - **Issue:** the issue # it fixes #11627,
  - **Dependencies:** pymongo dependency,
  - **Tag maintainer:** @hwchase17,
  - **Twitter handle:** @izzyacademy

---------

Co-authored-by: Israel Ekpo <israel.ekpo@gmail.com>
Co-authored-by: Israel Ekpo <44282278+izzyacademy@users.noreply.github.com>
Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
Israel Ekpo
2023-10-11 16:56:46 -04:00
committed by GitHub
parent 28ee6a7c12
commit d0603c86b6
6 changed files with 1265 additions and 4 deletions

View File

@@ -37,6 +37,12 @@ def _import_alibaba_cloud_open_search_settings() -> Any:
return AlibabaCloudOpenSearchSettings
def _import_azure_cosmos_db() -> Any:
from langchain.vectorstores.azure_cosmos_db import AzureCosmosDBVectorSearch
return AzureCosmosDBVectorSearch
def _import_elastic_knn_search() -> Any:
from langchain.vectorstores.elastic_vector_search import ElasticKnnSearch
@@ -398,6 +404,8 @@ def __getattr__(name: str) -> Any:
return _import_alibaba_cloud_open_search()
elif name == "AlibabaCloudOpenSearchSettings":
return _import_alibaba_cloud_open_search_settings()
elif name == "AzureCosmosDBVectorSearch":
return _import_azure_cosmos_db()
elif name == "ElasticKnnSearch":
return _import_elastic_knn_search()
elif name == "ElasticVectorSearch":
@@ -588,4 +596,5 @@ __all__ = [
"Zilliz",
"Zilliz",
"TencentVectorDB",
"AzureCosmosDBVectorSearch",
]

View File

@@ -0,0 +1,421 @@
from __future__ import annotations
import logging
from enum import Enum
from typing import (
TYPE_CHECKING,
Any,
Dict,
Generator,
Iterable,
List,
Optional,
Tuple,
TypeVar,
Union,
)
import numpy as np
from langchain.docstore.document import Document
from langchain.vectorstores.base import VectorStore
from langchain.vectorstores.utils import maximal_marginal_relevance
if TYPE_CHECKING:
from pymongo.collection import Collection
from langchain.schema.embeddings import Embeddings
# Before Python 3.11 native StrEnum is not available
class CosmosDBSimilarityType(str, Enum):
COS = "COS" # CosineSimilarity
IP = "IP" # inner - product
L2 = "L2" # Euclidean distance
CosmosDBDocumentType = TypeVar("CosmosDBDocumentType", bound=Dict[str, Any])
logger = logging.getLogger(__name__)
DEFAULT_INSERT_BATCH_SIZE = 128
class AzureCosmosDBVectorSearch(VectorStore):
"""`Azure Cosmos DB for MongoDB vCore` vector store.
To use, you should have both:
- the ``pymongo`` python package installed
- a connection string associated with a MongoDB VCore Cluster
Example:
. code-block:: python
from langchain.vectorstores import AzureCosmosDBVectorSearch
from langchain.embeddings.openai import OpenAIEmbeddings
from pymongo import MongoClient
mongo_client = MongoClient("<YOUR-CONNECTION-STRING>")
collection = mongo_client["<db_name>"]["<collection_name>"]
embeddings = OpenAIEmbeddings()
vectorstore = AzureCosmosDBVectorSearch(collection, embeddings)
"""
def __init__(
self,
collection: Collection[CosmosDBDocumentType],
embedding: Embeddings,
*,
index_name: str = "vectorSearchIndex",
text_key: str = "textContent",
embedding_key: str = "vectorContent",
):
"""Constructor for AzureCosmosDBVectorSearch
Args:
collection: MongoDB collection to add the texts to.
embedding: Text embedding model to use.
index_name: Name of the Atlas Search index.
text_key: MongoDB field that will contain the text
for each document.
embedding_key: MongoDB field that will contain the embedding
for each document.
"""
self._collection = collection
self._embedding = embedding
self._index_name = index_name
self._text_key = text_key
self._embedding_key = embedding_key
@property
def embeddings(self) -> Embeddings:
return self._embedding
def get_index_name(self) -> str:
"""Returns the index name
Returns:
Returns the index name
"""
return self._index_name
@classmethod
def from_connection_string(
cls,
connection_string: str,
namespace: str,
embedding: Embeddings,
**kwargs: Any,
) -> AzureCosmosDBVectorSearch:
"""Creates an Instance of AzureCosmosDBVectorSearch from a Connection String
Args:
connection_string: The MongoDB vCore instance connection string
namespace: The namespace (database.collection)
embedding: The embedding utility
**kwargs: Dynamic keyword arguments
Returns:
an instance of the vector store
"""
try:
from pymongo import MongoClient
except ImportError:
raise ImportError(
"Could not import pymongo, please install it with "
"`pip install pymongo`."
)
client: MongoClient = MongoClient(connection_string)
db_name, collection_name = namespace.split(".")
collection = client[db_name][collection_name]
return cls(collection, embedding, **kwargs)
def index_exists(self) -> bool:
"""Verifies if the specified index name during instance
construction exists on the collection
Returns:
Returns True on success and False if no such index exists
on the collection
"""
cursor = self._collection.list_indexes()
index_name = self._index_name
for res in cursor:
current_index_name = res.pop("name")
if current_index_name == index_name:
return True
return False
def delete_index(self) -> None:
"""Deletes the index specified during instance construction if it exists"""
if self.index_exists():
self._collection.drop_index(self._index_name)
# Raises OperationFailure on an error (e.g. trying to drop
# an index that does not exist)
def create_index(
self,
num_lists: int = 100,
dimensions: int = 1536,
similarity: CosmosDBSimilarityType = CosmosDBSimilarityType.COS,
) -> dict[str, Any]:
"""Creates an index using the index name specified at
instance construction
Setting the numLists parameter correctly is important for achieving
good accuracy and performance.
Since the vector store uses IVF as the indexing strategy,
you should create the index only after you
have loaded a large enough sample documents to ensure that the
centroids for the respective buckets are
faily distributed.
We recommend that numLists is set to documentCount/1000 for up
to 1 million documents
and to sqrt(documentCount) for more than 1 million documents.
As the number of items in your database grows, you should
tune numLists to be larger
in order to achieve good latency performance for vector search.
If you're experimenting with a new scenario or creating a
small demo, you can start with numLists
set to 1 to perform a brute-force search across all vectors.
This should provide you with the most
accurate results from the vector search, however be aware that
the search speed and latency will be slow.
After your initial setup, you should go ahead and tune
the numLists parameter using the above guidance.
Args:
num_lists: This integer is the number of clusters that the
inverted file (IVF) index uses to group the vector data.
We recommend that numLists is set to documentCount/1000
for up to 1 million documents and to sqrt(documentCount)
for more than 1 million documents.
Using a numLists value of 1 is akin to performing
brute-force search, which has limited performance
dimensions: Number of dimensions for vector similarity.
The maximum number of supported dimensions is 2000
similarity: Similarity metric to use with the IVF index.
Possible options are:
- CosmosDBSimilarityType.COS (cosine distance),
- CosmosDBSimilarityType.L2 (Euclidean distance), and
- CosmosDBSimilarityType.IP (inner product).
Returns:
An object describing the created index
"""
# prepare the command
create_index_commands = {
"createIndexes": self._collection.name,
"indexes": [
{
"name": self._index_name,
"key": {"vectorContent": "cosmosSearch"},
"cosmosSearchOptions": {
"kind": "vector-ivf",
"numLists": num_lists,
"similarity": similarity,
"dimensions": dimensions,
},
}
],
}
# retrieve the database object
current_database = self._collection.database
# invoke the command from the database object
create_index_responses: dict[str, Any] = current_database.command(
create_index_commands
)
return create_index_responses
def add_texts(
self,
texts: Iterable[str],
metadatas: Optional[List[Dict[str, Any]]] = None,
**kwargs: Any,
) -> List:
batch_size = kwargs.get("batch_size", DEFAULT_INSERT_BATCH_SIZE)
_metadatas: Union[List, Generator] = metadatas or ({} for _ in texts)
texts_batch = []
metadatas_batch = []
result_ids = []
for i, (text, metadata) in enumerate(zip(texts, _metadatas)):
texts_batch.append(text)
metadatas_batch.append(metadata)
if (i + 1) % batch_size == 0:
result_ids.extend(self._insert_texts(texts_batch, metadatas_batch))
texts_batch = []
metadatas_batch = []
if texts_batch:
result_ids.extend(self._insert_texts(texts_batch, metadatas_batch))
return result_ids
def _insert_texts(self, texts: List[str], metadatas: List[Dict[str, Any]]) -> List:
"""Used to Load Documents into the collection
Args:
texts: The list of documents strings to load
metadatas: The list of metadata objects associated with each document
Returns:
"""
# If the text is empty, then exit early
if not texts:
return []
# Embed and create the documents
embeddings = self._embedding.embed_documents(texts)
to_insert = [
{self._text_key: t, self._embedding_key: embedding, **m}
for t, m, embedding in zip(texts, metadatas, embeddings)
]
# insert the documents in Cosmos DB
insert_result = self._collection.insert_many(to_insert) # type: ignore
return insert_result.inserted_ids
@classmethod
def from_texts(
cls,
texts: List[str],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
collection: Optional[Collection[CosmosDBDocumentType]] = None,
**kwargs: Any,
) -> AzureCosmosDBVectorSearch:
if collection is None:
raise ValueError("Must provide 'collection' named parameter.")
vectorstore = cls(collection, embedding, **kwargs)
vectorstore.add_texts(texts, metadatas=metadatas)
return vectorstore
def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> Optional[bool]:
if ids is None:
raise ValueError("No document ids provided to delete.")
for document_id in ids:
self.delete_document_by_id(document_id)
return True
def delete_document_by_id(self, document_id: Optional[str] = None) -> None:
"""Removes a Specific Document by Id
Args:
document_id: The document identifier
"""
try:
from bson.objectid import ObjectId
except ImportError as e:
raise ImportError(
"Unable to import bson, please install with `pip install bson`."
) from e
if document_id is None:
raise ValueError("No document id provided to delete.")
self._collection.delete_one({"_id": ObjectId(document_id)})
def _similarity_search_with_score(
self, embeddings: List[float], k: int = 4
) -> List[Tuple[Document, float]]:
"""Returns a list of documents with their scores
Args:
embeddings: The query vector
k: the number of documents to return
Returns:
A list of documents closest to the query vector
"""
pipeline: List[dict[str, Any]] = [
{
"$search": {
"cosmosSearch": {
"vector": embeddings,
"path": self._embedding_key,
"k": k,
},
"returnStoredSource": True,
}
},
{
"$project": {
"similarityScore": {"$meta": "searchScore"},
"document": "$$ROOT",
}
},
]
cursor = self._collection.aggregate(pipeline)
docs = []
for res in cursor:
score = res.pop("similarityScore")
document_object_field = res.pop("document")
text = document_object_field.pop(self._text_key)
docs.append(
(Document(page_content=text, metadata=document_object_field), score)
)
return docs
def similarity_search_with_score(
self, query: str, k: int = 4
) -> List[Tuple[Document, float]]:
embeddings = self._embedding.embed_query(query)
docs = self._similarity_search_with_score(embeddings=embeddings, k=k)
return docs
def similarity_search(
self, query: str, k: int = 4, **kwargs: Any
) -> List[Document]:
docs_and_scores = self.similarity_search_with_score(query, k=k)
return [doc for doc, _ in docs_and_scores]
def max_marginal_relevance_search_by_vector(
self,
embedding: List[float],
k: int = 4,
fetch_k: int = 20,
lambda_mult: float = 0.5,
**kwargs: Any,
) -> List[Document]:
# Retrieves the docs with similarity scores
# sorted by similarity scores in DESC order
docs = self._similarity_search_with_score(embedding, k=fetch_k)
# Re-ranks the docs using MMR
mmr_doc_indexes = maximal_marginal_relevance(
np.array(embedding),
[doc.metadata[self._embedding_key] for doc, _ in docs],
k=k,
lambda_mult=lambda_mult,
)
mmr_docs = [docs[i][0] for i in mmr_doc_indexes]
return mmr_docs
def max_marginal_relevance_search(
self,
query: str,
k: int = 4,
fetch_k: int = 20,
lambda_mult: float = 0.5,
**kwargs: Any,
) -> List[Document]:
# compute the embeddings vector from the query string
embeddings = self._embedding.embed_query(query)
docs = self.max_marginal_relevance_search_by_vector(
embeddings, k=k, fetch_k=fetch_k, lambda_mult=lambda_mult
)
return docs

View File

@@ -0,0 +1,435 @@
"""Test AzureCosmosDBVectorSearch functionality."""
import logging
import os
from time import sleep
from typing import Any, Generator, Optional, Union
import pytest
from langchain.docstore.document import Document
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores.azure_cosmos_db import (
AzureCosmosDBVectorSearch,
CosmosDBSimilarityType,
)
logging.basicConfig(level=logging.DEBUG)
model_deployment = os.getenv(
"OPENAI_EMBEDDINGS_DEPLOYMENT", "smart-agent-embedding-ada"
)
model_name = os.getenv("OPENAI_EMBEDDINGS_MODEL_NAME", "text-embedding-ada-002")
INDEX_NAME = "langchain-test-index"
NAMESPACE = "langchain_test_db.langchain_test_collection"
CONNECTION_STRING: str = os.environ.get("MONGODB_VCORE_URI", "")
DB_NAME, COLLECTION_NAME = NAMESPACE.split(".")
num_lists = 3
dimensions = 1536
similarity_algorithm = CosmosDBSimilarityType.COS
def prepare_collection() -> Any:
from pymongo import MongoClient
test_client: MongoClient = MongoClient(CONNECTION_STRING)
return test_client[DB_NAME][COLLECTION_NAME]
@pytest.fixture()
def collection() -> Any:
return prepare_collection()
@pytest.fixture()
def azure_openai_embeddings() -> Any:
openai_embeddings: OpenAIEmbeddings = OpenAIEmbeddings(
deployment=model_deployment, model=model_name, chunk_size=1
)
return openai_embeddings
"""
This is how to run the integration tests:
cd libs/langchain
pytest tests/integration_tests/vectorstores/test_azure_cosmos_db.py
"""
class TestAzureCosmosDBVectorSearch:
@classmethod
def setup_class(cls) -> None:
if not os.getenv("OPENAI_API_KEY"):
raise ValueError("OPENAI_API_KEY environment variable is not set")
# insure the test collection is empty
collection = prepare_collection()
assert collection.count_documents({}) == 0 # type: ignore[index] # noqa: E501
@classmethod
def teardown_class(cls) -> None:
collection = prepare_collection()
# delete all the documents in the collection
collection.delete_many({}) # type: ignore[index]
@pytest.fixture(autouse=True)
def setup(self) -> None:
collection = prepare_collection()
# delete all the documents in the collection
collection.delete_many({}) # type: ignore[index]
@pytest.fixture(scope="class", autouse=True)
def cosmos_db_url(self) -> Union[str, Generator[str, None, None]]:
"""Return the elasticsearch url."""
return "805.555.1212"
def test_from_documents_cosine_distance(
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
) -> None:
"""Test end to end construction and search."""
documents = [
Document(page_content="Dogs are tough.", metadata={"a": 1}),
Document(page_content="Cats have fluff.", metadata={"b": 1}),
Document(page_content="What is a sandwich?", metadata={"c": 1}),
Document(page_content="That fence is purple.", metadata={"d": 1, "e": 2}),
]
vectorstore = AzureCosmosDBVectorSearch.from_documents(
documents,
azure_openai_embeddings,
collection=collection,
index_name=INDEX_NAME,
)
sleep(1) # waits for Cosmos DB to save contents to the collection
# Create the IVF index that will be leveraged later for vector search
vectorstore.create_index(num_lists, dimensions, similarity_algorithm)
sleep(2) # waits for the index to be set up
output = vectorstore.similarity_search("Sandwich", k=1)
assert output
assert output[0].page_content == "What is a sandwich?"
assert output[0].metadata["c"] == 1
vectorstore.delete_index()
def test_from_documents_inner_product(
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
) -> None:
"""Test end to end construction and search."""
documents = [
Document(page_content="Dogs are tough.", metadata={"a": 1}),
Document(page_content="Cats have fluff.", metadata={"b": 1}),
Document(page_content="What is a sandwich?", metadata={"c": 1}),
Document(page_content="That fence is purple.", metadata={"d": 1, "e": 2}),
]
vectorstore = AzureCosmosDBVectorSearch.from_documents(
documents,
azure_openai_embeddings,
collection=collection,
index_name=INDEX_NAME,
)
sleep(1) # waits for Cosmos DB to save contents to the collection
# Create the IVF index that will be leveraged later for vector search
vectorstore.create_index(num_lists, dimensions, CosmosDBSimilarityType.IP)
sleep(2) # waits for the index to be set up
output = vectorstore.similarity_search("Sandwich", k=1)
assert output
assert output[0].page_content == "What is a sandwich?"
assert output[0].metadata["c"] == 1
vectorstore.delete_index()
def test_from_texts_cosine_distance(
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
) -> None:
texts = [
"Dogs are tough.",
"Cats have fluff.",
"What is a sandwich?",
"That fence is purple.",
]
vectorstore = AzureCosmosDBVectorSearch.from_texts(
texts,
azure_openai_embeddings,
collection=collection,
index_name=INDEX_NAME,
)
# Create the IVF index that will be leveraged later for vector search
vectorstore.create_index(num_lists, dimensions, similarity_algorithm)
sleep(2) # waits for the index to be set up
output = vectorstore.similarity_search("Sandwich", k=1)
assert output[0].page_content == "What is a sandwich?"
vectorstore.delete_index()
def test_from_texts_with_metadatas_cosine_distance(
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
) -> None:
texts = [
"Dogs are tough.",
"Cats have fluff.",
"What is a sandwich?",
"The fence is purple.",
]
metadatas = [{"a": 1}, {"b": 1}, {"c": 1}, {"d": 1, "e": 2}]
vectorstore = AzureCosmosDBVectorSearch.from_texts(
texts,
azure_openai_embeddings,
metadatas=metadatas,
collection=collection,
index_name=INDEX_NAME,
)
# Create the IVF index that will be leveraged later for vector search
vectorstore.create_index(num_lists, dimensions, similarity_algorithm)
sleep(2) # waits for the index to be set up
output = vectorstore.similarity_search("Sandwich", k=1)
assert output
assert output[0].page_content == "What is a sandwich?"
assert output[0].metadata["c"] == 1
vectorstore.delete_index()
def test_from_texts_with_metadatas_delete_one(
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
) -> None:
texts = [
"Dogs are tough.",
"Cats have fluff.",
"What is a sandwich?",
"The fence is purple.",
]
metadatas = [{"a": 1}, {"b": 1}, {"c": 1}, {"d": 1, "e": 2}]
vectorstore = AzureCosmosDBVectorSearch.from_texts(
texts,
azure_openai_embeddings,
metadatas=metadatas,
collection=collection,
index_name=INDEX_NAME,
)
# Create the IVF index that will be leveraged later for vector search
vectorstore.create_index(num_lists, dimensions, similarity_algorithm)
sleep(2) # waits for the index to be set up
output = vectorstore.similarity_search("Sandwich", k=1)
assert output
assert output[0].page_content == "What is a sandwich?"
assert output[0].metadata["c"] == 1
first_document_id_object = output[0].metadata["_id"]
first_document_id = str(first_document_id_object)
vectorstore.delete_document_by_id(first_document_id)
sleep(2) # waits for the index to be updated
output2 = vectorstore.similarity_search("Sandwich", k=1)
assert output2
assert output2[0].page_content != "What is a sandwich?"
vectorstore.delete_index()
def test_from_texts_with_metadatas_delete_multiple(
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
) -> None:
texts = [
"Dogs are tough.",
"Cats have fluff.",
"What is a sandwich?",
"The fence is purple.",
]
metadatas = [{"a": 1}, {"b": 1}, {"c": 1}, {"d": 1, "e": 2}]
vectorstore = AzureCosmosDBVectorSearch.from_texts(
texts,
azure_openai_embeddings,
metadatas=metadatas,
collection=collection,
index_name=INDEX_NAME,
)
# Create the IVF index that will be leveraged later for vector search
vectorstore.create_index(num_lists, dimensions, similarity_algorithm)
sleep(2) # waits for the index to be set up
output = vectorstore.similarity_search("Sandwich", k=5)
first_document_id_object = output[0].metadata["_id"]
first_document_id = str(first_document_id_object)
output[1].metadata["_id"]
second_document_id = output[1].metadata["_id"]
output[2].metadata["_id"]
third_document_id = output[2].metadata["_id"]
document_ids = [first_document_id, second_document_id, third_document_id]
vectorstore.delete(document_ids)
sleep(2) # waits for the index to be updated
output_2 = vectorstore.similarity_search("Sandwich", k=5)
assert output
assert output_2
assert len(output) == 4 # we should see all the four documents
assert (
len(output_2) == 1
) # we should see only one document left after three have been deleted
vectorstore.delete_index()
def test_from_texts_with_metadatas_inner_product(
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
) -> None:
texts = [
"Dogs are tough.",
"Cats have fluff.",
"What is a sandwich?",
"The fence is purple.",
]
metadatas = [{"a": 1}, {"b": 1}, {"c": 1}, {"d": 1, "e": 2}]
vectorstore = AzureCosmosDBVectorSearch.from_texts(
texts,
azure_openai_embeddings,
metadatas=metadatas,
collection=collection,
index_name=INDEX_NAME,
)
# Create the IVF index that will be leveraged later for vector search
vectorstore.create_index(num_lists, dimensions, CosmosDBSimilarityType.IP)
sleep(2) # waits for the index to be set up
output = vectorstore.similarity_search("Sandwich", k=1)
assert output
assert output[0].page_content == "What is a sandwich?"
assert output[0].metadata["c"] == 1
vectorstore.delete_index()
def test_from_texts_with_metadatas_euclidean_distance(
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
) -> None:
texts = [
"Dogs are tough.",
"Cats have fluff.",
"What is a sandwich?",
"The fence is purple.",
]
metadatas = [{"a": 1}, {"b": 1}, {"c": 1}, {"d": 1, "e": 2}]
vectorstore = AzureCosmosDBVectorSearch.from_texts(
texts,
azure_openai_embeddings,
metadatas=metadatas,
collection=collection,
index_name=INDEX_NAME,
)
# Create the IVF index that will be leveraged later for vector search
vectorstore.create_index(num_lists, dimensions, CosmosDBSimilarityType.L2)
sleep(2) # waits for the index to be set up
output = vectorstore.similarity_search("Sandwich", k=1)
assert output
assert output[0].page_content == "What is a sandwich?"
assert output[0].metadata["c"] == 1
vectorstore.delete_index()
def test_max_marginal_relevance_cosine_distance(
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
) -> None:
texts = ["foo", "foo", "fou", "foy"]
vectorstore = AzureCosmosDBVectorSearch.from_texts(
texts,
azure_openai_embeddings,
collection=collection,
index_name=INDEX_NAME,
)
# Create the IVF index that will be leveraged later for vector search
vectorstore.create_index(num_lists, dimensions, CosmosDBSimilarityType.COS)
sleep(2) # waits for the index to be set up
query = "foo"
output = vectorstore.max_marginal_relevance_search(query, k=10, lambda_mult=0.1)
assert len(output) == len(texts)
assert output[0].page_content == "foo"
assert output[1].page_content != "foo"
vectorstore.delete_index()
def test_max_marginal_relevance_inner_product(
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
) -> None:
texts = ["foo", "foo", "fou", "foy"]
vectorstore = AzureCosmosDBVectorSearch.from_texts(
texts,
azure_openai_embeddings,
collection=collection,
index_name=INDEX_NAME,
)
# Create the IVF index that will be leveraged later for vector search
vectorstore.create_index(num_lists, dimensions, CosmosDBSimilarityType.IP)
sleep(2) # waits for the index to be set up
query = "foo"
output = vectorstore.max_marginal_relevance_search(query, k=10, lambda_mult=0.1)
assert len(output) == len(texts)
assert output[0].page_content == "foo"
assert output[1].page_content != "foo"
vectorstore.delete_index()
def invoke_delete_with_no_args(
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
) -> Optional[bool]:
vectorstore: AzureCosmosDBVectorSearch = (
AzureCosmosDBVectorSearch.from_connection_string(
CONNECTION_STRING,
NAMESPACE,
azure_openai_embeddings,
index_name=INDEX_NAME,
)
)
return vectorstore.delete()
def invoke_delete_by_id_with_no_args(
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
) -> None:
vectorstore: AzureCosmosDBVectorSearch = (
AzureCosmosDBVectorSearch.from_connection_string(
CONNECTION_STRING,
NAMESPACE,
azure_openai_embeddings,
index_name=INDEX_NAME,
)
)
vectorstore.delete_document_by_id()
def test_invalid_arguments_to_delete(
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
) -> None:
with pytest.raises(ValueError) as exception_info:
self.invoke_delete_with_no_args(azure_openai_embeddings, collection)
assert str(exception_info.value) == "No document ids provided to delete."
def test_no_arguments_to_delete_by_id(
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
) -> None:
with pytest.raises(Exception) as exception_info:
self.invoke_delete_by_id_with_no_args(azure_openai_embeddings, collection)
assert str(exception_info.value) == "No document id provided to delete."