community: VectorStores: Azure Cosmos DB Mongo vCore with DiskANN (#27329)

# Description
Add a new vector index type `diskann` to Azure Cosmos DB Mongo vCore
vector store. Paper of DiskANN can be found here [DiskANN: Fast Accurate
Billion-point Nearest Neighbor Search on a Single
Node](https://proceedings.neurips.cc/paper_files/paper/2019/file/09853c7fb1d3f8ee67a61b6bf4a7f8e6-Paper.pdf).

## Sample Usage
```python
from pymongo import MongoClient

# INDEX_NAME = "izzy-test-index-2"
# NAMESPACE = "izzy_test_db.izzy_test_collection"
# DB_NAME, COLLECTION_NAME = NAMESPACE.split(".")

client: MongoClient = MongoClient(CONNECTION_STRING)
collection = client[DB_NAME][COLLECTION_NAME]

model_deployment = os.getenv(
    "OPENAI_EMBEDDINGS_DEPLOYMENT", "smart-agent-embedding-ada"
)
model_name = os.getenv("OPENAI_EMBEDDINGS_MODEL_NAME", "text-embedding-ada-002")

vectorstore = AzureCosmosDBVectorSearch.from_documents(
    docs,
    openai_embeddings,
    collection=collection,
    index_name=INDEX_NAME,
)

# Read more about these variables in detail here. https://learn.microsoft.com/en-us/azure/cosmos-db/mongodb/vcore/vector-search
maxDegree = 40
dimensions = 1536
similarity_algorithm = CosmosDBSimilarityType.COS
kind = CosmosDBVectorSearchType.VECTOR_DISKANN
lBuild = 20

vectorstore.create_index(
            dimensions=dimensions,
            similarity=similarity_algorithm,
            kind=kind ,
            max_degree=maxDegree,
            l_build=lBuild,
        )
```

## Dependencies
No additional dependencies were added

---------

Co-authored-by: Yang Qiao (from Dev Box) <yangqiao@microsoft.com>
Co-authored-by: Erick Friis <erick@langchain.dev>
This commit is contained in:
fatmelon 2024-12-12 09:54:04 +08:00 committed by GitHub
parent ba9b95cd23
commit d1e0ec7b55
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 703 additions and 59 deletions

View File

@ -38,9 +38,6 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"\r\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.2.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.3.2\u001b[0m\r\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\r\n",
"Note: you may need to restart the kernel to use updated packages.\n" "Note: you may need to restart the kernel to use updated packages.\n"
] ]
} }
@ -74,7 +71,7 @@
"id": "f2e66b097c6ce2e3", "id": "f2e66b097c6ce2e3",
"metadata": {}, "metadata": {},
"source": [ "source": [
"We want to use `OpenAIEmbeddings` so we need to set up our Azure OpenAI API Key alongside other environment variables. " "We want to use `AzureOpenAIEmbeddings` so we need to set up our Azure OpenAI API Key alongside other environment variables. "
] ]
}, },
{ {
@ -90,15 +87,10 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"# Set up the OpenAI Environment Variables\n", "# Set up the OpenAI Environment Variables\n",
"os.environ[\"OPENAI_API_TYPE\"] = \"azure\"\n", "\n",
"os.environ[\"OPENAI_API_VERSION\"] = \"2023-05-15\"\n", "os.environ[\"AZURE_OPENAI_API_KEY\"] = \"YOUR_AZURE_OPENAI_API_KEY\"\n",
"os.environ[\"OPENAI_API_BASE\"] = (\n", "os.environ[\"AZURE_OPENAI_ENDPOINT\"] = \"YOUR_AZURE_OPENAI_ENDPOINT\"\n",
" \"YOUR_OPEN_AI_ENDPOINT\" # https://example.openai.azure.com/\n", "os.environ[\"AZURE_OPENAI_API_VERSION\"] = \"2023-05-15\"\n",
")\n",
"os.environ[\"OPENAI_API_KEY\"] = \"YOUR_OPENAI_API_KEY\"\n",
"os.environ[\"OPENAI_EMBEDDINGS_DEPLOYMENT\"] = (\n",
" \"smart-agent-embedding-ada\" # the deployment name for the embedding model\n",
")\n",
"os.environ[\"OPENAI_EMBEDDINGS_MODEL_NAME\"] = \"text-embedding-ada-002\" # the model name" "os.environ[\"OPENAI_EMBEDDINGS_MODEL_NAME\"] = \"text-embedding-ada-002\" # the model name"
] ]
}, },
@ -130,7 +122,7 @@
" CosmosDBSimilarityType,\n", " CosmosDBSimilarityType,\n",
" CosmosDBVectorSearchType,\n", " CosmosDBVectorSearchType,\n",
")\n", ")\n",
"from langchain_openai import OpenAIEmbeddings\n", "from langchain_openai import AzureOpenAIEmbeddings\n",
"from langchain_text_splitters import CharacterTextSplitter\n", "from langchain_text_splitters import CharacterTextSplitter\n",
"\n", "\n",
"SOURCE_FILE_NAME = \"../../how_to/state_of_the_union.txt\"\n", "SOURCE_FILE_NAME = \"../../how_to/state_of_the_union.txt\"\n",
@ -147,14 +139,35 @@
"model_name = os.getenv(\"OPENAI_EMBEDDINGS_MODEL_NAME\", \"text-embedding-ada-002\")\n", "model_name = os.getenv(\"OPENAI_EMBEDDINGS_MODEL_NAME\", \"text-embedding-ada-002\")\n",
"\n", "\n",
"\n", "\n",
"openai_embeddings: OpenAIEmbeddings = OpenAIEmbeddings(\n", "openai_embeddings: AzureOpenAIEmbeddings = AzureOpenAIEmbeddings(\n",
" deployment=model_deployment, model=model_name, chunk_size=1\n", " model=model_name, chunk_size=1\n",
")" ")"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 5, "execution_count": 5,
"id": "f6c6ed80-7b91-4833-bab5-c9b2b5edcdec",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Document(metadata={'source': '../../how_to/state_of_the_union.txt'}, page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans. \\n\\nLast year COVID-19 kept us apart. This year we are finally together again. \\n\\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \\n\\nWith a duty to one another to the American people to the Constitution. \\n\\nAnd with an unwavering resolve that freedom will always triumph over tyranny. \\n\\nSix days ago, Russias Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated. \\n\\nHe thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined. \\n\\nHe met the Ukrainian people. \\n\\nFrom President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determination, inspires the world.')"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"docs[0]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "39ae6058c2f7fdf1", "id": "39ae6058c2f7fdf1",
"metadata": { "metadata": {
"ExecuteTime": { "ExecuteTime": {
@ -166,14 +179,10 @@
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"{'raw': {'defaultShard': {'numIndexesBefore': 1,\n", "'\\n# DiskANN vectorstore\\nmaxDegree = 40\\ndimensions = 1536\\nsimilarity_algorithm = CosmosDBSimilarityType.COS\\nkind = CosmosDBVectorSearchType.VECTOR_DISKANN\\nlBuild = 20\\n\\nvectorstore.create_index(\\n dimensions=dimensions,\\n similarity=similarity_algorithm,\\n kind=kind ,\\n max_degree=maxDegree,\\n l_build=lBuild,\\n )\\n\\n# -----------------------------------------------------------\\n\\n# HNSW vectorstore\\ndimensions = 1536\\nsimilarity_algorithm = CosmosDBSimilarityType.COS\\nkind = CosmosDBVectorSearchType.VECTOR_HNSW\\nm = 16\\nef_construction = 64\\n\\nvectorstore.create_index(\\n dimensions=dimensions,\\n similarity=similarity_algorithm,\\n kind=kind ,\\n m=m,\\n ef_construction=ef_construction,\\n )\\n'"
" 'numIndexesAfter': 2,\n",
" 'createdCollectionAutomatically': False,\n",
" 'ok': 1}},\n",
" 'ok': 1}"
] ]
}, },
"execution_count": 5, "execution_count": 6,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -212,12 +221,46 @@
"\n", "\n",
"vectorstore.create_index(\n", "vectorstore.create_index(\n",
" num_lists, dimensions, similarity_algorithm, kind, m, ef_construction\n", " num_lists, dimensions, similarity_algorithm, kind, m, ef_construction\n",
")" ")\n",
"\n",
"\"\"\"\n",
"# DiskANN vectorstore\n",
"maxDegree = 40\n",
"dimensions = 1536\n",
"similarity_algorithm = CosmosDBSimilarityType.COS\n",
"kind = CosmosDBVectorSearchType.VECTOR_DISKANN\n",
"lBuild = 20\n",
"\n",
"vectorstore.create_index(\n",
" dimensions=dimensions,\n",
" similarity=similarity_algorithm,\n",
" kind=kind ,\n",
" max_degree=maxDegree,\n",
" l_build=lBuild,\n",
" )\n",
"\n",
"# -----------------------------------------------------------\n",
"\n",
"# HNSW vectorstore\n",
"dimensions = 1536\n",
"similarity_algorithm = CosmosDBSimilarityType.COS\n",
"kind = CosmosDBVectorSearchType.VECTOR_HNSW\n",
"m = 16\n",
"ef_construction = 64\n",
"\n",
"vectorstore.create_index(\n",
" dimensions=dimensions,\n",
" similarity=similarity_algorithm,\n",
" kind=kind ,\n",
" m=m,\n",
" ef_construction=ef_construction,\n",
" )\n",
"\"\"\""
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 6, "execution_count": 7,
"id": "32c68d3246adc21f", "id": "32c68d3246adc21f",
"metadata": { "metadata": {
"ExecuteTime": { "ExecuteTime": {
@ -234,7 +277,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 7, "execution_count": 8,
"id": "8feeeb4364efb204", "id": "8feeeb4364efb204",
"metadata": { "metadata": {
"ExecuteTime": { "ExecuteTime": {
@ -271,7 +314,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 8, "execution_count": 9,
"id": "3c218ab6f59301f7", "id": "3c218ab6f59301f7",
"metadata": { "metadata": {
"ExecuteTime": { "ExecuteTime": {
@ -308,7 +351,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 9, "execution_count": 10,
"id": "fd67e4d92c9ab32f", "id": "fd67e4d92c9ab32f",
"metadata": { "metadata": {
"ExecuteTime": { "ExecuteTime": {
@ -352,10 +395,106 @@
"Azure Cosmos DB for MongoDB supports pre-filtering with $lt, $lte, $eq, $neq, $gte, $gt, $in, $nin, and $regex. To use this feature, enable \"filtering vector search\" in the \"Preview Features\" tab of your Azure Subscription. Learn more about preview features [here](https://learn.microsoft.com/azure/cosmos-db/mongodb/vcore/vector-search#filtered-vector-search-preview)." "Azure Cosmos DB for MongoDB supports pre-filtering with $lt, $lte, $eq, $neq, $gte, $gt, $in, $nin, and $regex. To use this feature, enable \"filtering vector search\" in the \"Preview Features\" tab of your Azure Subscription. Learn more about preview features [here](https://learn.microsoft.com/azure/cosmos-db/mongodb/vcore/vector-search#filtered-vector-search-preview)."
] ]
}, },
{
"cell_type": "code",
"execution_count": 29,
"id": "19c43de6-47f9-45f0-a422-8d852a5d191f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'raw': {'defaultShard': {'numIndexesBefore': 3,\n",
" 'numIndexesAfter': 4,\n",
" 'createdCollectionAutomatically': False,\n",
" 'ok': 1}},\n",
" 'ok': 1}"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# create a filter index\n",
"vectorstore.create_filter_index(\n",
" property_to_filter=\"metadata.source\", index_name=\"filter_index\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "c7031279-dfb8-43f2-a7a8-d10a3786023b",
"metadata": {},
"outputs": [],
"source": [
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
"docs = vectorstore.similarity_search(\n",
" query, pre_filter={\"metadata.source\": {\"$ne\": \"filter content\"}}\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "3860be72-d293-43b9-a727-425f166ff6c6",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"4"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(docs)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "b7fb9800-b1cf-4315-af9d-e8c572d3e05f",
"metadata": {},
"outputs": [],
"source": [
"docs = vectorstore.similarity_search(\n",
" query,\n",
" pre_filter={\"metadata.source\": {\"$ne\": \"../../how_to/state_of_the_union.txt\"}},\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "dba9d39e-6220-4fad-84fa-e123aa7ca6e4",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(docs)"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"id": "50bb4346", "id": "25ea7250-6e8f-48e6-aac9-196effbdc8d8",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [] "source": []

View File

@ -44,6 +44,8 @@ class CosmosDBVectorSearchType(str, Enum):
"""IVF vector index""" """IVF vector index"""
VECTOR_HNSW = "vector-hnsw" VECTOR_HNSW = "vector-hnsw"
"""HNSW vector index""" """HNSW vector index"""
VECTOR_DISKANN = "vector-diskann"
"""DISKANN vector index"""
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -181,6 +183,8 @@ class AzureCosmosDBVectorSearch(VectorStore):
kind: str = "vector-ivf", kind: str = "vector-ivf",
m: int = 16, m: int = 16,
ef_construction: int = 64, ef_construction: int = 64,
max_degree: int = 32,
l_build: int = 50,
) -> dict[str, Any]: ) -> dict[str, Any]:
"""Creates an index using the index name specified at """Creates an index using the index name specified at
instance construction instance construction
@ -215,6 +219,7 @@ class AzureCosmosDBVectorSearch(VectorStore):
- vector-ivf - vector-ivf
- vector-hnsw: available as a preview feature only, - vector-hnsw: available as a preview feature only,
to enable visit https://learn.microsoft.com/en-us/azure/azure-resource-manager/management/preview-features to enable visit https://learn.microsoft.com/en-us/azure/azure-resource-manager/management/preview-features
- vector-diskann: available as a preview feature only
num_lists: This integer is the number of clusters that the num_lists: This integer is the number of clusters that the
inverted file (IVF) index uses to group the vector data. inverted file (IVF) index uses to group the vector data.
We recommend that numLists is set to documentCount/1000 We recommend that numLists is set to documentCount/1000
@ -239,6 +244,12 @@ class AzureCosmosDBVectorSearch(VectorStore):
better index quality and higher accuracy, but it will better index quality and higher accuracy, but it will
also increase the time required to build the index. also increase the time required to build the index.
ef_construction has to be at least 2 * m ef_construction has to be at least 2 * m
max_degree: Max number of neighbors.
Default value is 32, range from 20 to 2048.
Only vector-diskann search supports this for now.
l_build: l value for index building.
Default value is 50, range from 10 to 500.
Only vector-diskann search supports this for now.
Returns: Returns:
An object describing the created index An object describing the created index
@ -254,6 +265,10 @@ class AzureCosmosDBVectorSearch(VectorStore):
create_index_commands = self._get_vector_index_hnsw( create_index_commands = self._get_vector_index_hnsw(
kind, m, ef_construction, similarity, dimensions kind, m, ef_construction, similarity, dimensions
) )
elif kind == CosmosDBVectorSearchType.VECTOR_DISKANN:
create_index_commands = self._get_vector_index_diskann(
kind, max_degree, l_build, similarity, dimensions
)
# retrieve the database object # retrieve the database object
current_database = self._collection.database current_database = self._collection.database
@ -306,6 +321,27 @@ class AzureCosmosDBVectorSearch(VectorStore):
} }
return command return command
def _get_vector_index_diskann(
self, kind: str, max_degree: int, l_build: int, similarity: str, dimensions: int
) -> Dict[str, Any]:
command = {
"createIndexes": self._collection.name,
"indexes": [
{
"name": self._index_name,
"key": {self._embedding_key: "cosmosSearch"},
"cosmosSearchOptions": {
"kind": kind,
"maxDegree": max_degree,
"lBuild": l_build,
"similarity": similarity,
"dimensions": dimensions,
},
}
],
}
return command
def create_filter_index( def create_filter_index(
self, self,
property_to_filter: str, property_to_filter: str,
@ -421,6 +457,7 @@ class AzureCosmosDBVectorSearch(VectorStore):
pre_filter: Optional[Dict] = None, pre_filter: Optional[Dict] = None,
ef_search: int = 40, ef_search: int = 40,
score_threshold: float = 0.0, score_threshold: float = 0.0,
l_search: int = 40,
with_embedding: bool = False, with_embedding: bool = False,
) -> List[Tuple[Document, float]]: ) -> List[Tuple[Document, float]]:
"""Returns a list of documents with their scores """Returns a list of documents with their scores
@ -433,12 +470,16 @@ class AzureCosmosDBVectorSearch(VectorStore):
- vector-ivf - vector-ivf
- vector-hnsw: available as a preview feature only, - vector-hnsw: available as a preview feature only,
to enable visit https://learn.microsoft.com/en-us/azure/azure-resource-manager/management/preview-features to enable visit https://learn.microsoft.com/en-us/azure/azure-resource-manager/management/preview-features
- vector-diskann: available as a preview feature only
ef_search: The size of the dynamic candidate list for search ef_search: The size of the dynamic candidate list for search
(40 by default). A higher value provides better (40 by default). A higher value provides better
recall at the cost of speed. recall at the cost of speed.
score_threshold: (Optional[float], optional): Maximum vector distance score_threshold: (Optional[float], optional): Maximum vector distance
between selected documents and the query vector. Defaults to None. between selected documents and the query vector. Defaults to None.
Only vector-ivf search supports this for now. Only vector-ivf search supports this for now.
l_search: l value for index searching.
Default value is 40, range from 10 to 10000.
Only vector-diskann search supports this.
Returns: Returns:
A list of documents closest to the query vector A list of documents closest to the query vector
@ -450,6 +491,10 @@ class AzureCosmosDBVectorSearch(VectorStore):
pipeline = self._get_pipeline_vector_hnsw( pipeline = self._get_pipeline_vector_hnsw(
embeddings, k, ef_search, pre_filter embeddings, k, ef_search, pre_filter
) )
elif kind == CosmosDBVectorSearchType.VECTOR_DISKANN:
pipeline = self._get_pipeline_vector_diskann(
embeddings, k, l_search, pre_filter
)
cursor = self._collection.aggregate(pipeline) cursor = self._collection.aggregate(pipeline)
@ -461,6 +506,9 @@ class AzureCosmosDBVectorSearch(VectorStore):
document_object_field = res.pop("document") document_object_field = res.pop("document")
text = document_object_field.pop(self._text_key) text = document_object_field.pop(self._text_key)
metadata = document_object_field.pop("metadata", {}) metadata = document_object_field.pop("metadata", {})
metadata["_id"] = document_object_field.pop(
"_id"
) # '_id' is in new position
if with_embedding: if with_embedding:
metadata[self._embedding_key] = document_object_field.pop( metadata[self._embedding_key] = document_object_field.pop(
self._embedding_key self._embedding_key
@ -527,6 +575,37 @@ class AzureCosmosDBVectorSearch(VectorStore):
] ]
return pipeline return pipeline
def _get_pipeline_vector_diskann(
self,
embeddings: List[float],
k: int = 4,
l_search: int = 40,
pre_filter: Optional[Dict] = None,
) -> List[dict[str, Any]]:
params = {
"vector": embeddings,
"path": self._embedding_key,
"k": k,
"lSearch": l_search,
}
if pre_filter:
params["filter"] = pre_filter
pipeline: List[dict[str, Any]] = [
{
"$search": {
"cosmosSearch": params,
}
},
{
"$project": {
"similarityScore": {"$meta": "searchScore"},
"document": "$$ROOT",
}
},
]
return pipeline
def similarity_search_with_score( def similarity_search_with_score(
self, self,
query: str, query: str,
@ -535,6 +614,7 @@ class AzureCosmosDBVectorSearch(VectorStore):
pre_filter: Optional[Dict] = None, pre_filter: Optional[Dict] = None,
ef_search: int = 40, ef_search: int = 40,
score_threshold: float = 0.0, score_threshold: float = 0.0,
l_search: int = 40,
with_embedding: bool = False, with_embedding: bool = False,
) -> List[Tuple[Document, float]]: ) -> List[Tuple[Document, float]]:
embeddings = self._embedding.embed_query(query) embeddings = self._embedding.embed_query(query)
@ -545,6 +625,7 @@ class AzureCosmosDBVectorSearch(VectorStore):
pre_filter=pre_filter, pre_filter=pre_filter,
ef_search=ef_search, ef_search=ef_search,
score_threshold=score_threshold, score_threshold=score_threshold,
l_search=l_search,
with_embedding=with_embedding, with_embedding=with_embedding,
) )
return docs return docs
@ -557,6 +638,7 @@ class AzureCosmosDBVectorSearch(VectorStore):
pre_filter: Optional[Dict] = None, pre_filter: Optional[Dict] = None,
ef_search: int = 40, ef_search: int = 40,
score_threshold: float = 0.0, score_threshold: float = 0.0,
l_search: int = 40,
with_embedding: bool = False, with_embedding: bool = False,
**kwargs: Any, **kwargs: Any,
) -> List[Document]: ) -> List[Document]:
@ -567,6 +649,7 @@ class AzureCosmosDBVectorSearch(VectorStore):
pre_filter=pre_filter, pre_filter=pre_filter,
ef_search=ef_search, ef_search=ef_search,
score_threshold=score_threshold, score_threshold=score_threshold,
l_search=l_search,
with_embedding=with_embedding, with_embedding=with_embedding,
) )
return [doc for doc, _ in docs_and_scores] return [doc for doc, _ in docs_and_scores]
@ -581,6 +664,7 @@ class AzureCosmosDBVectorSearch(VectorStore):
pre_filter: Optional[Dict] = None, pre_filter: Optional[Dict] = None,
ef_search: int = 40, ef_search: int = 40,
score_threshold: float = 0.0, score_threshold: float = 0.0,
l_search: int = 40,
with_embedding: bool = False, with_embedding: bool = False,
**kwargs: Any, **kwargs: Any,
) -> List[Document]: ) -> List[Document]:
@ -593,6 +677,7 @@ class AzureCosmosDBVectorSearch(VectorStore):
pre_filter=pre_filter, pre_filter=pre_filter,
ef_search=ef_search, ef_search=ef_search,
score_threshold=score_threshold, score_threshold=score_threshold,
l_search=l_search,
with_embedding=with_embedding, with_embedding=with_embedding,
) )
@ -616,6 +701,7 @@ class AzureCosmosDBVectorSearch(VectorStore):
pre_filter: Optional[Dict] = None, pre_filter: Optional[Dict] = None,
ef_search: int = 40, ef_search: int = 40,
score_threshold: float = 0.0, score_threshold: float = 0.0,
l_search: int = 40,
with_embedding: bool = False, with_embedding: bool = False,
**kwargs: Any, **kwargs: Any,
) -> List[Document]: ) -> List[Document]:
@ -631,6 +717,7 @@ class AzureCosmosDBVectorSearch(VectorStore):
pre_filter=pre_filter, pre_filter=pre_filter,
ef_search=ef_search, ef_search=ef_search,
score_threshold=score_threshold, score_threshold=score_threshold,
l_search=l_search,
with_embedding=with_embedding, with_embedding=with_embedding,
) )
return docs return docs

View File

@ -8,7 +8,7 @@ from typing import Any, Generator, Optional, Union
import pytest import pytest
from langchain_core.documents import Document from langchain_core.documents import Document
from langchain_community.embeddings import OpenAIEmbeddings from langchain_community.embeddings import AzureOpenAIEmbeddings
from langchain_community.vectorstores.azure_cosmos_db import ( from langchain_community.vectorstores.azure_cosmos_db import (
AzureCosmosDBVectorSearch, AzureCosmosDBVectorSearch,
CosmosDBSimilarityType, CosmosDBSimilarityType,
@ -24,6 +24,7 @@ model_name = os.getenv("OPENAI_EMBEDDINGS_MODEL_NAME", "text-embedding-ada-002")
INDEX_NAME = "langchain-test-index" INDEX_NAME = "langchain-test-index"
INDEX_NAME_VECTOR_HNSW = "langchain-test-index-hnsw" INDEX_NAME_VECTOR_HNSW = "langchain-test-index-hnsw"
INDEX_NAME_VECTOR_DISKANN = "langchain-test-index-diskann"
NAMESPACE = "langchain_test_db.langchain_test_collection" NAMESPACE = "langchain_test_db.langchain_test_collection"
CONNECTION_STRING: str = os.environ.get("MONGODB_VCORE_URI", "") CONNECTION_STRING: str = os.environ.get("MONGODB_VCORE_URI", "")
DB_NAME, COLLECTION_NAME = NAMESPACE.split(".") DB_NAME, COLLECTION_NAME = NAMESPACE.split(".")
@ -36,6 +37,9 @@ m = 16
ef_construction = 64 ef_construction = 64
ef_search = 40 ef_search = 40
score_threshold = 0.1 score_threshold = 0.1
maxDegree = 50
lBuild = 40
lSearch = 100
application_name = "LANGCHAIN_PYTHON" application_name = "LANGCHAIN_PYTHON"
@ -53,8 +57,9 @@ def collection() -> Any:
@pytest.fixture() @pytest.fixture()
def azure_openai_embeddings() -> Any: def azure_openai_embeddings() -> Any:
openai_embeddings: OpenAIEmbeddings = OpenAIEmbeddings( openai_embeddings: AzureOpenAIEmbeddings = AzureOpenAIEmbeddings(
deployment=model_deployment, model=model_name, chunk_size=1 model=model_name,
chunk_size=1,
) )
return openai_embeddings return openai_embeddings
@ -70,8 +75,12 @@ pytest tests/integration_tests/vectorstores/test_azure_cosmos_db.py
class TestAzureCosmosDBVectorSearch: class TestAzureCosmosDBVectorSearch:
@classmethod @classmethod
def setup_class(cls) -> None: def setup_class(cls) -> None:
if not os.getenv("OPENAI_API_KEY"): if not os.getenv("AZURE_OPENAI_API_KEY"):
raise ValueError("OPENAI_API_KEY environment variable is not set") raise ValueError("AZURE_OPENAI_API_KEY environment variable is not set")
if not os.getenv("AZURE_OPENAI_ENDPOINT"):
raise ValueError("AZURE_OPENAI_ENDPOINT environment variable is not set")
if not os.getenv("AZURE_OPENAI_API_VERSION"):
raise ValueError("AZURE_OPENAI_API_VERSION environment variable is not set")
# insure the test collection is empty # insure the test collection is empty
collection = prepare_collection() collection = prepare_collection()
@ -95,7 +104,7 @@ class TestAzureCosmosDBVectorSearch:
return "805.555.1212" return "805.555.1212"
def test_from_documents_cosine_distance( def test_from_documents_cosine_distance(
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
) -> None: ) -> None:
"""Test end to end construction and search.""" """Test end to end construction and search."""
documents = [ documents = [
@ -135,7 +144,7 @@ class TestAzureCosmosDBVectorSearch:
vectorstore.delete_index() vectorstore.delete_index()
def test_from_documents_inner_product( def test_from_documents_inner_product(
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
) -> None: ) -> None:
"""Test end to end construction and search.""" """Test end to end construction and search."""
documents = [ documents = [
@ -174,7 +183,7 @@ class TestAzureCosmosDBVectorSearch:
vectorstore.delete_index() vectorstore.delete_index()
def test_from_texts_cosine_distance( def test_from_texts_cosine_distance(
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
) -> None: ) -> None:
texts = [ texts = [
"Dogs are tough.", "Dogs are tough.",
@ -208,7 +217,7 @@ class TestAzureCosmosDBVectorSearch:
vectorstore.delete_index() vectorstore.delete_index()
def test_from_texts_with_metadatas_cosine_distance( def test_from_texts_with_metadatas_cosine_distance(
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
) -> None: ) -> None:
texts = [ texts = [
"Dogs are tough.", "Dogs are tough.",
@ -246,7 +255,7 @@ class TestAzureCosmosDBVectorSearch:
vectorstore.delete_index() vectorstore.delete_index()
def test_from_texts_with_metadatas_delete_one( def test_from_texts_with_metadatas_delete_one(
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
) -> None: ) -> None:
texts = [ texts = [
"Dogs are tough.", "Dogs are tough.",
@ -280,7 +289,6 @@ class TestAzureCosmosDBVectorSearch:
assert output assert output
assert output[0].page_content == "What is a sandwich?" assert output[0].page_content == "What is a sandwich?"
assert output[0].metadata["c"] == 1 assert output[0].metadata["c"] == 1
first_document_id_object = output[0].metadata["_id"] first_document_id_object = output[0].metadata["_id"]
first_document_id = str(first_document_id_object) first_document_id = str(first_document_id_object)
@ -300,7 +308,7 @@ class TestAzureCosmosDBVectorSearch:
vectorstore.delete_index() vectorstore.delete_index()
def test_from_texts_with_metadatas_delete_multiple( def test_from_texts_with_metadatas_delete_multiple(
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
) -> None: ) -> None:
texts = [ texts = [
"Dogs are tough.", "Dogs are tough.",
@ -359,7 +367,7 @@ class TestAzureCosmosDBVectorSearch:
vectorstore.delete_index() vectorstore.delete_index()
def test_from_texts_with_metadatas_inner_product( def test_from_texts_with_metadatas_inner_product(
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
) -> None: ) -> None:
texts = [ texts = [
"Dogs are tough.", "Dogs are tough.",
@ -397,7 +405,7 @@ class TestAzureCosmosDBVectorSearch:
vectorstore.delete_index() vectorstore.delete_index()
def test_from_texts_with_metadatas_euclidean_distance( def test_from_texts_with_metadatas_euclidean_distance(
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
) -> None: ) -> None:
texts = [ texts = [
"Dogs are tough.", "Dogs are tough.",
@ -435,7 +443,7 @@ class TestAzureCosmosDBVectorSearch:
vectorstore.delete_index() vectorstore.delete_index()
def test_max_marginal_relevance_cosine_distance( def test_max_marginal_relevance_cosine_distance(
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
) -> None: ) -> None:
texts = ["foo", "foo", "fou", "foy"] texts = ["foo", "foo", "fou", "foy"]
vectorstore = AzureCosmosDBVectorSearch.from_texts( vectorstore = AzureCosmosDBVectorSearch.from_texts(
@ -453,7 +461,12 @@ class TestAzureCosmosDBVectorSearch:
query = "foo" query = "foo"
output = vectorstore.max_marginal_relevance_search( output = vectorstore.max_marginal_relevance_search(
query, k=10, kind=kind, lambda_mult=0.1, score_threshold=score_threshold query,
k=10,
kind=kind,
lambda_mult=0.1,
score_threshold=score_threshold,
with_embedding=True,
) )
assert len(output) == len(texts) assert len(output) == len(texts)
@ -463,7 +476,7 @@ class TestAzureCosmosDBVectorSearch:
vectorstore.delete_index() vectorstore.delete_index()
def test_max_marginal_relevance_inner_product( def test_max_marginal_relevance_inner_product(
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
) -> None: ) -> None:
texts = ["foo", "foo", "fou", "foy"] texts = ["foo", "foo", "fou", "foy"]
vectorstore = AzureCosmosDBVectorSearch.from_texts( vectorstore = AzureCosmosDBVectorSearch.from_texts(
@ -481,7 +494,12 @@ class TestAzureCosmosDBVectorSearch:
query = "foo" query = "foo"
output = vectorstore.max_marginal_relevance_search( output = vectorstore.max_marginal_relevance_search(
query, k=10, kind=kind, lambda_mult=0.1, score_threshold=score_threshold query,
k=10,
kind=kind,
lambda_mult=0.1,
score_threshold=score_threshold,
with_embedding=True,
) )
assert len(output) == len(texts) assert len(output) == len(texts)
@ -495,7 +513,7 @@ class TestAzureCosmosDBVectorSearch:
""" """
def test_from_documents_cosine_distance_vector_hnsw( def test_from_documents_cosine_distance_vector_hnsw(
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
) -> None: ) -> None:
"""Test end to end construction and search.""" """Test end to end construction and search."""
documents = [ documents = [
@ -539,7 +557,7 @@ class TestAzureCosmosDBVectorSearch:
vectorstore.delete_index() vectorstore.delete_index()
def test_from_documents_inner_product_vector_hnsw( def test_from_documents_inner_product_vector_hnsw(
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
) -> None: ) -> None:
"""Test end to end construction and search.""" """Test end to end construction and search."""
documents = [ documents = [
@ -583,7 +601,7 @@ class TestAzureCosmosDBVectorSearch:
vectorstore.delete_index() vectorstore.delete_index()
def test_from_texts_cosine_distance_vector_hnsw( def test_from_texts_cosine_distance_vector_hnsw(
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
) -> None: ) -> None:
texts = [ texts = [
"Dogs are tough.", "Dogs are tough.",
@ -622,7 +640,7 @@ class TestAzureCosmosDBVectorSearch:
vectorstore.delete_index() vectorstore.delete_index()
def test_from_texts_with_metadatas_cosine_distance_vector_hnsw( def test_from_texts_with_metadatas_cosine_distance_vector_hnsw(
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
) -> None: ) -> None:
texts = [ texts = [
"Dogs are tough.", "Dogs are tough.",
@ -665,7 +683,7 @@ class TestAzureCosmosDBVectorSearch:
vectorstore.delete_index() vectorstore.delete_index()
def test_from_texts_with_metadatas_delete_one_vector_hnsw( def test_from_texts_with_metadatas_delete_one_vector_hnsw(
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
) -> None: ) -> None:
texts = [ texts = [
"Dogs are tough.", "Dogs are tough.",
@ -724,7 +742,7 @@ class TestAzureCosmosDBVectorSearch:
vectorstore.delete_index() vectorstore.delete_index()
def test_from_texts_with_metadatas_delete_multiple_vector_hnsw( def test_from_texts_with_metadatas_delete_multiple_vector_hnsw(
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
) -> None: ) -> None:
texts = [ texts = [
"Dogs are tough.", "Dogs are tough.",
@ -788,7 +806,7 @@ class TestAzureCosmosDBVectorSearch:
vectorstore.delete_index() vectorstore.delete_index()
def test_from_texts_with_metadatas_inner_product_vector_hnsw( def test_from_texts_with_metadatas_inner_product_vector_hnsw(
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
) -> None: ) -> None:
texts = [ texts = [
"Dogs are tough.", "Dogs are tough.",
@ -831,7 +849,7 @@ class TestAzureCosmosDBVectorSearch:
vectorstore.delete_index() vectorstore.delete_index()
def test_max_marginal_relevance_cosine_distance_vector_hnsw( def test_max_marginal_relevance_cosine_distance_vector_hnsw(
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
) -> None: ) -> None:
texts = ["foo", "foo", "fou", "foy"] texts = ["foo", "foo", "fou", "foy"]
vectorstore = AzureCosmosDBVectorSearch.from_texts( vectorstore = AzureCosmosDBVectorSearch.from_texts(
@ -859,6 +877,7 @@ class TestAzureCosmosDBVectorSearch:
kind=CosmosDBVectorSearchType.VECTOR_HNSW, kind=CosmosDBVectorSearchType.VECTOR_HNSW,
lambda_mult=0.1, lambda_mult=0.1,
score_threshold=score_threshold, score_threshold=score_threshold,
with_embedding=True,
) )
assert len(output) == len(texts) assert len(output) == len(texts)
@ -868,7 +887,7 @@ class TestAzureCosmosDBVectorSearch:
vectorstore.delete_index() vectorstore.delete_index()
def test_max_marginal_relevance_inner_product_vector_hnsw( def test_max_marginal_relevance_inner_product_vector_hnsw(
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
) -> None: ) -> None:
texts = ["foo", "foo", "fou", "foy"] texts = ["foo", "foo", "fou", "foy"]
vectorstore = AzureCosmosDBVectorSearch.from_texts( vectorstore = AzureCosmosDBVectorSearch.from_texts(
@ -896,6 +915,405 @@ class TestAzureCosmosDBVectorSearch:
kind=CosmosDBVectorSearchType.VECTOR_HNSW, kind=CosmosDBVectorSearchType.VECTOR_HNSW,
lambda_mult=0.1, lambda_mult=0.1,
score_threshold=score_threshold, score_threshold=score_threshold,
with_embedding=True,
)
assert len(output) == len(texts)
assert output[0].page_content == "foo"
assert output[1].page_content != "foo"
vectorstore.delete_index()
"""
Test cases for the similarity algorithm using vector-diskann
"""
def test_from_documents_cosine_distance_vector_diskann(
self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
) -> None:
"""Test end to end construction and search."""
documents = [
Document(page_content="Dogs are tough.", metadata={"a": 1}),
Document(page_content="Cats have fluff.", metadata={"b": 1}),
Document(page_content="What is a sandwich?", metadata={"c": 1}),
Document(page_content="That fence is purple.", metadata={"d": 1, "e": 2}),
]
vectorstore = AzureCosmosDBVectorSearch.from_documents(
documents,
azure_openai_embeddings,
collection=collection,
index_name=INDEX_NAME_VECTOR_DISKANN,
)
sleep(1) # waits for Cosmos DB to save contents to the collection
# Create the DiskANN index that will be leveraged later for vector search
vectorstore.create_index(
dimensions=dimensions,
similarity=similarity_algorithm,
kind=CosmosDBVectorSearchType.VECTOR_DISKANN,
max_degree=maxDegree,
l_build=lBuild,
)
sleep(2) # waits for the index to be set up
output = vectorstore.similarity_search(
"Sandwich",
k=1,
kind=CosmosDBVectorSearchType.VECTOR_DISKANN,
lSearch=lSearch,
)
assert output
assert output[0].page_content == "What is a sandwich?"
assert output[0].metadata["c"] == 1
vectorstore.delete_index()
def test_from_documents_inner_product_vector_diskann(
self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
) -> None:
"""Test end to end construction and search."""
documents = [
Document(page_content="Dogs are tough.", metadata={"a": 1}),
Document(page_content="Cats have fluff.", metadata={"b": 1}),
Document(page_content="What is a sandwich?", metadata={"c": 1}),
Document(page_content="That fence is purple.", metadata={"d": 1, "e": 2}),
]
vectorstore = AzureCosmosDBVectorSearch.from_documents(
documents,
azure_openai_embeddings,
collection=collection,
index_name=INDEX_NAME_VECTOR_DISKANN,
)
sleep(1) # waits for Cosmos DB to save contents to the collection
# Create the DiskANN index that will be leveraged later for vector search
vectorstore.create_index(
dimensions=dimensions,
similarity=similarity_algorithm,
kind=CosmosDBVectorSearchType.VECTOR_DISKANN,
max_degree=maxDegree,
l_build=lBuild,
)
sleep(2) # waits for the index to be set up
output = vectorstore.similarity_search(
"Sandwich",
k=1,
kind=CosmosDBVectorSearchType.VECTOR_DISKANN,
lSearch=lSearch,
)
assert output
assert output[0].page_content == "What is a sandwich?"
assert output[0].metadata["c"] == 1
vectorstore.delete_index()
def test_from_texts_cosine_distance_vector_diskann(
self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
) -> None:
texts = [
"Dogs are tough.",
"Cats have fluff.",
"What is a sandwich?",
"That fence is purple.",
]
vectorstore = AzureCosmosDBVectorSearch.from_texts(
texts,
azure_openai_embeddings,
collection=collection,
index_name=INDEX_NAME_VECTOR_DISKANN,
)
# Create the DiskANN index that will be leveraged later for vector search
vectorstore.create_index(
dimensions=dimensions,
similarity=similarity_algorithm,
kind=CosmosDBVectorSearchType.VECTOR_DISKANN,
max_degree=maxDegree,
l_build=lBuild,
)
sleep(2) # waits for the index to be set up
output = vectorstore.similarity_search(
"Sandwich",
k=1,
kind=CosmosDBVectorSearchType.VECTOR_DISKANN,
lSearch=lSearch,
)
assert output[0].page_content == "What is a sandwich?"
vectorstore.delete_index()
def test_from_texts_with_metadatas_cosine_distance_vector_diskann(
self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
) -> None:
texts = [
"Dogs are tough.",
"Cats have fluff.",
"What is a sandwich?",
"The fence is purple.",
]
metadatas = [{"a": 1}, {"b": 1}, {"c": 1}, {"d": 1, "e": 2}]
vectorstore = AzureCosmosDBVectorSearch.from_texts(
texts,
azure_openai_embeddings,
metadatas=metadatas,
collection=collection,
index_name=INDEX_NAME_VECTOR_DISKANN,
)
# Create the DiskANN index that will be leveraged later for vector search
vectorstore.create_index(
dimensions=dimensions,
similarity=similarity_algorithm,
kind=CosmosDBVectorSearchType.VECTOR_DISKANN,
max_degree=maxDegree,
l_build=lBuild,
)
sleep(2) # waits for the index to be set up
output = vectorstore.similarity_search(
"Sandwich",
k=1,
kind=CosmosDBVectorSearchType.VECTOR_DISKANN,
lSearch=lSearch,
)
assert output
assert output[0].page_content == "What is a sandwich?"
assert output[0].metadata["c"] == 1
vectorstore.delete_index()
def test_from_texts_with_metadatas_delete_one_vector_diskann(
self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
) -> None:
texts = [
"Dogs are tough.",
"Cats have fluff.",
"What is a sandwich?",
"The fence is purple.",
]
metadatas = [{"a": 1}, {"b": 1}, {"c": 1}, {"d": 1, "e": 2}]
vectorstore = AzureCosmosDBVectorSearch.from_texts(
texts,
azure_openai_embeddings,
metadatas=metadatas,
collection=collection,
index_name=INDEX_NAME_VECTOR_DISKANN,
)
# Create the DiskANN index that will be leveraged later for vector search
vectorstore.create_index(
dimensions=dimensions,
similarity=similarity_algorithm,
kind=CosmosDBVectorSearchType.VECTOR_DISKANN,
max_degree=maxDegree,
l_build=lBuild,
)
sleep(2) # waits for the index to be set up
output = vectorstore.similarity_search(
"Sandwich",
k=1,
kind=CosmosDBVectorSearchType.VECTOR_DISKANN,
lSearch=lSearch,
)
assert output
assert output[0].page_content == "What is a sandwich?"
assert output[0].metadata["c"] == 1
first_document_id_object = output[0].metadata["_id"]
first_document_id = str(first_document_id_object)
vectorstore.delete_document_by_id(first_document_id)
sleep(2) # waits for the index to be updated
output2 = vectorstore.similarity_search(
"Sandwich",
k=1,
kind=CosmosDBVectorSearchType.VECTOR_DISKANN,
lSearch=lSearch,
)
assert output2
assert output2[0].page_content != "What is a sandwich?"
vectorstore.delete_index()
def test_from_texts_with_metadatas_delete_multiple_vector_diskann(
self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
) -> None:
texts = [
"Dogs are tough.",
"Cats have fluff.",
"What is a sandwich?",
"The fence is purple.",
]
metadatas = [{"a": 1}, {"b": 1}, {"c": 1}, {"d": 1, "e": 2}]
vectorstore = AzureCosmosDBVectorSearch.from_texts(
texts,
azure_openai_embeddings,
metadatas=metadatas,
collection=collection,
index_name=INDEX_NAME_VECTOR_DISKANN,
)
# Create the DiskANN index that will be leveraged later for vector search
vectorstore.create_index(
dimensions=dimensions,
similarity=similarity_algorithm,
kind=CosmosDBVectorSearchType.VECTOR_DISKANN,
max_degree=maxDegree,
l_build=lBuild,
)
sleep(2) # waits for the index to be set up
output = vectorstore.similarity_search(
"Sandwich",
k=5,
kind=CosmosDBVectorSearchType.VECTOR_DISKANN,
lSearch=lSearch,
)
first_document_id = str(output[0].metadata["_id"])
second_document_id = str(output[1].metadata["_id"])
third_document_id = str(output[2].metadata["_id"])
document_ids = [first_document_id, second_document_id, third_document_id]
vectorstore.delete(document_ids)
sleep(2) # waits for the index to be updated
output_2 = vectorstore.similarity_search(
"Sandwich",
k=5,
kind=CosmosDBVectorSearchType.VECTOR_DISKANN,
lSearch=lSearch,
)
assert output
assert output_2
assert len(output) == 4 # we should see all the four documents
assert (
len(output_2) == 1
) # we should see only one document left after three have been deleted
vectorstore.delete_index()
def test_from_texts_with_metadatas_inner_product_vector_diskann(
self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
) -> None:
texts = [
"Dogs are tough.",
"Cats have fluff.",
"What is a sandwich?",
"The fence is purple.",
]
metadatas = [{"a": 1}, {"b": 1}, {"c": 1}, {"d": 1, "e": 2}]
vectorstore = AzureCosmosDBVectorSearch.from_texts(
texts,
azure_openai_embeddings,
metadatas=metadatas,
collection=collection,
index_name=INDEX_NAME_VECTOR_DISKANN,
)
# Create the DiskANN index that will be leveraged later for vector search
vectorstore.create_index(
dimensions=dimensions,
similarity=similarity_algorithm,
kind=CosmosDBVectorSearchType.VECTOR_DISKANN,
max_degree=maxDegree,
l_build=lBuild,
)
sleep(2) # waits for the index to be set up
output = vectorstore.similarity_search(
"Sandwich",
k=1,
kind=CosmosDBVectorSearchType.VECTOR_DISKANN,
lSearch=lSearch,
)
assert output
assert output[0].page_content == "What is a sandwich?"
assert output[0].metadata["c"] == 1
vectorstore.delete_index()
def test_max_marginal_relevance_cosine_distance_vector_diskann(
self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
) -> None:
texts = ["foo", "foo", "fou", "foy"]
vectorstore = AzureCosmosDBVectorSearch.from_texts(
texts,
azure_openai_embeddings,
collection=collection,
index_name=INDEX_NAME_VECTOR_DISKANN,
)
# Create the IVF index that will be leveraged later for vector search
vectorstore.create_index(
dimensions=dimensions,
similarity=similarity_algorithm,
kind=CosmosDBVectorSearchType.VECTOR_DISKANN,
max_degree=maxDegree,
l_build=lBuild,
)
sleep(2) # waits for the index to be set up
query = "foo"
output = vectorstore.max_marginal_relevance_search(
query,
k=10,
kind=CosmosDBVectorSearchType.VECTOR_DISKANN,
lambda_mult=0.1,
lSearch=lSearch,
with_embedding=True,
)
assert len(output) == len(texts)
assert output[0].page_content == "foo"
assert output[1].page_content != "foo"
vectorstore.delete_index()
def test_max_marginal_relevance_inner_product_vector_diskann(
self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
) -> None:
texts = ["foo", "foo", "fou", "foy"]
vectorstore = AzureCosmosDBVectorSearch.from_texts(
texts,
azure_openai_embeddings,
collection=collection,
index_name=INDEX_NAME_VECTOR_DISKANN,
)
# Create the DiskANN index that will be leveraged later for vector search
vectorstore.create_index(
dimensions=dimensions,
similarity=similarity_algorithm,
kind=CosmosDBVectorSearchType.VECTOR_DISKANN,
max_degree=maxDegree,
l_build=lBuild,
)
sleep(2) # waits for the index to be set up
query = "foo"
output = vectorstore.max_marginal_relevance_search(
query,
k=10,
kind=CosmosDBVectorSearchType.VECTOR_DISKANN,
lambda_mult=0.1,
lSearch=lSearch,
with_embedding=True,
) )
assert len(output) == len(texts) assert len(output) == len(texts)
@ -906,7 +1324,7 @@ class TestAzureCosmosDBVectorSearch:
@staticmethod @staticmethod
def invoke_delete_with_no_args( def invoke_delete_with_no_args(
azure_openai_embeddings: OpenAIEmbeddings, collection: Any azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
) -> Optional[bool]: ) -> Optional[bool]:
vectorstore: AzureCosmosDBVectorSearch = ( vectorstore: AzureCosmosDBVectorSearch = (
AzureCosmosDBVectorSearch.from_connection_string( AzureCosmosDBVectorSearch.from_connection_string(
@ -922,7 +1340,7 @@ class TestAzureCosmosDBVectorSearch:
@staticmethod @staticmethod
def invoke_delete_by_id_with_no_args( def invoke_delete_by_id_with_no_args(
azure_openai_embeddings: OpenAIEmbeddings, collection: Any azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
) -> None: ) -> None:
vectorstore: AzureCosmosDBVectorSearch = ( vectorstore: AzureCosmosDBVectorSearch = (
AzureCosmosDBVectorSearch.from_connection_string( AzureCosmosDBVectorSearch.from_connection_string(
@ -937,14 +1355,14 @@ class TestAzureCosmosDBVectorSearch:
vectorstore.delete_document_by_id() vectorstore.delete_document_by_id()
def test_invalid_arguments_to_delete( def test_invalid_arguments_to_delete(
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
) -> None: ) -> None:
with pytest.raises(ValueError) as exception_info: with pytest.raises(ValueError) as exception_info:
self.invoke_delete_with_no_args(azure_openai_embeddings, collection) self.invoke_delete_with_no_args(azure_openai_embeddings, collection)
assert str(exception_info.value) == "No document ids provided to delete." assert str(exception_info.value) == "No document ids provided to delete."
def test_no_arguments_to_delete_by_id( def test_no_arguments_to_delete_by_id(
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
) -> None: ) -> None:
with pytest.raises(Exception) as exception_info: with pytest.raises(Exception) as exception_info:
self.invoke_delete_by_id_with_no_args( self.invoke_delete_by_id_with_no_args(