community: VectorStores: Azure Cosmos DB Mongo vCore with DiskANN (#27329)

# Description
Add a new vector index type `diskann` to Azure Cosmos DB Mongo vCore
vector store. Paper of DiskANN can be found here [DiskANN: Fast Accurate
Billion-point Nearest Neighbor Search on a Single
Node](https://proceedings.neurips.cc/paper_files/paper/2019/file/09853c7fb1d3f8ee67a61b6bf4a7f8e6-Paper.pdf).

## Sample Usage
```python
from pymongo import MongoClient

# INDEX_NAME = "izzy-test-index-2"
# NAMESPACE = "izzy_test_db.izzy_test_collection"
# DB_NAME, COLLECTION_NAME = NAMESPACE.split(".")

client: MongoClient = MongoClient(CONNECTION_STRING)
collection = client[DB_NAME][COLLECTION_NAME]

model_deployment = os.getenv(
    "OPENAI_EMBEDDINGS_DEPLOYMENT", "smart-agent-embedding-ada"
)
model_name = os.getenv("OPENAI_EMBEDDINGS_MODEL_NAME", "text-embedding-ada-002")

vectorstore = AzureCosmosDBVectorSearch.from_documents(
    docs,
    openai_embeddings,
    collection=collection,
    index_name=INDEX_NAME,
)

# Read more about these variables in detail here. https://learn.microsoft.com/en-us/azure/cosmos-db/mongodb/vcore/vector-search
maxDegree = 40
dimensions = 1536
similarity_algorithm = CosmosDBSimilarityType.COS
kind = CosmosDBVectorSearchType.VECTOR_DISKANN
lBuild = 20

vectorstore.create_index(
            dimensions=dimensions,
            similarity=similarity_algorithm,
            kind=kind ,
            max_degree=maxDegree,
            l_build=lBuild,
        )
```

## Dependencies
No additional dependencies were added

---------

Co-authored-by: Yang Qiao (from Dev Box) <yangqiao@microsoft.com>
Co-authored-by: Erick Friis <erick@langchain.dev>
This commit is contained in:
fatmelon
2024-12-12 09:54:04 +08:00
committed by GitHub
parent ba9b95cd23
commit d1e0ec7b55
3 changed files with 703 additions and 59 deletions

View File

@@ -38,9 +38,6 @@
"name": "stdout",
"output_type": "stream",
"text": [
"\r\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.2.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.3.2\u001b[0m\r\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\r\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
@@ -74,7 +71,7 @@
"id": "f2e66b097c6ce2e3",
"metadata": {},
"source": [
"We want to use `OpenAIEmbeddings` so we need to set up our Azure OpenAI API Key alongside other environment variables. "
"We want to use `AzureOpenAIEmbeddings` so we need to set up our Azure OpenAI API Key alongside other environment variables. "
]
},
{
@@ -90,15 +87,10 @@
"outputs": [],
"source": [
"# Set up the OpenAI Environment Variables\n",
"os.environ[\"OPENAI_API_TYPE\"] = \"azure\"\n",
"os.environ[\"OPENAI_API_VERSION\"] = \"2023-05-15\"\n",
"os.environ[\"OPENAI_API_BASE\"] = (\n",
" \"YOUR_OPEN_AI_ENDPOINT\" # https://example.openai.azure.com/\n",
")\n",
"os.environ[\"OPENAI_API_KEY\"] = \"YOUR_OPENAI_API_KEY\"\n",
"os.environ[\"OPENAI_EMBEDDINGS_DEPLOYMENT\"] = (\n",
" \"smart-agent-embedding-ada\" # the deployment name for the embedding model\n",
")\n",
"\n",
"os.environ[\"AZURE_OPENAI_API_KEY\"] = \"YOUR_AZURE_OPENAI_API_KEY\"\n",
"os.environ[\"AZURE_OPENAI_ENDPOINT\"] = \"YOUR_AZURE_OPENAI_ENDPOINT\"\n",
"os.environ[\"AZURE_OPENAI_API_VERSION\"] = \"2023-05-15\"\n",
"os.environ[\"OPENAI_EMBEDDINGS_MODEL_NAME\"] = \"text-embedding-ada-002\" # the model name"
]
},
@@ -130,7 +122,7 @@
" CosmosDBSimilarityType,\n",
" CosmosDBVectorSearchType,\n",
")\n",
"from langchain_openai import OpenAIEmbeddings\n",
"from langchain_openai import AzureOpenAIEmbeddings\n",
"from langchain_text_splitters import CharacterTextSplitter\n",
"\n",
"SOURCE_FILE_NAME = \"../../how_to/state_of_the_union.txt\"\n",
@@ -147,14 +139,35 @@
"model_name = os.getenv(\"OPENAI_EMBEDDINGS_MODEL_NAME\", \"text-embedding-ada-002\")\n",
"\n",
"\n",
"openai_embeddings: OpenAIEmbeddings = OpenAIEmbeddings(\n",
" deployment=model_deployment, model=model_name, chunk_size=1\n",
"openai_embeddings: AzureOpenAIEmbeddings = AzureOpenAIEmbeddings(\n",
" model=model_name, chunk_size=1\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "f6c6ed80-7b91-4833-bab5-c9b2b5edcdec",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Document(metadata={'source': '../../how_to/state_of_the_union.txt'}, page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans. \\n\\nLast year COVID-19 kept us apart. This year we are finally together again. \\n\\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \\n\\nWith a duty to one another to the American people to the Constitution. \\n\\nAnd with an unwavering resolve that freedom will always triumph over tyranny. \\n\\nSix days ago, Russias Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated. \\n\\nHe thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined. \\n\\nHe met the Ukrainian people. \\n\\nFrom President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determination, inspires the world.')"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"docs[0]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "39ae6058c2f7fdf1",
"metadata": {
"ExecuteTime": {
@@ -166,14 +179,10 @@
{
"data": {
"text/plain": [
"{'raw': {'defaultShard': {'numIndexesBefore': 1,\n",
" 'numIndexesAfter': 2,\n",
" 'createdCollectionAutomatically': False,\n",
" 'ok': 1}},\n",
" 'ok': 1}"
"'\\n# DiskANN vectorstore\\nmaxDegree = 40\\ndimensions = 1536\\nsimilarity_algorithm = CosmosDBSimilarityType.COS\\nkind = CosmosDBVectorSearchType.VECTOR_DISKANN\\nlBuild = 20\\n\\nvectorstore.create_index(\\n dimensions=dimensions,\\n similarity=similarity_algorithm,\\n kind=kind ,\\n max_degree=maxDegree,\\n l_build=lBuild,\\n )\\n\\n# -----------------------------------------------------------\\n\\n# HNSW vectorstore\\ndimensions = 1536\\nsimilarity_algorithm = CosmosDBSimilarityType.COS\\nkind = CosmosDBVectorSearchType.VECTOR_HNSW\\nm = 16\\nef_construction = 64\\n\\nvectorstore.create_index(\\n dimensions=dimensions,\\n similarity=similarity_algorithm,\\n kind=kind ,\\n m=m,\\n ef_construction=ef_construction,\\n )\\n'"
]
},
"execution_count": 5,
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
@@ -212,12 +221,46 @@
"\n",
"vectorstore.create_index(\n",
" num_lists, dimensions, similarity_algorithm, kind, m, ef_construction\n",
")"
")\n",
"\n",
"\"\"\"\n",
"# DiskANN vectorstore\n",
"maxDegree = 40\n",
"dimensions = 1536\n",
"similarity_algorithm = CosmosDBSimilarityType.COS\n",
"kind = CosmosDBVectorSearchType.VECTOR_DISKANN\n",
"lBuild = 20\n",
"\n",
"vectorstore.create_index(\n",
" dimensions=dimensions,\n",
" similarity=similarity_algorithm,\n",
" kind=kind ,\n",
" max_degree=maxDegree,\n",
" l_build=lBuild,\n",
" )\n",
"\n",
"# -----------------------------------------------------------\n",
"\n",
"# HNSW vectorstore\n",
"dimensions = 1536\n",
"similarity_algorithm = CosmosDBSimilarityType.COS\n",
"kind = CosmosDBVectorSearchType.VECTOR_HNSW\n",
"m = 16\n",
"ef_construction = 64\n",
"\n",
"vectorstore.create_index(\n",
" dimensions=dimensions,\n",
" similarity=similarity_algorithm,\n",
" kind=kind ,\n",
" m=m,\n",
" ef_construction=ef_construction,\n",
" )\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 7,
"id": "32c68d3246adc21f",
"metadata": {
"ExecuteTime": {
@@ -234,7 +277,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 8,
"id": "8feeeb4364efb204",
"metadata": {
"ExecuteTime": {
@@ -271,7 +314,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 9,
"id": "3c218ab6f59301f7",
"metadata": {
"ExecuteTime": {
@@ -308,7 +351,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 10,
"id": "fd67e4d92c9ab32f",
"metadata": {
"ExecuteTime": {
@@ -352,10 +395,106 @@
"Azure Cosmos DB for MongoDB supports pre-filtering with $lt, $lte, $eq, $neq, $gte, $gt, $in, $nin, and $regex. To use this feature, enable \"filtering vector search\" in the \"Preview Features\" tab of your Azure Subscription. Learn more about preview features [here](https://learn.microsoft.com/azure/cosmos-db/mongodb/vcore/vector-search#filtered-vector-search-preview)."
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "19c43de6-47f9-45f0-a422-8d852a5d191f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'raw': {'defaultShard': {'numIndexesBefore': 3,\n",
" 'numIndexesAfter': 4,\n",
" 'createdCollectionAutomatically': False,\n",
" 'ok': 1}},\n",
" 'ok': 1}"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# create a filter index\n",
"vectorstore.create_filter_index(\n",
" property_to_filter=\"metadata.source\", index_name=\"filter_index\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "c7031279-dfb8-43f2-a7a8-d10a3786023b",
"metadata": {},
"outputs": [],
"source": [
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
"docs = vectorstore.similarity_search(\n",
" query, pre_filter={\"metadata.source\": {\"$ne\": \"filter content\"}}\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "3860be72-d293-43b9-a727-425f166ff6c6",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"4"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(docs)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "b7fb9800-b1cf-4315-af9d-e8c572d3e05f",
"metadata": {},
"outputs": [],
"source": [
"docs = vectorstore.similarity_search(\n",
" query,\n",
" pre_filter={\"metadata.source\": {\"$ne\": \"../../how_to/state_of_the_union.txt\"}},\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "dba9d39e-6220-4fad-84fa-e123aa7ca6e4",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(docs)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "50bb4346",
"id": "25ea7250-6e8f-48e6-aac9-196effbdc8d8",
"metadata": {},
"outputs": [],
"source": []