mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-11 07:50:47 +00:00
community: VectorStores: Azure Cosmos DB Mongo vCore with DiskANN (#27329)
# Description Add a new vector index type `diskann` to Azure Cosmos DB Mongo vCore vector store. Paper of DiskANN can be found here [DiskANN: Fast Accurate Billion-point Nearest Neighbor Search on a Single Node](https://proceedings.neurips.cc/paper_files/paper/2019/file/09853c7fb1d3f8ee67a61b6bf4a7f8e6-Paper.pdf). ## Sample Usage ```python from pymongo import MongoClient # INDEX_NAME = "izzy-test-index-2" # NAMESPACE = "izzy_test_db.izzy_test_collection" # DB_NAME, COLLECTION_NAME = NAMESPACE.split(".") client: MongoClient = MongoClient(CONNECTION_STRING) collection = client[DB_NAME][COLLECTION_NAME] model_deployment = os.getenv( "OPENAI_EMBEDDINGS_DEPLOYMENT", "smart-agent-embedding-ada" ) model_name = os.getenv("OPENAI_EMBEDDINGS_MODEL_NAME", "text-embedding-ada-002") vectorstore = AzureCosmosDBVectorSearch.from_documents( docs, openai_embeddings, collection=collection, index_name=INDEX_NAME, ) # Read more about these variables in detail here. https://learn.microsoft.com/en-us/azure/cosmos-db/mongodb/vcore/vector-search maxDegree = 40 dimensions = 1536 similarity_algorithm = CosmosDBSimilarityType.COS kind = CosmosDBVectorSearchType.VECTOR_DISKANN lBuild = 20 vectorstore.create_index( dimensions=dimensions, similarity=similarity_algorithm, kind=kind , max_degree=maxDegree, l_build=lBuild, ) ``` ## Dependencies No additional dependencies were added --------- Co-authored-by: Yang Qiao (from Dev Box) <yangqiao@microsoft.com> Co-authored-by: Erick Friis <erick@langchain.dev>
This commit is contained in:
@@ -44,6 +44,8 @@ class CosmosDBVectorSearchType(str, Enum):
|
||||
"""IVF vector index"""
|
||||
VECTOR_HNSW = "vector-hnsw"
|
||||
"""HNSW vector index"""
|
||||
VECTOR_DISKANN = "vector-diskann"
|
||||
"""DISKANN vector index"""
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -181,6 +183,8 @@ class AzureCosmosDBVectorSearch(VectorStore):
|
||||
kind: str = "vector-ivf",
|
||||
m: int = 16,
|
||||
ef_construction: int = 64,
|
||||
max_degree: int = 32,
|
||||
l_build: int = 50,
|
||||
) -> dict[str, Any]:
|
||||
"""Creates an index using the index name specified at
|
||||
instance construction
|
||||
@@ -215,6 +219,7 @@ class AzureCosmosDBVectorSearch(VectorStore):
|
||||
- vector-ivf
|
||||
- vector-hnsw: available as a preview feature only,
|
||||
to enable visit https://learn.microsoft.com/en-us/azure/azure-resource-manager/management/preview-features
|
||||
- vector-diskann: available as a preview feature only
|
||||
num_lists: This integer is the number of clusters that the
|
||||
inverted file (IVF) index uses to group the vector data.
|
||||
We recommend that numLists is set to documentCount/1000
|
||||
@@ -239,6 +244,12 @@ class AzureCosmosDBVectorSearch(VectorStore):
|
||||
better index quality and higher accuracy, but it will
|
||||
also increase the time required to build the index.
|
||||
ef_construction has to be at least 2 * m
|
||||
max_degree: Max number of neighbors.
|
||||
Default value is 32, range from 20 to 2048.
|
||||
Only vector-diskann search supports this for now.
|
||||
l_build: l value for index building.
|
||||
Default value is 50, range from 10 to 500.
|
||||
Only vector-diskann search supports this for now.
|
||||
Returns:
|
||||
An object describing the created index
|
||||
|
||||
@@ -254,6 +265,10 @@ class AzureCosmosDBVectorSearch(VectorStore):
|
||||
create_index_commands = self._get_vector_index_hnsw(
|
||||
kind, m, ef_construction, similarity, dimensions
|
||||
)
|
||||
elif kind == CosmosDBVectorSearchType.VECTOR_DISKANN:
|
||||
create_index_commands = self._get_vector_index_diskann(
|
||||
kind, max_degree, l_build, similarity, dimensions
|
||||
)
|
||||
|
||||
# retrieve the database object
|
||||
current_database = self._collection.database
|
||||
@@ -306,6 +321,27 @@ class AzureCosmosDBVectorSearch(VectorStore):
|
||||
}
|
||||
return command
|
||||
|
||||
def _get_vector_index_diskann(
|
||||
self, kind: str, max_degree: int, l_build: int, similarity: str, dimensions: int
|
||||
) -> Dict[str, Any]:
|
||||
command = {
|
||||
"createIndexes": self._collection.name,
|
||||
"indexes": [
|
||||
{
|
||||
"name": self._index_name,
|
||||
"key": {self._embedding_key: "cosmosSearch"},
|
||||
"cosmosSearchOptions": {
|
||||
"kind": kind,
|
||||
"maxDegree": max_degree,
|
||||
"lBuild": l_build,
|
||||
"similarity": similarity,
|
||||
"dimensions": dimensions,
|
||||
},
|
||||
}
|
||||
],
|
||||
}
|
||||
return command
|
||||
|
||||
def create_filter_index(
|
||||
self,
|
||||
property_to_filter: str,
|
||||
@@ -421,6 +457,7 @@ class AzureCosmosDBVectorSearch(VectorStore):
|
||||
pre_filter: Optional[Dict] = None,
|
||||
ef_search: int = 40,
|
||||
score_threshold: float = 0.0,
|
||||
l_search: int = 40,
|
||||
with_embedding: bool = False,
|
||||
) -> List[Tuple[Document, float]]:
|
||||
"""Returns a list of documents with their scores
|
||||
@@ -433,12 +470,16 @@ class AzureCosmosDBVectorSearch(VectorStore):
|
||||
- vector-ivf
|
||||
- vector-hnsw: available as a preview feature only,
|
||||
to enable visit https://learn.microsoft.com/en-us/azure/azure-resource-manager/management/preview-features
|
||||
- vector-diskann: available as a preview feature only
|
||||
ef_search: The size of the dynamic candidate list for search
|
||||
(40 by default). A higher value provides better
|
||||
recall at the cost of speed.
|
||||
score_threshold: (Optional[float], optional): Maximum vector distance
|
||||
between selected documents and the query vector. Defaults to None.
|
||||
Only vector-ivf search supports this for now.
|
||||
l_search: l value for index searching.
|
||||
Default value is 40, range from 10 to 10000.
|
||||
Only vector-diskann search supports this.
|
||||
|
||||
Returns:
|
||||
A list of documents closest to the query vector
|
||||
@@ -450,6 +491,10 @@ class AzureCosmosDBVectorSearch(VectorStore):
|
||||
pipeline = self._get_pipeline_vector_hnsw(
|
||||
embeddings, k, ef_search, pre_filter
|
||||
)
|
||||
elif kind == CosmosDBVectorSearchType.VECTOR_DISKANN:
|
||||
pipeline = self._get_pipeline_vector_diskann(
|
||||
embeddings, k, l_search, pre_filter
|
||||
)
|
||||
|
||||
cursor = self._collection.aggregate(pipeline)
|
||||
|
||||
@@ -461,6 +506,9 @@ class AzureCosmosDBVectorSearch(VectorStore):
|
||||
document_object_field = res.pop("document")
|
||||
text = document_object_field.pop(self._text_key)
|
||||
metadata = document_object_field.pop("metadata", {})
|
||||
metadata["_id"] = document_object_field.pop(
|
||||
"_id"
|
||||
) # '_id' is in new position
|
||||
if with_embedding:
|
||||
metadata[self._embedding_key] = document_object_field.pop(
|
||||
self._embedding_key
|
||||
@@ -527,6 +575,37 @@ class AzureCosmosDBVectorSearch(VectorStore):
|
||||
]
|
||||
return pipeline
|
||||
|
||||
def _get_pipeline_vector_diskann(
|
||||
self,
|
||||
embeddings: List[float],
|
||||
k: int = 4,
|
||||
l_search: int = 40,
|
||||
pre_filter: Optional[Dict] = None,
|
||||
) -> List[dict[str, Any]]:
|
||||
params = {
|
||||
"vector": embeddings,
|
||||
"path": self._embedding_key,
|
||||
"k": k,
|
||||
"lSearch": l_search,
|
||||
}
|
||||
if pre_filter:
|
||||
params["filter"] = pre_filter
|
||||
|
||||
pipeline: List[dict[str, Any]] = [
|
||||
{
|
||||
"$search": {
|
||||
"cosmosSearch": params,
|
||||
}
|
||||
},
|
||||
{
|
||||
"$project": {
|
||||
"similarityScore": {"$meta": "searchScore"},
|
||||
"document": "$$ROOT",
|
||||
}
|
||||
},
|
||||
]
|
||||
return pipeline
|
||||
|
||||
def similarity_search_with_score(
|
||||
self,
|
||||
query: str,
|
||||
@@ -535,6 +614,7 @@ class AzureCosmosDBVectorSearch(VectorStore):
|
||||
pre_filter: Optional[Dict] = None,
|
||||
ef_search: int = 40,
|
||||
score_threshold: float = 0.0,
|
||||
l_search: int = 40,
|
||||
with_embedding: bool = False,
|
||||
) -> List[Tuple[Document, float]]:
|
||||
embeddings = self._embedding.embed_query(query)
|
||||
@@ -545,6 +625,7 @@ class AzureCosmosDBVectorSearch(VectorStore):
|
||||
pre_filter=pre_filter,
|
||||
ef_search=ef_search,
|
||||
score_threshold=score_threshold,
|
||||
l_search=l_search,
|
||||
with_embedding=with_embedding,
|
||||
)
|
||||
return docs
|
||||
@@ -557,6 +638,7 @@ class AzureCosmosDBVectorSearch(VectorStore):
|
||||
pre_filter: Optional[Dict] = None,
|
||||
ef_search: int = 40,
|
||||
score_threshold: float = 0.0,
|
||||
l_search: int = 40,
|
||||
with_embedding: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
@@ -567,6 +649,7 @@ class AzureCosmosDBVectorSearch(VectorStore):
|
||||
pre_filter=pre_filter,
|
||||
ef_search=ef_search,
|
||||
score_threshold=score_threshold,
|
||||
l_search=l_search,
|
||||
with_embedding=with_embedding,
|
||||
)
|
||||
return [doc for doc, _ in docs_and_scores]
|
||||
@@ -581,6 +664,7 @@ class AzureCosmosDBVectorSearch(VectorStore):
|
||||
pre_filter: Optional[Dict] = None,
|
||||
ef_search: int = 40,
|
||||
score_threshold: float = 0.0,
|
||||
l_search: int = 40,
|
||||
with_embedding: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
@@ -593,6 +677,7 @@ class AzureCosmosDBVectorSearch(VectorStore):
|
||||
pre_filter=pre_filter,
|
||||
ef_search=ef_search,
|
||||
score_threshold=score_threshold,
|
||||
l_search=l_search,
|
||||
with_embedding=with_embedding,
|
||||
)
|
||||
|
||||
@@ -616,6 +701,7 @@ class AzureCosmosDBVectorSearch(VectorStore):
|
||||
pre_filter: Optional[Dict] = None,
|
||||
ef_search: int = 40,
|
||||
score_threshold: float = 0.0,
|
||||
l_search: int = 40,
|
||||
with_embedding: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
@@ -631,6 +717,7 @@ class AzureCosmosDBVectorSearch(VectorStore):
|
||||
pre_filter=pre_filter,
|
||||
ef_search=ef_search,
|
||||
score_threshold=score_threshold,
|
||||
l_search=l_search,
|
||||
with_embedding=with_embedding,
|
||||
)
|
||||
return docs
|
||||
|
@@ -8,7 +8,7 @@ from typing import Any, Generator, Optional, Union
|
||||
import pytest
|
||||
from langchain_core.documents import Document
|
||||
|
||||
from langchain_community.embeddings import OpenAIEmbeddings
|
||||
from langchain_community.embeddings import AzureOpenAIEmbeddings
|
||||
from langchain_community.vectorstores.azure_cosmos_db import (
|
||||
AzureCosmosDBVectorSearch,
|
||||
CosmosDBSimilarityType,
|
||||
@@ -24,6 +24,7 @@ model_name = os.getenv("OPENAI_EMBEDDINGS_MODEL_NAME", "text-embedding-ada-002")
|
||||
|
||||
INDEX_NAME = "langchain-test-index"
|
||||
INDEX_NAME_VECTOR_HNSW = "langchain-test-index-hnsw"
|
||||
INDEX_NAME_VECTOR_DISKANN = "langchain-test-index-diskann"
|
||||
NAMESPACE = "langchain_test_db.langchain_test_collection"
|
||||
CONNECTION_STRING: str = os.environ.get("MONGODB_VCORE_URI", "")
|
||||
DB_NAME, COLLECTION_NAME = NAMESPACE.split(".")
|
||||
@@ -36,6 +37,9 @@ m = 16
|
||||
ef_construction = 64
|
||||
ef_search = 40
|
||||
score_threshold = 0.1
|
||||
maxDegree = 50
|
||||
lBuild = 40
|
||||
lSearch = 100
|
||||
application_name = "LANGCHAIN_PYTHON"
|
||||
|
||||
|
||||
@@ -53,8 +57,9 @@ def collection() -> Any:
|
||||
|
||||
@pytest.fixture()
|
||||
def azure_openai_embeddings() -> Any:
|
||||
openai_embeddings: OpenAIEmbeddings = OpenAIEmbeddings(
|
||||
deployment=model_deployment, model=model_name, chunk_size=1
|
||||
openai_embeddings: AzureOpenAIEmbeddings = AzureOpenAIEmbeddings(
|
||||
model=model_name,
|
||||
chunk_size=1,
|
||||
)
|
||||
return openai_embeddings
|
||||
|
||||
@@ -70,8 +75,12 @@ pytest tests/integration_tests/vectorstores/test_azure_cosmos_db.py
|
||||
class TestAzureCosmosDBVectorSearch:
|
||||
@classmethod
|
||||
def setup_class(cls) -> None:
|
||||
if not os.getenv("OPENAI_API_KEY"):
|
||||
raise ValueError("OPENAI_API_KEY environment variable is not set")
|
||||
if not os.getenv("AZURE_OPENAI_API_KEY"):
|
||||
raise ValueError("AZURE_OPENAI_API_KEY environment variable is not set")
|
||||
if not os.getenv("AZURE_OPENAI_ENDPOINT"):
|
||||
raise ValueError("AZURE_OPENAI_ENDPOINT environment variable is not set")
|
||||
if not os.getenv("AZURE_OPENAI_API_VERSION"):
|
||||
raise ValueError("AZURE_OPENAI_API_VERSION environment variable is not set")
|
||||
|
||||
# insure the test collection is empty
|
||||
collection = prepare_collection()
|
||||
@@ -95,7 +104,7 @@ class TestAzureCosmosDBVectorSearch:
|
||||
return "805.555.1212"
|
||||
|
||||
def test_from_documents_cosine_distance(
|
||||
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
|
||||
self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
|
||||
) -> None:
|
||||
"""Test end to end construction and search."""
|
||||
documents = [
|
||||
@@ -135,7 +144,7 @@ class TestAzureCosmosDBVectorSearch:
|
||||
vectorstore.delete_index()
|
||||
|
||||
def test_from_documents_inner_product(
|
||||
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
|
||||
self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
|
||||
) -> None:
|
||||
"""Test end to end construction and search."""
|
||||
documents = [
|
||||
@@ -174,7 +183,7 @@ class TestAzureCosmosDBVectorSearch:
|
||||
vectorstore.delete_index()
|
||||
|
||||
def test_from_texts_cosine_distance(
|
||||
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
|
||||
self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
|
||||
) -> None:
|
||||
texts = [
|
||||
"Dogs are tough.",
|
||||
@@ -208,7 +217,7 @@ class TestAzureCosmosDBVectorSearch:
|
||||
vectorstore.delete_index()
|
||||
|
||||
def test_from_texts_with_metadatas_cosine_distance(
|
||||
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
|
||||
self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
|
||||
) -> None:
|
||||
texts = [
|
||||
"Dogs are tough.",
|
||||
@@ -246,7 +255,7 @@ class TestAzureCosmosDBVectorSearch:
|
||||
vectorstore.delete_index()
|
||||
|
||||
def test_from_texts_with_metadatas_delete_one(
|
||||
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
|
||||
self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
|
||||
) -> None:
|
||||
texts = [
|
||||
"Dogs are tough.",
|
||||
@@ -280,7 +289,6 @@ class TestAzureCosmosDBVectorSearch:
|
||||
assert output
|
||||
assert output[0].page_content == "What is a sandwich?"
|
||||
assert output[0].metadata["c"] == 1
|
||||
|
||||
first_document_id_object = output[0].metadata["_id"]
|
||||
first_document_id = str(first_document_id_object)
|
||||
|
||||
@@ -300,7 +308,7 @@ class TestAzureCosmosDBVectorSearch:
|
||||
vectorstore.delete_index()
|
||||
|
||||
def test_from_texts_with_metadatas_delete_multiple(
|
||||
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
|
||||
self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
|
||||
) -> None:
|
||||
texts = [
|
||||
"Dogs are tough.",
|
||||
@@ -359,7 +367,7 @@ class TestAzureCosmosDBVectorSearch:
|
||||
vectorstore.delete_index()
|
||||
|
||||
def test_from_texts_with_metadatas_inner_product(
|
||||
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
|
||||
self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
|
||||
) -> None:
|
||||
texts = [
|
||||
"Dogs are tough.",
|
||||
@@ -397,7 +405,7 @@ class TestAzureCosmosDBVectorSearch:
|
||||
vectorstore.delete_index()
|
||||
|
||||
def test_from_texts_with_metadatas_euclidean_distance(
|
||||
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
|
||||
self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
|
||||
) -> None:
|
||||
texts = [
|
||||
"Dogs are tough.",
|
||||
@@ -435,7 +443,7 @@ class TestAzureCosmosDBVectorSearch:
|
||||
vectorstore.delete_index()
|
||||
|
||||
def test_max_marginal_relevance_cosine_distance(
|
||||
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
|
||||
self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
|
||||
) -> None:
|
||||
texts = ["foo", "foo", "fou", "foy"]
|
||||
vectorstore = AzureCosmosDBVectorSearch.from_texts(
|
||||
@@ -453,7 +461,12 @@ class TestAzureCosmosDBVectorSearch:
|
||||
|
||||
query = "foo"
|
||||
output = vectorstore.max_marginal_relevance_search(
|
||||
query, k=10, kind=kind, lambda_mult=0.1, score_threshold=score_threshold
|
||||
query,
|
||||
k=10,
|
||||
kind=kind,
|
||||
lambda_mult=0.1,
|
||||
score_threshold=score_threshold,
|
||||
with_embedding=True,
|
||||
)
|
||||
|
||||
assert len(output) == len(texts)
|
||||
@@ -463,7 +476,7 @@ class TestAzureCosmosDBVectorSearch:
|
||||
vectorstore.delete_index()
|
||||
|
||||
def test_max_marginal_relevance_inner_product(
|
||||
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
|
||||
self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
|
||||
) -> None:
|
||||
texts = ["foo", "foo", "fou", "foy"]
|
||||
vectorstore = AzureCosmosDBVectorSearch.from_texts(
|
||||
@@ -481,7 +494,12 @@ class TestAzureCosmosDBVectorSearch:
|
||||
|
||||
query = "foo"
|
||||
output = vectorstore.max_marginal_relevance_search(
|
||||
query, k=10, kind=kind, lambda_mult=0.1, score_threshold=score_threshold
|
||||
query,
|
||||
k=10,
|
||||
kind=kind,
|
||||
lambda_mult=0.1,
|
||||
score_threshold=score_threshold,
|
||||
with_embedding=True,
|
||||
)
|
||||
|
||||
assert len(output) == len(texts)
|
||||
@@ -495,7 +513,7 @@ class TestAzureCosmosDBVectorSearch:
|
||||
"""
|
||||
|
||||
def test_from_documents_cosine_distance_vector_hnsw(
|
||||
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
|
||||
self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
|
||||
) -> None:
|
||||
"""Test end to end construction and search."""
|
||||
documents = [
|
||||
@@ -539,7 +557,7 @@ class TestAzureCosmosDBVectorSearch:
|
||||
vectorstore.delete_index()
|
||||
|
||||
def test_from_documents_inner_product_vector_hnsw(
|
||||
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
|
||||
self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
|
||||
) -> None:
|
||||
"""Test end to end construction and search."""
|
||||
documents = [
|
||||
@@ -583,7 +601,7 @@ class TestAzureCosmosDBVectorSearch:
|
||||
vectorstore.delete_index()
|
||||
|
||||
def test_from_texts_cosine_distance_vector_hnsw(
|
||||
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
|
||||
self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
|
||||
) -> None:
|
||||
texts = [
|
||||
"Dogs are tough.",
|
||||
@@ -622,7 +640,7 @@ class TestAzureCosmosDBVectorSearch:
|
||||
vectorstore.delete_index()
|
||||
|
||||
def test_from_texts_with_metadatas_cosine_distance_vector_hnsw(
|
||||
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
|
||||
self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
|
||||
) -> None:
|
||||
texts = [
|
||||
"Dogs are tough.",
|
||||
@@ -665,7 +683,7 @@ class TestAzureCosmosDBVectorSearch:
|
||||
vectorstore.delete_index()
|
||||
|
||||
def test_from_texts_with_metadatas_delete_one_vector_hnsw(
|
||||
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
|
||||
self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
|
||||
) -> None:
|
||||
texts = [
|
||||
"Dogs are tough.",
|
||||
@@ -724,7 +742,7 @@ class TestAzureCosmosDBVectorSearch:
|
||||
vectorstore.delete_index()
|
||||
|
||||
def test_from_texts_with_metadatas_delete_multiple_vector_hnsw(
|
||||
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
|
||||
self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
|
||||
) -> None:
|
||||
texts = [
|
||||
"Dogs are tough.",
|
||||
@@ -788,7 +806,7 @@ class TestAzureCosmosDBVectorSearch:
|
||||
vectorstore.delete_index()
|
||||
|
||||
def test_from_texts_with_metadatas_inner_product_vector_hnsw(
|
||||
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
|
||||
self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
|
||||
) -> None:
|
||||
texts = [
|
||||
"Dogs are tough.",
|
||||
@@ -831,7 +849,7 @@ class TestAzureCosmosDBVectorSearch:
|
||||
vectorstore.delete_index()
|
||||
|
||||
def test_max_marginal_relevance_cosine_distance_vector_hnsw(
|
||||
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
|
||||
self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
|
||||
) -> None:
|
||||
texts = ["foo", "foo", "fou", "foy"]
|
||||
vectorstore = AzureCosmosDBVectorSearch.from_texts(
|
||||
@@ -859,6 +877,7 @@ class TestAzureCosmosDBVectorSearch:
|
||||
kind=CosmosDBVectorSearchType.VECTOR_HNSW,
|
||||
lambda_mult=0.1,
|
||||
score_threshold=score_threshold,
|
||||
with_embedding=True,
|
||||
)
|
||||
|
||||
assert len(output) == len(texts)
|
||||
@@ -868,7 +887,7 @@ class TestAzureCosmosDBVectorSearch:
|
||||
vectorstore.delete_index()
|
||||
|
||||
def test_max_marginal_relevance_inner_product_vector_hnsw(
|
||||
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
|
||||
self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
|
||||
) -> None:
|
||||
texts = ["foo", "foo", "fou", "foy"]
|
||||
vectorstore = AzureCosmosDBVectorSearch.from_texts(
|
||||
@@ -896,6 +915,405 @@ class TestAzureCosmosDBVectorSearch:
|
||||
kind=CosmosDBVectorSearchType.VECTOR_HNSW,
|
||||
lambda_mult=0.1,
|
||||
score_threshold=score_threshold,
|
||||
with_embedding=True,
|
||||
)
|
||||
|
||||
assert len(output) == len(texts)
|
||||
assert output[0].page_content == "foo"
|
||||
assert output[1].page_content != "foo"
|
||||
|
||||
vectorstore.delete_index()
|
||||
|
||||
"""
|
||||
Test cases for the similarity algorithm using vector-diskann
|
||||
"""
|
||||
|
||||
def test_from_documents_cosine_distance_vector_diskann(
|
||||
self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
|
||||
) -> None:
|
||||
"""Test end to end construction and search."""
|
||||
documents = [
|
||||
Document(page_content="Dogs are tough.", metadata={"a": 1}),
|
||||
Document(page_content="Cats have fluff.", metadata={"b": 1}),
|
||||
Document(page_content="What is a sandwich?", metadata={"c": 1}),
|
||||
Document(page_content="That fence is purple.", metadata={"d": 1, "e": 2}),
|
||||
]
|
||||
|
||||
vectorstore = AzureCosmosDBVectorSearch.from_documents(
|
||||
documents,
|
||||
azure_openai_embeddings,
|
||||
collection=collection,
|
||||
index_name=INDEX_NAME_VECTOR_DISKANN,
|
||||
)
|
||||
sleep(1) # waits for Cosmos DB to save contents to the collection
|
||||
|
||||
# Create the DiskANN index that will be leveraged later for vector search
|
||||
vectorstore.create_index(
|
||||
dimensions=dimensions,
|
||||
similarity=similarity_algorithm,
|
||||
kind=CosmosDBVectorSearchType.VECTOR_DISKANN,
|
||||
max_degree=maxDegree,
|
||||
l_build=lBuild,
|
||||
)
|
||||
sleep(2) # waits for the index to be set up
|
||||
|
||||
output = vectorstore.similarity_search(
|
||||
"Sandwich",
|
||||
k=1,
|
||||
kind=CosmosDBVectorSearchType.VECTOR_DISKANN,
|
||||
lSearch=lSearch,
|
||||
)
|
||||
|
||||
assert output
|
||||
assert output[0].page_content == "What is a sandwich?"
|
||||
assert output[0].metadata["c"] == 1
|
||||
|
||||
vectorstore.delete_index()
|
||||
|
||||
def test_from_documents_inner_product_vector_diskann(
|
||||
self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
|
||||
) -> None:
|
||||
"""Test end to end construction and search."""
|
||||
documents = [
|
||||
Document(page_content="Dogs are tough.", metadata={"a": 1}),
|
||||
Document(page_content="Cats have fluff.", metadata={"b": 1}),
|
||||
Document(page_content="What is a sandwich?", metadata={"c": 1}),
|
||||
Document(page_content="That fence is purple.", metadata={"d": 1, "e": 2}),
|
||||
]
|
||||
|
||||
vectorstore = AzureCosmosDBVectorSearch.from_documents(
|
||||
documents,
|
||||
azure_openai_embeddings,
|
||||
collection=collection,
|
||||
index_name=INDEX_NAME_VECTOR_DISKANN,
|
||||
)
|
||||
sleep(1) # waits for Cosmos DB to save contents to the collection
|
||||
|
||||
# Create the DiskANN index that will be leveraged later for vector search
|
||||
vectorstore.create_index(
|
||||
dimensions=dimensions,
|
||||
similarity=similarity_algorithm,
|
||||
kind=CosmosDBVectorSearchType.VECTOR_DISKANN,
|
||||
max_degree=maxDegree,
|
||||
l_build=lBuild,
|
||||
)
|
||||
sleep(2) # waits for the index to be set up
|
||||
|
||||
output = vectorstore.similarity_search(
|
||||
"Sandwich",
|
||||
k=1,
|
||||
kind=CosmosDBVectorSearchType.VECTOR_DISKANN,
|
||||
lSearch=lSearch,
|
||||
)
|
||||
|
||||
assert output
|
||||
assert output[0].page_content == "What is a sandwich?"
|
||||
assert output[0].metadata["c"] == 1
|
||||
|
||||
vectorstore.delete_index()
|
||||
|
||||
def test_from_texts_cosine_distance_vector_diskann(
|
||||
self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
|
||||
) -> None:
|
||||
texts = [
|
||||
"Dogs are tough.",
|
||||
"Cats have fluff.",
|
||||
"What is a sandwich?",
|
||||
"That fence is purple.",
|
||||
]
|
||||
vectorstore = AzureCosmosDBVectorSearch.from_texts(
|
||||
texts,
|
||||
azure_openai_embeddings,
|
||||
collection=collection,
|
||||
index_name=INDEX_NAME_VECTOR_DISKANN,
|
||||
)
|
||||
|
||||
# Create the DiskANN index that will be leveraged later for vector search
|
||||
vectorstore.create_index(
|
||||
dimensions=dimensions,
|
||||
similarity=similarity_algorithm,
|
||||
kind=CosmosDBVectorSearchType.VECTOR_DISKANN,
|
||||
max_degree=maxDegree,
|
||||
l_build=lBuild,
|
||||
)
|
||||
sleep(2) # waits for the index to be set up
|
||||
|
||||
output = vectorstore.similarity_search(
|
||||
"Sandwich",
|
||||
k=1,
|
||||
kind=CosmosDBVectorSearchType.VECTOR_DISKANN,
|
||||
lSearch=lSearch,
|
||||
)
|
||||
|
||||
assert output[0].page_content == "What is a sandwich?"
|
||||
|
||||
vectorstore.delete_index()
|
||||
|
||||
def test_from_texts_with_metadatas_cosine_distance_vector_diskann(
|
||||
self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
|
||||
) -> None:
|
||||
texts = [
|
||||
"Dogs are tough.",
|
||||
"Cats have fluff.",
|
||||
"What is a sandwich?",
|
||||
"The fence is purple.",
|
||||
]
|
||||
metadatas = [{"a": 1}, {"b": 1}, {"c": 1}, {"d": 1, "e": 2}]
|
||||
vectorstore = AzureCosmosDBVectorSearch.from_texts(
|
||||
texts,
|
||||
azure_openai_embeddings,
|
||||
metadatas=metadatas,
|
||||
collection=collection,
|
||||
index_name=INDEX_NAME_VECTOR_DISKANN,
|
||||
)
|
||||
|
||||
# Create the DiskANN index that will be leveraged later for vector search
|
||||
vectorstore.create_index(
|
||||
dimensions=dimensions,
|
||||
similarity=similarity_algorithm,
|
||||
kind=CosmosDBVectorSearchType.VECTOR_DISKANN,
|
||||
max_degree=maxDegree,
|
||||
l_build=lBuild,
|
||||
)
|
||||
sleep(2) # waits for the index to be set up
|
||||
|
||||
output = vectorstore.similarity_search(
|
||||
"Sandwich",
|
||||
k=1,
|
||||
kind=CosmosDBVectorSearchType.VECTOR_DISKANN,
|
||||
lSearch=lSearch,
|
||||
)
|
||||
|
||||
assert output
|
||||
assert output[0].page_content == "What is a sandwich?"
|
||||
assert output[0].metadata["c"] == 1
|
||||
|
||||
vectorstore.delete_index()
|
||||
|
||||
def test_from_texts_with_metadatas_delete_one_vector_diskann(
|
||||
self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
|
||||
) -> None:
|
||||
texts = [
|
||||
"Dogs are tough.",
|
||||
"Cats have fluff.",
|
||||
"What is a sandwich?",
|
||||
"The fence is purple.",
|
||||
]
|
||||
metadatas = [{"a": 1}, {"b": 1}, {"c": 1}, {"d": 1, "e": 2}]
|
||||
vectorstore = AzureCosmosDBVectorSearch.from_texts(
|
||||
texts,
|
||||
azure_openai_embeddings,
|
||||
metadatas=metadatas,
|
||||
collection=collection,
|
||||
index_name=INDEX_NAME_VECTOR_DISKANN,
|
||||
)
|
||||
|
||||
# Create the DiskANN index that will be leveraged later for vector search
|
||||
vectorstore.create_index(
|
||||
dimensions=dimensions,
|
||||
similarity=similarity_algorithm,
|
||||
kind=CosmosDBVectorSearchType.VECTOR_DISKANN,
|
||||
max_degree=maxDegree,
|
||||
l_build=lBuild,
|
||||
)
|
||||
sleep(2) # waits for the index to be set up
|
||||
|
||||
output = vectorstore.similarity_search(
|
||||
"Sandwich",
|
||||
k=1,
|
||||
kind=CosmosDBVectorSearchType.VECTOR_DISKANN,
|
||||
lSearch=lSearch,
|
||||
)
|
||||
|
||||
assert output
|
||||
assert output[0].page_content == "What is a sandwich?"
|
||||
assert output[0].metadata["c"] == 1
|
||||
|
||||
first_document_id_object = output[0].metadata["_id"]
|
||||
first_document_id = str(first_document_id_object)
|
||||
|
||||
vectorstore.delete_document_by_id(first_document_id)
|
||||
sleep(2) # waits for the index to be updated
|
||||
|
||||
output2 = vectorstore.similarity_search(
|
||||
"Sandwich",
|
||||
k=1,
|
||||
kind=CosmosDBVectorSearchType.VECTOR_DISKANN,
|
||||
lSearch=lSearch,
|
||||
)
|
||||
assert output2
|
||||
assert output2[0].page_content != "What is a sandwich?"
|
||||
|
||||
vectorstore.delete_index()
|
||||
|
||||
def test_from_texts_with_metadatas_delete_multiple_vector_diskann(
|
||||
self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
|
||||
) -> None:
|
||||
texts = [
|
||||
"Dogs are tough.",
|
||||
"Cats have fluff.",
|
||||
"What is a sandwich?",
|
||||
"The fence is purple.",
|
||||
]
|
||||
metadatas = [{"a": 1}, {"b": 1}, {"c": 1}, {"d": 1, "e": 2}]
|
||||
vectorstore = AzureCosmosDBVectorSearch.from_texts(
|
||||
texts,
|
||||
azure_openai_embeddings,
|
||||
metadatas=metadatas,
|
||||
collection=collection,
|
||||
index_name=INDEX_NAME_VECTOR_DISKANN,
|
||||
)
|
||||
|
||||
# Create the DiskANN index that will be leveraged later for vector search
|
||||
vectorstore.create_index(
|
||||
dimensions=dimensions,
|
||||
similarity=similarity_algorithm,
|
||||
kind=CosmosDBVectorSearchType.VECTOR_DISKANN,
|
||||
max_degree=maxDegree,
|
||||
l_build=lBuild,
|
||||
)
|
||||
sleep(2) # waits for the index to be set up
|
||||
|
||||
output = vectorstore.similarity_search(
|
||||
"Sandwich",
|
||||
k=5,
|
||||
kind=CosmosDBVectorSearchType.VECTOR_DISKANN,
|
||||
lSearch=lSearch,
|
||||
)
|
||||
|
||||
first_document_id = str(output[0].metadata["_id"])
|
||||
|
||||
second_document_id = str(output[1].metadata["_id"])
|
||||
|
||||
third_document_id = str(output[2].metadata["_id"])
|
||||
|
||||
document_ids = [first_document_id, second_document_id, third_document_id]
|
||||
vectorstore.delete(document_ids)
|
||||
sleep(2) # waits for the index to be updated
|
||||
|
||||
output_2 = vectorstore.similarity_search(
|
||||
"Sandwich",
|
||||
k=5,
|
||||
kind=CosmosDBVectorSearchType.VECTOR_DISKANN,
|
||||
lSearch=lSearch,
|
||||
)
|
||||
assert output
|
||||
assert output_2
|
||||
|
||||
assert len(output) == 4 # we should see all the four documents
|
||||
assert (
|
||||
len(output_2) == 1
|
||||
) # we should see only one document left after three have been deleted
|
||||
|
||||
vectorstore.delete_index()
|
||||
|
||||
def test_from_texts_with_metadatas_inner_product_vector_diskann(
|
||||
self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
|
||||
) -> None:
|
||||
texts = [
|
||||
"Dogs are tough.",
|
||||
"Cats have fluff.",
|
||||
"What is a sandwich?",
|
||||
"The fence is purple.",
|
||||
]
|
||||
metadatas = [{"a": 1}, {"b": 1}, {"c": 1}, {"d": 1, "e": 2}]
|
||||
vectorstore = AzureCosmosDBVectorSearch.from_texts(
|
||||
texts,
|
||||
azure_openai_embeddings,
|
||||
metadatas=metadatas,
|
||||
collection=collection,
|
||||
index_name=INDEX_NAME_VECTOR_DISKANN,
|
||||
)
|
||||
|
||||
# Create the DiskANN index that will be leveraged later for vector search
|
||||
vectorstore.create_index(
|
||||
dimensions=dimensions,
|
||||
similarity=similarity_algorithm,
|
||||
kind=CosmosDBVectorSearchType.VECTOR_DISKANN,
|
||||
max_degree=maxDegree,
|
||||
l_build=lBuild,
|
||||
)
|
||||
sleep(2) # waits for the index to be set up
|
||||
|
||||
output = vectorstore.similarity_search(
|
||||
"Sandwich",
|
||||
k=1,
|
||||
kind=CosmosDBVectorSearchType.VECTOR_DISKANN,
|
||||
lSearch=lSearch,
|
||||
)
|
||||
|
||||
assert output
|
||||
assert output[0].page_content == "What is a sandwich?"
|
||||
assert output[0].metadata["c"] == 1
|
||||
|
||||
vectorstore.delete_index()
|
||||
|
||||
def test_max_marginal_relevance_cosine_distance_vector_diskann(
|
||||
self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
|
||||
) -> None:
|
||||
texts = ["foo", "foo", "fou", "foy"]
|
||||
vectorstore = AzureCosmosDBVectorSearch.from_texts(
|
||||
texts,
|
||||
azure_openai_embeddings,
|
||||
collection=collection,
|
||||
index_name=INDEX_NAME_VECTOR_DISKANN,
|
||||
)
|
||||
|
||||
# Create the IVF index that will be leveraged later for vector search
|
||||
vectorstore.create_index(
|
||||
dimensions=dimensions,
|
||||
similarity=similarity_algorithm,
|
||||
kind=CosmosDBVectorSearchType.VECTOR_DISKANN,
|
||||
max_degree=maxDegree,
|
||||
l_build=lBuild,
|
||||
)
|
||||
sleep(2) # waits for the index to be set up
|
||||
|
||||
query = "foo"
|
||||
output = vectorstore.max_marginal_relevance_search(
|
||||
query,
|
||||
k=10,
|
||||
kind=CosmosDBVectorSearchType.VECTOR_DISKANN,
|
||||
lambda_mult=0.1,
|
||||
lSearch=lSearch,
|
||||
with_embedding=True,
|
||||
)
|
||||
|
||||
assert len(output) == len(texts)
|
||||
assert output[0].page_content == "foo"
|
||||
assert output[1].page_content != "foo"
|
||||
|
||||
vectorstore.delete_index()
|
||||
|
||||
def test_max_marginal_relevance_inner_product_vector_diskann(
|
||||
self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
|
||||
) -> None:
|
||||
texts = ["foo", "foo", "fou", "foy"]
|
||||
vectorstore = AzureCosmosDBVectorSearch.from_texts(
|
||||
texts,
|
||||
azure_openai_embeddings,
|
||||
collection=collection,
|
||||
index_name=INDEX_NAME_VECTOR_DISKANN,
|
||||
)
|
||||
|
||||
# Create the DiskANN index that will be leveraged later for vector search
|
||||
vectorstore.create_index(
|
||||
dimensions=dimensions,
|
||||
similarity=similarity_algorithm,
|
||||
kind=CosmosDBVectorSearchType.VECTOR_DISKANN,
|
||||
max_degree=maxDegree,
|
||||
l_build=lBuild,
|
||||
)
|
||||
sleep(2) # waits for the index to be set up
|
||||
|
||||
query = "foo"
|
||||
output = vectorstore.max_marginal_relevance_search(
|
||||
query,
|
||||
k=10,
|
||||
kind=CosmosDBVectorSearchType.VECTOR_DISKANN,
|
||||
lambda_mult=0.1,
|
||||
lSearch=lSearch,
|
||||
with_embedding=True,
|
||||
)
|
||||
|
||||
assert len(output) == len(texts)
|
||||
@@ -906,7 +1324,7 @@ class TestAzureCosmosDBVectorSearch:
|
||||
|
||||
@staticmethod
|
||||
def invoke_delete_with_no_args(
|
||||
azure_openai_embeddings: OpenAIEmbeddings, collection: Any
|
||||
azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
|
||||
) -> Optional[bool]:
|
||||
vectorstore: AzureCosmosDBVectorSearch = (
|
||||
AzureCosmosDBVectorSearch.from_connection_string(
|
||||
@@ -922,7 +1340,7 @@ class TestAzureCosmosDBVectorSearch:
|
||||
|
||||
@staticmethod
|
||||
def invoke_delete_by_id_with_no_args(
|
||||
azure_openai_embeddings: OpenAIEmbeddings, collection: Any
|
||||
azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
|
||||
) -> None:
|
||||
vectorstore: AzureCosmosDBVectorSearch = (
|
||||
AzureCosmosDBVectorSearch.from_connection_string(
|
||||
@@ -937,14 +1355,14 @@ class TestAzureCosmosDBVectorSearch:
|
||||
vectorstore.delete_document_by_id()
|
||||
|
||||
def test_invalid_arguments_to_delete(
|
||||
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
|
||||
self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
|
||||
) -> None:
|
||||
with pytest.raises(ValueError) as exception_info:
|
||||
self.invoke_delete_with_no_args(azure_openai_embeddings, collection)
|
||||
assert str(exception_info.value) == "No document ids provided to delete."
|
||||
|
||||
def test_no_arguments_to_delete_by_id(
|
||||
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
|
||||
self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any
|
||||
) -> None:
|
||||
with pytest.raises(Exception) as exception_info:
|
||||
self.invoke_delete_by_id_with_no_args(
|
||||
|
Reference in New Issue
Block a user