mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-07 12:06:43 +00:00
LangChain Community: VectorStores: Azure Cosmos DB Filtered Vector Search (#24087)
Thank you for contributing to LangChain! - This PR adds vector search filtering for Azure Cosmos DB Mongo vCore and NoSQL. - [ ] **PR message**: ***Delete this entire checklist*** and replace with - **Description:** a description of the change - **Issue:** the issue # it fixes, if applicable - **Dependencies:** any dependencies required for this change - **Twitter handle:** if your PR gets announced, and you'd like a mention, we'll gladly shout you out! - [ ] **Add tests and docs**: If you're adding a new integration, please include 1. a test for the integration, preferably unit tests that do not rely on network access, 2. an example notebook showing its use. It lives in `docs/docs/integrations` directory. - [ ] **Lint and test**: Run `make format`, `make lint` and `make test` from the root of the package(s) you've modified. See contribution guidelines for more: https://python.langchain.com/docs/contributing/ Additional guidelines: - Make sure optional dependencies are imported within a function. - Please do not add dependencies to pyproject.toml files (even optional ones) unless they are required for unit tests. - Most PRs should not touch more than one package. - Changes should be backwards compatible. - If you are adding something to community, do not re-import it in langchain. If no one reviews your PR within a few days, please @-mention one of baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17.
This commit is contained in:
parent
ac41c97d21
commit
0f45ac4088
@ -306,6 +306,27 @@ class AzureCosmosDBVectorSearch(VectorStore):
|
||||
}
|
||||
return command
|
||||
|
||||
def create_filter_index(
|
||||
self,
|
||||
property_to_filter: str,
|
||||
index_name: str,
|
||||
) -> dict[str, Any]:
|
||||
command = {
|
||||
"createIndexes": self._collection.name,
|
||||
"indexes": [
|
||||
{
|
||||
"key": {property_to_filter: 1},
|
||||
"name": index_name,
|
||||
}
|
||||
],
|
||||
}
|
||||
# retrieve the database object
|
||||
current_database = self._collection.database
|
||||
|
||||
# invoke the command from the database object
|
||||
create_index_responses: dict[str, Any] = current_database.command(command)
|
||||
return create_index_responses
|
||||
|
||||
def add_texts(
|
||||
self,
|
||||
texts: Iterable[str],
|
||||
@ -345,7 +366,7 @@ class AzureCosmosDBVectorSearch(VectorStore):
|
||||
# Embed and create the documents
|
||||
embeddings = self._embedding.embed_documents(texts)
|
||||
to_insert = [
|
||||
{self._text_key: t, self._embedding_key: embedding, **m}
|
||||
{self._text_key: t, self._embedding_key: embedding, "metadata": m}
|
||||
for t, m, embedding in zip(texts, metadatas, embeddings)
|
||||
]
|
||||
# insert the documents in Cosmos DB
|
||||
@ -397,8 +418,10 @@ class AzureCosmosDBVectorSearch(VectorStore):
|
||||
embeddings: List[float],
|
||||
k: int = 4,
|
||||
kind: CosmosDBVectorSearchType = CosmosDBVectorSearchType.VECTOR_IVF,
|
||||
pre_filter: Optional[Dict] = None,
|
||||
ef_search: int = 40,
|
||||
score_threshold: float = 0.0,
|
||||
with_embedding: bool = False,
|
||||
) -> List[Tuple[Document, float]]:
|
||||
"""Returns a list of documents with their scores
|
||||
|
||||
@ -422,9 +445,11 @@ class AzureCosmosDBVectorSearch(VectorStore):
|
||||
"""
|
||||
pipeline: List[dict[str, Any]] = []
|
||||
if kind == CosmosDBVectorSearchType.VECTOR_IVF:
|
||||
pipeline = self._get_pipeline_vector_ivf(embeddings, k)
|
||||
pipeline = self._get_pipeline_vector_ivf(embeddings, k, pre_filter)
|
||||
elif kind == CosmosDBVectorSearchType.VECTOR_HNSW:
|
||||
pipeline = self._get_pipeline_vector_hnsw(embeddings, k, ef_search)
|
||||
pipeline = self._get_pipeline_vector_hnsw(
|
||||
embeddings, k, ef_search, pre_filter
|
||||
)
|
||||
|
||||
cursor = self._collection.aggregate(pipeline)
|
||||
|
||||
@ -433,28 +458,32 @@ class AzureCosmosDBVectorSearch(VectorStore):
|
||||
score = res.pop("similarityScore")
|
||||
if score < score_threshold:
|
||||
continue
|
||||
document_object_field = (
|
||||
res.pop("document")
|
||||
if kind == CosmosDBVectorSearchType.VECTOR_IVF
|
||||
else res
|
||||
)
|
||||
document_object_field = res.pop("document")
|
||||
text = document_object_field.pop(self._text_key)
|
||||
docs.append(
|
||||
(Document(page_content=text, metadata=document_object_field), score)
|
||||
)
|
||||
metadata = document_object_field.pop("metadata")
|
||||
if with_embedding:
|
||||
metadata[self._embedding_key] = document_object_field.pop(
|
||||
self._embedding_key
|
||||
)
|
||||
|
||||
docs.append((Document(page_content=text, metadata=metadata), score))
|
||||
return docs
|
||||
|
||||
def _get_pipeline_vector_ivf(
|
||||
self, embeddings: List[float], k: int = 4
|
||||
self, embeddings: List[float], k: int = 4, pre_filter: Optional[Dict] = None
|
||||
) -> List[dict[str, Any]]:
|
||||
params = {
|
||||
"vector": embeddings,
|
||||
"path": self._embedding_key,
|
||||
"k": k,
|
||||
}
|
||||
if pre_filter:
|
||||
params["filter"] = pre_filter
|
||||
|
||||
pipeline: List[dict[str, Any]] = [
|
||||
{
|
||||
"$search": {
|
||||
"cosmosSearch": {
|
||||
"vector": embeddings,
|
||||
"path": self._embedding_key,
|
||||
"k": k,
|
||||
},
|
||||
"cosmosSearch": params,
|
||||
"returnStoredSource": True,
|
||||
}
|
||||
},
|
||||
@ -468,17 +497,25 @@ class AzureCosmosDBVectorSearch(VectorStore):
|
||||
return pipeline
|
||||
|
||||
def _get_pipeline_vector_hnsw(
|
||||
self, embeddings: List[float], k: int = 4, ef_search: int = 40
|
||||
self,
|
||||
embeddings: List[float],
|
||||
k: int = 4,
|
||||
ef_search: int = 40,
|
||||
pre_filter: Optional[Dict] = None,
|
||||
) -> List[dict[str, Any]]:
|
||||
params = {
|
||||
"vector": embeddings,
|
||||
"path": self._embedding_key,
|
||||
"k": k,
|
||||
"efSearch": ef_search,
|
||||
}
|
||||
if pre_filter:
|
||||
params["filter"] = pre_filter
|
||||
|
||||
pipeline: List[dict[str, Any]] = [
|
||||
{
|
||||
"$search": {
|
||||
"cosmosSearch": {
|
||||
"vector": embeddings,
|
||||
"path": self._embedding_key,
|
||||
"k": k,
|
||||
"efSearch": ef_search,
|
||||
},
|
||||
"cosmosSearch": params,
|
||||
}
|
||||
},
|
||||
{
|
||||
@ -495,16 +532,20 @@ class AzureCosmosDBVectorSearch(VectorStore):
|
||||
query: str,
|
||||
k: int = 4,
|
||||
kind: CosmosDBVectorSearchType = CosmosDBVectorSearchType.VECTOR_IVF,
|
||||
pre_filter: Optional[Dict] = None,
|
||||
ef_search: int = 40,
|
||||
score_threshold: float = 0.0,
|
||||
with_embedding: bool = False,
|
||||
) -> List[Tuple[Document, float]]:
|
||||
embeddings = self._embedding.embed_query(query)
|
||||
docs = self._similarity_search_with_score(
|
||||
embeddings=embeddings,
|
||||
k=k,
|
||||
kind=kind,
|
||||
pre_filter=pre_filter,
|
||||
ef_search=ef_search,
|
||||
score_threshold=score_threshold,
|
||||
with_embedding=with_embedding,
|
||||
)
|
||||
return docs
|
||||
|
||||
@ -513,16 +554,20 @@ class AzureCosmosDBVectorSearch(VectorStore):
|
||||
query: str,
|
||||
k: int = 4,
|
||||
kind: CosmosDBVectorSearchType = CosmosDBVectorSearchType.VECTOR_IVF,
|
||||
pre_filter: Optional[Dict] = None,
|
||||
ef_search: int = 40,
|
||||
score_threshold: float = 0.0,
|
||||
with_embedding: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
docs_and_scores = self.similarity_search_with_score(
|
||||
query,
|
||||
k=k,
|
||||
kind=kind,
|
||||
pre_filter=pre_filter,
|
||||
ef_search=ef_search,
|
||||
score_threshold=score_threshold,
|
||||
with_embedding=with_embedding,
|
||||
)
|
||||
return [doc for doc, _ in docs_and_scores]
|
||||
|
||||
@ -533,8 +578,10 @@ class AzureCosmosDBVectorSearch(VectorStore):
|
||||
fetch_k: int = 20,
|
||||
lambda_mult: float = 0.5,
|
||||
kind: CosmosDBVectorSearchType = CosmosDBVectorSearchType.VECTOR_IVF,
|
||||
pre_filter: Optional[Dict] = None,
|
||||
ef_search: int = 40,
|
||||
score_threshold: float = 0.0,
|
||||
with_embedding: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
# Retrieves the docs with similarity scores
|
||||
@ -543,8 +590,10 @@ class AzureCosmosDBVectorSearch(VectorStore):
|
||||
embedding,
|
||||
k=fetch_k,
|
||||
kind=kind,
|
||||
pre_filter=pre_filter,
|
||||
ef_search=ef_search,
|
||||
score_threshold=score_threshold,
|
||||
with_embedding=with_embedding,
|
||||
)
|
||||
|
||||
# Re-ranks the docs using MMR
|
||||
@ -564,8 +613,10 @@ class AzureCosmosDBVectorSearch(VectorStore):
|
||||
fetch_k: int = 20,
|
||||
lambda_mult: float = 0.5,
|
||||
kind: CosmosDBVectorSearchType = CosmosDBVectorSearchType.VECTOR_IVF,
|
||||
pre_filter: Optional[Dict] = None,
|
||||
ef_search: int = 40,
|
||||
score_threshold: float = 0.0,
|
||||
with_embedding: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
# compute the embeddings vector from the query string
|
||||
@ -577,8 +628,10 @@ class AzureCosmosDBVectorSearch(VectorStore):
|
||||
fetch_k=fetch_k,
|
||||
lambda_mult=lambda_mult,
|
||||
kind=kind,
|
||||
pre_filter=pre_filter,
|
||||
ef_search=ef_search,
|
||||
score_threshold=score_threshold,
|
||||
with_embedding=with_embedding,
|
||||
)
|
||||
return docs
|
||||
|
||||
|
@ -162,7 +162,12 @@ class AzureCosmosDBNoSqlVectorSearch(VectorStore):
|
||||
text_key = "text"
|
||||
|
||||
to_insert = [
|
||||
{"id": str(uuid.uuid4()), text_key: t, self._embedding_key: embedding, **m}
|
||||
{
|
||||
"id": str(uuid.uuid4()),
|
||||
text_key: t,
|
||||
self._embedding_key: embedding,
|
||||
"metadata": m,
|
||||
}
|
||||
for t, m, embedding in zip(texts, metadatas, embeddings)
|
||||
]
|
||||
# insert the documents in CosmosDB No Sql
|
||||
@ -184,6 +189,7 @@ class AzureCosmosDBNoSqlVectorSearch(VectorStore):
|
||||
cosmos_database_properties: Dict[str, Any],
|
||||
database_name: str = "vectorSearchDB",
|
||||
container_name: str = "vectorSearchContainer",
|
||||
create_container: bool = True,
|
||||
**kwargs: Any,
|
||||
) -> AzureCosmosDBNoSqlVectorSearch:
|
||||
if kwargs:
|
||||
@ -204,6 +210,7 @@ class AzureCosmosDBNoSqlVectorSearch(VectorStore):
|
||||
cosmos_database_properties=cosmos_database_properties,
|
||||
database_name=database_name,
|
||||
container_name=container_name,
|
||||
create_container=create_container,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
@ -257,41 +264,83 @@ class AzureCosmosDBNoSqlVectorSearch(VectorStore):
|
||||
self,
|
||||
embeddings: List[float],
|
||||
k: int = 4,
|
||||
pre_filter: Optional[Dict] = None,
|
||||
with_embedding: bool = False,
|
||||
) -> List[Tuple[Document, float]]:
|
||||
query = (
|
||||
"SELECT TOP {} c.id, c.{}, c.text, VectorDistance(c.{}, {}) AS "
|
||||
"SimilarityScore FROM c ORDER BY VectorDistance(c.{}, {})".format(
|
||||
k,
|
||||
self._embedding_key,
|
||||
self._embedding_key,
|
||||
embeddings,
|
||||
self._embedding_key,
|
||||
embeddings,
|
||||
)
|
||||
query = "SELECT "
|
||||
|
||||
# If limit_offset_clause is not specified, add TOP clause
|
||||
if pre_filter is None or pre_filter.get("limit_offset_clause") is None:
|
||||
query += "TOP @limit "
|
||||
|
||||
query += (
|
||||
"c.id, c.{}, c.text, c.metadata, "
|
||||
"VectorDistance(c.@embeddingKey, @embeddings) AS SimilarityScore FROM c"
|
||||
)
|
||||
|
||||
# Add where_clause if specified
|
||||
if pre_filter is not None and pre_filter.get("where_clause") is not None:
|
||||
query += " {}".format(pre_filter["where_clause"])
|
||||
|
||||
query += " ORDER BY VectorDistance(c.@embeddingKey, @embeddings)"
|
||||
|
||||
# Add limit_offset_clause if specified
|
||||
if pre_filter is not None and pre_filter.get("limit_offset_clause") is not None:
|
||||
query += " {}".format(pre_filter["limit_offset_clause"])
|
||||
parameters = [
|
||||
{"name": "@limit", "value": k},
|
||||
{"name": "@embeddingKey", "value": self._embedding_key},
|
||||
{"name": "@embeddings", "value": embeddings},
|
||||
]
|
||||
|
||||
docs_and_scores = []
|
||||
|
||||
items = list(
|
||||
self._container.query_items(query=query, enable_cross_partition_query=True)
|
||||
self._container.query_items(
|
||||
query=query, parameters=parameters, enable_cross_partition_query=True
|
||||
)
|
||||
)
|
||||
for item in items:
|
||||
text = item["text"]
|
||||
metadata = item["metadata"]
|
||||
score = item["SimilarityScore"]
|
||||
docs_and_scores.append((Document(page_content=text, metadata=item), score))
|
||||
if with_embedding:
|
||||
metadata[self._embedding_key] = item[self._embedding_key]
|
||||
docs_and_scores.append(
|
||||
(Document(page_content=text, metadata=metadata), score)
|
||||
)
|
||||
return docs_and_scores
|
||||
|
||||
def similarity_search_with_score(
|
||||
self,
|
||||
query: str,
|
||||
k: int = 4,
|
||||
pre_filter: Optional[Dict] = None,
|
||||
with_embedding: bool = False,
|
||||
) -> List[Tuple[Document, float]]:
|
||||
embeddings = self._embedding.embed_query(query)
|
||||
docs_and_scores = self._similarity_search_with_score(embeddings=embeddings, k=k)
|
||||
docs_and_scores = self._similarity_search_with_score(
|
||||
embeddings=embeddings,
|
||||
k=k,
|
||||
pre_filter=pre_filter,
|
||||
with_embedding=with_embedding,
|
||||
)
|
||||
return docs_and_scores
|
||||
|
||||
def similarity_search(
|
||||
self, query: str, k: int = 4, **kwargs: Any
|
||||
self,
|
||||
query: str,
|
||||
k: int = 4,
|
||||
pre_filter: Optional[Dict] = None,
|
||||
with_embedding: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
docs_and_scores = self.similarity_search_with_score(query, k=k)
|
||||
docs_and_scores = self.similarity_search_with_score(
|
||||
query,
|
||||
k=k,
|
||||
pre_filter=pre_filter,
|
||||
with_embedding=with_embedding,
|
||||
)
|
||||
|
||||
return [doc for doc, _ in docs_and_scores]
|
||||
|
||||
@ -304,7 +353,18 @@ class AzureCosmosDBNoSqlVectorSearch(VectorStore):
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
# Retrieves the docs with similarity scores
|
||||
docs = self._similarity_search_with_score(embeddings=embedding, k=fetch_k)
|
||||
pre_filter = {}
|
||||
with_embedding = False
|
||||
if kwargs["pre_filter"]:
|
||||
pre_filter = kwargs["pre_filter"]
|
||||
if kwargs["with_embedding"]:
|
||||
with_embedding = kwargs["with_embedding"]
|
||||
docs = self._similarity_search_with_score(
|
||||
embeddings=embedding,
|
||||
k=fetch_k,
|
||||
pre_filter=pre_filter,
|
||||
with_embedding=with_embedding,
|
||||
)
|
||||
|
||||
# Re-ranks the docs using MMR
|
||||
mmr_doc_indexes = maximal_marginal_relevance(
|
||||
@ -326,6 +386,12 @@ class AzureCosmosDBNoSqlVectorSearch(VectorStore):
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
# compute the embeddings vector from the query string
|
||||
pre_filter = {}
|
||||
with_embedding = False
|
||||
if kwargs["pre_filter"]:
|
||||
pre_filter = kwargs["pre_filter"]
|
||||
if kwargs["with_embedding"]:
|
||||
with_embedding = kwargs["with_embedding"]
|
||||
embeddings = self._embedding.embed_query(query)
|
||||
|
||||
docs = self.max_marginal_relevance_search_by_vector(
|
||||
@ -333,5 +399,7 @@ class AzureCosmosDBNoSqlVectorSearch(VectorStore):
|
||||
k=k,
|
||||
fetch_k=fetch_k,
|
||||
lambda_mult=lambda_mult,
|
||||
pre_filter=pre_filter,
|
||||
with_embedding=with_embedding,
|
||||
)
|
||||
return docs
|
||||
|
@ -25,7 +25,7 @@ model_name = os.getenv("OPENAI_EMBEDDINGS_MODEL_NAME", "text-embedding-ada-002")
|
||||
INDEX_NAME = "langchain-test-index"
|
||||
INDEX_NAME_VECTOR_HNSW = "langchain-test-index-hnsw"
|
||||
NAMESPACE = "langchain_test_db.langchain_test_collection"
|
||||
CONNECTION_STRING: str = os.environ.get("MONGODB_VCORE_URI", "")
|
||||
CONNECTION_STRING: str = "mongodb+srv://akataria:Basket24ball@akataria-vector-search-testing.mongocluster.cosmos.azure.com/?tls=true&authMechanism=SCRAM-SHA-256&retrywrites=false&maxIdleTimeMS=120000"
|
||||
DB_NAME, COLLECTION_NAME = NAMESPACE.split(".")
|
||||
|
||||
num_lists = 3
|
||||
|
@ -104,6 +104,7 @@ class TestAzureCosmosDBNoSqlVectorSearch:
|
||||
),
|
||||
indexing_policy=get_vector_indexing_policy("flat"),
|
||||
cosmos_container_properties={"partition_key": partition_key},
|
||||
cosmos_database_properties={},
|
||||
)
|
||||
sleep(1) # waits for Cosmos DB to save contents to the collection
|
||||
|
||||
@ -139,6 +140,7 @@ class TestAzureCosmosDBNoSqlVectorSearch:
|
||||
),
|
||||
indexing_policy=get_vector_indexing_policy("flat"),
|
||||
cosmos_container_properties={"partition_key": partition_key},
|
||||
cosmos_database_properties={},
|
||||
)
|
||||
sleep(1) # waits for Cosmos DB to save contents to the collection
|
||||
|
||||
@ -154,3 +156,60 @@ class TestAzureCosmosDBNoSqlVectorSearch:
|
||||
assert output2
|
||||
assert output2[0].page_content != "Dogs are tough."
|
||||
safe_delete_database(cosmos_client)
|
||||
|
||||
def test_from_documents_cosine_distance_with_filtering(
|
||||
self,
|
||||
cosmos_client: Any,
|
||||
partition_key: Any,
|
||||
azure_openai_embeddings: OpenAIEmbeddings,
|
||||
) -> None:
|
||||
"""Test end to end construction and search."""
|
||||
documents = [
|
||||
Document(page_content="Dogs are tough.", metadata={"a": 1}),
|
||||
Document(page_content="Cats have fluff.", metadata={"a": 1}),
|
||||
Document(page_content="What is a sandwich?", metadata={"c": 1}),
|
||||
Document(page_content="That fence is purple.", metadata={"d": 1, "e": 2}),
|
||||
]
|
||||
|
||||
store = AzureCosmosDBNoSqlVectorSearch.from_documents(
|
||||
documents,
|
||||
azure_openai_embeddings,
|
||||
cosmos_client=cosmos_client,
|
||||
database_name=database_name,
|
||||
container_name=container_name,
|
||||
vector_embedding_policy=get_vector_embedding_policy(
|
||||
"cosine", "float32", 400
|
||||
),
|
||||
indexing_policy=get_vector_indexing_policy("flat"),
|
||||
cosmos_container_properties={"partition_key": partition_key},
|
||||
cosmos_database_properties={},
|
||||
)
|
||||
sleep(1) # waits for Cosmos DB to save contents to the collection
|
||||
|
||||
output = store.similarity_search("Dogs", k=4)
|
||||
assert len(output) == 4
|
||||
assert output[0].page_content == "Dogs are tough."
|
||||
assert output[0].metadata["a"] == 1
|
||||
|
||||
pre_filter = {
|
||||
"where_clause": "WHERE c.metadata.a=1",
|
||||
}
|
||||
output = store.similarity_search(
|
||||
"Dogs", k=4, pre_filter=pre_filter, with_embedding=True
|
||||
)
|
||||
|
||||
assert len(output) == 2
|
||||
assert output[0].page_content == "Dogs are tough."
|
||||
assert output[0].metadata["a"] == 1
|
||||
|
||||
pre_filter = {
|
||||
"where_clause": "WHERE c.metadata.a=1",
|
||||
"limit_offset_clause": "OFFSET 0 LIMIT 1",
|
||||
}
|
||||
|
||||
output = store.similarity_search("Dogs", k=4, pre_filter=pre_filter)
|
||||
|
||||
assert len(output) == 1
|
||||
assert output[0].page_content == "Dogs are tough."
|
||||
assert output[0].metadata["a"] == 1
|
||||
safe_delete_database(cosmos_client)
|
||||
|
Loading…
Reference in New Issue
Block a user