community: Fix Bug in Azure Search Vectorstore search asyncronously (#24081)

Thank you for contributing to LangChain!

**Description**:
This PR fixes a bug described in the issue in #24064, when using the
AzureSearch Vectorstore with the asyncronous methods to do search which
is also the method used for the retriever. The proposed change includes
just change the access of the embedding as optional because is it not
used anywhere to retrieve documents. Actually, the syncronous methods of
retrieval do not use the embedding neither.

With this PR the code given by the user in the issue works.

```python
vectorstore = AzureSearch(
    azure_search_endpoint=os.getenv("AI_SEARCH_ENDPOINT_SECRET"),
    azure_search_key=os.getenv("AI_SEARCH_API_KEY"),
    index_name=os.getenv("AI_SEARCH_INDEX_NAME_SECRET"),
    fields=fields,
    embedding_function=encoder,
)

retriever = vectorstore.as_retriever(search_type="hybrid", k=2)

await vectorstore.avector_search("what is the capital of France")
await retriever.ainvoke("what is the capital of France")
```

**Issue**:
The Azure Search Vectorstore is not working when searching for documents
with asyncronous methods, as described in issue #24064

**Dependencies**:
There are no extra dependencies required for this change.

---------

Co-authored-by: isaac hershenson <ihershenson@hmc.edu>
This commit is contained in:
thedavgar 2024-07-12 03:32:19 +02:00 committed by GitHub
parent 7790d67f94
commit ffe6ca986e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -327,6 +327,21 @@ class AzureSearch(VectorStore):
cors_options=cors_options, cors_options=cors_options,
additional_search_client_options=additional_search_client_options, additional_search_client_options=additional_search_client_options,
) )
self.async_client = _get_search_client(
azure_search_endpoint,
azure_search_key,
index_name,
semantic_configuration_name=semantic_configuration_name,
fields=fields,
vector_search=vector_search,
semantic_configurations=semantic_configurations,
scoring_profiles=scoring_profiles,
default_scoring_profile=default_scoring_profile,
default_fields=default_fields,
user_agent=user_agent,
cors_options=cors_options,
async_=True,
)
self.search_type = search_type self.search_type = search_type
self.semantic_configuration_name = semantic_configuration_name self.semantic_configuration_name = semantic_configuration_name
self.fields = fields if fields else default_fields self.fields = fields if fields else default_fields
@ -344,23 +359,6 @@ class AzureSearch(VectorStore):
self._user_agent = user_agent self._user_agent = user_agent
self._cors_options = cors_options self._cors_options = cors_options
def _async_client(self) -> AsyncSearchClient:
return _get_search_client(
self._azure_search_endpoint,
self._azure_search_key,
self._index_name,
semantic_configuration_name=self._semantic_configuration_name,
fields=self._fields,
vector_search=self._vector_search,
semantic_configurations=self._semantic_configurations,
scoring_profiles=self._scoring_profiles,
default_scoring_profile=self._default_scoring_profile,
default_fields=self._default_fields,
user_agent=self._user_agent,
cors_options=self._cors_options,
async_=True,
)
@property @property
def embeddings(self) -> Optional[Embeddings]: def embeddings(self) -> Optional[Embeddings]:
# TODO: Support embedding object directly # TODO: Support embedding object directly
@ -519,7 +517,7 @@ class AzureSearch(VectorStore):
ids.append(key) ids.append(key)
# Upload data in batches # Upload data in batches
if len(data) == MAX_UPLOAD_BATCH_SIZE: if len(data) == MAX_UPLOAD_BATCH_SIZE:
async with self._async_client() as async_client: async with self.async_client as async_client:
response = await async_client.upload_documents(documents=data) response = await async_client.upload_documents(documents=data)
# Check if all documents were successfully uploaded # Check if all documents were successfully uploaded
if not all(r.succeeded for r in response): if not all(r.succeeded for r in response):
@ -532,7 +530,7 @@ class AzureSearch(VectorStore):
return ids return ids
# Upload data to index # Upload data to index
async with self._async_client() as async_client: async with self.async_client as async_client:
response = await async_client.upload_documents(documents=data) response = await async_client.upload_documents(documents=data)
# Check if all documents were successfully uploaded # Check if all documents were successfully uploaded
if all(r.succeeded for r in response): if all(r.succeeded for r in response):
@ -567,7 +565,7 @@ class AzureSearch(VectorStore):
False otherwise. False otherwise.
""" """
if ids: if ids:
async with self._async_client() as async_client: async with self.async_client as async_client:
res = await async_client.delete_documents([{"id": i} for i in ids]) res = await async_client.delete_documents([{"id": i} for i in ids])
return len(res) > 0 return len(res) > 0
else: else:
@ -745,11 +743,11 @@ class AzureSearch(VectorStore):
to the query and score for each to the query and score for each
""" """
embedding = await self._aembed_query(query) embedding = await self._aembed_query(query)
docs, scores, _ = await self._asimple_search( results = await self._asimple_search(
embedding, "", k, filters=filters, **kwargs embedding, "", k, filters=filters, **kwargs
) )
return list(zip(docs, scores)) return _results_to_documents(results)
def max_marginal_relevance_search_with_score( def max_marginal_relevance_search_with_score(
self, self,
@ -813,14 +811,12 @@ class AzureSearch(VectorStore):
to the query and score for each to the query and score for each
""" """
embedding = await self._aembed_query(query) embedding = await self._aembed_query(query)
docs, scores, vectors = await self._asimple_search( results = await self._asimple_search(
embedding, "", fetch_k, filters=filters, **kwargs embedding, "", fetch_k, filters=filters, **kwargs
) )
return await self._areorder_results_with_maximal_marginal_relevance( return await _areorder_results_with_maximal_marginal_relevance(
docs, results,
scores,
vectors,
query_embedding=np.array(embedding), query_embedding=np.array(embedding),
lambda_mult=lambda_mult, lambda_mult=lambda_mult,
k=k, k=k,
@ -896,11 +892,11 @@ class AzureSearch(VectorStore):
""" """
embedding = await self._aembed_query(query) embedding = await self._aembed_query(query)
docs, scores, _ = await self._asimple_search( results = await self._asimple_search(
embedding, query, k, filters=filters, **kwargs embedding, query, k, filters=filters, **kwargs
) )
return list(zip(docs, scores)) return _results_to_documents(results)
def hybrid_search_with_relevance_scores( def hybrid_search_with_relevance_scores(
self, self,
@ -998,14 +994,12 @@ class AzureSearch(VectorStore):
""" """
embedding = await self._aembed_query(query) embedding = await self._aembed_query(query)
docs, scores, vectors = await self._asimple_search( results = await self._asimple_search(
embedding, query, fetch_k, filters=filters, **kwargs embedding, query, fetch_k, filters=filters, **kwargs
) )
return await self._areorder_results_with_maximal_marginal_relevance( return await _areorder_results_with_maximal_marginal_relevance(
docs, results,
scores,
vectors,
query_embedding=np.array(embedding), query_embedding=np.array(embedding),
lambda_mult=lambda_mult, lambda_mult=lambda_mult,
k=k, k=k,
@ -1055,7 +1049,7 @@ class AzureSearch(VectorStore):
*, *,
filters: Optional[str] = None, filters: Optional[str] = None,
**kwargs: Any, **kwargs: Any,
) -> Tuple[List[Document], List[float], List[List[float]]]: ) -> SearchItemPaged[dict]:
"""Perform vector or hybrid search in the Azure search index. """Perform vector or hybrid search in the Azure search index.
Args: Args:
@ -1069,8 +1063,8 @@ class AzureSearch(VectorStore):
""" """
from azure.search.documents.models import VectorizedQuery from azure.search.documents.models import VectorizedQuery
async with self._async_client() as async_client: async with self.async_client as async_client:
results = await async_client.search( return await async_client.search(
search_text=text_query, search_text=text_query,
vector_queries=[ vector_queries=[
VectorizedQuery( VectorizedQuery(
@ -1083,18 +1077,6 @@ class AzureSearch(VectorStore):
top=k, top=k,
**kwargs, **kwargs,
) )
docs = [
(
_result_to_document(result),
float(result["@search.score"]),
result[FIELDS_CONTENT_VECTOR],
)
async for result in results
]
if not docs:
raise ValueError(f"No {docs=}")
documents, scores, vectors = map(list, zip(*docs))
return documents, scores, vectors
def semantic_hybrid_search( def semantic_hybrid_search(
self, query: str, k: int = 4, **kwargs: Any self, query: str, k: int = 4, **kwargs: Any
@ -1306,7 +1288,7 @@ class AzureSearch(VectorStore):
from azure.search.documents.models import VectorizedQuery from azure.search.documents.models import VectorizedQuery
vector = await self._aembed_query(query) vector = await self._aembed_query(query)
async with self._async_client() as async_client: async with self.async_client as async_client:
results = await async_client.search( results = await async_client.search(
search_text=query, search_text=query,
vector_queries=[ vector_queries=[
@ -1481,30 +1463,6 @@ class AzureSearch(VectorStore):
azure_search.add_embeddings(text_embeddings, metadatas, **kwargs) azure_search.add_embeddings(text_embeddings, metadatas, **kwargs)
return azure_search return azure_search
async def _areorder_results_with_maximal_marginal_relevance(
self,
documents: List[Document],
scores: List[float],
vectors: List[List[float]],
query_embedding: np.ndarray,
lambda_mult: float = 0.5,
k: int = 4,
) -> List[Tuple[Document, float]]:
# Get the new order of results.
new_ordering = maximal_marginal_relevance(
query_embedding, vectors, k=k, lambda_mult=lambda_mult
)
# Reorder the values and return.
ret: List[Tuple[Document, float]] = []
for x in new_ordering:
# Function can return -1 index
if x == -1:
break
ret.append((documents[x], scores[x])) # type: ignore
return ret
def as_retriever(self, **kwargs: Any) -> AzureSearchVectorStoreRetriever: # type: ignore def as_retriever(self, **kwargs: Any) -> AzureSearchVectorStoreRetriever: # type: ignore
"""Return AzureSearchVectorStoreRetriever initialized from this VectorStore. """Return AzureSearchVectorStoreRetriever initialized from this VectorStore.
@ -1672,6 +1630,39 @@ def _results_to_documents(
return docs return docs
async def _areorder_results_with_maximal_marginal_relevance(
results: SearchItemPaged[Dict],
query_embedding: np.ndarray,
lambda_mult: float = 0.5,
k: int = 4,
) -> List[Tuple[Document, float]]:
# Convert results to Document objects
docs = [
(
_result_to_document(result),
float(result["@search.score"]),
result[FIELDS_CONTENT_VECTOR],
)
for result in results
]
documents, scores, vectors = map(list, zip(*docs))
# Get the new order of results.
new_ordering = maximal_marginal_relevance(
query_embedding, vectors, k=k, lambda_mult=lambda_mult
)
# Reorder the values and return.
ret: List[Tuple[Document, float]] = []
for x in new_ordering:
# Function can return -1 index
if x == -1:
break
ret.append((documents[x], scores[x])) # type: ignore
return ret
def _reorder_results_with_maximal_marginal_relevance( def _reorder_results_with_maximal_marginal_relevance(
results: SearchItemPaged[Dict], results: SearchItemPaged[Dict],
query_embedding: np.ndarray, query_embedding: np.ndarray,