From ffe6ca986ee5b439e85c82781c1d8ce3578a3e88 Mon Sep 17 00:00:00 2001 From: thedavgar <87140293+thedavgar@users.noreply.github.com> Date: Fri, 12 Jul 2024 03:32:19 +0200 Subject: [PATCH] community: Fix Bug in Azure Search Vectorstore search asyncronously (#24081) Thank you for contributing to LangChain! **Description**: This PR fixes a bug described in the issue in #24064, when using the AzureSearch Vectorstore with the asyncronous methods to do search which is also the method used for the retriever. The proposed change includes just change the access of the embedding as optional because is it not used anywhere to retrieve documents. Actually, the syncronous methods of retrieval do not use the embedding neither. With this PR the code given by the user in the issue works. ```python vectorstore = AzureSearch( azure_search_endpoint=os.getenv("AI_SEARCH_ENDPOINT_SECRET"), azure_search_key=os.getenv("AI_SEARCH_API_KEY"), index_name=os.getenv("AI_SEARCH_INDEX_NAME_SECRET"), fields=fields, embedding_function=encoder, ) retriever = vectorstore.as_retriever(search_type="hybrid", k=2) await vectorstore.avector_search("what is the capital of France") await retriever.ainvoke("what is the capital of France") ``` **Issue**: The Azure Search Vectorstore is not working when searching for documents with asyncronous methods, as described in issue #24064 **Dependencies**: There are no extra dependencies required for this change. --------- Co-authored-by: isaac hershenson --- .../vectorstores/azuresearch.py | 139 ++++++++---------- 1 file changed, 65 insertions(+), 74 deletions(-) diff --git a/libs/community/langchain_community/vectorstores/azuresearch.py b/libs/community/langchain_community/vectorstores/azuresearch.py index 8f1946a7f50..65bdae78c12 100644 --- a/libs/community/langchain_community/vectorstores/azuresearch.py +++ b/libs/community/langchain_community/vectorstores/azuresearch.py @@ -327,6 +327,21 @@ class AzureSearch(VectorStore): cors_options=cors_options, additional_search_client_options=additional_search_client_options, ) + self.async_client = _get_search_client( + azure_search_endpoint, + azure_search_key, + index_name, + semantic_configuration_name=semantic_configuration_name, + fields=fields, + vector_search=vector_search, + semantic_configurations=semantic_configurations, + scoring_profiles=scoring_profiles, + default_scoring_profile=default_scoring_profile, + default_fields=default_fields, + user_agent=user_agent, + cors_options=cors_options, + async_=True, + ) self.search_type = search_type self.semantic_configuration_name = semantic_configuration_name self.fields = fields if fields else default_fields @@ -344,23 +359,6 @@ class AzureSearch(VectorStore): self._user_agent = user_agent self._cors_options = cors_options - def _async_client(self) -> AsyncSearchClient: - return _get_search_client( - self._azure_search_endpoint, - self._azure_search_key, - self._index_name, - semantic_configuration_name=self._semantic_configuration_name, - fields=self._fields, - vector_search=self._vector_search, - semantic_configurations=self._semantic_configurations, - scoring_profiles=self._scoring_profiles, - default_scoring_profile=self._default_scoring_profile, - default_fields=self._default_fields, - user_agent=self._user_agent, - cors_options=self._cors_options, - async_=True, - ) - @property def embeddings(self) -> Optional[Embeddings]: # TODO: Support embedding object directly @@ -519,7 +517,7 @@ class AzureSearch(VectorStore): ids.append(key) # Upload data in batches if len(data) == MAX_UPLOAD_BATCH_SIZE: - async with self._async_client() as async_client: + async with self.async_client as async_client: response = await async_client.upload_documents(documents=data) # Check if all documents were successfully uploaded if not all(r.succeeded for r in response): @@ -532,7 +530,7 @@ class AzureSearch(VectorStore): return ids # Upload data to index - async with self._async_client() as async_client: + async with self.async_client as async_client: response = await async_client.upload_documents(documents=data) # Check if all documents were successfully uploaded if all(r.succeeded for r in response): @@ -567,7 +565,7 @@ class AzureSearch(VectorStore): False otherwise. """ if ids: - async with self._async_client() as async_client: + async with self.async_client as async_client: res = await async_client.delete_documents([{"id": i} for i in ids]) return len(res) > 0 else: @@ -745,11 +743,11 @@ class AzureSearch(VectorStore): to the query and score for each """ embedding = await self._aembed_query(query) - docs, scores, _ = await self._asimple_search( + results = await self._asimple_search( embedding, "", k, filters=filters, **kwargs ) - return list(zip(docs, scores)) + return _results_to_documents(results) def max_marginal_relevance_search_with_score( self, @@ -813,14 +811,12 @@ class AzureSearch(VectorStore): to the query and score for each """ embedding = await self._aembed_query(query) - docs, scores, vectors = await self._asimple_search( + results = await self._asimple_search( embedding, "", fetch_k, filters=filters, **kwargs ) - return await self._areorder_results_with_maximal_marginal_relevance( - docs, - scores, - vectors, + return await _areorder_results_with_maximal_marginal_relevance( + results, query_embedding=np.array(embedding), lambda_mult=lambda_mult, k=k, @@ -896,11 +892,11 @@ class AzureSearch(VectorStore): """ embedding = await self._aembed_query(query) - docs, scores, _ = await self._asimple_search( + results = await self._asimple_search( embedding, query, k, filters=filters, **kwargs ) - return list(zip(docs, scores)) + return _results_to_documents(results) def hybrid_search_with_relevance_scores( self, @@ -998,14 +994,12 @@ class AzureSearch(VectorStore): """ embedding = await self._aembed_query(query) - docs, scores, vectors = await self._asimple_search( + results = await self._asimple_search( embedding, query, fetch_k, filters=filters, **kwargs ) - return await self._areorder_results_with_maximal_marginal_relevance( - docs, - scores, - vectors, + return await _areorder_results_with_maximal_marginal_relevance( + results, query_embedding=np.array(embedding), lambda_mult=lambda_mult, k=k, @@ -1055,7 +1049,7 @@ class AzureSearch(VectorStore): *, filters: Optional[str] = None, **kwargs: Any, - ) -> Tuple[List[Document], List[float], List[List[float]]]: + ) -> SearchItemPaged[dict]: """Perform vector or hybrid search in the Azure search index. Args: @@ -1069,8 +1063,8 @@ class AzureSearch(VectorStore): """ from azure.search.documents.models import VectorizedQuery - async with self._async_client() as async_client: - results = await async_client.search( + async with self.async_client as async_client: + return await async_client.search( search_text=text_query, vector_queries=[ VectorizedQuery( @@ -1083,18 +1077,6 @@ class AzureSearch(VectorStore): top=k, **kwargs, ) - docs = [ - ( - _result_to_document(result), - float(result["@search.score"]), - result[FIELDS_CONTENT_VECTOR], - ) - async for result in results - ] - if not docs: - raise ValueError(f"No {docs=}") - documents, scores, vectors = map(list, zip(*docs)) - return documents, scores, vectors def semantic_hybrid_search( self, query: str, k: int = 4, **kwargs: Any @@ -1306,7 +1288,7 @@ class AzureSearch(VectorStore): from azure.search.documents.models import VectorizedQuery vector = await self._aembed_query(query) - async with self._async_client() as async_client: + async with self.async_client as async_client: results = await async_client.search( search_text=query, vector_queries=[ @@ -1481,30 +1463,6 @@ class AzureSearch(VectorStore): azure_search.add_embeddings(text_embeddings, metadatas, **kwargs) return azure_search - async def _areorder_results_with_maximal_marginal_relevance( - self, - documents: List[Document], - scores: List[float], - vectors: List[List[float]], - query_embedding: np.ndarray, - lambda_mult: float = 0.5, - k: int = 4, - ) -> List[Tuple[Document, float]]: - # Get the new order of results. - new_ordering = maximal_marginal_relevance( - query_embedding, vectors, k=k, lambda_mult=lambda_mult - ) - - # Reorder the values and return. - ret: List[Tuple[Document, float]] = [] - for x in new_ordering: - # Function can return -1 index - if x == -1: - break - ret.append((documents[x], scores[x])) # type: ignore - - return ret - def as_retriever(self, **kwargs: Any) -> AzureSearchVectorStoreRetriever: # type: ignore """Return AzureSearchVectorStoreRetriever initialized from this VectorStore. @@ -1672,6 +1630,39 @@ def _results_to_documents( return docs +async def _areorder_results_with_maximal_marginal_relevance( + results: SearchItemPaged[Dict], + query_embedding: np.ndarray, + lambda_mult: float = 0.5, + k: int = 4, +) -> List[Tuple[Document, float]]: + # Convert results to Document objects + docs = [ + ( + _result_to_document(result), + float(result["@search.score"]), + result[FIELDS_CONTENT_VECTOR], + ) + for result in results + ] + documents, scores, vectors = map(list, zip(*docs)) + + # Get the new order of results. + new_ordering = maximal_marginal_relevance( + query_embedding, vectors, k=k, lambda_mult=lambda_mult + ) + + # Reorder the values and return. + ret: List[Tuple[Document, float]] = [] + for x in new_ordering: + # Function can return -1 index + if x == -1: + break + ret.append((documents[x], scores[x])) # type: ignore + + return ret + + def _reorder_results_with_maximal_marginal_relevance( results: SearchItemPaged[Dict], query_embedding: np.ndarray,