community : [bugfix] Use document ids as keys in AzureSearch vectorstore (#25486)

# Description [Vector store base class](4cdaca67dc/libs/core/langchain_core/vectorstores/base.py (L65)) currently expects `ids` to be passed in and that is what it passes along to the AzureSearch vector store when attempting to `add_texts()`. However AzureSearch expects `keys` to be passed in. When they are not present, AzureSearch `add_embeddings()` makes up new uuids. This is a problem when trying to run indexing. [Indexing code expects](b297af5482/libs/core/langchain_core/indexing/api.py (L371)) the documents to be uploaded using provided ids. Currently AzureSearch ignores `ids` passed from `indexing` and makes up new ones. Later when `indexer` attempts to delete removed file, it uses the `id` it had stored when uploading the document, however it was uploaded under different `id`. **Twitter handle: @martintriska1**
2025-09-16 15:04:13 +00:00 · 2024-09-19 15:37:18 +02:00
parent a8561bc303
commit 3fc0ea510e
2 changed files with 56 additions and 3 deletions
--- a/libs/community/langchain_community/vectorstores/azuresearch.py
+++ b/libs/community/langchain_community/vectorstores/azuresearch.py
@@ -443,6 +443,12 @@ class AzureSearch(VectorStore):
            logger.debug("Nothing to insert, skipping.")
            return []

+        # when `keys` are not passed in and there is `ids` in kwargs, use those instead
+        # base class expects `ids` passed in rather than `keys`
+        # https://github.com/langchain-ai/langchain/blob/4cdaca67dc51dba887289f56c6fead3c1a52f97d/libs/core/langchain_core/vectorstores/base.py#L65
+        if (not keys) and ("ids" in kwargs) and (len(kwargs["ids"]) == len(embeddings)):
+            keys = kwargs["ids"]
+
        return self.add_embeddings(zip(texts, embeddings), metadatas, keys=keys)

    async def aadd_texts(
@@ -467,6 +473,12 @@ class AzureSearch(VectorStore):
            logger.debug("Nothing to insert, skipping.")
            return []

+        # when `keys` are not passed in and there is `ids` in kwargs, use those instead
+        # base class expects `ids` passed in rather than `keys`
+        # https://github.com/langchain-ai/langchain/blob/4cdaca67dc51dba887289f56c6fead3c1a52f97d/libs/core/langchain_core/vectorstores/base.py#L65
+        if (not keys) and ("ids" in kwargs) and (len(kwargs["ids"]) == len(embeddings)):
+            keys = kwargs["ids"]
+
        return await self.aadd_embeddings(zip(texts, embeddings), metadatas, keys=keys)

    def add_embeddings(
@@ -483,9 +495,13 @@ class AzureSearch(VectorStore):
        data = []
        for i, (text, embedding) in enumerate(text_embeddings):
            # Use provided key otherwise use default key
-            key = keys[i] if keys else str(uuid.uuid4())
-            # Encoding key for Azure Search valid characters
-            key = base64.urlsafe_b64encode(bytes(key, "utf-8")).decode("ascii")
+            if keys:
+                key = keys[i]
+            else:
+                key = str(uuid.uuid4())
+                # Encoding key for Azure Search valid characters
+                key = base64.urlsafe_b64encode(bytes(key, "utf-8")).decode("ascii")
+
            metadata = metadatas[i] if metadatas else {}
            # Add data to index
            # Additional metadata to fields mapping