core[minor]: add upsert, streaming_upsert, aupsert, astreaming_upsert methods to the VectorStore abstraction (#23774)

This PR rolls out part of the new proposed interface for vectorstores (https://github.com/langchain-ai/langchain/pull/23544) to existing store implementations. The PR makes the following changes: 1. Adds standard upsert, streaming_upsert, aupsert, astreaming_upsert methods to the vectorstore. 2. Updates `add_texts` and `aadd_texts` to be non required with a default implementation that delegates to `upsert` and `aupsert` if those have been implemented. The original `add_texts` and `aadd_texts` methods are problematic as they spread object specific information across document and **kwargs. (e.g., ids are not a part of the document) 3. Adds a default implementation to `add_documents` and `aadd_documents` that delegates to `upsert` and `aupsert` respectively. 4. Adds standard unit tests to verify that a given vectorstore implements a correct read/write API. A downside of this implementation is that it creates `upsert` with a very similar signature to `add_documents`. The reason for introducing `upsert` is to: * Remove any ambiguities about what information is allowed in `kwargs`. Specifically kwargs should only be used for information common to all indexed data. (e.g., indexing timeout). *Allow inheriting from an anticipated generalized interface for indexing that will allow indexing `BaseMedia` (i.e., allow making a vectorstore for images/audio etc.) `add_documents` can be deprecated in the future in favor of `upsert` to make sure that users have a single correct way of indexing content. --------- Co-authored-by: ccurme <chester.curme@gmail.com>
2025-09-06 13:33:37 +00:00 · 2024-07-05 12:21:40 -04:00
parent 3c752238c5
commit 6f08e11d7c
14 changed files with 667 additions and 83 deletions
--- a/libs/community/langchain_community/vectorstores/inmemory.py
+++ b/libs/community/langchain_community/vectorstores/inmemory.py
@@ -6,6 +6,7 @@ from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tupl
 import numpy as np
 from langchain_core.documents import Document
 from langchain_core.embeddings import Embeddings
+from langchain_core.indexing import UpsertResponse
 from langchain_core.load import dumpd, load
 from langchain_core.vectorstores import VectorStore

@@ -37,27 +38,41 @@ class InMemoryVectorStore(VectorStore):
    async def adelete(self, ids: Optional[Sequence[str]] = None, **kwargs: Any) -> None:
        self.delete(ids)

-    def add_texts(
-        self,
-        texts: Iterable[str],
-        metadatas: Optional[List[dict]] = None,
-        ids: Optional[Sequence[str]] = None,
-        **kwargs: Any,
-    ) -> List[str]:
-        """Add texts to the store."""
-        vectors = self.embedding.embed_documents(list(texts))
-        ids_ = []
-
-        for i, text in enumerate(texts):
-            doc_id = ids[i] if ids else str(uuid.uuid4())
-            ids_.append(doc_id)
+    def upsert(self, items: Sequence[Document], /, **kwargs: Any) -> UpsertResponse:
+        vectors = self.embedding.embed_documents([item.page_content for item in items])
+        ids = []
+        for item, vector in zip(items, vectors):
+            doc_id = item.id if item.id else str(uuid.uuid4())
+            ids.append(doc_id)
            self.store[doc_id] = {
                "id": doc_id,
-                "vector": vectors[i],
-                "text": text,
-                "metadata": metadatas[i] if metadatas else {},
+                "vector": vector,
+                "text": item.page_content,
+                "metadata": item.metadata,
            }
-        return ids_
+        return {
+            "succeeded": ids,
+            "failed": [],
+        }
+
+    def get_by_ids(self, ids: Sequence[str], /) -> List[Document]:
+        """Get documents by their ids."""
+        documents = []
+
+        for doc_id in ids:
+            doc = self.store.get(doc_id)
+            if doc:
+                documents.append(
+                    Document(
+                        id=doc["id"],
+                        page_content=doc["text"],
+                        metadata=doc["metadata"],
+                    )
+                )
+        return documents
+
+    async def aget_by_ids(self, ids: Sequence[str], /) -> List[Document]:
+        return self.get_by_ids(ids)

    async def aadd_texts(
        self,
@@ -80,7 +95,9 @@ class InMemoryVectorStore(VectorStore):
            similarity = float(cosine_similarity([embedding], [vector]).item(0))
            result.append(
                (
-                    Document(page_content=doc["text"], metadata=doc["metadata"]),
+                    Document(
+                        id=doc["id"], page_content=doc["text"], metadata=doc["metadata"]
+                    ),
                    similarity,
                    vector,
                )
--- a/libs/community/langchain_community/vectorstores/milvus.py
+++ b/libs/community/langchain_community/vectorstores/milvus.py
@@ -1053,7 +1053,7 @@ class Milvus(VectorStore):
        pks = [item.get(self._primary_field) for item in query_result]
        return pks

-    def upsert(
+    def upsert(  # type: ignore[override]
        self,
        ids: Optional[List[str]] = None,
        documents: List[Document] | None = None,
--- a/libs/community/tests/unit_tests/vectorstores/test_inmemory.py
+++ b/libs/community/tests/unit_tests/vectorstores/test_inmemory.py
@@ -1,4 +1,5 @@
 from pathlib import Path
+from typing import Any

 import pytest
 from langchain_core.documents import Document
@@ -13,6 +14,11 @@ from tests.integration_tests.vectorstores.fake_embeddings import (
 )


+class AnyStr(str):
+    def __eq__(self, other: Any) -> bool:
+        return isinstance(other, str)
+
+
 class TestInMemoryReadWriteTestSuite(ReadWriteTestSuite):
    @pytest.fixture
    def vectorstore(self) -> InMemoryVectorStore:
@@ -31,10 +37,13 @@ async def test_inmemory() -> None:
        ["foo", "bar", "baz"], ConsistentFakeEmbeddings()
    )
    output = await store.asimilarity_search("foo", k=1)
-    assert output == [Document(page_content="foo")]
+    assert output == [Document(page_content="foo", id=AnyStr())]

    output = await store.asimilarity_search("bar", k=2)
-    assert output == [Document(page_content="bar"), Document(page_content="baz")]
+    assert output == [
+        Document(page_content="bar", id=AnyStr()),
+        Document(page_content="baz", id=AnyStr()),
+    ]

    output2 = await store.asimilarity_search_with_score("bar", k=2)
    assert output2[0][1] > output2[1][1]
@@ -61,8 +70,8 @@ async def test_inmemory_mmr() -> None:
        "foo", k=10, lambda_mult=0.1
    )
    assert len(output) == len(texts)
-    assert output[0] == Document(page_content="foo")
-    assert output[1] == Document(page_content="foy")
+    assert output[0] == Document(page_content="foo", id=AnyStr())
+    assert output[1] == Document(page_content="foy", id=AnyStr())


 async def test_inmemory_dump_load(tmp_path: Path) -> None:
@@ -90,4 +99,4 @@ async def test_inmemory_filter() -> None:
    output = await store.asimilarity_search(
        "baz", filter=lambda doc: doc.metadata["id"] == 1
    )
-    assert output == [Document(page_content="foo", metadata={"id": 1})]
+    assert output == [Document(page_content="foo", metadata={"id": 1}, id=AnyStr())]