core[minor]: add upsert, streaming_upsert, aupsert, astreaming_upsert methods to the VectorStore abstraction (#23774)

This PR rolls out part of the new proposed interface for vectorstores (https://github.com/langchain-ai/langchain/pull/23544) to existing store implementations. The PR makes the following changes: 1. Adds standard upsert, streaming_upsert, aupsert, astreaming_upsert methods to the vectorstore. 2. Updates `add_texts` and `aadd_texts` to be non required with a default implementation that delegates to `upsert` and `aupsert` if those have been implemented. The original `add_texts` and `aadd_texts` methods are problematic as they spread object specific information across document and **kwargs. (e.g., ids are not a part of the document) 3. Adds a default implementation to `add_documents` and `aadd_documents` that delegates to `upsert` and `aupsert` respectively. 4. Adds standard unit tests to verify that a given vectorstore implements a correct read/write API. A downside of this implementation is that it creates `upsert` with a very similar signature to `add_documents`. The reason for introducing `upsert` is to: * Remove any ambiguities about what information is allowed in `kwargs`. Specifically kwargs should only be used for information common to all indexed data. (e.g., indexing timeout). *Allow inheriting from an anticipated generalized interface for indexing that will allow indexing `BaseMedia` (i.e., allow making a vectorstore for images/audio etc.) `add_documents` can be deprecated in the future in favor of `upsert` to make sure that users have a single correct way of indexing content. --------- Co-authored-by: ccurme <chester.curme@gmail.com>
2025-06-21 14:18:52 +00:00 · 2024-07-05 12:21:40 -04:00 · 2024-07-05 12:21:40 -04:00 · 6f08e11d7c
commit 6f08e11d7c
parent 3c752238c5
14 changed files with 667 additions and 83 deletions
--- a/libs/community/langchain_community/vectorstores/inmemory.py
+++ b/libs/community/langchain_community/vectorstores/inmemory.py
@ -6,6 +6,7 @@ from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tupl
 import numpy as np
 from langchain_core.documents import Document
 from langchain_core.embeddings import Embeddings
 from langchain_core.indexing import UpsertResponse
 from langchain_core.load import dumpd, load
 from langchain_core.vectorstores import VectorStore
@ -37,27 +38,41 @@ class InMemoryVectorStore(VectorStore):
    async def adelete(self, ids: Optional[Sequence[str]] = None, **kwargs: Any) -> None:
        self.delete(ids)
-    def add_texts(
+    def upsert(self, items: Sequence[Document], /, **kwargs: Any) -> UpsertResponse:
-        self,
+        vectors = self.embedding.embed_documents([item.page_content for item in items])
-        texts: Iterable[str],
+        ids = []
-        metadatas: Optional[List[dict]] = None,
+        for item, vector in zip(items, vectors):
-        ids: Optional[Sequence[str]] = None,
+            doc_id = item.id if item.id else str(uuid.uuid4())
-        **kwargs: Any,
+            ids.append(doc_id)
    ) -> List[str]:
        """Add texts to the store."""
        vectors = self.embedding.embed_documents(list(texts))
        ids_ = []
        for i, text in enumerate(texts):
            doc_id = ids[i] if ids else str(uuid.uuid4())
            ids_.append(doc_id)
            self.store[doc_id] = {
                "id": doc_id,
-                "vector": vectors[i],
+                "vector": vector,
-                "text": text,
+                "text": item.page_content,
-                "metadata": metadatas[i] if metadatas else {},
+                "metadata": item.metadata,
            }
-        return ids_
+        return {
            "succeeded": ids,
            "failed": [],
        }
    def get_by_ids(self, ids: Sequence[str], /) -> List[Document]:
        """Get documents by their ids."""
        documents = []
        for doc_id in ids:
            doc = self.store.get(doc_id)
            if doc:
                documents.append(
                    Document(
                        id=doc["id"],
                        page_content=doc["text"],
                        metadata=doc["metadata"],
                    )
                )
        return documents
    async def aget_by_ids(self, ids: Sequence[str], /) -> List[Document]:
        return self.get_by_ids(ids)
    async def aadd_texts(
        self,
@ -80,7 +95,9 @@ class InMemoryVectorStore(VectorStore):
            similarity = float(cosine_similarity([embedding], [vector]).item(0))
            result.append(
                (
-                    Document(page_content=doc["text"], metadata=doc["metadata"]),
+                    Document(
                        id=doc["id"], page_content=doc["text"], metadata=doc["metadata"]
                    ),
                    similarity,
                    vector,
                )
--- a/libs/community/langchain_community/vectorstores/milvus.py
+++ b/libs/community/langchain_community/vectorstores/milvus.py
@ -1053,7 +1053,7 @@ class Milvus(VectorStore):
        pks = [item.get(self._primary_field) for item in query_result]
        return pks
-    def upsert(
+    def upsert(  # type: ignore[override]
        self,
        ids: Optional[List[str]] = None,
        documents: List[Document] | None = None,
--- a/libs/community/tests/unit_tests/vectorstores/test_inmemory.py
+++ b/libs/community/tests/unit_tests/vectorstores/test_inmemory.py
@ -1,4 +1,5 @@
 from pathlib import Path
 from typing import Any
 import pytest
 from langchain_core.documents import Document
@ -13,6 +14,11 @@ from tests.integration_tests.vectorstores.fake_embeddings import (
 )
 class AnyStr(str):
    def __eq__(self, other: Any) -> bool:
        return isinstance(other, str)
 class TestInMemoryReadWriteTestSuite(ReadWriteTestSuite):
    @pytest.fixture
    def vectorstore(self) -> InMemoryVectorStore:
@ -31,10 +37,13 @@ async def test_inmemory() -> None:
        ["foo", "bar", "baz"], ConsistentFakeEmbeddings()
    )
    output = await store.asimilarity_search("foo", k=1)
-    assert output == [Document(page_content="foo")]
+    assert output == [Document(page_content="foo", id=AnyStr())]
    output = await store.asimilarity_search("bar", k=2)
-    assert output == [Document(page_content="bar"), Document(page_content="baz")]
+    assert output == [
        Document(page_content="bar", id=AnyStr()),
        Document(page_content="baz", id=AnyStr()),
    ]
    output2 = await store.asimilarity_search_with_score("bar", k=2)
    assert output2[0][1] > output2[1][1]
@ -61,8 +70,8 @@ async def test_inmemory_mmr() -> None:
        "foo", k=10, lambda_mult=0.1
    )
    assert len(output) == len(texts)
-    assert output[0] == Document(page_content="foo")
+    assert output[0] == Document(page_content="foo", id=AnyStr())
-    assert output[1] == Document(page_content="foy")
+    assert output[1] == Document(page_content="foy", id=AnyStr())
 async def test_inmemory_dump_load(tmp_path: Path) -> None:
@ -90,4 +99,4 @@ async def test_inmemory_filter() -> None:
    output = await store.asimilarity_search(
        "baz", filter=lambda doc: doc.metadata["id"] == 1
    )
-    assert output == [Document(page_content="foo", metadata={"id": 1})]
+    assert output == [Document(page_content="foo", metadata={"id": 1}, id=AnyStr())]
--- a/libs/core/langchain_core/indexing/init.py
+++ b/libs/core/langchain_core/indexing/init.py
@ -6,7 +6,11 @@ if it's unchanged.
 """
 from langchain_core.indexing.api import IndexingResult, aindex, index
-from langchain_core.indexing.base import InMemoryRecordManager, RecordManager
+from langchain_core.indexing.base import (
    InMemoryRecordManager,
    RecordManager,
    UpsertResponse,
 )
 __all__ = [
    "aindex",
@ -14,4 +18,5 @@ __all__ = [
    "IndexingResult",
    "InMemoryRecordManager",
    "RecordManager",
    "UpsertResponse",
 ]
--- a/libs/core/langchain_core/indexing/base.py
+++ b/libs/core/langchain_core/indexing/base.py
@ -421,3 +421,16 @@ class InMemoryRecordManager(RecordManager):
            keys: A list of keys to delete.
        """
        self.delete_keys(keys)
 class UpsertResponse(TypedDict):
    """A generic response for upsert operations.
    The upsert response will be used by abstractions that implement an upsert
    operation for content that can be upserted by ID.
    """
    succeeded: List[str]
    """The IDs that were successfully indexed."""
    failed: List[str]
    """The IDs that failed to index."""
--- a/libs/core/langchain_core/utils/init.py
+++ b/libs/core/langchain_core/utils/init.py
@ -5,6 +5,7 @@ These functions do not depend on any other LangChain module.
 """
 from langchain_core.utils import image
 from langchain_core.utils.aiter import abatch_iterate
 from langchain_core.utils.env import get_from_dict_or_env, get_from_env
 from langchain_core.utils.formatting import StrictFormatter, formatter
 from langchain_core.utils.input import (
@ -13,6 +14,7 @@ from langchain_core.utils.input import (
    get_colored_text,
    print_text,
 )
 from langchain_core.utils.iter import batch_iterate
 from langchain_core.utils.loading import try_load_from_hub
 from langchain_core.utils.strings import comma_list, stringify_dict, stringify_value
 from langchain_core.utils.utils import (
@ -48,4 +50,6 @@ __all__ = [
    "stringify_dict",
    "comma_list",
    "stringify_value",
    "batch_iterate",
    "abatch_iterate",
 ]
--- a/libs/core/langchain_core/utils/aiter.py
+++ b/libs/core/langchain_core/utils/aiter.py
@ -11,6 +11,7 @@ from typing import (
    Any,
    AsyncContextManager,
    AsyncGenerator,
    AsyncIterable,
    AsyncIterator,
    Awaitable,
    Callable,
@ -245,3 +246,28 @@ class aclosing(AbstractAsyncContextManager):
    ) -> None:
        if hasattr(self.thing, "aclose"):
            await self.thing.aclose()
 async def abatch_iterate(
    size: int, iterable: AsyncIterable[T]
 ) -> AsyncIterator[List[T]]:
    """Utility batching function for async iterables.
    Args:
        size: The size of the batch.
        iterable: The async iterable to batch.
    Returns:
        An async iterator over the batches
    """
    batch: List[T] = []
    async for element in iterable:
        if len(batch) < size:
            batch.append(element)
        if len(batch) >= size:
            yield batch
            batch = []
    if batch:
        yield batch
--- a/libs/core/langchain_core/vectorstores.py
+++ b/libs/core/langchain_core/vectorstores.py
@ -25,26 +25,34 @@ import logging
 import math
 import warnings
 from abc import ABC, abstractmethod
 from itertools import cycle
 from typing import (
    TYPE_CHECKING,
    Any,
    AsyncIterable,
    AsyncIterator,
    Callable,
    ClassVar,
    Collection,
    Dict,
    Iterable,
    Iterator,
    List,
    Optional,
    Sequence,
    Tuple,
    Type,
    TypeVar,
    Union,
 )
 from langchain_core._api import beta
 from langchain_core.embeddings import Embeddings
 from langchain_core.pydantic_v1 import Field, root_validator
 from langchain_core.retrievers import BaseRetriever
 from langchain_core.runnables.config import run_in_executor
 from langchain_core.utils.aiter import abatch_iterate
 from langchain_core.utils.iter import batch_iterate
 if TYPE_CHECKING:
    from langchain_core.callbacks.manager import (
@ -52,6 +60,7 @@ if TYPE_CHECKING:
        CallbackManagerForRetrieverRun,
    )
    from langchain_core.documents import Document
    from langchain_core.indexing.base import UpsertResponse
 logger = logging.getLogger(__name__)
@ -61,11 +70,14 @@ VST = TypeVar("VST", bound="VectorStore")
 class VectorStore(ABC):
    """Interface for vector store."""
    @abstractmethod
    def add_texts(
        self,
        texts: Iterable[str],
        metadatas: Optional[List[dict]] = None,
        # One of the kwargs should be `ids` which is a list of ids
        # associated with the texts.
        # This is not yet enforced in the type signature for backwards compatibility
        # with existing implementations.
        **kwargs: Any,
    ) -> List[str]:
        """Run more texts through the embeddings and add to the vectorstore.
@ -74,16 +86,205 @@ class VectorStore(ABC):
            texts: Iterable of strings to add to the vectorstore.
            metadatas: Optional list of metadatas associated with the texts.
            **kwargs: vectorstore specific parameters.
                One of the kwargs should be `ids` which is a list of ids
                associated with the texts.
        Returns:
            List of ids from adding the texts into the vectorstore.
        """
        if type(self).upsert != VectorStore.upsert:
            # Import document in local scope to avoid circular imports
            from langchain_core.documents import Document
            # This condition is triggered if the subclass has provided
            # an implementation of the upsert method.
            # The existing add_texts
            texts_: Sequence[str] = (
                texts if isinstance(texts, (list, tuple)) else list(texts)
            )
            if metadatas and len(metadatas) != len(texts_):
                raise ValueError(
                    "The number of metadatas must match the number of texts."
                    "Got {len(metadatas)} metadatas and {len(texts_)} texts."
                )
            if "ids" in kwargs:
                ids = kwargs.pop("ids")
                if ids and len(ids) != len(texts_):
                    raise ValueError(
                        "The number of ids must match the number of texts."
                        "Got {len(ids)} ids and {len(texts_)} texts."
                    )
            else:
                ids = None
            metadatas_ = iter(metadatas) if metadatas else cycle([{}])
            ids_: Iterable[Union[str, None]] = ids if ids is not None else cycle([None])
            docs = [
                Document(page_content=text, metadata=metadata_, id=id_)
                for text, metadata_, id_ in zip(texts, metadatas_, ids_)
            ]
            upsert_response = self.upsert(docs, **kwargs)
            return upsert_response["succeeded"]
        raise NotImplementedError(
            f"`add_texts` has not been implemented for {self.__class__.__name__} "
        )
    # Developer guidelines:
    # Do not override streaming_upsert!
    @beta(message="Added in 0.2.11. The API is subject to change.")
    def streaming_upsert(
        self, items: Iterable[Document], /, batch_size: int, **kwargs: Any
    ) -> Iterator[UpsertResponse]:
        """Upsert documents in a streaming fashion.
        Args:
            items: Iterable of Documents to add to the vectorstore.
            batch_size: The size of each batch to upsert.
            **kwargs: Additional keyword arguments.
                kwargs should only include parameters that are common to all
                documents. (e.g., timeout for indexing, retry policy, etc.)
                kwargs should not include ids to avoid ambiguous semantics.
                Instead the ID should be provided as part of the Document object.
        .. versionadded:: 0.2.11
        """
        # The default implementation of this method breaks the input into
        # batches of size `batch_size` and calls the `upsert` method on each batch.
        # Subclasses can override this method to provide a more efficient
        # implementation.
        for item_batch in batch_iterate(batch_size, items):
            yield self.upsert(item_batch, **kwargs)
    # Please note that we've added a new method `upsert` instead of re-using the
    # existing `add_documents` method.
    # This was done to resolve potential ambiguities around the behavior of **kwargs
    # in existing add_documents / add_texts methods which could include per document
    # information (e.g., the `ids` parameter).
    # Over time the `add_documents` could be denoted as legacy and deprecated
    # in favor of the `upsert` method.
    @beta(message="Added in 0.2.11. The API is subject to change.")
    def upsert(self, items: Sequence[Document], /, **kwargs: Any) -> UpsertResponse:
        """Add or update documents in the vectorstore.
        The upsert functionality should utilize the ID field of the Document object
        if it is provided. If the ID is not provided, the upsert method is free
        to generate an ID for the document.
        When an ID is specified and the document already exists in the vectorstore,
        the upsert method should update the document with the new data. If the document
        does not exist, the upsert method should add the document to the vectorstore.
        Args:
            items: Sequence of Documents to add to the vectorstore.
            **kwargs: Additional keyword arguments.
        Returns:
            UpsertResponse: A response object that contains the list of IDs that were
            successfully added or updated in the vectorstore and the list of IDs that
            failed to be added or updated.
        .. versionadded:: 0.2.11
        """
        #  Developer guidelines:
        #
        #  Vectorstores implementations are free to extend `upsert` implementation
        #  to take in additional data per document.
        #
        #  This data **SHOULD NOT** be part of the **kwargs** parameter, instead
        #  sub-classes can use a Union type on `documents` to include additional
        #  supported formats for the input data stream.
        #
        #  For example,
        #
        #  .. code-block:: python
        #  from typing import TypedDict
        #
        #  class DocumentWithVector(TypedDict):
        #      document: Document
        #      vector: List[float]
        #
        #  def upsert(
        #          self,
        #          documents: Union[Iterable[Document], Iterable[DocumentWithVector]],
        #          /,
        #          **kwargs
        #  ) -> UpsertResponse:
        #      \"\"\"Add or update documents in the vectorstore.\"\"\"
        #      # Implementation should check if documents is an
        #      # iterable of DocumentWithVector or Document
        #      pass
        #
        #  Implementations that override upsert should include a new doc-string
        #  that explains the semantics of upsert and includes in code
        #  examples of how to insert using the alternate data formats.
        # The implementation does not delegate to the `add_texts` method or
        # the `add_documents` method by default since those implementations
        raise NotImplementedError(
            f"upsert has not been implemented for {self.__class__.__name__}"
        )
    @beta(message="Added in 0.2.11. The API is subject to change.")
    async def astreaming_upsert(
        self,
        items: AsyncIterable[Document],
        /,
        batch_size: int,
        **kwargs: Any,
    ) -> AsyncIterator[UpsertResponse]:
        """Upsert documents in a streaming fashion. Async version of streaming_upsert.
        Args:
            items: Iterable of Documents to add to the vectorstore.
            batch_size: The size of each batch to upsert.
            **kwargs: Additional keyword arguments.
                kwargs should only include parameters that are common to all
                documents. (e.g., timeout for indexing, retry policy, etc.)
                kwargs should not include ids to avoid ambiguous semantics.
                Instead the ID should be provided as part of the Document object.
        .. versionadded:: 0.2.11
        """
        async for batch in abatch_iterate(batch_size, items):
            yield await self.aupsert(batch, **kwargs)
    @beta(message="Added in 0.2.11. The API is subject to change.")
    async def aupsert(
        self, items: Sequence[Document], /, **kwargs: Any
    ) -> UpsertResponse:
        """Add or update documents in the vectorstore. Async version of upsert.
        The upsert functionality should utilize the ID field of the Document object
        if it is provided. If the ID is not provided, the upsert method is free
        to generate an ID for the document.
        When an ID is specified and the document already exists in the vectorstore,
        the upsert method should update the document with the new data. If the document
        does not exist, the upsert method should add the document to the vectorstore.
        Args:
            items: Sequence of Documents to add to the vectorstore.
            **kwargs: Additional keyword arguments.
        Returns:
            UpsertResponse: A response object that contains the list of IDs that were
            successfully added or updated in the vectorstore and the list of IDs that
            failed to be added or updated.
        .. versionadded:: 0.2.11
        """
        #  Developer guidelines: See guidelines for the `upsert` method.
        # The implementation does not delegate to the `add_texts` method or
        # the `add_documents` method by default since those implementations
        return await run_in_executor(None, self.upsert, items, **kwargs)
    @property
    def embeddings(self) -> Optional[Embeddings]:
        """Access the query embedding object if available."""
        logger.debug(
-            f"{Embeddings.__name__} is not implemented for {self.__class__.__name__}"
+            f"The embeddings property has not been "
            f"implemented for {self.__class__.__name__}"
        )
        return None
@ -187,17 +388,81 @@ class VectorStore(ABC):
        Returns:
            List of ids from adding the texts into the vectorstore.
        """
        if type(self).aupsert != VectorStore.aupsert:
            # Import document in local scope to avoid circular imports
            from langchain_core.documents import Document
            # This condition is triggered if the subclass has provided
            # an implementation of the upsert method.
            # The existing add_texts
            texts_: Sequence[str] = (
                texts if isinstance(texts, (list, tuple)) else list(texts)
            )
            if metadatas and len(metadatas) != len(texts_):
                raise ValueError(
                    "The number of metadatas must match the number of texts."
                    "Got {len(metadatas)} metadatas and {len(texts_)} texts."
                )
            if "ids" in kwargs:
                ids = kwargs.pop("ids")
                if ids and len(ids) != len(texts_):
                    raise ValueError(
                        "The number of ids must match the number of texts."
                        "Got {len(ids)} ids and {len(texts_)} texts."
                    )
            else:
                ids = None
            metadatas_ = iter(metadatas) if metadatas else cycle([{}])
            ids_: Iterable[Union[str, None]] = ids if ids is not None else cycle([None])
            docs = [
                Document(page_content=text, metadata=metadata_, id=id_)
                for text, metadata_, id_ in zip(texts, metadatas_, ids_)
            ]
            upsert_response = await self.aupsert(docs, **kwargs)
            return upsert_response["succeeded"]
        return await run_in_executor(None, self.add_texts, texts, metadatas, **kwargs)
    def add_documents(self, documents: List[Document], **kwargs: Any) -> List[str]:
-        """Run more documents through the embeddings and add to the vectorstore.
+        """Add or update documents in the vectorstore.
        Args:
            documents: Documents to add to the vectorstore.
            kwargs: Additional keyword arguments.
                if kwargs contains ids and documents contain ids,
                the ids in the kwargs will receive precedence.
        Returns:
            List of IDs of the added texts.
        """
        if type(self).upsert != VectorStore.upsert:
            from langchain_core.documents import Document
            if "ids" in kwargs:
                ids = kwargs.pop("ids")
                if ids and len(ids) != len(documents):
                    raise ValueError(
                        "The number of ids must match the number of documents. "
                        "Got {len(ids)} ids and {len(documents)} documents."
                    )
                documents_ = []
                for id_, document in zip(ids, documents):
                    doc_with_id = Document(
                        page_content=document.page_content,
                        metadata=document.metadata,
                        id=id_,
                    )
                    documents_.append(doc_with_id)
            else:
                documents_ = documents
            # If upsert has been implemented, we can use it to add documents
            return self.upsert(documents_, **kwargs)["succeeded"]
        # Code path that delegates to add_text for backwards compatibility
        # TODO: Handle the case where the user doesn't provide ids on the Collection
        texts = [doc.page_content for doc in documents]
        metadatas = [doc.metadata for doc in documents]
@ -214,6 +479,38 @@ class VectorStore(ABC):
        Returns:
            List of IDs of the added texts.
        """
        # If either upsert or aupsert has been implemented, we delegate to them!
        if (
            type(self).aupsert != VectorStore.aupsert
            or type(self).upsert != VectorStore.upsert
        ):
            # If aupsert has been implemented, we can use it to add documents
            from langchain_core.documents import Document
            if "ids" in kwargs:
                ids = kwargs.pop("ids")
                if ids and len(ids) != len(documents):
                    raise ValueError(
                        "The number of ids must match the number of documents."
                        "Got {len(ids)} ids and {len(documents)} documents."
                    )
                documents_ = []
                for id_, document in zip(ids, documents):
                    doc_with_id = Document(
                        page_content=document.page_content,
                        metadata=document.metadata,
                        id=id_,
                    )
                    documents_.append(doc_with_id)
            else:
                documents_ = documents
            # If upsert has been implemented, we can use it to add documents
            upsert_response = await self.aupsert(documents_, **kwargs)
            return upsert_response["succeeded"]
        texts = [doc.page_content for doc in documents]
        metadatas = [doc.metadata for doc in documents]
        return await self.aadd_texts(texts, metadatas, **kwargs)
--- a/libs/core/tests/unit_tests/indexing/test_public_api.py
+++ b/libs/core/tests/unit_tests/indexing/test_public_api.py
@ -10,4 +10,5 @@ def test_all() -> None:
        "IndexingResult",
        "InMemoryRecordManager",
        "RecordManager",
        "UpsertResponse",
    ]
--- a/libs/core/tests/unit_tests/utils/test_aiter.py
+++ b/libs/core/tests/unit_tests/utils/test_aiter.py
@ -0,0 +1,31 @@
 from typing import AsyncIterator, List
 import pytest
 from langchain_core.utils.aiter import abatch_iterate
@pytest.mark.parametrize(
    "input_size, input_iterable, expected_output",
    [
        (2, [1, 2, 3, 4, 5], [[1, 2], [3, 4], [5]]),
        (3, [10, 20, 30, 40, 50], [[10, 20, 30], [40, 50]]),
        (1, [100, 200, 300], [[100], [200], [300]]),
        (4, [], []),
    ],
 )
 async def test_abatch_iterate(
    input_size: int, input_iterable: List[str], expected_output: List[str]
 ) -> None:
    """Test batching function."""
    async def _to_async_iterable(iterable: List[str]) -> AsyncIterator[str]:
        for item in iterable:
            yield item
    iterator_ = abatch_iterate(input_size, _to_async_iterable(input_iterable))
    assert isinstance(iterator_, AsyncIterator)
    output = [el async for el in iterator_]
    assert output == expected_output
--- a/libs/core/tests/unit_tests/utils/test_imports.py
+++ b/libs/core/tests/unit_tests/utils/test_imports.py
@ -6,6 +6,8 @@ EXPECTED_ALL = [
    "convert_to_secret_str",
    "formatter",
    "get_bolded_text",
    "abatch_iterate",
    "batch_iterate",
    "get_color_mapping",
    "get_colored_text",
    "get_pydantic_field_names",
--- a/libs/core/tests/unit_tests/vectorstores/init.py
+++ b/libs/core/tests/unit_tests/vectorstores/init.py
--- a/libs/core/tests/unit_tests/vectorstores/test_vectorstore.py
+++ b/libs/core/tests/unit_tests/vectorstores/test_vectorstore.py
@ -0,0 +1,194 @@
 from __future__ import annotations
 import uuid
 from typing import Any, Dict, List, Optional, Sequence, Union
 from typing_extensions import TypedDict
 from langchain_core.documents import Document
 from langchain_core.embeddings import Embeddings
 from langchain_core.indexing.base import UpsertResponse
 from langchain_core.vectorstores import VectorStore
 def test_custom_upsert_type() -> None:
    """Test that we can override the signature of the upsert method
    of the VectorStore class without creating typing issues by violating
    the Liskov Substitution Principle.
    """
    class ByVector(TypedDict):
        document: Document
        vector: List[float]
    class CustomVectorStore(VectorStore):
        def upsert(
            # This unit test verifies that the signature of the upsert method
            # specifically the items parameter can be overridden without
            # violating the Liskov Substitution Principle (and getting
            # typing errors).
            self,
            items: Union[Sequence[Document], Sequence[ByVector]],
            /,
            **kwargs: Any,
        ) -> UpsertResponse:
            raise NotImplementedError()
 class CustomSyncVectorStore(VectorStore):
    """A vectorstore that only implements the synchronous methods."""
    def __init__(self) -> None:
        self.store: Dict[str, Document] = {}
    def upsert(
        self,
        items: Sequence[Document],
        /,
        **kwargs: Any,
    ) -> UpsertResponse:
        ids = []
        for item in items:
            if item.id is None:
                new_item = item.copy()
                id_: str = str(uuid.uuid4())
                new_item.id = id_
            else:
                id_ = item.id
                new_item = item
            self.store[id_] = new_item
            ids.append(id_)
        return {
            "succeeded": ids,
            "failed": [],
        }
    def get_by_ids(self, ids: Sequence[str], /) -> List[Document]:
        return [self.store[id] for id in ids if id in self.store]
    def from_texts(  # type: ignore
        cls,
        texts: List[str],
        embedding: Embeddings,
        metadatas: Optional[List[dict]] = None,
        **kwargs: Any,
    ) -> CustomSyncVectorStore:
        vectorstore = CustomSyncVectorStore()
        vectorstore.add_texts(texts, metadatas=metadatas, **kwargs)
        return vectorstore
    def similarity_search(
        self, query: str, k: int = 4, **kwargs: Any
    ) -> List[Document]:
        raise NotImplementedError()
 def test_implement_upsert() -> None:
    """Test that we can implement the upsert method of the CustomVectorStore
    class without violating the Liskov Substitution Principle.
    """
    store = CustomSyncVectorStore()
    # Check upsert with id
    assert store.upsert([Document(id="1", page_content="hello")]) == {
        "succeeded": ["1"],
        "failed": [],
    }
    assert store.get_by_ids(["1"]) == [Document(id="1", page_content="hello")]
    # Check upsert without id
    response = store.upsert([Document(page_content="world")])
    assert len(response["succeeded"]) == 1
    id_ = response["succeeded"][0]
    assert id_ is not None
    assert store.get_by_ids([id_]) == [Document(id=id_, page_content="world")]
    # Check that default implementation of add_texts works
    assert store.add_texts(["hello", "world"], ids=["3", "4"]) == ["3", "4"]
    assert store.get_by_ids(["3", "4"]) == [
        Document(id="3", page_content="hello"),
        Document(id="4", page_content="world"),
    ]
    # Add texts without ids
    ids_ = store.add_texts(["foo", "bar"])
    assert len(ids_) == 2
    assert store.get_by_ids(ids_) == [
        Document(id=ids_[0], page_content="foo"),
        Document(id=ids_[1], page_content="bar"),
    ]
    # Add texts with metadatas
    ids_2 = store.add_texts(["foo", "bar"], metadatas=[{"foo": "bar"}] * 2)
    assert len(ids_2) == 2
    assert store.get_by_ids(ids_2) == [
        Document(id=ids_2[0], page_content="foo", metadata={"foo": "bar"}),
        Document(id=ids_2[1], page_content="bar", metadata={"foo": "bar"}),
    ]
    # Check that add_documents works
    assert store.add_documents([Document(id="5", page_content="baz")]) == ["5"]
    # Test add documents with id specified in both document and ids
    original_document = Document(id="7", page_content="baz")
    assert store.add_documents([original_document], ids=["6"]) == ["6"]
    assert original_document.id == "7"  # original document should not be modified
    assert store.get_by_ids(["6"]) == [Document(id="6", page_content="baz")]
 async def test_aupsert_delegation_to_upsert() -> None:
    """Test delegation to the synchronous upsert method in async execution
    if async methods are not implemented.
    """
    store = CustomSyncVectorStore()
    # Check upsert with id
    assert await store.aupsert([Document(id="1", page_content="hello")]) == {
        "succeeded": ["1"],
        "failed": [],
    }
    assert await store.aget_by_ids(["1"]) == [Document(id="1", page_content="hello")]
    # Check upsert without id
    response = await store.aupsert([Document(page_content="world")])
    assert len(response["succeeded"]) == 1
    id_ = response["succeeded"][0]
    assert id_ is not None
    assert await store.aget_by_ids([id_]) == [Document(id=id_, page_content="world")]
    # Check that default implementation of add_texts works
    assert await store.aadd_texts(["hello", "world"], ids=["3", "4"]) == ["3", "4"]
    assert await store.aget_by_ids(["3", "4"]) == [
        Document(id="3", page_content="hello"),
        Document(id="4", page_content="world"),
    ]
    # Add texts without ids
    ids_ = await store.aadd_texts(["foo", "bar"])
    assert len(ids_) == 2
    assert await store.aget_by_ids(ids_) == [
        Document(id=ids_[0], page_content="foo"),
        Document(id=ids_[1], page_content="bar"),
    ]
    # Add texts with metadatas
    ids_2 = await store.aadd_texts(["foo", "bar"], metadatas=[{"foo": "bar"}] * 2)
    assert len(ids_2) == 2
    assert await store.aget_by_ids(ids_2) == [
        Document(id=ids_2[0], page_content="foo", metadata={"foo": "bar"}),
        Document(id=ids_2[1], page_content="bar", metadata={"foo": "bar"}),
    ]
    # Check that add_documents works
    assert await store.aadd_documents([Document(id="5", page_content="baz")]) == ["5"]
    # Test add documents with id specified in both document and ids
    original_document = Document(id="7", page_content="baz")
    assert await store.aadd_documents([original_document], ids=["6"]) == ["6"]
    assert original_document.id == "7"  # original document should not be modified
    assert await store.aget_by_ids(["6"]) == [Document(id="6", page_content="baz")]
--- a/libs/standard-tests/langchain_standard_tests/integration_tests/vectorstores.py
+++ b/libs/standard-tests/langchain_standard_tests/integration_tests/vectorstores.py
@ -46,15 +46,21 @@ class ReadWriteTestSuite(ABC):
    def test_add_documents(self, vectorstore: VectorStore) -> None:
        """Test adding documents into the vectorstore."""
-        documents = [
+        original_documents = [
            Document(page_content="foo", metadata={"id": 1}),
            Document(page_content="bar", metadata={"id": 2}),
        ]
-        vectorstore.add_documents(documents)
+        ids = vectorstore.add_documents(original_documents)
        documents = vectorstore.similarity_search("bar", k=2)
        assert documents == [
-            Document(page_content="bar", metadata={"id": 2}),
+            Document(page_content="bar", metadata={"id": 2}, id=ids[1]),
            Document(page_content="foo", metadata={"id": 1}, id=ids[0]),
        ]
        # Verify that the original document object does not get mutated!
        # (e.g., an ID is added to the original document object)
        assert original_documents == [
            Document(page_content="foo", metadata={"id": 1}),
            Document(page_content="bar", metadata={"id": 2}),
        ]
    def test_vectorstore_still_empty(self, vectorstore: VectorStore) -> None:
@ -71,10 +77,11 @@ class ReadWriteTestSuite(ABC):
            Document(page_content="foo", metadata={"id": 1}),
            Document(page_content="bar", metadata={"id": 2}),
        ]
-        vectorstore.add_documents(documents, ids=["1", "2"])
+        ids = vectorstore.add_documents(documents, ids=["1", "2"])
        assert ids == ["1", "2"]
        vectorstore.delete(["1"])
        documents = vectorstore.similarity_search("foo", k=1)
-        assert documents == [Document(page_content="bar", metadata={"id": 2})]
+        assert documents == [Document(page_content="bar", metadata={"id": 2}, id="2")]
    def test_deleting_bulk_documents(self, vectorstore: VectorStore) -> None:
        """Test that we can delete several documents at once."""
@ -87,7 +94,7 @@ class ReadWriteTestSuite(ABC):
        vectorstore.add_documents(documents, ids=["1", "2", "3"])
        vectorstore.delete(["1", "2"])
        documents = vectorstore.similarity_search("foo", k=1)
-        assert documents == [Document(page_content="baz", metadata={"id": 3})]
+        assert documents == [Document(page_content="baz", metadata={"id": 3}, id="3")]
    def test_delete_missing_content(self, vectorstore: VectorStore) -> None:
        """Deleting missing content should not raise an exception."""
@ -106,25 +113,8 @@ class ReadWriteTestSuite(ABC):
        vectorstore.add_documents(documents, ids=["1", "2"])
        documents = vectorstore.similarity_search("bar", k=2)
        assert documents == [
-            Document(page_content="bar", metadata={"id": 2}),
+            Document(page_content="bar", metadata={"id": 2}, id="2"),
-            Document(page_content="foo", metadata={"id": 1}),
+            Document(page_content="foo", metadata={"id": 1}, id="1"),
        ]
    def test_add_documents_without_ids_gets_duplicated(
        self, vectorstore: VectorStore
    ) -> None:
        """Adding documents without specifying IDs should duplicate content."""
        documents = [
            Document(page_content="foo", metadata={"id": 1}),
            Document(page_content="bar", metadata={"id": 2}),
        ]
        vectorstore.add_documents(documents)
        vectorstore.add_documents(documents)
        documents = vectorstore.similarity_search("bar", k=2)
        assert documents == [
            Document(page_content="bar", metadata={"id": 2}),
            Document(page_content="bar", metadata={"id": 2}),
        ]
    def test_add_documents_by_id_with_mutation(self, vectorstore: VectorStore) -> None:
@ -149,9 +139,11 @@ class ReadWriteTestSuite(ABC):
        documents = vectorstore.similarity_search("new foo", k=2)
        assert documents == [
            Document(
-                page_content="new foo", metadata={"id": 1, "some_other_field": "foo"}
+                id="1",
                page_content="new foo",
                metadata={"id": 1, "some_other_field": "foo"},
            ),
-            Document(page_content="bar", metadata={"id": 2}),
+            Document(id="2", page_content="bar", metadata={"id": 2}),
        ]
@ -190,15 +182,22 @@ class AsyncReadWriteTestSuite(ABC):
    async def test_add_documents(self, vectorstore: VectorStore) -> None:
        """Test adding documents into the vectorstore."""
-        documents = [
+        original_documents = [
            Document(page_content="foo", metadata={"id": 1}),
            Document(page_content="bar", metadata={"id": 2}),
        ]
-        await vectorstore.aadd_documents(documents)
+        ids = await vectorstore.aadd_documents(original_documents)
        documents = await vectorstore.asimilarity_search("bar", k=2)
        assert documents == [
-            Document(page_content="bar", metadata={"id": 2}),
+            Document(page_content="bar", metadata={"id": 2}, id=ids[1]),
            Document(page_content="foo", metadata={"id": 1}, id=ids[0]),
        ]
        # Verify that the original document object does not get mutated!
        # (e.g., an ID is added to the original document object)
        assert original_documents == [
            Document(page_content="foo", metadata={"id": 1}),
            Document(page_content="bar", metadata={"id": 2}),
        ]
    async def test_vectorstore_still_empty(self, vectorstore: VectorStore) -> None:
@ -215,10 +214,11 @@ class AsyncReadWriteTestSuite(ABC):
            Document(page_content="foo", metadata={"id": 1}),
            Document(page_content="bar", metadata={"id": 2}),
        ]
-        await vectorstore.aadd_documents(documents, ids=["1", "2"])
+        ids = await vectorstore.aadd_documents(documents, ids=["1", "2"])
        assert ids == ["1", "2"]
        await vectorstore.adelete(["1"])
        documents = await vectorstore.asimilarity_search("foo", k=1)
-        assert documents == [Document(page_content="bar", metadata={"id": 2})]
+        assert documents == [Document(page_content="bar", metadata={"id": 2}, id="2")]
    async def test_deleting_bulk_documents(self, vectorstore: VectorStore) -> None:
        """Test that we can delete several documents at once."""
@ -231,7 +231,7 @@ class AsyncReadWriteTestSuite(ABC):
        await vectorstore.aadd_documents(documents, ids=["1", "2", "3"])
        await vectorstore.adelete(["1", "2"])
        documents = await vectorstore.asimilarity_search("foo", k=1)
-        assert documents == [Document(page_content="baz", metadata={"id": 3})]
+        assert documents == [Document(page_content="baz", metadata={"id": 3}, id="3")]
    async def test_delete_missing_content(self, vectorstore: VectorStore) -> None:
        """Deleting missing content should not raise an exception."""
@ -250,25 +250,8 @@ class AsyncReadWriteTestSuite(ABC):
        await vectorstore.aadd_documents(documents, ids=["1", "2"])
        documents = await vectorstore.asimilarity_search("bar", k=2)
        assert documents == [
-            Document(page_content="bar", metadata={"id": 2}),
+            Document(page_content="bar", metadata={"id": 2}, id="2"),
-            Document(page_content="foo", metadata={"id": 1}),
+            Document(page_content="foo", metadata={"id": 1}, id="1"),
        ]
    async def test_add_documents_without_ids_gets_duplicated(
        self, vectorstore: VectorStore
    ) -> None:
        """Adding documents without specifying IDs should duplicate content."""
        documents = [
            Document(page_content="foo", metadata={"id": 1}),
            Document(page_content="bar", metadata={"id": 2}),
        ]
        await vectorstore.aadd_documents(documents)
        await vectorstore.aadd_documents(documents)
        documents = await vectorstore.asimilarity_search("bar", k=2)
        assert documents == [
            Document(page_content="bar", metadata={"id": 2}),
            Document(page_content="bar", metadata={"id": 2}),
        ]
    async def test_add_documents_by_id_with_mutation(
@ -295,7 +278,9 @@ class AsyncReadWriteTestSuite(ABC):
        documents = await vectorstore.asimilarity_search("new foo", k=2)
        assert documents == [
            Document(
-                page_content="new foo", metadata={"id": 1, "some_other_field": "foo"}
+                id="1",
                page_content="new foo",
                metadata={"id": 1, "some_other_field": "foo"},
            ),
-            Document(page_content="bar", metadata={"id": 2}),
+            Document(id="2", page_content="bar", metadata={"id": 2}),
        ]