x

qxqx
2026-01-24 05:50:18 +00:00 · 2024-07-22 09:39:18 -04:00 · 2024-07-19 17:21:46 -04:00 · 2024-07-19 17:06:58 -04:00 · 2024-07-19 15:30:04 -04:00 · 2024-07-19 15:29:47 -04:00
9 changed files with 787 additions and 33 deletions
--- a/libs/core/langchain_core/indexing/init.py
+++ b/libs/core/langchain_core/indexing/init.py
@@ -7,6 +7,8 @@ if it's unchanged.

 from langchain_core.indexing.api import IndexingResult, aindex, index
 from langchain_core.indexing.base import (
+    AsyncDocumentIndexer,
+    DocumentIndexer,
    InMemoryRecordManager,
    RecordManager,
    UpsertResponse,
@@ -14,6 +16,8 @@ from langchain_core.indexing.base import (

 __all__ = [
    "aindex",
+    "AsyncDocumentIndexer",
+    "DocumentIndexer",
    "index",
    "IndexingResult",
    "InMemoryRecordManager",
--- a/libs/core/langchain_core/indexing/api.py
+++ b/libs/core/langchain_core/indexing/api.py
@@ -7,6 +7,7 @@ import json
 import uuid
 from itertools import islice
 from typing import (
+    TYPE_CHECKING,
    Any,
    AsyncIterable,
    AsyncIterator,
@@ -29,7 +30,9 @@ from langchain_core.document_loaders.base import BaseLoader
 from langchain_core.documents import Document
 from langchain_core.indexing.base import RecordManager
 from langchain_core.pydantic_v1 import root_validator
-from langchain_core.vectorstores import VectorStore
+
+if TYPE_CHECKING:
+    from langchain_core.vectorstores import VectorStore

 # Magic UUID to use as a namespace for hashing.
 # Used to try and generate a unique UUID for each document
@@ -265,6 +268,9 @@ def index(
            "delete" and "add_documents" required methods.
        ValueError: If source_id_key is not None, but is not a string or callable.
    """
+    # Local scope to avoid circular imports
+    from langchain_core.vectorstores import VectorStore
+
    if cleanup not in {"incremental", "full", None}:
        raise ValueError(
            f"cleanup should be one of 'incremental', 'full' or None. "
@@ -478,6 +484,8 @@ async def aindex(
            "adelete" and "aadd_documents" required methods.
        ValueError: If source_id_key is not None, but is not a string or callable.
    """
+    # Local scope to avoid circular imports
+    from langchain_core.vectorstores import VectorStore

    if cleanup not in {"incremental", "full", None}:
        raise ValueError(
--- a/libs/core/langchain_core/indexing/base.py
+++ b/libs/core/langchain_core/indexing/base.py
@@ -1,8 +1,263 @@
 from __future__ import annotations

+import abc
 import time
 from abc import ABC, abstractmethod
-from typing import Dict, List, Optional, Sequence, TypedDict
+from typing import (
+    Any,
+    Dict,
+    List,
+    Optional,
+    Sequence,
+)
+
+from typing_extensions import TypedDict
+
+from langchain_core._api import beta
+from langchain_core.documents.base import Document
+
+
+class UpsertResponse(TypedDict):
+    """A generic response for upsert operations.
+
+    The upsert response will be used by abstractions that implement an upsert
+    operation for content that can be upserted by ID.
+
+    Upsert APIs that accept inputs with IDs and generate IDs internally
+    will return a response that includes the IDs that succeeded and the IDs
+    that failed.
+
+    If there are no failures, the failed list will be empty, and the order
+    of the IDs in the succeeded list will match the order of the input documents.
+
+    If there are failures, the response becomes ill defined, and a user of the API
+    cannot determine which generated ID corresponds to which input document.
+
+    It is recommended for users explicitly attach the IDs to the items being
+    indexed to avoid this issue.
+    """
+
+    succeeded: List[str]
+    """The IDs that were successfully indexed."""
+    failed: List[str]
+    """The IDs that failed to index."""
+
+
+class DeleteResponse(TypedDict, total=False):
+    """A generic response for delete operation.
+
+    The fields in this response are optional and whether the vectorstore
+    returns them or not is up to the implementation.
+    """
+
+    num_deleted: int
+    """The number of items that were successfully deleted.
+    
+    If returned, this should only include *actual* deletions.
+    
+    If the ID did not exist to begin with, 
+    it should not be included in this count.
+    """
+
+    succeeded: Sequence[str]
+    """The IDs that were successfully deleted.
+    
+    If returned, this should only include *actual* deletions.
+    
+    If the ID did not exist to begin with,
+    it should not be included in this list.
+    """
+
+    failed: Sequence[str]
+    """The IDs that failed to be deleted.
+    
+    Please note that deleting an ID that 
+    does not exist is **NOT** considered a failure.
+    """
+
+    num_failed: int
+    """The number of items that failed to be deleted."""
+
+
+@beta(message="Added in ___version___. The API is subject to change.")
+class DocumentIndexer(abc.ABC):
+    """An abstraction for indexing documents.
+
+    This indexing interface is designed to be a generic abstraction for storing and
+    querying documents that has an ID and metadata associated with it.
+
+    The interface is designed to be agnostic to the underlying implementation of the
+    indexing system.
+
+    The interface is designed to support the following operations:
+
+    1. Storing content in the index.
+    2. Retrieving content by ID.
+
+    .. versionadded:: ___version___
+    """
+
+    @abc.abstractmethod
+    def upsert(self, items: Sequence[Document], /, **kwargs: Any) -> UpsertResponse:
+        """Upsert documents into the index.
+
+        The upsert functionality should utilize the ID field of the content object
+        if it is provided. If the ID is not provided, the upsert method is free
+        to generate an ID for the content.
+
+        When an ID is specified and the content already exists in the vectorstore,
+        the upsert method should update the content with the new data. If the content
+        does not exist, the upsert method should add the item to the vectorstore.
+
+        Args:
+            items: Sequence of documents to add to the vectorstore.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            UpsertResponse: A response object that contains the list of IDs that were
+            successfully added or updated in the vectorstore and the list of IDs that
+            failed to be added or updated.
+
+        .. versionadded:: ___version___
+        """
+
+    @abc.abstractmethod
+    def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> DeleteResponse:
+        """Delete by IDs or other criteria.
+
+        Calling delete without any input parameters should raise a ValueError!
+
+        Args:
+            ids: List of ids to delete.
+            kwargs: Additional keyword arguments. This is up to the implementation.
+                For example, can include an option to delete the entire index,
+                or else issue a non blocking delete etc.
+
+        Returns:
+            DeleteResponse: A response object that contains the list of IDs that were
+            successfully deleted and the list of IDs that failed to be deleted.
+        """
+
+    @abc.abstractmethod
+    def get(
+        self,
+        ids: Sequence[str],
+        /,
+        **kwargs: Any,
+    ) -> List[Document]:
+        """Get documents by id.
+
+        Fewer documents may be returned than requested if some IDs are not found or
+        if there are duplicated IDs.
+
+        Users should not assume that the order of the returned documents matches
+        the order of the input IDs. Instead, users should rely on the ID field of the
+        returned documents.
+
+        This method should **NOT** raise exceptions if no documents are found for
+        some IDs.
+
+        Args:
+            ids: List of IDs to get.
+            kwargs: Additional keyword arguments. These are up to the implementation.
+
+        Returns:
+            List[Document]: List of documents that were found.
+
+        .. versionadded:: ___version___
+        """
+
+
+@beta(message="Added in ___version___. The API is subject to change.")
+class AsyncDocumentIndexer(abc.ABC):
+    """An abstraction for indexing documents. Async Variant.
+
+    This indexing interface is designed to be a generic abstraction for storing and
+    querying documents that has an ID and metadata associated with it.
+
+    The interface is designed to be agnostic to the underlying implementation of the
+    indexing system.
+
+    The interface is designed to support the following operations:
+
+    1. Storing content in the index.
+    2. Retrieving content by ID.
+
+    .. versionadded:: ___version___
+    """
+
+    @abc.abstractmethod
+    async def upsert(
+        self, items: Sequence[Document], /, **kwargs: Any
+    ) -> UpsertResponse:
+        """Add or update documents in the vectorstore. Async version of upsert.
+
+        The upsert functionality should utilize the ID field of the item
+        if it is provided. If the ID is not provided, the upsert method is free
+        to generate an ID for the item.
+
+        When an ID is specified and the item already exists in the vectorstore,
+        the upsert method should update the item with the new data. If the item
+        does not exist, the upsert method should add the item to the vectorstore.
+
+        Args:
+            items: Sequence of documents to add to the vectorstore.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            UpsertResponse: A response object that contains the list of IDs that were
+            successfully added or updated in the vectorstore and the list of IDs that
+            failed to be added or updated.
+
+        .. versionadded:: ___version___
+        """
+
+    @abc.abstractmethod
+    async def delete(
+        self, ids: Optional[List[str]] = None, **kwargs: Any
+    ) -> DeleteResponse:
+        """Delete by IDs or other criteria. Async variant.
+
+        Calling adelete without any input parameters should raise a ValueError!
+
+        Args:
+            ids: List of ids to delete.
+            kwargs: Additional keyword arguments. This is up to the implementation.
+                For example, can include an option to delete the entire index.
+
+        Returns:
+            DeleteResponse: A response object that contains the list of IDs that were
+            successfully deleted and the list of IDs that failed to be deleted.
+        """
+
+    @abc.abstractmethod
+    async def get(
+        self,
+        ids: Sequence[str],
+        /,
+        **kwargs: Any,
+    ) -> List[Document]:
+        """Get documents by id.
+
+        Fewer documents may be returned than requested if some IDs are not found or
+        if there are duplicated IDs.
+
+        Users should not assume that the order of the returned documents matches
+        the order of the input IDs. Instead, users should rely on the ID field of the
+        returned documents.
+
+        This method should **NOT** raise exceptions if no documents are found for
+        some IDs.
+
+        Args:
+            ids: List of IDs to get.
+            kwargs: Additional keyword arguments. These are up to the implementation.
+
+        Returns:
+            List[Document]: List of documents that were found.
+
+        .. versionadded:: ___version___
+        """


 class RecordManager(ABC):
@@ -421,29 +676,3 @@ class InMemoryRecordManager(RecordManager):
            keys: A list of keys to delete.
        """
        self.delete_keys(keys)
-
-
-class UpsertResponse(TypedDict):
-    """A generic response for upsert operations.
-
-    The upsert response will be used by abstractions that implement an upsert
-    operation for content that can be upserted by ID.
-
-    Upsert APIs that accept inputs with IDs and generate IDs internally
-    will return a response that includes the IDs that succeeded and the IDs
-    that failed.
-
-    If there are no failures, the failed list will be empty, and the order
-    of the IDs in the succeeded list will match the order of the input documents.
-
-    If there are failures, the response becomes ill defined, and a user of the API
-    cannot determine which generated ID corresponds to which input document.
-
-    It is recommended for users explicitly attach the IDs to the items being
-    indexed to avoid this issue.
-    """
-
-    succeeded: List[str]
-    """The IDs that were successfully indexed."""
-    failed: List[str]
-    """The IDs that failed to index."""
--- a/libs/core/langchain_core/indexing/in_memory.py
+++ b/libs/core/langchain_core/indexing/in_memory.py
@@ -0,0 +1,86 @@
+import uuid
+from typing import Any, Dict, List, Optional, Sequence, cast
+
+from langchain_core.documents import Document
+from langchain_core.indexing import UpsertResponse
+from langchain_core.indexing.base import (
+    AsyncDocumentIndexer,
+    DeleteResponse,
+    DocumentIndexer,
+)
+
+
+class InMemoryDocumentIndexer(DocumentIndexer):
+    """In memory sync indexer."""
+
+    def __init__(self, *, store: Optional[Dict[str, Document]] = None) -> None:
+        """An in memory implementation of a document indexer."""
+        self.store = store if store is not None else {}
+
+    def upsert(self, items: Sequence[Document], /, **kwargs: Any) -> UpsertResponse:
+        """Upsert items into the indexer."""
+        ok_ids = []
+
+        for item in items:
+            if item.id is None:
+                id_ = str(uuid.uuid4())
+                item_ = item.copy()
+                item_.id = id_
+            else:
+                item_ = item
+                id_ = item.id
+
+            self.store[id_] = item_
+            ok_ids.append(cast(str, item_.id))
+
+        return UpsertResponse(succeeded=ok_ids, failed=[])
+
+    def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> DeleteResponse:
+        """Delete by ID."""
+        if ids is None:
+            raise ValueError("IDs must be provided for deletion")
+
+        ok_ids = []
+
+        for id_ in ids:
+            if id_ in self.store:
+                del self.store[id_]
+                ok_ids.append(id_)
+
+        return DeleteResponse(
+            succeeded=ok_ids, num_deleted=len(ok_ids), num_failed=0, failed=[]
+        )
+
+    def get(self, ids: Sequence[str], /, **kwargs: Any) -> List[Document]:
+        """Get by ids."""
+        found_documents = []
+
+        for id_ in ids:
+            if id_ in self.store:
+                found_documents.append(self.store[id_])
+
+        return found_documents
+
+
+class AsyncInMemoryDocumentIndexer(AsyncDocumentIndexer):
+    """An in memory async indexer implementation."""
+
+    def __init__(self, *, store: Optional[Dict[str, Document]] = None) -> None:
+        """An in memory implementation of a document indexer."""
+        self.indexer = InMemoryDocumentIndexer(store=store)
+
+    async def upsert(
+        self, items: Sequence[Document], /, **kwargs: Any
+    ) -> UpsertResponse:
+        """Upsert items into the indexer."""
+        return self.indexer.upsert(items, **kwargs)
+
+    async def delete(
+        self, ids: Optional[List[str]] = None, **kwargs: Any
+    ) -> DeleteResponse:
+        """Delete by ID."""
+        return self.indexer.delete(ids, **kwargs)
+
+    async def get(self, ids: Sequence[str], /, **kwargs: Any) -> List[Document]:
+        """Get by ids."""
+        return self.indexer.get(ids, **kwargs)
--- a/libs/core/tests/unit_tests/indexing/test_in_memory_indexer.py
+++ b/libs/core/tests/unit_tests/indexing/test_in_memory_indexer.py
@@ -0,0 +1,28 @@
+"""Test in memory indexer"""
+
+from typing import AsyncGenerator, Generator
+
+import pytest
+from langchain_standard_tests.integration_tests.indexer import (
+    AsyncDocumentIndexerTestSuite,
+    DocumentIndexerTestSuite,
+)
+
+from langchain_core.indexing import AsyncDocumentIndexer, DocumentIndexer
+from langchain_core.indexing.in_memory import (
+    AsyncInMemoryDocumentIndexer,
+    InMemoryDocumentIndexer,
+)
+
+
+class TestDocumentIndexerTestSuite(DocumentIndexerTestSuite):
+    @pytest.fixture()
+    def indexer(self) -> Generator[DocumentIndexer, None, None]:
+        yield InMemoryDocumentIndexer()
+
+
+class TestAsyncDocumentIndexerTestSuite(AsyncDocumentIndexerTestSuite):
+    # Something funky is going on with mypy and async pytest fixture
+    @pytest.fixture()
+    async def indexer(self) -> AsyncGenerator[AsyncDocumentIndexer, None]:  # type: ignore
+        yield AsyncInMemoryDocumentIndexer()
--- a/libs/core/tests/unit_tests/indexing/test_public_api.py
+++ b/libs/core/tests/unit_tests/indexing/test_public_api.py
@@ -4,11 +4,13 @@ from langchain_core.indexing import __all__
 def test_all() -> None:
    """Use to catch obvious breaking changes."""
    assert __all__ == sorted(__all__, key=str.lower)
-    assert __all__ == [
+    assert set(__all__) == {
        "aindex",
+        "AsyncDocumentIndexer",
+        "DocumentIndexer",
        "index",
        "IndexingResult",
        "InMemoryRecordManager",
        "RecordManager",
        "UpsertResponse",
-    ]
+    }
--- a/libs/standard-tests/Makefile
+++ b/libs/standard-tests/Makefile
@@ -30,14 +30,14 @@ lint_tests: PYTHON_FILES=tests
 lint_tests: MYPY_CACHE=.mypy_cache_test

 lint lint_diff lint_package lint_tests:
-	poetry run ruff .
+	poetry run ruff check .
 	poetry run ruff format $(PYTHON_FILES) --diff
-	poetry run ruff --select I $(PYTHON_FILES)
+	poetry run ruff check --select I $(PYTHON_FILES)
 	mkdir $(MYPY_CACHE); poetry run mypy $(PYTHON_FILES) --cache-dir $(MYPY_CACHE)

 format format_diff:
 	poetry run ruff format $(PYTHON_FILES)
-	poetry run ruff --select I --fix $(PYTHON_FILES)
+	poetry run ruff check --select I --fix $(PYTHON_FILES)

 spell_check:
 	poetry run codespell --toml pyproject.toml
--- a/libs/standard-tests/langchain_standard_tests/integration_tests/init.py
+++ b/libs/standard-tests/langchain_standard_tests/integration_tests/init.py
@@ -8,6 +8,7 @@ modules = [
    "base_store",
    "cache",
    "chat_models",
+    "indexer",
    "vectorstores",
 ]

--- a/libs/standard-tests/langchain_standard_tests/integration_tests/indexer.py
+++ b/libs/standard-tests/langchain_standard_tests/integration_tests/indexer.py
@@ -0,0 +1,396 @@
+"""Test suite to check indexer implementations."""
+
+import inspect
+import uuid
+from abc import ABC, abstractmethod
+from typing import AsyncGenerator, Generator
+
+import pytest
+from langchain_core.documents import Document
+from langchain_core.indexing import AsyncDocumentIndexer, DocumentIndexer
+
+
+class DocumentIndexerTestSuite(ABC):
+    """Test suite for checking the read-write of a document indexer.
+
+    Implementers should subclass this test suite and provide a fixture
+    that returns an empty indexer for each test.
+    """
+
+    @abstractmethod
+    @pytest.fixture
+    def indexer(self) -> Generator[DocumentIndexer, None, None]:
+        """Get the indexer."""
+
+    def test_upsert_documents_has_no_ids(self, indexer: DocumentIndexer) -> None:
+        """Verify that there is not parameter called ids in upsert"""
+        signature = inspect.signature(indexer.upsert)
+        assert "ids" not in signature.parameters
+
+    def test_upsert_no_ids(self, indexer: DocumentIndexer) -> None:
+        """Upsert works with documents that do not have IDs.
+
+        At the moment, the ID field in documents is optional.
+        """
+        documents = [
+            Document(page_content="foo", metadata={"id": 1}),
+            Document(page_content="bar", metadata={"id": 2}),
+        ]
+        response = indexer.upsert(documents)
+        ids = sorted(response["succeeded"])
+
+        # Ordering is not guaranteed, need to test carefully
+        documents = indexer.get(ids)
+        sorted_documents = sorted(documents, key=lambda x: x.id)  # type: ignore
+
+        if sorted_documents[0].page_content == "bar":
+            assert sorted_documents[0] == Document(
+                page_content="bar", metadata={"id": 2}, id=ids[0]
+            )
+            assert sorted_documents[1] == Document(
+                page_content="foo", metadata={"id": 1}, id=ids[1]
+            )
+        else:
+            assert sorted_documents[0] == Document(
+                page_content="foo", metadata={"id": 1}, id=ids[0]
+            )
+            assert sorted_documents[1] == Document(
+                page_content="bar", metadata={"id": 2}, id=ids[1]
+            )
+
+    def test_upsert_some_ids(self, indexer: DocumentIndexer) -> None:
+        """Test an upsert where some docs have ids and some dont."""
+        foo_uuid = str(uuid.UUID(int=7))
+        documents = [
+            Document(id=foo_uuid, page_content="foo", metadata={"id": 1}),
+            Document(page_content="bar", metadata={"id": 2}),
+        ]
+        response = indexer.upsert(documents)
+        ids = response["succeeded"]
+        other_id = list(set(ids) - {foo_uuid})[0]
+        assert response["failed"] == []
+        assert foo_uuid in ids
+        # Ordering is not guaranteed, so we use a set.
+        documents = indexer.get(ids)
+        first_doc = documents[0]
+        if first_doc.id == foo_uuid:
+            assert documents == [
+                Document(page_content="foo", metadata={"id": 1}, id=foo_uuid),
+                Document(page_content="bar", metadata={"id": 2}, id=other_id),
+            ]
+        else:
+            assert documents == [
+                Document(page_content="bar", metadata={"id": 2}, id=other_id),
+                Document(page_content="foo", metadata={"id": 1}, id=foo_uuid),
+            ]
+
+    def test_upsert_overwrites(self, indexer: DocumentIndexer) -> None:
+        """Test that upsert overwrites existing content."""
+        foo_uuid = str(uuid.UUID(int=7))
+        documents = [
+            Document(id=foo_uuid, page_content="foo", metadata={"bar": 1}),
+        ]
+        response = indexer.upsert(documents)
+        ids = response["succeeded"]
+        assert response["failed"] == []
+
+        assert indexer.get(ids) == [
+            Document(page_content="foo", metadata={"bar": 1}, id=foo_uuid),
+        ]
+
+        # Now let's overwrite foo
+        indexer.upsert(
+            [Document(id=foo_uuid, page_content="foo2", metadata={"meow": 2})]
+        )
+        documents = indexer.get([foo_uuid])
+        assert documents == [
+            Document(page_content="foo2", metadata={"meow": 2}, id=foo_uuid)
+        ]
+
+    def test_delete_missing_docs(self, indexer: DocumentIndexer) -> None:
+        """Verify that we can delete docs that aren't there."""
+        assert indexer.get(["1"]) == []  # Should be empty.
+
+        delete_response = indexer.delete(["1"])
+        if "num_deleted" in delete_response:
+            assert delete_response["num_deleted"] == 0
+
+        if "num_failed" in delete_response:
+            # Deleting a missing an ID is **not** failure!!
+            assert delete_response["num_failed"] == 0
+
+        if "succeeded" in delete_response:
+            # There was nothing to delete!
+            assert delete_response["succeeded"] == []
+
+        if "failed" in delete_response:
+            # Nothing should have failed
+            assert delete_response["failed"] == []
+
+    def test_delete_semantics(self, indexer: DocumentIndexer) -> None:
+        """Test deletion of content has appropriate semantics."""
+        # Let's index a document first.
+        foo_uuid = str(uuid.UUID(int=7))
+        upsert_response = indexer.upsert(
+            [Document(id=foo_uuid, page_content="foo", metadata={})]
+        )
+        assert upsert_response == {"succeeded": [foo_uuid], "failed": []}
+
+        delete_response = indexer.delete(["missing_id", foo_uuid])
+
+        if "num_deleted" in delete_response:
+            assert delete_response["num_deleted"] == 1
+
+        if "num_failed" in delete_response:
+            # Deleting a missing an ID is **not** failure!!
+            assert delete_response["num_failed"] == 0
+
+        if "succeeded" in delete_response:
+            # There was nothing to delete!
+            assert delete_response["succeeded"] == [foo_uuid]
+
+        if "failed" in delete_response:
+            # Nothing should have failed
+            assert delete_response["failed"] == []
+
+    def test_bulk_delete(self, indexer: DocumentIndexer) -> None:
+        """Test that we can delete several documents at once."""
+        documents = [
+            Document(id="1", page_content="foo", metadata={"id": 1}),
+            Document(id="2", page_content="bar", metadata={"id": 2}),
+            Document(id="3", page_content="baz", metadata={"id": 3}),
+        ]
+
+        indexer.upsert(documents)
+        indexer.delete(["1", "2"])
+        assert indexer.get(["1", "2", "3"]) == [
+            Document(page_content="baz", metadata={"id": 3}, id="3")
+        ]
+
+    def test_delete_no_args(self, indexer: DocumentIndexer) -> None:
+        """Test delete with no args raises ValueError."""
+        with pytest.raises(ValueError):
+            indexer.delete()
+
+    def test_delete_missing_content(self, indexer: DocumentIndexer) -> None:
+        """Deleting missing content should not raise an exception."""
+        indexer.delete(["1"])
+        indexer.delete(["1", "2", "3"])
+
+    def test_get_with_missing_ids(self, indexer: DocumentIndexer) -> None:
+        """Test get with missing IDs."""
+        documents = [
+            Document(id="1", page_content="foo", metadata={"id": 1}),
+            Document(id="2", page_content="bar", metadata={"id": 2}),
+        ]
+        upsert_response = indexer.upsert(documents)
+        assert upsert_response == {
+            "succeeded": ["1", "2"],
+            "failed": [],
+        }
+        retrieved_documents = indexer.get(["1", "2", "3", "4"])
+        # The ordering is not guaranteed, so we use a set.
+        assert sorted(retrieved_documents, key=lambda x: x.id) == [  # type: ignore
+            Document(page_content="foo", metadata={"id": 1}, id="1"),
+            Document(page_content="bar", metadata={"id": 2}, id="2"),
+        ]
+
+    def test_get_missing(self, indexer: DocumentIndexer) -> None:
+        """Test get by IDs with missing IDs."""
+        # This should not raise an exception
+        documents = indexer.get(["1", "2", "3"])
+        assert documents == []
+
+
+class AsyncDocumentIndexerTestSuite(ABC):
+    """Test suite for checking the read-write of a document indexer.
+
+    Implementers should subclass this test suite and provide a fixture
+    that returns an empty indexer for each test.
+    """
+
+    @abstractmethod
+    @pytest.fixture
+    async def indexer(self) -> AsyncGenerator[AsyncDocumentIndexer, None]:
+        """Get the indexer."""
+
+    async def test_upsert_documents_has_no_ids(
+        self, indexer: AsyncDocumentIndexer
+    ) -> None:
+        """Verify that there is not parameter called ids in upsert"""
+        signature = inspect.signature(indexer.upsert)
+        assert "ids" not in signature.parameters
+
+    async def test_upsert_no_ids(self, indexer: AsyncDocumentIndexer) -> None:
+        """Upsert works with documents that do not have IDs.
+
+        At the moment, the ID field in documents is optional.
+        """
+        documents = [
+            Document(page_content="foo", metadata={"id": 1}),
+            Document(page_content="bar", metadata={"id": 2}),
+        ]
+        response = await indexer.upsert(documents)
+        ids = sorted(response["succeeded"])
+
+        # Ordering is not guaranteed, need to test carefully
+        documents = await indexer.get(ids)
+        sorted_documents = sorted(documents, key=lambda x: x.id)  # type: ignore
+
+        if sorted_documents[0].page_content == "bar":
+            assert sorted_documents[0] == Document(
+                page_content="bar", metadata={"id": 2}, id=ids[0]
+            )
+            assert sorted_documents[1] == Document(
+                page_content="foo", metadata={"id": 1}, id=ids[1]
+            )
+        else:
+            assert sorted_documents[0] == Document(
+                page_content="foo", metadata={"id": 1}, id=ids[0]
+            )
+            assert sorted_documents[1] == Document(
+                page_content="bar", metadata={"id": 2}, id=ids[1]
+            )
+
+    async def test_upsert_some_ids(self, indexer: AsyncDocumentIndexer) -> None:
+        """Test an upsert where some docs have ids and some dont."""
+        foo_uuid = str(uuid.UUID(int=7))
+        documents = [
+            Document(id=foo_uuid, page_content="foo", metadata={"id": 1}),
+            Document(page_content="bar", metadata={"id": 2}),
+        ]
+        response = await indexer.upsert(documents)
+        ids = response["succeeded"]
+        other_id = list(set(ids) - {foo_uuid})[0]
+        assert response["failed"] == []
+        assert foo_uuid in ids
+        # Ordering is not guaranteed, so we use a set.
+        documents = await indexer.get(ids)
+        first_doc = documents[0]
+        if first_doc.id == foo_uuid:
+            assert documents == [
+                Document(page_content="foo", metadata={"id": 1}, id=foo_uuid),
+                Document(page_content="bar", metadata={"id": 2}, id=other_id),
+            ]
+        else:
+            assert documents == [
+                Document(page_content="bar", metadata={"id": 2}, id=other_id),
+                Document(page_content="foo", metadata={"id": 1}, id=foo_uuid),
+            ]
+
+    async def test_upsert_overwrites(self, indexer: AsyncDocumentIndexer) -> None:
+        """Test that upsert overwrites existing content."""
+        foo_uuid = str(uuid.UUID(int=7))
+        documents = [
+            Document(id=foo_uuid, page_content="foo", metadata={"bar": 1}),
+        ]
+        response = await indexer.upsert(documents)
+        ids = response["succeeded"]
+        assert response["failed"] == []
+
+        assert await indexer.get(ids) == [
+            Document(page_content="foo", metadata={"bar": 1}, id=foo_uuid),
+        ]
+
+        # Now let's overwrite foo
+        await indexer.upsert(
+            [Document(id=foo_uuid, page_content="foo2", metadata={"meow": 2})]
+        )
+        documents = await indexer.get([foo_uuid])
+        assert documents == [
+            Document(page_content="foo2", metadata={"meow": 2}, id=foo_uuid)
+        ]
+
+    async def test_delete_missing_docs(self, indexer: AsyncDocumentIndexer) -> None:
+        """Verify that we can delete docs that aren't there."""
+        assert await indexer.get(["1"]) == []  # Should be empty.
+
+        delete_response = await indexer.delete(["1"])
+        if "num_deleted" in delete_response:
+            assert delete_response["num_deleted"] == 0
+
+        if "num_failed" in delete_response:
+            # Deleting a missing an ID is **not** failure!!
+            assert delete_response["num_failed"] == 0
+
+        if "succeeded" in delete_response:
+            # There was nothing to delete!
+            assert delete_response["succeeded"] == []
+
+        if "failed" in delete_response:
+            # Nothing should have failed
+            assert delete_response["failed"] == []
+
+    async def test_delete_semantics(self, indexer: AsyncDocumentIndexer) -> None:
+        """Test deletion of content has appropriate semantics."""
+        # Let's index a document first.
+        foo_uuid = str(uuid.UUID(int=7))
+        upsert_response = await indexer.upsert(
+            [Document(id=foo_uuid, page_content="foo", metadata={})]
+        )
+        assert upsert_response == {"succeeded": [foo_uuid], "failed": []}
+
+        delete_response = await indexer.delete(["missing_id", foo_uuid])
+
+        if "num_deleted" in delete_response:
+            assert delete_response["num_deleted"] == 1
+
+        if "num_failed" in delete_response:
+            # Deleting a missing an ID is **not** failure!!
+            assert delete_response["num_failed"] == 0
+
+        if "succeeded" in delete_response:
+            # There was nothing to delete!
+            assert delete_response["succeeded"] == [foo_uuid]
+
+        if "failed" in delete_response:
+            # Nothing should have failed
+            assert delete_response["failed"] == []
+
+    async def test_bulk_delete(self, indexer: AsyncDocumentIndexer) -> None:
+        """Test that we can delete several documents at once."""
+        documents = [
+            Document(id="1", page_content="foo", metadata={"id": 1}),
+            Document(id="2", page_content="bar", metadata={"id": 2}),
+            Document(id="3", page_content="baz", metadata={"id": 3}),
+        ]
+
+        await indexer.upsert(documents)
+        await indexer.delete(["1", "2"])
+        assert await indexer.get(["1", "2", "3"]) == [
+            Document(page_content="baz", metadata={"id": 3}, id="3")
+        ]
+
+    async def test_delete_no_args(self, indexer: AsyncDocumentIndexer) -> None:
+        """Test delete with no args raises ValueError."""
+        with pytest.raises(ValueError):
+            await indexer.delete()
+
+    async def test_delete_missing_content(self, indexer: AsyncDocumentIndexer) -> None:
+        """Deleting missing content should not raise an exception."""
+        await indexer.delete(["1"])
+        await indexer.delete(["1", "2", "3"])
+
+    async def test_get_with_missing_ids(self, indexer: AsyncDocumentIndexer) -> None:
+        """Test get with missing IDs."""
+        documents = [
+            Document(id="1", page_content="foo", metadata={"id": 1}),
+            Document(id="2", page_content="bar", metadata={"id": 2}),
+        ]
+        upsert_response = await indexer.upsert(documents)
+        assert upsert_response == {
+            "succeeded": ["1", "2"],
+            "failed": [],
+        }
+        retrieved_documents = await indexer.get(["1", "2", "3", "4"])
+        # The ordering is not guaranteed, so we use a set.
+        assert sorted(retrieved_documents, key=lambda x: x.id) == [  # type: ignore
+            Document(page_content="foo", metadata={"id": 1}, id="1"),
+            Document(page_content="bar", metadata={"id": 2}, id="2"),
+        ]
+
+    async def test_get_missing(self, indexer: AsyncDocumentIndexer) -> None:
+        """Test get by IDs with missing IDs."""
+        # This should not raise an exception
+        documents = await indexer.get(["1", "2", "3"])
+        assert documents == []
Author	SHA1	Message	Date
Eugene Yurtsev	5560965a61	x	2024-07-22 09:39:18 -04:00
Eugene Yurtsev	6a41664efd	x	2024-07-19 17:21:46 -04:00
Eugene Yurtsev	925109105b	qxqx	2024-07-19 17:06:58 -04:00
Eugene Yurtsev	6dcb7050d9	qxqx	2024-07-19 15:30:04 -04:00
Eugene Yurtsev	2208123432	qxqx	2024-07-19 15:29:47 -04:00
Eugene Yurtsev	73d7bd9282	x	2024-07-19 14:59:53 -04:00
Eugene Yurtsev	2962d91beb	Merge branch 'master' into eugene/document_indexer_v2	2024-07-19 14:31:01 -04:00
Eugene Yurtsev	033adfb8b3	x	2024-07-18 17:07:11 -04:00
Eugene Yurtsev	2c5b1704ff	qxqx	2024-07-18 15:04:18 -04:00
Eugene Yurtsev	ad8a4ee074	update	2024-07-18 14:08:34 -04:00
Eugene Yurtsev	9b7126f962	xt	2024-07-18 12:09:43 -04:00
Eugene Yurtsev	81db15ffa0	x	2024-07-18 11:01:36 -04:00
Eugene Yurtsev	fed2495a31	x	2024-07-17 17:24:43 -04:00
Eugene Yurtsev	f85287c5e7	x	2024-07-17 17:19:01 -04:00
Eugene Yurtsev	10708e856e	update	2024-07-17 17:18:09 -04:00
Eugene Yurtsev	1a41d27b1d	qxqx	2024-07-17 17:05:29 -04:00
Eugene Yurtsev	ab72ad9e36	update	2024-07-17 17:05:09 -04:00
Eugene Yurtsev	428b2409c7	x	2024-07-17 15:30:10 -04:00