diff --git a/libs/core/langchain_core/indexing/__init__.py b/libs/core/langchain_core/indexing/__init__.py index 305ae7b459d..786914c00e1 100644 --- a/libs/core/langchain_core/indexing/__init__.py +++ b/libs/core/langchain_core/indexing/__init__.py @@ -7,6 +7,7 @@ if it's unchanged. from langchain_core.indexing.api import IndexingResult, aindex, index from langchain_core.indexing.base import ( + DocumentIndex, InMemoryRecordManager, RecordManager, UpsertResponse, @@ -14,6 +15,7 @@ from langchain_core.indexing.base import ( __all__ = [ "aindex", + "DocumentIndex", "index", "IndexingResult", "InMemoryRecordManager", diff --git a/libs/core/langchain_core/indexing/base.py b/libs/core/langchain_core/indexing/base.py index c7e549615e7..24683a5f8eb 100644 --- a/libs/core/langchain_core/indexing/base.py +++ b/libs/core/langchain_core/indexing/base.py @@ -1,8 +1,14 @@ from __future__ import annotations +import abc import time from abc import ABC, abstractmethod -from typing import Dict, List, Optional, Sequence, TypedDict +from typing import Any, Dict, List, Optional, Sequence, TypedDict + +from langchain_core._api import beta +from langchain_core.documents import Document +from langchain_core.retrievers import BaseRetriever +from langchain_core.runnables import run_in_executor class RecordManager(ABC): @@ -447,3 +453,209 @@ class UpsertResponse(TypedDict): """The IDs that were successfully indexed.""" failed: List[str] """The IDs that failed to index.""" + + +class DeleteResponse(TypedDict, total=False): + """A generic response for delete operation. + + The fields in this response are optional and whether the vectorstore + returns them or not is up to the implementation. + """ + + num_deleted: int + """The number of items that were successfully deleted. + + If returned, this should only include *actual* deletions. + + If the ID did not exist to begin with, + it should not be included in this count. + """ + + succeeded: Sequence[str] + """The IDs that were successfully deleted. + + If returned, this should only include *actual* deletions. + + If the ID did not exist to begin with, + it should not be included in this list. + """ + + failed: Sequence[str] + """The IDs that failed to be deleted. + + Please note that deleting an ID that + does not exist is **NOT** considered a failure. + """ + + num_failed: int + """The number of items that failed to be deleted.""" + + +@beta(message="Added in 0.2.29. The abstraction is subject to change.") +class DocumentIndex(BaseRetriever): + """A document retriever that supports indexing operations. + + This indexing interface is designed to be a generic abstraction for storing and + querying documents that has an ID and metadata associated with it. + + The interface is designed to be agnostic to the underlying implementation of the + indexing system. + + The interface is designed to support the following operations: + + 1. Storing document in the index. + 2. Fetching document by ID. + 3. Searching for document using a query. + + .. versionadded:: 0.2.29 + """ + + @abc.abstractmethod + def upsert(self, items: Sequence[Document], /, **kwargs: Any) -> UpsertResponse: + """Upsert documents into the index. + + The upsert functionality should utilize the ID field of the content object + if it is provided. If the ID is not provided, the upsert method is free + to generate an ID for the content. + + When an ID is specified and the content already exists in the vectorstore, + the upsert method should update the content with the new data. If the content + does not exist, the upsert method should add the item to the vectorstore. + + Args: + items: Sequence of documents to add to the vectorstore. + **kwargs: Additional keyword arguments. + + Returns: + UpsertResponse: A response object that contains the list of IDs that were + successfully added or updated in the vectorstore and the list of IDs that + failed to be added or updated. + """ + + async def aupsert( + self, items: Sequence[Document], /, **kwargs: Any + ) -> UpsertResponse: + """Add or update documents in the vectorstore. Async version of upsert. + + The upsert functionality should utilize the ID field of the item + if it is provided. If the ID is not provided, the upsert method is free + to generate an ID for the item. + + When an ID is specified and the item already exists in the vectorstore, + the upsert method should update the item with the new data. If the item + does not exist, the upsert method should add the item to the vectorstore. + + Args: + items: Sequence of documents to add to the vectorstore. + **kwargs: Additional keyword arguments. + + Returns: + UpsertResponse: A response object that contains the list of IDs that were + successfully added or updated in the vectorstore and the list of IDs that + failed to be added or updated. + """ + return await run_in_executor( + None, + self.upsert, + items, + **kwargs, + ) + + @abc.abstractmethod + def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> DeleteResponse: + """Delete by IDs or other criteria. + + Calling delete without any input parameters should raise a ValueError! + + Args: + ids: List of ids to delete. + kwargs: Additional keyword arguments. This is up to the implementation. + For example, can include an option to delete the entire index, + or else issue a non-blocking delete etc. + + Returns: + DeleteResponse: A response object that contains the list of IDs that were + successfully deleted and the list of IDs that failed to be deleted. + """ + + async def adelete( + self, ids: Optional[List[str]] = None, **kwargs: Any + ) -> DeleteResponse: + """Delete by IDs or other criteria. Async variant. + + Calling adelete without any input parameters should raise a ValueError! + + Args: + ids: List of ids to delete. + kwargs: Additional keyword arguments. This is up to the implementation. + For example, can include an option to delete the entire index. + + Returns: + DeleteResponse: A response object that contains the list of IDs that were + successfully deleted and the list of IDs that failed to be deleted. + """ + return await run_in_executor( + None, + self.delete, + ids, + **kwargs, + ) + + @abc.abstractmethod + def get( + self, + ids: Sequence[str], + /, + **kwargs: Any, + ) -> List[Document]: + """Get documents by id. + + Fewer documents may be returned than requested if some IDs are not found or + if there are duplicated IDs. + + Users should not assume that the order of the returned documents matches + the order of the input IDs. Instead, users should rely on the ID field of the + returned documents. + + This method should **NOT** raise exceptions if no documents are found for + some IDs. + + Args: + ids: List of IDs to get. + kwargs: Additional keyword arguments. These are up to the implementation. + + Returns: + List[Document]: List of documents that were found. + """ + + async def aget( + self, + ids: Sequence[str], + /, + **kwargs: Any, + ) -> List[Document]: + """Get documents by id. + + Fewer documents may be returned than requested if some IDs are not found or + if there are duplicated IDs. + + Users should not assume that the order of the returned documents matches + the order of the input IDs. Instead, users should rely on the ID field of the + returned documents. + + This method should **NOT** raise exceptions if no documents are found for + some IDs. + + Args: + ids: List of IDs to get. + kwargs: Additional keyword arguments. These are up to the implementation. + + Returns: + List[Document]: List of documents that were found. + """ + return await run_in_executor( + None, + self.get, + ids, + **kwargs, + ) diff --git a/libs/core/langchain_core/indexing/in_memory.py b/libs/core/langchain_core/indexing/in_memory.py new file mode 100644 index 00000000000..01541030312 --- /dev/null +++ b/libs/core/langchain_core/indexing/in_memory.py @@ -0,0 +1,81 @@ +import uuid +from typing import Any, Dict, List, Optional, Sequence, cast + +from langchain_core._api import beta +from langchain_core.callbacks import CallbackManagerForRetrieverRun +from langchain_core.documents import Document +from langchain_core.indexing import UpsertResponse +from langchain_core.indexing.base import DeleteResponse, DocumentIndex +from langchain_core.pydantic_v1 import Field + + +@beta(message="Introduced in version 0.2.29. Underlying abstraction subject to change.") +class InMemoryDocumentIndex(DocumentIndex): + """In memory document index. + + This is an in-memory document index that stores documents in a dictionary. + + It provides a simple search API that returns documents by the number of + counts the given query appears in the document. + + .. versionadded:: 0.2.29 + """ + + store: Dict[str, Document] = Field(default_factory=dict) + top_k: int = 4 + + def upsert(self, items: Sequence[Document], /, **kwargs: Any) -> UpsertResponse: + """Upsert items into the index.""" + ok_ids = [] + + for item in items: + if item.id is None: + id_ = str(uuid.uuid4()) + item_ = item.copy() + item_.id = id_ + else: + item_ = item + id_ = item.id + + self.store[id_] = item_ + ok_ids.append(cast(str, item_.id)) + + return UpsertResponse(succeeded=ok_ids, failed=[]) + + def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> DeleteResponse: + """Delete by ID.""" + if ids is None: + raise ValueError("IDs must be provided for deletion") + + ok_ids = [] + + for id_ in ids: + if id_ in self.store: + del self.store[id_] + ok_ids.append(id_) + + return DeleteResponse( + succeeded=ok_ids, num_deleted=len(ok_ids), num_failed=0, failed=[] + ) + + def get(self, ids: Sequence[str], /, **kwargs: Any) -> List[Document]: + """Get by ids.""" + found_documents = [] + + for id_ in ids: + if id_ in self.store: + found_documents.append(self.store[id_]) + + return found_documents + + def _get_relevant_documents( + self, query: str, *, run_manager: CallbackManagerForRetrieverRun + ) -> List[Document]: + counts_by_doc = [] + + for document in self.store.values(): + count = document.page_content.count(query) + counts_by_doc.append((document, count)) + + counts_by_doc.sort(key=lambda x: x[1], reverse=True) + return [doc.copy() for doc, count in counts_by_doc[: self.top_k]] diff --git a/libs/core/langchain_core/vectorstores/base.py b/libs/core/langchain_core/vectorstores/base.py index 87806ecb3a1..88688894969 100644 --- a/libs/core/langchain_core/vectorstores/base.py +++ b/libs/core/langchain_core/vectorstores/base.py @@ -60,7 +60,7 @@ if TYPE_CHECKING: CallbackManagerForRetrieverRun, ) from langchain_core.documents import Document - from langchain_core.indexing.base import UpsertResponse + from langchain_core.indexing import UpsertResponse logger = logging.getLogger(__name__) diff --git a/libs/core/tests/unit_tests/indexing/test_in_memory_indexer.py b/libs/core/tests/unit_tests/indexing/test_in_memory_indexer.py new file mode 100644 index 00000000000..9e567628138 --- /dev/null +++ b/libs/core/tests/unit_tests/indexing/test_in_memory_indexer.py @@ -0,0 +1,50 @@ +"""Test in memory indexer""" + +from typing import AsyncGenerator, Generator + +import pytest +from langchain_standard_tests.integration_tests.indexer import ( + AsyncDocumentIndexTestSuite, + DocumentIndexerTestSuite, +) + +from langchain_core.documents import Document +from langchain_core.indexing.base import DocumentIndex +from langchain_core.indexing.in_memory import ( + InMemoryDocumentIndex, +) + + +class TestDocumentIndexerTestSuite(DocumentIndexerTestSuite): + @pytest.fixture() + def index(self) -> Generator[DocumentIndex, None, None]: + yield InMemoryDocumentIndex() + + +class TestAsyncDocumentIndexerTestSuite(AsyncDocumentIndexTestSuite): + # Something funky is going on with mypy and async pytest fixture + @pytest.fixture() + async def index(self) -> AsyncGenerator[DocumentIndex, None]: # type: ignore + yield InMemoryDocumentIndex() + + +def test_sync_retriever() -> None: + index = InMemoryDocumentIndex() + documents = [ + Document(id="1", page_content="hello world"), + Document(id="2", page_content="goodbye cat"), + ] + index.upsert(documents) + assert index.invoke("hello") == [documents[0], documents[1]] + assert index.invoke("cat") == [documents[1], documents[0]] + + +async def test_async_retriever() -> None: + index = InMemoryDocumentIndex() + documents = [ + Document(id="1", page_content="hello world"), + Document(id="2", page_content="goodbye cat"), + ] + await index.aupsert(documents) + assert (await index.ainvoke("hello")) == [documents[0], documents[1]] + assert (await index.ainvoke("cat")) == [documents[1], documents[0]] diff --git a/libs/core/tests/unit_tests/indexing/test_public_api.py b/libs/core/tests/unit_tests/indexing/test_public_api.py index 0259017a954..fce3d4f4f96 100644 --- a/libs/core/tests/unit_tests/indexing/test_public_api.py +++ b/libs/core/tests/unit_tests/indexing/test_public_api.py @@ -4,11 +4,12 @@ from langchain_core.indexing import __all__ def test_all() -> None: """Use to catch obvious breaking changes.""" assert __all__ == sorted(__all__, key=str.lower) - assert __all__ == [ + assert set(__all__) == { "aindex", + "DocumentIndex", "index", "IndexingResult", "InMemoryRecordManager", "RecordManager", "UpsertResponse", - ] + } diff --git a/libs/core/tests/unit_tests/vectorstores/test_vectorstore.py b/libs/core/tests/unit_tests/vectorstores/test_vectorstore.py index dc4955e70a2..3b5efc7a85b 100644 --- a/libs/core/tests/unit_tests/vectorstores/test_vectorstore.py +++ b/libs/core/tests/unit_tests/vectorstores/test_vectorstore.py @@ -7,7 +7,7 @@ from typing_extensions import TypedDict from langchain_core.documents import Document from langchain_core.embeddings import Embeddings -from langchain_core.indexing.base import UpsertResponse +from langchain_core.indexing import UpsertResponse from langchain_core.vectorstores import VectorStore diff --git a/libs/standard-tests/langchain_standard_tests/integration_tests/indexer.py b/libs/standard-tests/langchain_standard_tests/integration_tests/indexer.py new file mode 100644 index 00000000000..f1e5d9eee0a --- /dev/null +++ b/libs/standard-tests/langchain_standard_tests/integration_tests/indexer.py @@ -0,0 +1,392 @@ +"""Test suite to check index implementations.""" + +import inspect +import uuid +from abc import ABC, abstractmethod +from typing import AsyncGenerator, Generator + +import pytest +from langchain_core.documents import Document +from langchain_core.indexing.base import DocumentIndex + + +class DocumentIndexerTestSuite(ABC): + """Test suite for checking the read-write of a document index. + + Implementers should subclass this test suite and provide a fixture + that returns an empty index for each test. + """ + + @abstractmethod + @pytest.fixture + def index(self) -> Generator[DocumentIndex, None, None]: + """Get the index.""" + + def test_upsert_documents_has_no_ids(self, index: DocumentIndex) -> None: + """Verify that there is not parameter called ids in upsert""" + signature = inspect.signature(index.upsert) + assert "ids" not in signature.parameters + + def test_upsert_no_ids(self, index: DocumentIndex) -> None: + """Upsert works with documents that do not have IDs. + + At the moment, the ID field in documents is optional. + """ + documents = [ + Document(page_content="foo", metadata={"id": 1}), + Document(page_content="bar", metadata={"id": 2}), + ] + response = index.upsert(documents) + ids = sorted(response["succeeded"]) + + # Ordering is not guaranteed, need to test carefully + documents = index.get(ids) + sorted_documents = sorted(documents, key=lambda x: x.id) # type: ignore + + if sorted_documents[0].page_content == "bar": + assert sorted_documents[0] == Document( + page_content="bar", metadata={"id": 2}, id=ids[0] + ) + assert sorted_documents[1] == Document( + page_content="foo", metadata={"id": 1}, id=ids[1] + ) + else: + assert sorted_documents[0] == Document( + page_content="foo", metadata={"id": 1}, id=ids[0] + ) + assert sorted_documents[1] == Document( + page_content="bar", metadata={"id": 2}, id=ids[1] + ) + + def test_upsert_some_ids(self, index: DocumentIndex) -> None: + """Test an upsert where some docs have ids and some dont.""" + foo_uuid = str(uuid.UUID(int=7)) + documents = [ + Document(id=foo_uuid, page_content="foo", metadata={"id": 1}), + Document(page_content="bar", metadata={"id": 2}), + ] + response = index.upsert(documents) + ids = response["succeeded"] + other_id = list(set(ids) - {foo_uuid})[0] + assert response["failed"] == [] + assert foo_uuid in ids + # Ordering is not guaranteed, so we use a set. + documents = index.get(ids) + first_doc = documents[0] + if first_doc.id == foo_uuid: + assert documents == [ + Document(page_content="foo", metadata={"id": 1}, id=foo_uuid), + Document(page_content="bar", metadata={"id": 2}, id=other_id), + ] + else: + assert documents == [ + Document(page_content="bar", metadata={"id": 2}, id=other_id), + Document(page_content="foo", metadata={"id": 1}, id=foo_uuid), + ] + + def test_upsert_overwrites(self, index: DocumentIndex) -> None: + """Test that upsert overwrites existing content.""" + foo_uuid = str(uuid.UUID(int=7)) + documents = [ + Document(id=foo_uuid, page_content="foo", metadata={"bar": 1}), + ] + response = index.upsert(documents) + ids = response["succeeded"] + assert response["failed"] == [] + + assert index.get(ids) == [ + Document(page_content="foo", metadata={"bar": 1}, id=foo_uuid), + ] + + # Now let's overwrite foo + index.upsert([Document(id=foo_uuid, page_content="foo2", metadata={"meow": 2})]) + documents = index.get([foo_uuid]) + assert documents == [ + Document(page_content="foo2", metadata={"meow": 2}, id=foo_uuid) + ] + + def test_delete_missing_docs(self, index: DocumentIndex) -> None: + """Verify that we can delete docs that aren't there.""" + assert index.get(["1"]) == [] # Should be empty. + + delete_response = index.delete(["1"]) + if "num_deleted" in delete_response: + assert delete_response["num_deleted"] == 0 + + if "num_failed" in delete_response: + # Deleting a missing an ID is **not** failure!! + assert delete_response["num_failed"] == 0 + + if "succeeded" in delete_response: + # There was nothing to delete! + assert delete_response["succeeded"] == [] + + if "failed" in delete_response: + # Nothing should have failed + assert delete_response["failed"] == [] + + def test_delete_semantics(self, index: DocumentIndex) -> None: + """Test deletion of content has appropriate semantics.""" + # Let's index a document first. + foo_uuid = str(uuid.UUID(int=7)) + upsert_response = index.upsert( + [Document(id=foo_uuid, page_content="foo", metadata={})] + ) + assert upsert_response == {"succeeded": [foo_uuid], "failed": []} + + delete_response = index.delete(["missing_id", foo_uuid]) + + if "num_deleted" in delete_response: + assert delete_response["num_deleted"] == 1 + + if "num_failed" in delete_response: + # Deleting a missing an ID is **not** failure!! + assert delete_response["num_failed"] == 0 + + if "succeeded" in delete_response: + # There was nothing to delete! + assert delete_response["succeeded"] == [foo_uuid] + + if "failed" in delete_response: + # Nothing should have failed + assert delete_response["failed"] == [] + + def test_bulk_delete(self, index: DocumentIndex) -> None: + """Test that we can delete several documents at once.""" + documents = [ + Document(id="1", page_content="foo", metadata={"id": 1}), + Document(id="2", page_content="bar", metadata={"id": 2}), + Document(id="3", page_content="baz", metadata={"id": 3}), + ] + + index.upsert(documents) + index.delete(["1", "2"]) + assert index.get(["1", "2", "3"]) == [ + Document(page_content="baz", metadata={"id": 3}, id="3") + ] + + def test_delete_no_args(self, index: DocumentIndex) -> None: + """Test delete with no args raises ValueError.""" + with pytest.raises(ValueError): + index.delete() + + def test_delete_missing_content(self, index: DocumentIndex) -> None: + """Deleting missing content should not raise an exception.""" + index.delete(["1"]) + index.delete(["1", "2", "3"]) + + def test_get_with_missing_ids(self, index: DocumentIndex) -> None: + """Test get with missing IDs.""" + documents = [ + Document(id="1", page_content="foo", metadata={"id": 1}), + Document(id="2", page_content="bar", metadata={"id": 2}), + ] + upsert_response = index.upsert(documents) + assert upsert_response == { + "succeeded": ["1", "2"], + "failed": [], + } + retrieved_documents = index.get(["1", "2", "3", "4"]) + # The ordering is not guaranteed, so we use a set. + assert sorted(retrieved_documents, key=lambda x: x.id) == [ # type: ignore + Document(page_content="foo", metadata={"id": 1}, id="1"), + Document(page_content="bar", metadata={"id": 2}, id="2"), + ] + + def test_get_missing(self, index: DocumentIndex) -> None: + """Test get by IDs with missing IDs.""" + # This should not raise an exception + documents = index.get(["1", "2", "3"]) + assert documents == [] + + +class AsyncDocumentIndexTestSuite(ABC): + """Test suite for checking the read-write of a document index. + + Implementers should subclass this test suite and provide a fixture + that returns an empty index for each test. + """ + + @abstractmethod + @pytest.fixture + async def index(self) -> AsyncGenerator[DocumentIndex, None]: + """Get the index.""" + + async def test_upsert_documents_has_no_ids(self, index: DocumentIndex) -> None: + """Verify that there is not parameter called ids in upsert""" + signature = inspect.signature(index.upsert) + assert "ids" not in signature.parameters + + async def test_upsert_no_ids(self, index: DocumentIndex) -> None: + """Upsert works with documents that do not have IDs. + + At the moment, the ID field in documents is optional. + """ + documents = [ + Document(page_content="foo", metadata={"id": 1}), + Document(page_content="bar", metadata={"id": 2}), + ] + response = await index.aupsert(documents) + ids = sorted(response["succeeded"]) + + # Ordering is not guaranteed, need to test carefully + documents = await index.aget(ids) + sorted_documents = sorted(documents, key=lambda x: x.id) # type: ignore + + if sorted_documents[0].page_content == "bar": + assert sorted_documents[0] == Document( + page_content="bar", metadata={"id": 2}, id=ids[0] + ) + assert sorted_documents[1] == Document( + page_content="foo", metadata={"id": 1}, id=ids[1] + ) + else: + assert sorted_documents[0] == Document( + page_content="foo", metadata={"id": 1}, id=ids[0] + ) + assert sorted_documents[1] == Document( + page_content="bar", metadata={"id": 2}, id=ids[1] + ) + + async def test_upsert_some_ids(self, index: DocumentIndex) -> None: + """Test an upsert where some docs have ids and some dont.""" + foo_uuid = str(uuid.UUID(int=7)) + documents = [ + Document(id=foo_uuid, page_content="foo", metadata={"id": 1}), + Document(page_content="bar", metadata={"id": 2}), + ] + response = await index.aupsert(documents) + ids = response["succeeded"] + other_id = list(set(ids) - {foo_uuid})[0] + assert response["failed"] == [] + assert foo_uuid in ids + # Ordering is not guaranteed, so we use a set. + documents = await index.aget(ids) + first_doc = documents[0] + if first_doc.id == foo_uuid: + assert documents == [ + Document(page_content="foo", metadata={"id": 1}, id=foo_uuid), + Document(page_content="bar", metadata={"id": 2}, id=other_id), + ] + else: + assert documents == [ + Document(page_content="bar", metadata={"id": 2}, id=other_id), + Document(page_content="foo", metadata={"id": 1}, id=foo_uuid), + ] + + async def test_upsert_overwrites(self, index: DocumentIndex) -> None: + """Test that upsert overwrites existing content.""" + foo_uuid = str(uuid.UUID(int=7)) + documents = [ + Document(id=foo_uuid, page_content="foo", metadata={"bar": 1}), + ] + response = await index.aupsert(documents) + ids = response["succeeded"] + assert response["failed"] == [] + + assert await index.aget(ids) == [ + Document(page_content="foo", metadata={"bar": 1}, id=foo_uuid), + ] + + # Now let's overwrite foo + await index.aupsert( + [Document(id=foo_uuid, page_content="foo2", metadata={"meow": 2})] + ) + documents = await index.aget([foo_uuid]) + assert documents == [ + Document(page_content="foo2", metadata={"meow": 2}, id=foo_uuid) + ] + + async def test_delete_missing_docs(self, index: DocumentIndex) -> None: + """Verify that we can delete docs that aren't there.""" + assert await index.aget(["1"]) == [] # Should be empty. + + delete_response = await index.adelete(["1"]) + if "num_deleted" in delete_response: + assert delete_response["num_deleted"] == 0 + + if "num_failed" in delete_response: + # Deleting a missing an ID is **not** failure!! + assert delete_response["num_failed"] == 0 + + if "succeeded" in delete_response: + # There was nothing to delete! + assert delete_response["succeeded"] == [] + + if "failed" in delete_response: + # Nothing should have failed + assert delete_response["failed"] == [] + + async def test_delete_semantics(self, index: DocumentIndex) -> None: + """Test deletion of content has appropriate semantics.""" + # Let's index a document first. + foo_uuid = str(uuid.UUID(int=7)) + upsert_response = await index.aupsert( + [Document(id=foo_uuid, page_content="foo", metadata={})] + ) + assert upsert_response == {"succeeded": [foo_uuid], "failed": []} + + delete_response = await index.adelete(["missing_id", foo_uuid]) + + if "num_deleted" in delete_response: + assert delete_response["num_deleted"] == 1 + + if "num_failed" in delete_response: + # Deleting a missing an ID is **not** failure!! + assert delete_response["num_failed"] == 0 + + if "succeeded" in delete_response: + # There was nothing to delete! + assert delete_response["succeeded"] == [foo_uuid] + + if "failed" in delete_response: + # Nothing should have failed + assert delete_response["failed"] == [] + + async def test_bulk_delete(self, index: DocumentIndex) -> None: + """Test that we can delete several documents at once.""" + documents = [ + Document(id="1", page_content="foo", metadata={"id": 1}), + Document(id="2", page_content="bar", metadata={"id": 2}), + Document(id="3", page_content="baz", metadata={"id": 3}), + ] + + await index.aupsert(documents) + await index.adelete(["1", "2"]) + assert await index.aget(["1", "2", "3"]) == [ + Document(page_content="baz", metadata={"id": 3}, id="3") + ] + + async def test_delete_no_args(self, index: DocumentIndex) -> None: + """Test delete with no args raises ValueError.""" + with pytest.raises(ValueError): + await index.adelete() + + async def test_delete_missing_content(self, index: DocumentIndex) -> None: + """Deleting missing content should not raise an exception.""" + await index.adelete(["1"]) + await index.adelete(["1", "2", "3"]) + + async def test_get_with_missing_ids(self, index: DocumentIndex) -> None: + """Test get with missing IDs.""" + documents = [ + Document(id="1", page_content="foo", metadata={"id": 1}), + Document(id="2", page_content="bar", metadata={"id": 2}), + ] + upsert_response = await index.aupsert(documents) + assert upsert_response == { + "succeeded": ["1", "2"], + "failed": [], + } + retrieved_documents = await index.aget(["1", "2", "3", "4"]) + # The ordering is not guaranteed, so we use a set. + assert sorted(retrieved_documents, key=lambda x: x.id) == [ # type: ignore + Document(page_content="foo", metadata={"id": 1}, id="1"), + Document(page_content="bar", metadata={"id": 2}, id="2"), + ] + + async def test_get_missing(self, index: DocumentIndex) -> None: + """Test get by IDs with missing IDs.""" + # This should not raise an exception + documents = await index.aget(["1", "2", "3"]) + assert documents == []