diff --git a/libs/core/langchain_core/indexing/base.py b/libs/core/langchain_core/indexing/base.py index 0491908967c..6d616f10f0e 100644 --- a/libs/core/langchain_core/indexing/base.py +++ b/libs/core/langchain_core/indexing/base.py @@ -1,7 +1,6 @@ from __future__ import annotations import abc -import time from abc import ABC, abstractmethod from typing import ( Any, @@ -11,6 +10,7 @@ from typing import ( Sequence, ) +import time from typing_extensions import TypedDict from langchain_core._api import beta @@ -52,22 +52,33 @@ class DeleteResponse(TypedDict, total=False): """ num_deleted: int - """The number of items that were successfully deleted.""" - num_failed: int - """The number of items that failed to be deleted.""" + """The number of items that were successfully deleted. + + If returned, this should only include *actual* deletions. + + If the ID did not exist to begin with, + it should not be included in this count. + """ + succeeded: Sequence[str] """The IDs that were successfully deleted. - Should not be returned when using delete_by_filter. + If returned, this should only include *actual* deletions. + + If the ID did not exist to begin with, + it should not be included in this list. """ + failed: Sequence[str] """The IDs that failed to be deleted. - Should not be returned when using delete_by_filter. - - Please note that deleting an ID that does not exist is **NOT** considered a failure. + Please note that deleting an ID that + does not exist is **NOT** considered a failure. """ + num_failed: int + """The number of items that failed to be deleted.""" + @beta(message="Added in ___version___. The API is subject to change.") class DocumentIndexer(abc.ABC): diff --git a/libs/core/langchain_core/indexing/in_memory.py b/libs/core/langchain_core/indexing/in_memory.py new file mode 100644 index 00000000000..46a50d02e50 --- /dev/null +++ b/libs/core/langchain_core/indexing/in_memory.py @@ -0,0 +1,57 @@ +import uuid +from typing import Dict, Optional, Sequence, Any, List + +from langchain_core.documents import Document +from langchain_core.indexing import UpsertResponse +from langchain_core.indexing.base import DocumentIndexer, DeleteResponse + + +class InMemoryIndexer(DocumentIndexer): + """In memory sync indexer.""" + + def __init__(self, *, store: Optional[Dict[str, Document]] = None) -> None: + """An in memory implementation of a document indexer.""" + self.store = store if store is not None else {} + + def upsert(self, items: Sequence[Document], /, **kwargs: Any) -> UpsertResponse: + """Upsert items into the indexer.""" + ok_ids = [] + + for item in items: + if item.id is None: + id_ = uuid.uuid4() + item_ = item.copy() + item_.id = str(id_) + else: + item_ = item + + self.store[item_.id] = item_ + ok_ids.append(item_.id) + + return UpsertResponse(succeeded=ok_ids, failed=[]) + + def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> DeleteResponse: + """Delete by ID.""" + if ids is None: + raise ValueError("IDs must be provided for deletion") + + ok_ids = [] + + for id_ in ids: + if id_ in self.store: + del self.store[id_] + ok_ids.append(id_) + + return DeleteResponse( + succeeded=ok_ids, num_deleted=len(ok_ids), num_failed=0, failed=[] + ) + + def get(self, ids: Sequence[str], /, **kwargs: Any) -> List[Document]: + """Get by ids.""" + found_documents = [] + + for id_ in ids: + if id_ in self.store: + found_documents.append(self.store[id_]) + + return found_documents diff --git a/libs/core/tests/unit_tests/indexing/test_in_memory_indexer.py b/libs/core/tests/unit_tests/indexing/test_in_memory_indexer.py new file mode 100644 index 00000000000..d15a1767c1d --- /dev/null +++ b/libs/core/tests/unit_tests/indexing/test_in_memory_indexer.py @@ -0,0 +1,17 @@ +"""Test in memory indexer""" + +from typing import Generator + +import pytest + +from langchain_core.indexing import DocumentIndexer +from langchain_core.indexing.in_memory import InMemoryIndexer +from langchain_standard_tests.integration_tests.indexer import ( + BaseDocumentIndexerTestSuite, +) + + +class TestDocumentIndexerTestSuite(BaseDocumentIndexerTestSuite): + @pytest.fixture() + def indexer(self) -> Generator[DocumentIndexer, None, None]: + return InMemoryIndexer() diff --git a/libs/standard-tests/langchain_standard_tests/integration_tests/__init__.py b/libs/standard-tests/langchain_standard_tests/integration_tests/__init__.py index dbf12101d11..8f9c108da1d 100644 --- a/libs/standard-tests/langchain_standard_tests/integration_tests/__init__.py +++ b/libs/standard-tests/langchain_standard_tests/integration_tests/__init__.py @@ -1,7 +1,14 @@ +import pytest + from langchain_standard_tests.integration_tests.chat_models import ( ChatModelIntegrationTests, ) +# Rewrite assert statements for test suite so that implementations can +# see the full error message from failed asserts. +# https://docs.pytest.org/en/7.1.x/how-to/writing_plugins.html#assertion-rewriting +pytest.register_assert_rewrite("langchain_standard_tests.integration_tests.indexer") + __all__ = [ "ChatModelIntegrationTests", ] diff --git a/libs/standard-tests/langchain_standard_tests/integration_tests/indexer.py b/libs/standard-tests/langchain_standard_tests/integration_tests/indexer.py new file mode 100644 index 00000000000..275b3315a82 --- /dev/null +++ b/libs/standard-tests/langchain_standard_tests/integration_tests/indexer.py @@ -0,0 +1,203 @@ +"""Test suite to check indexer implementations.""" +import inspect +import uuid +from abc import ABC, abstractmethod +from typing import Generator + +import pytest +from langchain_core.documents import Document +from langchain_core.indexing import DocumentIndexer + +# Arbitrarily chosen. Using a small embedding size +# so tests are faster and easier to debug. +EMBEDDING_SIZE = 6 + + +class BaseDocumentIndexerTestSuite(ABC): + """Test suite for checking the read-write of a document indexer. + + Implementers should subclass this test suite and provide a fixture + that returns an empty indexer for each test. + """ + + @abstractmethod + @pytest.fixture + def indexer(self) -> Generator[DocumentIndexer, None, None]: + """Get the indexer.""" + + def test_upsert_documents_has_no_ids(self, indexer: DocumentIndexer) -> None: + """Verify that there is not parameter called ids in upsert""" + signature = inspect.signature(indexer.upsert) + assert "ids" not in signature.parameters + + def test_upsert_no_ids(self, indexer: DocumentIndexer) -> None: + """Upsert works with documents that do not have IDs. + + At the moment, the ID field in documents is optional. + """ + documents = [ + Document(page_content="foo", metadata={"id": 1}), + Document(page_content="bar", metadata={"id": 2}), + ] + response = indexer.upsert(documents) + ids = sorted(response["succeeded"]) + + # Ordering is not guaranteed, need to test carefully + documents = indexer.get(ids) + sorted_documents = sorted(documents, key=lambda x: x.id) + + if sorted_documents[0].page_content == "bar": + assert sorted_documents[0] == Document( + page_content="bar", metadata={"id": 2}, id=ids[0] + ) + assert sorted_documents[1] == Document( + page_content="foo", metadata={"id": 1}, id=ids[1] + ) + else: + assert sorted_documents[0] == Document( + page_content="foo", metadata={"id": 1}, id=ids[0] + ) + assert sorted_documents[1] == Document( + page_content="bar", metadata={"id": 2}, id=ids[1] + ) + + def test_upsert_some_ids(self, indexer: DocumentIndexer) -> None: + """Test an upsert where some docs have ids and some dont.""" + foo_uuid = str(uuid.UUID(int=7)) + documents = [ + Document(id=foo_uuid, page_content="foo", metadata={"id": 1}), + Document(page_content="bar", metadata={"id": 2}), + ] + response = indexer.upsert(documents) + ids = response["succeeded"] + other_id = list(set(ids) - {foo_uuid})[0] + assert response["failed"] == [] + assert foo_uuid in ids + # Ordering is not guaranteed, so we use a set. + documents = indexer.get(ids) + first_doc = documents[0] + if first_doc.id == foo_uuid: + assert documents == [ + Document(page_content="foo", metadata={"id": 1}, id=foo_uuid), + Document(page_content="bar", metadata={"id": 2}, id=other_id), + ] + else: + assert documents == [ + Document(page_content="bar", metadata={"id": 2}, id=other_id), + Document(page_content="foo", metadata={"id": 1}, id=foo_uuid), + ] + + def test_upsert_overwrites(self, indexer: DocumentIndexer) -> None: + """Test that upsert overwrites existing content.""" + foo_uuid = str(uuid.UUID(int=7)) + documents = [ + Document(id=foo_uuid, page_content="foo", metadata={"bar": 1}), + ] + response = indexer.upsert(documents) + ids = response["succeeded"] + assert response["failed"] == [] + + assert indexer.get(ids) == [ + Document(page_content="foo", metadata={"bar": 1}, id=foo_uuid), + ] + + # Now let's overwrite foo + indexer.upsert( + [Document(id=foo_uuid, page_content="foo2", metadata={"meow": 2})] + ) + documents = indexer.get([foo_uuid]) + assert documents == [ + Document(page_content="foo2", metadata={"meow": 2}, id=foo_uuid) + ] + + def test_delete_missing_docs(self, indexer: DocumentIndexer) -> None: + """Verify that we can delete docs that aren't there.""" + assert indexer.get(["1"]) == [] # Should be empty. + + delete_response = indexer.delete(["1"]) + if "num_deleted" in delete_response: + assert delete_response["num_deleted"] == 0 + + if "num_failed" in delete_response: + # Deleting a missing an ID is **not** failure!! + assert delete_response["num_failed"] == 0 + + if "succeeded" in delete_response: + # There was nothing to delete! + assert delete_response["succeeded"] == [] + + if "failed" in delete_response: + # Nothing should have failed + assert delete_response["failed"] == [] + + def test_delete_semantics(self, indexer: DocumentIndexer) -> None: + """Test deletion of content has appropriate semantics.""" + # Let's index a document first. + foo_uuid = str(uuid.UUID(int=7)) + upsert_response = indexer.upsert( + [Document(id=foo_uuid, page_content="foo", metadata={})] + ) + assert upsert_response == {"succeeded": [foo_uuid], "failed": []} + + delete_response = indexer.delete(["missing_id", foo_uuid]) + + if "num_deleted" in delete_response: + assert delete_response["num_deleted"] == 1 + + if "num_failed" in delete_response: + # Deleting a missing an ID is **not** failure!! + assert delete_response["num_failed"] == 0 + + if "succeeded" in delete_response: + # There was nothing to delete! + assert delete_response["succeeded"] == [foo_uuid] + + if "failed" in delete_response: + # Nothing should have failed + assert delete_response["failed"] == [] + + def test_bulk_delete(self, indexer: DocumentIndexer) -> None: + """Test that we can delete several documents at once.""" + documents = [ + Document(id="1", page_content="foo", metadata={"id": 1}), + Document(id="2", page_content="bar", metadata={"id": 2}), + Document(id="3", page_content="baz", metadata={"id": 3}), + ] + + indexer.upsert(documents) + indexer.delete(["1", "2"]) + assert indexer.get(["1", "2", "3"]) == [ + Document(page_content="baz", metadata={"id": 3}, id="3") + ] + + def test_delete_no_args(self, indexer: DocumentIndexer) -> None: + """Test delete with no args raises ValueError.""" + + def test_delete_missing_content(self, indexer: DocumentIndexer) -> None: + """Deleting missing content should not raise an exception.""" + indexer.delete(["1"]) + indexer.delete(["1", "2", "3"]) + + def test_get_with_missing_ids(self, indexer: DocumentIndexer) -> None: + """Test get with missing IDs.""" + documents = [ + Document(id="1", page_content="foo", metadata={"id": 1}), + Document(id="2", page_content="bar", metadata={"id": 2}), + ] + upsert_response = indexer.upsert(documents) + assert upsert_response == { + "succeeded": ["1", "2"], + "failed": [], + } + retrieved_documents = indexer.get(["1", "2", "3", "4"]) + # The ordering is not guaranteed, so we use a set. + assert sorted(retrieved_documents, key=lambda x: x.id) == [ + Document(page_content="foo", metadata={"id": 1}, id="1"), + Document(page_content="bar", metadata={"id": 2}, id="2"), + ] + + def test_get_missing(self, indexer: DocumentIndexer) -> None: + """Test get by IDs with missing IDs.""" + # This should not raise an exception + documents = indexer.get(["1", "2", "3"]) + assert documents == []