update

2026-01-24 05:50:18 +00:00 · 2024-07-17 17:05:09 -04:00
parent 428b2409c7
commit ab72ad9e36
5 changed files with 303 additions and 8 deletions
--- a/libs/core/langchain_core/indexing/base.py
+++ b/libs/core/langchain_core/indexing/base.py
@@ -1,7 +1,6 @@
 from __future__ import annotations

 import abc
-import time
 from abc import ABC, abstractmethod
 from typing import (
    Any,
@@ -11,6 +10,7 @@ from typing import (
    Sequence,
 )

+import time
 from typing_extensions import TypedDict

 from langchain_core._api import beta
@@ -52,22 +52,33 @@ class DeleteResponse(TypedDict, total=False):
    """

    num_deleted: int
-    """The number of items that were successfully deleted."""
-    num_failed: int
-    """The number of items that failed to be deleted."""
+    """The number of items that were successfully deleted.
+    
+    If returned, this should only include *actual* deletions.
+    
+    If the ID did not exist to begin with, 
+    it should not be included in this count.
+    """
+
    succeeded: Sequence[str]
    """The IDs that were successfully deleted.
    
-    Should not be returned when using delete_by_filter.
+    If returned, this should only include *actual* deletions.
+    
+    If the ID did not exist to begin with,
+    it should not be included in this list.
    """
+
    failed: Sequence[str]
    """The IDs that failed to be deleted.
    
-    Should not be returned when using delete_by_filter.
-    
-    Please note that deleting an ID that does not exist is **NOT** considered a failure.
+    Please note that deleting an ID that 
+    does not exist is **NOT** considered a failure.
    """

+    num_failed: int
+    """The number of items that failed to be deleted."""
+

@beta(message="Added in ___version___. The API is subject to change.")
 class DocumentIndexer(abc.ABC):
--- a/libs/core/langchain_core/indexing/in_memory.py
+++ b/libs/core/langchain_core/indexing/in_memory.py
@@ -0,0 +1,57 @@
+import uuid
+from typing import Dict, Optional, Sequence, Any, List
+
+from langchain_core.documents import Document
+from langchain_core.indexing import UpsertResponse
+from langchain_core.indexing.base import DocumentIndexer, DeleteResponse
+
+
+class InMemoryIndexer(DocumentIndexer):
+    """In memory sync indexer."""
+
+    def __init__(self, *, store: Optional[Dict[str, Document]] = None) -> None:
+        """An in memory implementation of a document indexer."""
+        self.store = store if store is not None else {}
+
+    def upsert(self, items: Sequence[Document], /, **kwargs: Any) -> UpsertResponse:
+        """Upsert items into the indexer."""
+        ok_ids = []
+
+        for item in items:
+            if item.id is None:
+                id_ = uuid.uuid4()
+                item_ = item.copy()
+                item_.id = str(id_)
+            else:
+                item_ = item
+
+            self.store[item_.id] = item_
+            ok_ids.append(item_.id)
+
+        return UpsertResponse(succeeded=ok_ids, failed=[])
+
+    def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> DeleteResponse:
+        """Delete by ID."""
+        if ids is None:
+            raise ValueError("IDs must be provided for deletion")
+
+        ok_ids = []
+
+        for id_ in ids:
+            if id_ in self.store:
+                del self.store[id_]
+                ok_ids.append(id_)
+
+        return DeleteResponse(
+            succeeded=ok_ids, num_deleted=len(ok_ids), num_failed=0, failed=[]
+        )
+
+    def get(self, ids: Sequence[str], /, **kwargs: Any) -> List[Document]:
+        """Get by ids."""
+        found_documents = []
+
+        for id_ in ids:
+            if id_ in self.store:
+                found_documents.append(self.store[id_])
+
+        return found_documents
--- a/libs/core/tests/unit_tests/indexing/test_in_memory_indexer.py
+++ b/libs/core/tests/unit_tests/indexing/test_in_memory_indexer.py
@@ -0,0 +1,17 @@
+"""Test in memory indexer"""
+
+from typing import Generator
+
+import pytest
+
+from langchain_core.indexing import DocumentIndexer
+from langchain_core.indexing.in_memory import InMemoryIndexer
+from langchain_standard_tests.integration_tests.indexer import (
+    BaseDocumentIndexerTestSuite,
+)
+
+
+class TestDocumentIndexerTestSuite(BaseDocumentIndexerTestSuite):
+    @pytest.fixture()
+    def indexer(self) -> Generator[DocumentIndexer, None, None]:
+        return InMemoryIndexer()
--- a/libs/standard-tests/langchain_standard_tests/integration_tests/init.py
+++ b/libs/standard-tests/langchain_standard_tests/integration_tests/init.py
@@ -1,7 +1,14 @@
+import pytest
+
 from langchain_standard_tests.integration_tests.chat_models import (
    ChatModelIntegrationTests,
 )

+# Rewrite assert statements for test suite so that implementations can
+# see the full error message from failed asserts.
+# https://docs.pytest.org/en/7.1.x/how-to/writing_plugins.html#assertion-rewriting
+pytest.register_assert_rewrite("langchain_standard_tests.integration_tests.indexer")
+
 __all__ = [
    "ChatModelIntegrationTests",
 ]
--- a/libs/standard-tests/langchain_standard_tests/integration_tests/indexer.py
+++ b/libs/standard-tests/langchain_standard_tests/integration_tests/indexer.py
@@ -0,0 +1,203 @@
+"""Test suite to check indexer implementations."""
+import inspect
+import uuid
+from abc import ABC, abstractmethod
+from typing import Generator
+
+import pytest
+from langchain_core.documents import Document
+from langchain_core.indexing import DocumentIndexer
+
+# Arbitrarily chosen. Using a small embedding size
+# so tests are faster and easier to debug.
+EMBEDDING_SIZE = 6
+
+
+class BaseDocumentIndexerTestSuite(ABC):
+    """Test suite for checking the read-write of a document indexer.
+
+    Implementers should subclass this test suite and provide a fixture
+    that returns an empty indexer for each test.
+    """
+
+    @abstractmethod
+    @pytest.fixture
+    def indexer(self) -> Generator[DocumentIndexer, None, None]:
+        """Get the indexer."""
+
+    def test_upsert_documents_has_no_ids(self, indexer: DocumentIndexer) -> None:
+        """Verify that there is not parameter called ids in upsert"""
+        signature = inspect.signature(indexer.upsert)
+        assert "ids" not in signature.parameters
+
+    def test_upsert_no_ids(self, indexer: DocumentIndexer) -> None:
+        """Upsert works with documents that do not have IDs.
+
+        At the moment, the ID field in documents is optional.
+        """
+        documents = [
+            Document(page_content="foo", metadata={"id": 1}),
+            Document(page_content="bar", metadata={"id": 2}),
+        ]
+        response = indexer.upsert(documents)
+        ids = sorted(response["succeeded"])
+
+        # Ordering is not guaranteed, need to test carefully
+        documents = indexer.get(ids)
+        sorted_documents = sorted(documents, key=lambda x: x.id)
+
+        if sorted_documents[0].page_content == "bar":
+            assert sorted_documents[0] == Document(
+                page_content="bar", metadata={"id": 2}, id=ids[0]
+            )
+            assert sorted_documents[1] == Document(
+                page_content="foo", metadata={"id": 1}, id=ids[1]
+            )
+        else:
+            assert sorted_documents[0] == Document(
+                page_content="foo", metadata={"id": 1}, id=ids[0]
+            )
+            assert sorted_documents[1] == Document(
+                page_content="bar", metadata={"id": 2}, id=ids[1]
+            )
+
+    def test_upsert_some_ids(self, indexer: DocumentIndexer) -> None:
+        """Test an upsert where some docs have ids and some dont."""
+        foo_uuid = str(uuid.UUID(int=7))
+        documents = [
+            Document(id=foo_uuid, page_content="foo", metadata={"id": 1}),
+            Document(page_content="bar", metadata={"id": 2}),
+        ]
+        response = indexer.upsert(documents)
+        ids = response["succeeded"]
+        other_id = list(set(ids) - {foo_uuid})[0]
+        assert response["failed"] == []
+        assert foo_uuid in ids
+        # Ordering is not guaranteed, so we use a set.
+        documents = indexer.get(ids)
+        first_doc = documents[0]
+        if first_doc.id == foo_uuid:
+            assert documents == [
+                Document(page_content="foo", metadata={"id": 1}, id=foo_uuid),
+                Document(page_content="bar", metadata={"id": 2}, id=other_id),
+            ]
+        else:
+            assert documents == [
+                Document(page_content="bar", metadata={"id": 2}, id=other_id),
+                Document(page_content="foo", metadata={"id": 1}, id=foo_uuid),
+            ]
+
+    def test_upsert_overwrites(self, indexer: DocumentIndexer) -> None:
+        """Test that upsert overwrites existing content."""
+        foo_uuid = str(uuid.UUID(int=7))
+        documents = [
+            Document(id=foo_uuid, page_content="foo", metadata={"bar": 1}),
+        ]
+        response = indexer.upsert(documents)
+        ids = response["succeeded"]
+        assert response["failed"] == []
+
+        assert indexer.get(ids) == [
+            Document(page_content="foo", metadata={"bar": 1}, id=foo_uuid),
+        ]
+
+        # Now let's overwrite foo
+        indexer.upsert(
+            [Document(id=foo_uuid, page_content="foo2", metadata={"meow": 2})]
+        )
+        documents = indexer.get([foo_uuid])
+        assert documents == [
+            Document(page_content="foo2", metadata={"meow": 2}, id=foo_uuid)
+        ]
+
+    def test_delete_missing_docs(self, indexer: DocumentIndexer) -> None:
+        """Verify that we can delete docs that aren't there."""
+        assert indexer.get(["1"]) == []  # Should be empty.
+
+        delete_response = indexer.delete(["1"])
+        if "num_deleted" in delete_response:
+            assert delete_response["num_deleted"] == 0
+
+        if "num_failed" in delete_response:
+            # Deleting a missing an ID is **not** failure!!
+            assert delete_response["num_failed"] == 0
+
+        if "succeeded" in delete_response:
+            # There was nothing to delete!
+            assert delete_response["succeeded"] == []
+
+        if "failed" in delete_response:
+            # Nothing should have failed
+            assert delete_response["failed"] == []
+
+    def test_delete_semantics(self, indexer: DocumentIndexer) -> None:
+        """Test deletion of content has appropriate semantics."""
+        # Let's index a document first.
+        foo_uuid = str(uuid.UUID(int=7))
+        upsert_response = indexer.upsert(
+            [Document(id=foo_uuid, page_content="foo", metadata={})]
+        )
+        assert upsert_response == {"succeeded": [foo_uuid], "failed": []}
+
+        delete_response = indexer.delete(["missing_id", foo_uuid])
+
+        if "num_deleted" in delete_response:
+            assert delete_response["num_deleted"] == 1
+
+        if "num_failed" in delete_response:
+            # Deleting a missing an ID is **not** failure!!
+            assert delete_response["num_failed"] == 0
+
+        if "succeeded" in delete_response:
+            # There was nothing to delete!
+            assert delete_response["succeeded"] == [foo_uuid]
+
+        if "failed" in delete_response:
+            # Nothing should have failed
+            assert delete_response["failed"] == []
+
+    def test_bulk_delete(self, indexer: DocumentIndexer) -> None:
+        """Test that we can delete several documents at once."""
+        documents = [
+            Document(id="1", page_content="foo", metadata={"id": 1}),
+            Document(id="2", page_content="bar", metadata={"id": 2}),
+            Document(id="3", page_content="baz", metadata={"id": 3}),
+        ]
+
+        indexer.upsert(documents)
+        indexer.delete(["1", "2"])
+        assert indexer.get(["1", "2", "3"]) == [
+            Document(page_content="baz", metadata={"id": 3}, id="3")
+        ]
+
+    def test_delete_no_args(self, indexer: DocumentIndexer) -> None:
+        """Test delete with no args raises ValueError."""
+
+    def test_delete_missing_content(self, indexer: DocumentIndexer) -> None:
+        """Deleting missing content should not raise an exception."""
+        indexer.delete(["1"])
+        indexer.delete(["1", "2", "3"])
+
+    def test_get_with_missing_ids(self, indexer: DocumentIndexer) -> None:
+        """Test get with missing IDs."""
+        documents = [
+            Document(id="1", page_content="foo", metadata={"id": 1}),
+            Document(id="2", page_content="bar", metadata={"id": 2}),
+        ]
+        upsert_response = indexer.upsert(documents)
+        assert upsert_response == {
+            "succeeded": ["1", "2"],
+            "failed": [],
+        }
+        retrieved_documents = indexer.get(["1", "2", "3", "4"])
+        # The ordering is not guaranteed, so we use a set.
+        assert sorted(retrieved_documents, key=lambda x: x.id) == [
+            Document(page_content="foo", metadata={"id": 1}, id="1"),
+            Document(page_content="bar", metadata={"id": 2}, id="2"),
+        ]
+
+    def test_get_missing(self, indexer: DocumentIndexer) -> None:
+        """Test get by IDs with missing IDs."""
+        # This should not raise an exception
+        documents = indexer.get(["1", "2", "3"])
+        assert documents == []