This commit is contained in:
Eugene Yurtsev
2024-07-17 17:05:09 -04:00
parent 428b2409c7
commit ab72ad9e36
5 changed files with 303 additions and 8 deletions

View File

@@ -1,7 +1,6 @@
from __future__ import annotations
import abc
import time
from abc import ABC, abstractmethod
from typing import (
Any,
@@ -11,6 +10,7 @@ from typing import (
Sequence,
)
import time
from typing_extensions import TypedDict
from langchain_core._api import beta
@@ -52,22 +52,33 @@ class DeleteResponse(TypedDict, total=False):
"""
num_deleted: int
"""The number of items that were successfully deleted."""
num_failed: int
"""The number of items that failed to be deleted."""
"""The number of items that were successfully deleted.
If returned, this should only include *actual* deletions.
If the ID did not exist to begin with,
it should not be included in this count.
"""
succeeded: Sequence[str]
"""The IDs that were successfully deleted.
Should not be returned when using delete_by_filter.
If returned, this should only include *actual* deletions.
If the ID did not exist to begin with,
it should not be included in this list.
"""
failed: Sequence[str]
"""The IDs that failed to be deleted.
Should not be returned when using delete_by_filter.
Please note that deleting an ID that does not exist is **NOT** considered a failure.
Please note that deleting an ID that
does not exist is **NOT** considered a failure.
"""
num_failed: int
"""The number of items that failed to be deleted."""
@beta(message="Added in ___version___. The API is subject to change.")
class DocumentIndexer(abc.ABC):

View File

@@ -0,0 +1,57 @@
import uuid
from typing import Dict, Optional, Sequence, Any, List
from langchain_core.documents import Document
from langchain_core.indexing import UpsertResponse
from langchain_core.indexing.base import DocumentIndexer, DeleteResponse
class InMemoryIndexer(DocumentIndexer):
"""In memory sync indexer."""
def __init__(self, *, store: Optional[Dict[str, Document]] = None) -> None:
"""An in memory implementation of a document indexer."""
self.store = store if store is not None else {}
def upsert(self, items: Sequence[Document], /, **kwargs: Any) -> UpsertResponse:
"""Upsert items into the indexer."""
ok_ids = []
for item in items:
if item.id is None:
id_ = uuid.uuid4()
item_ = item.copy()
item_.id = str(id_)
else:
item_ = item
self.store[item_.id] = item_
ok_ids.append(item_.id)
return UpsertResponse(succeeded=ok_ids, failed=[])
def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> DeleteResponse:
"""Delete by ID."""
if ids is None:
raise ValueError("IDs must be provided for deletion")
ok_ids = []
for id_ in ids:
if id_ in self.store:
del self.store[id_]
ok_ids.append(id_)
return DeleteResponse(
succeeded=ok_ids, num_deleted=len(ok_ids), num_failed=0, failed=[]
)
def get(self, ids: Sequence[str], /, **kwargs: Any) -> List[Document]:
"""Get by ids."""
found_documents = []
for id_ in ids:
if id_ in self.store:
found_documents.append(self.store[id_])
return found_documents

View File

@@ -0,0 +1,17 @@
"""Test in memory indexer"""
from typing import Generator
import pytest
from langchain_core.indexing import DocumentIndexer
from langchain_core.indexing.in_memory import InMemoryIndexer
from langchain_standard_tests.integration_tests.indexer import (
BaseDocumentIndexerTestSuite,
)
class TestDocumentIndexerTestSuite(BaseDocumentIndexerTestSuite):
@pytest.fixture()
def indexer(self) -> Generator[DocumentIndexer, None, None]:
return InMemoryIndexer()

View File

@@ -1,7 +1,14 @@
import pytest
from langchain_standard_tests.integration_tests.chat_models import (
ChatModelIntegrationTests,
)
# Rewrite assert statements for test suite so that implementations can
# see the full error message from failed asserts.
# https://docs.pytest.org/en/7.1.x/how-to/writing_plugins.html#assertion-rewriting
pytest.register_assert_rewrite("langchain_standard_tests.integration_tests.indexer")
__all__ = [
"ChatModelIntegrationTests",
]

View File

@@ -0,0 +1,203 @@
"""Test suite to check indexer implementations."""
import inspect
import uuid
from abc import ABC, abstractmethod
from typing import Generator
import pytest
from langchain_core.documents import Document
from langchain_core.indexing import DocumentIndexer
# Arbitrarily chosen. Using a small embedding size
# so tests are faster and easier to debug.
EMBEDDING_SIZE = 6
class BaseDocumentIndexerTestSuite(ABC):
"""Test suite for checking the read-write of a document indexer.
Implementers should subclass this test suite and provide a fixture
that returns an empty indexer for each test.
"""
@abstractmethod
@pytest.fixture
def indexer(self) -> Generator[DocumentIndexer, None, None]:
"""Get the indexer."""
def test_upsert_documents_has_no_ids(self, indexer: DocumentIndexer) -> None:
"""Verify that there is not parameter called ids in upsert"""
signature = inspect.signature(indexer.upsert)
assert "ids" not in signature.parameters
def test_upsert_no_ids(self, indexer: DocumentIndexer) -> None:
"""Upsert works with documents that do not have IDs.
At the moment, the ID field in documents is optional.
"""
documents = [
Document(page_content="foo", metadata={"id": 1}),
Document(page_content="bar", metadata={"id": 2}),
]
response = indexer.upsert(documents)
ids = sorted(response["succeeded"])
# Ordering is not guaranteed, need to test carefully
documents = indexer.get(ids)
sorted_documents = sorted(documents, key=lambda x: x.id)
if sorted_documents[0].page_content == "bar":
assert sorted_documents[0] == Document(
page_content="bar", metadata={"id": 2}, id=ids[0]
)
assert sorted_documents[1] == Document(
page_content="foo", metadata={"id": 1}, id=ids[1]
)
else:
assert sorted_documents[0] == Document(
page_content="foo", metadata={"id": 1}, id=ids[0]
)
assert sorted_documents[1] == Document(
page_content="bar", metadata={"id": 2}, id=ids[1]
)
def test_upsert_some_ids(self, indexer: DocumentIndexer) -> None:
"""Test an upsert where some docs have ids and some dont."""
foo_uuid = str(uuid.UUID(int=7))
documents = [
Document(id=foo_uuid, page_content="foo", metadata={"id": 1}),
Document(page_content="bar", metadata={"id": 2}),
]
response = indexer.upsert(documents)
ids = response["succeeded"]
other_id = list(set(ids) - {foo_uuid})[0]
assert response["failed"] == []
assert foo_uuid in ids
# Ordering is not guaranteed, so we use a set.
documents = indexer.get(ids)
first_doc = documents[0]
if first_doc.id == foo_uuid:
assert documents == [
Document(page_content="foo", metadata={"id": 1}, id=foo_uuid),
Document(page_content="bar", metadata={"id": 2}, id=other_id),
]
else:
assert documents == [
Document(page_content="bar", metadata={"id": 2}, id=other_id),
Document(page_content="foo", metadata={"id": 1}, id=foo_uuid),
]
def test_upsert_overwrites(self, indexer: DocumentIndexer) -> None:
"""Test that upsert overwrites existing content."""
foo_uuid = str(uuid.UUID(int=7))
documents = [
Document(id=foo_uuid, page_content="foo", metadata={"bar": 1}),
]
response = indexer.upsert(documents)
ids = response["succeeded"]
assert response["failed"] == []
assert indexer.get(ids) == [
Document(page_content="foo", metadata={"bar": 1}, id=foo_uuid),
]
# Now let's overwrite foo
indexer.upsert(
[Document(id=foo_uuid, page_content="foo2", metadata={"meow": 2})]
)
documents = indexer.get([foo_uuid])
assert documents == [
Document(page_content="foo2", metadata={"meow": 2}, id=foo_uuid)
]
def test_delete_missing_docs(self, indexer: DocumentIndexer) -> None:
"""Verify that we can delete docs that aren't there."""
assert indexer.get(["1"]) == [] # Should be empty.
delete_response = indexer.delete(["1"])
if "num_deleted" in delete_response:
assert delete_response["num_deleted"] == 0
if "num_failed" in delete_response:
# Deleting a missing an ID is **not** failure!!
assert delete_response["num_failed"] == 0
if "succeeded" in delete_response:
# There was nothing to delete!
assert delete_response["succeeded"] == []
if "failed" in delete_response:
# Nothing should have failed
assert delete_response["failed"] == []
def test_delete_semantics(self, indexer: DocumentIndexer) -> None:
"""Test deletion of content has appropriate semantics."""
# Let's index a document first.
foo_uuid = str(uuid.UUID(int=7))
upsert_response = indexer.upsert(
[Document(id=foo_uuid, page_content="foo", metadata={})]
)
assert upsert_response == {"succeeded": [foo_uuid], "failed": []}
delete_response = indexer.delete(["missing_id", foo_uuid])
if "num_deleted" in delete_response:
assert delete_response["num_deleted"] == 1
if "num_failed" in delete_response:
# Deleting a missing an ID is **not** failure!!
assert delete_response["num_failed"] == 0
if "succeeded" in delete_response:
# There was nothing to delete!
assert delete_response["succeeded"] == [foo_uuid]
if "failed" in delete_response:
# Nothing should have failed
assert delete_response["failed"] == []
def test_bulk_delete(self, indexer: DocumentIndexer) -> None:
"""Test that we can delete several documents at once."""
documents = [
Document(id="1", page_content="foo", metadata={"id": 1}),
Document(id="2", page_content="bar", metadata={"id": 2}),
Document(id="3", page_content="baz", metadata={"id": 3}),
]
indexer.upsert(documents)
indexer.delete(["1", "2"])
assert indexer.get(["1", "2", "3"]) == [
Document(page_content="baz", metadata={"id": 3}, id="3")
]
def test_delete_no_args(self, indexer: DocumentIndexer) -> None:
"""Test delete with no args raises ValueError."""
def test_delete_missing_content(self, indexer: DocumentIndexer) -> None:
"""Deleting missing content should not raise an exception."""
indexer.delete(["1"])
indexer.delete(["1", "2", "3"])
def test_get_with_missing_ids(self, indexer: DocumentIndexer) -> None:
"""Test get with missing IDs."""
documents = [
Document(id="1", page_content="foo", metadata={"id": 1}),
Document(id="2", page_content="bar", metadata={"id": 2}),
]
upsert_response = indexer.upsert(documents)
assert upsert_response == {
"succeeded": ["1", "2"],
"failed": [],
}
retrieved_documents = indexer.get(["1", "2", "3", "4"])
# The ordering is not guaranteed, so we use a set.
assert sorted(retrieved_documents, key=lambda x: x.id) == [
Document(page_content="foo", metadata={"id": 1}, id="1"),
Document(page_content="bar", metadata={"id": 2}, id="2"),
]
def test_get_missing(self, indexer: DocumentIndexer) -> None:
"""Test get by IDs with missing IDs."""
# This should not raise an exception
documents = indexer.get(["1", "2", "3"])
assert documents == []