mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-05 22:53:30 +00:00
This PR adds a minimal document indexer abstraction. The goal of this abstraction is to allow developers to create custom retrievers that also have a standard indexing API and allow updating the document content in them. The abstraction comes with a test suite that can verify that the indexer implements the correct semantics. This is an iteration over a previous PRs (https://github.com/langchain-ai/langchain/pull/24364). The main difference is that we're sub-classing from BaseRetriever in this iteration and as so have consolidated the sync and async interfaces. The main problem with the current design is that runt time search configuration has to be specified at init rather than provided at run time. We will likely resolve this issue in one of the two ways: (1) Define a method (`get_retriever`) that will allow creating a retriever at run time with a specific configuration.. If we do this, we will likely break the subclass on BaseRetriever (2) Generalize base retriever so it can support structured queries --------- Co-authored-by: Erick Friis <erick@langchain.dev>
393 lines
15 KiB
Python
393 lines
15 KiB
Python
"""Test suite to check index implementations."""
|
|
|
|
import inspect
|
|
import uuid
|
|
from abc import ABC, abstractmethod
|
|
from typing import AsyncGenerator, Generator
|
|
|
|
import pytest
|
|
from langchain_core.documents import Document
|
|
from langchain_core.indexing.base import DocumentIndex
|
|
|
|
|
|
class DocumentIndexerTestSuite(ABC):
|
|
"""Test suite for checking the read-write of a document index.
|
|
|
|
Implementers should subclass this test suite and provide a fixture
|
|
that returns an empty index for each test.
|
|
"""
|
|
|
|
@abstractmethod
|
|
@pytest.fixture
|
|
def index(self) -> Generator[DocumentIndex, None, None]:
|
|
"""Get the index."""
|
|
|
|
def test_upsert_documents_has_no_ids(self, index: DocumentIndex) -> None:
|
|
"""Verify that there is not parameter called ids in upsert"""
|
|
signature = inspect.signature(index.upsert)
|
|
assert "ids" not in signature.parameters
|
|
|
|
def test_upsert_no_ids(self, index: DocumentIndex) -> None:
|
|
"""Upsert works with documents that do not have IDs.
|
|
|
|
At the moment, the ID field in documents is optional.
|
|
"""
|
|
documents = [
|
|
Document(page_content="foo", metadata={"id": 1}),
|
|
Document(page_content="bar", metadata={"id": 2}),
|
|
]
|
|
response = index.upsert(documents)
|
|
ids = sorted(response["succeeded"])
|
|
|
|
# Ordering is not guaranteed, need to test carefully
|
|
documents = index.get(ids)
|
|
sorted_documents = sorted(documents, key=lambda x: x.id) # type: ignore
|
|
|
|
if sorted_documents[0].page_content == "bar":
|
|
assert sorted_documents[0] == Document(
|
|
page_content="bar", metadata={"id": 2}, id=ids[0]
|
|
)
|
|
assert sorted_documents[1] == Document(
|
|
page_content="foo", metadata={"id": 1}, id=ids[1]
|
|
)
|
|
else:
|
|
assert sorted_documents[0] == Document(
|
|
page_content="foo", metadata={"id": 1}, id=ids[0]
|
|
)
|
|
assert sorted_documents[1] == Document(
|
|
page_content="bar", metadata={"id": 2}, id=ids[1]
|
|
)
|
|
|
|
def test_upsert_some_ids(self, index: DocumentIndex) -> None:
|
|
"""Test an upsert where some docs have ids and some dont."""
|
|
foo_uuid = str(uuid.UUID(int=7))
|
|
documents = [
|
|
Document(id=foo_uuid, page_content="foo", metadata={"id": 1}),
|
|
Document(page_content="bar", metadata={"id": 2}),
|
|
]
|
|
response = index.upsert(documents)
|
|
ids = response["succeeded"]
|
|
other_id = list(set(ids) - {foo_uuid})[0]
|
|
assert response["failed"] == []
|
|
assert foo_uuid in ids
|
|
# Ordering is not guaranteed, so we use a set.
|
|
documents = index.get(ids)
|
|
first_doc = documents[0]
|
|
if first_doc.id == foo_uuid:
|
|
assert documents == [
|
|
Document(page_content="foo", metadata={"id": 1}, id=foo_uuid),
|
|
Document(page_content="bar", metadata={"id": 2}, id=other_id),
|
|
]
|
|
else:
|
|
assert documents == [
|
|
Document(page_content="bar", metadata={"id": 2}, id=other_id),
|
|
Document(page_content="foo", metadata={"id": 1}, id=foo_uuid),
|
|
]
|
|
|
|
def test_upsert_overwrites(self, index: DocumentIndex) -> None:
|
|
"""Test that upsert overwrites existing content."""
|
|
foo_uuid = str(uuid.UUID(int=7))
|
|
documents = [
|
|
Document(id=foo_uuid, page_content="foo", metadata={"bar": 1}),
|
|
]
|
|
response = index.upsert(documents)
|
|
ids = response["succeeded"]
|
|
assert response["failed"] == []
|
|
|
|
assert index.get(ids) == [
|
|
Document(page_content="foo", metadata={"bar": 1}, id=foo_uuid),
|
|
]
|
|
|
|
# Now let's overwrite foo
|
|
index.upsert([Document(id=foo_uuid, page_content="foo2", metadata={"meow": 2})])
|
|
documents = index.get([foo_uuid])
|
|
assert documents == [
|
|
Document(page_content="foo2", metadata={"meow": 2}, id=foo_uuid)
|
|
]
|
|
|
|
def test_delete_missing_docs(self, index: DocumentIndex) -> None:
|
|
"""Verify that we can delete docs that aren't there."""
|
|
assert index.get(["1"]) == [] # Should be empty.
|
|
|
|
delete_response = index.delete(["1"])
|
|
if "num_deleted" in delete_response:
|
|
assert delete_response["num_deleted"] == 0
|
|
|
|
if "num_failed" in delete_response:
|
|
# Deleting a missing an ID is **not** failure!!
|
|
assert delete_response["num_failed"] == 0
|
|
|
|
if "succeeded" in delete_response:
|
|
# There was nothing to delete!
|
|
assert delete_response["succeeded"] == []
|
|
|
|
if "failed" in delete_response:
|
|
# Nothing should have failed
|
|
assert delete_response["failed"] == []
|
|
|
|
def test_delete_semantics(self, index: DocumentIndex) -> None:
|
|
"""Test deletion of content has appropriate semantics."""
|
|
# Let's index a document first.
|
|
foo_uuid = str(uuid.UUID(int=7))
|
|
upsert_response = index.upsert(
|
|
[Document(id=foo_uuid, page_content="foo", metadata={})]
|
|
)
|
|
assert upsert_response == {"succeeded": [foo_uuid], "failed": []}
|
|
|
|
delete_response = index.delete(["missing_id", foo_uuid])
|
|
|
|
if "num_deleted" in delete_response:
|
|
assert delete_response["num_deleted"] == 1
|
|
|
|
if "num_failed" in delete_response:
|
|
# Deleting a missing an ID is **not** failure!!
|
|
assert delete_response["num_failed"] == 0
|
|
|
|
if "succeeded" in delete_response:
|
|
# There was nothing to delete!
|
|
assert delete_response["succeeded"] == [foo_uuid]
|
|
|
|
if "failed" in delete_response:
|
|
# Nothing should have failed
|
|
assert delete_response["failed"] == []
|
|
|
|
def test_bulk_delete(self, index: DocumentIndex) -> None:
|
|
"""Test that we can delete several documents at once."""
|
|
documents = [
|
|
Document(id="1", page_content="foo", metadata={"id": 1}),
|
|
Document(id="2", page_content="bar", metadata={"id": 2}),
|
|
Document(id="3", page_content="baz", metadata={"id": 3}),
|
|
]
|
|
|
|
index.upsert(documents)
|
|
index.delete(["1", "2"])
|
|
assert index.get(["1", "2", "3"]) == [
|
|
Document(page_content="baz", metadata={"id": 3}, id="3")
|
|
]
|
|
|
|
def test_delete_no_args(self, index: DocumentIndex) -> None:
|
|
"""Test delete with no args raises ValueError."""
|
|
with pytest.raises(ValueError):
|
|
index.delete()
|
|
|
|
def test_delete_missing_content(self, index: DocumentIndex) -> None:
|
|
"""Deleting missing content should not raise an exception."""
|
|
index.delete(["1"])
|
|
index.delete(["1", "2", "3"])
|
|
|
|
def test_get_with_missing_ids(self, index: DocumentIndex) -> None:
|
|
"""Test get with missing IDs."""
|
|
documents = [
|
|
Document(id="1", page_content="foo", metadata={"id": 1}),
|
|
Document(id="2", page_content="bar", metadata={"id": 2}),
|
|
]
|
|
upsert_response = index.upsert(documents)
|
|
assert upsert_response == {
|
|
"succeeded": ["1", "2"],
|
|
"failed": [],
|
|
}
|
|
retrieved_documents = index.get(["1", "2", "3", "4"])
|
|
# The ordering is not guaranteed, so we use a set.
|
|
assert sorted(retrieved_documents, key=lambda x: x.id) == [ # type: ignore
|
|
Document(page_content="foo", metadata={"id": 1}, id="1"),
|
|
Document(page_content="bar", metadata={"id": 2}, id="2"),
|
|
]
|
|
|
|
def test_get_missing(self, index: DocumentIndex) -> None:
|
|
"""Test get by IDs with missing IDs."""
|
|
# This should not raise an exception
|
|
documents = index.get(["1", "2", "3"])
|
|
assert documents == []
|
|
|
|
|
|
class AsyncDocumentIndexTestSuite(ABC):
|
|
"""Test suite for checking the read-write of a document index.
|
|
|
|
Implementers should subclass this test suite and provide a fixture
|
|
that returns an empty index for each test.
|
|
"""
|
|
|
|
@abstractmethod
|
|
@pytest.fixture
|
|
async def index(self) -> AsyncGenerator[DocumentIndex, None]:
|
|
"""Get the index."""
|
|
|
|
async def test_upsert_documents_has_no_ids(self, index: DocumentIndex) -> None:
|
|
"""Verify that there is not parameter called ids in upsert"""
|
|
signature = inspect.signature(index.upsert)
|
|
assert "ids" not in signature.parameters
|
|
|
|
async def test_upsert_no_ids(self, index: DocumentIndex) -> None:
|
|
"""Upsert works with documents that do not have IDs.
|
|
|
|
At the moment, the ID field in documents is optional.
|
|
"""
|
|
documents = [
|
|
Document(page_content="foo", metadata={"id": 1}),
|
|
Document(page_content="bar", metadata={"id": 2}),
|
|
]
|
|
response = await index.aupsert(documents)
|
|
ids = sorted(response["succeeded"])
|
|
|
|
# Ordering is not guaranteed, need to test carefully
|
|
documents = await index.aget(ids)
|
|
sorted_documents = sorted(documents, key=lambda x: x.id) # type: ignore
|
|
|
|
if sorted_documents[0].page_content == "bar":
|
|
assert sorted_documents[0] == Document(
|
|
page_content="bar", metadata={"id": 2}, id=ids[0]
|
|
)
|
|
assert sorted_documents[1] == Document(
|
|
page_content="foo", metadata={"id": 1}, id=ids[1]
|
|
)
|
|
else:
|
|
assert sorted_documents[0] == Document(
|
|
page_content="foo", metadata={"id": 1}, id=ids[0]
|
|
)
|
|
assert sorted_documents[1] == Document(
|
|
page_content="bar", metadata={"id": 2}, id=ids[1]
|
|
)
|
|
|
|
async def test_upsert_some_ids(self, index: DocumentIndex) -> None:
|
|
"""Test an upsert where some docs have ids and some dont."""
|
|
foo_uuid = str(uuid.UUID(int=7))
|
|
documents = [
|
|
Document(id=foo_uuid, page_content="foo", metadata={"id": 1}),
|
|
Document(page_content="bar", metadata={"id": 2}),
|
|
]
|
|
response = await index.aupsert(documents)
|
|
ids = response["succeeded"]
|
|
other_id = list(set(ids) - {foo_uuid})[0]
|
|
assert response["failed"] == []
|
|
assert foo_uuid in ids
|
|
# Ordering is not guaranteed, so we use a set.
|
|
documents = await index.aget(ids)
|
|
first_doc = documents[0]
|
|
if first_doc.id == foo_uuid:
|
|
assert documents == [
|
|
Document(page_content="foo", metadata={"id": 1}, id=foo_uuid),
|
|
Document(page_content="bar", metadata={"id": 2}, id=other_id),
|
|
]
|
|
else:
|
|
assert documents == [
|
|
Document(page_content="bar", metadata={"id": 2}, id=other_id),
|
|
Document(page_content="foo", metadata={"id": 1}, id=foo_uuid),
|
|
]
|
|
|
|
async def test_upsert_overwrites(self, index: DocumentIndex) -> None:
|
|
"""Test that upsert overwrites existing content."""
|
|
foo_uuid = str(uuid.UUID(int=7))
|
|
documents = [
|
|
Document(id=foo_uuid, page_content="foo", metadata={"bar": 1}),
|
|
]
|
|
response = await index.aupsert(documents)
|
|
ids = response["succeeded"]
|
|
assert response["failed"] == []
|
|
|
|
assert await index.aget(ids) == [
|
|
Document(page_content="foo", metadata={"bar": 1}, id=foo_uuid),
|
|
]
|
|
|
|
# Now let's overwrite foo
|
|
await index.aupsert(
|
|
[Document(id=foo_uuid, page_content="foo2", metadata={"meow": 2})]
|
|
)
|
|
documents = await index.aget([foo_uuid])
|
|
assert documents == [
|
|
Document(page_content="foo2", metadata={"meow": 2}, id=foo_uuid)
|
|
]
|
|
|
|
async def test_delete_missing_docs(self, index: DocumentIndex) -> None:
|
|
"""Verify that we can delete docs that aren't there."""
|
|
assert await index.aget(["1"]) == [] # Should be empty.
|
|
|
|
delete_response = await index.adelete(["1"])
|
|
if "num_deleted" in delete_response:
|
|
assert delete_response["num_deleted"] == 0
|
|
|
|
if "num_failed" in delete_response:
|
|
# Deleting a missing an ID is **not** failure!!
|
|
assert delete_response["num_failed"] == 0
|
|
|
|
if "succeeded" in delete_response:
|
|
# There was nothing to delete!
|
|
assert delete_response["succeeded"] == []
|
|
|
|
if "failed" in delete_response:
|
|
# Nothing should have failed
|
|
assert delete_response["failed"] == []
|
|
|
|
async def test_delete_semantics(self, index: DocumentIndex) -> None:
|
|
"""Test deletion of content has appropriate semantics."""
|
|
# Let's index a document first.
|
|
foo_uuid = str(uuid.UUID(int=7))
|
|
upsert_response = await index.aupsert(
|
|
[Document(id=foo_uuid, page_content="foo", metadata={})]
|
|
)
|
|
assert upsert_response == {"succeeded": [foo_uuid], "failed": []}
|
|
|
|
delete_response = await index.adelete(["missing_id", foo_uuid])
|
|
|
|
if "num_deleted" in delete_response:
|
|
assert delete_response["num_deleted"] == 1
|
|
|
|
if "num_failed" in delete_response:
|
|
# Deleting a missing an ID is **not** failure!!
|
|
assert delete_response["num_failed"] == 0
|
|
|
|
if "succeeded" in delete_response:
|
|
# There was nothing to delete!
|
|
assert delete_response["succeeded"] == [foo_uuid]
|
|
|
|
if "failed" in delete_response:
|
|
# Nothing should have failed
|
|
assert delete_response["failed"] == []
|
|
|
|
async def test_bulk_delete(self, index: DocumentIndex) -> None:
|
|
"""Test that we can delete several documents at once."""
|
|
documents = [
|
|
Document(id="1", page_content="foo", metadata={"id": 1}),
|
|
Document(id="2", page_content="bar", metadata={"id": 2}),
|
|
Document(id="3", page_content="baz", metadata={"id": 3}),
|
|
]
|
|
|
|
await index.aupsert(documents)
|
|
await index.adelete(["1", "2"])
|
|
assert await index.aget(["1", "2", "3"]) == [
|
|
Document(page_content="baz", metadata={"id": 3}, id="3")
|
|
]
|
|
|
|
async def test_delete_no_args(self, index: DocumentIndex) -> None:
|
|
"""Test delete with no args raises ValueError."""
|
|
with pytest.raises(ValueError):
|
|
await index.adelete()
|
|
|
|
async def test_delete_missing_content(self, index: DocumentIndex) -> None:
|
|
"""Deleting missing content should not raise an exception."""
|
|
await index.adelete(["1"])
|
|
await index.adelete(["1", "2", "3"])
|
|
|
|
async def test_get_with_missing_ids(self, index: DocumentIndex) -> None:
|
|
"""Test get with missing IDs."""
|
|
documents = [
|
|
Document(id="1", page_content="foo", metadata={"id": 1}),
|
|
Document(id="2", page_content="bar", metadata={"id": 2}),
|
|
]
|
|
upsert_response = await index.aupsert(documents)
|
|
assert upsert_response == {
|
|
"succeeded": ["1", "2"],
|
|
"failed": [],
|
|
}
|
|
retrieved_documents = await index.aget(["1", "2", "3", "4"])
|
|
# The ordering is not guaranteed, so we use a set.
|
|
assert sorted(retrieved_documents, key=lambda x: x.id) == [ # type: ignore
|
|
Document(page_content="foo", metadata={"id": 1}, id="1"),
|
|
Document(page_content="bar", metadata={"id": 2}, id="2"),
|
|
]
|
|
|
|
async def test_get_missing(self, index: DocumentIndex) -> None:
|
|
"""Test get by IDs with missing IDs."""
|
|
# This should not raise an exception
|
|
documents = await index.aget(["1", "2", "3"])
|
|
assert documents == []
|