core[minor]: Introduce DocumentIndex abstraction (#25062)

This PR adds a minimal document indexer abstraction.

The goal of this abstraction is to allow developers to create custom
retrievers that also have a standard indexing API and allow updating the
document content in them.

The abstraction comes with a test suite that can verify that the indexer
implements the correct semantics.

This is an iteration over a previous PRs
(https://github.com/langchain-ai/langchain/pull/24364). The main
difference is that we're sub-classing from BaseRetriever in this
iteration and as so have consolidated the sync and async interfaces.

The main problem with the current design is that runt time search
configuration has to be specified at init rather than provided at run
time.

We will likely resolve this issue in one of the two ways:

(1) Define a method (`get_retriever`) that will allow creating a
retriever at run time with a specific configuration.. If we do this, we
will likely break the subclass on BaseRetriever
(2) Generalize base retriever so it can support structured queries

---------

Co-authored-by: Erick Friis <erick@langchain.dev>
This commit is contained in:
Eugene Yurtsev
2024-08-05 14:06:33 -04:00
committed by GitHub
parent e7b95e0802
commit 41dfad5104
8 changed files with 743 additions and 5 deletions

View File

@@ -0,0 +1,50 @@
"""Test in memory indexer"""
from typing import AsyncGenerator, Generator
import pytest
from langchain_standard_tests.integration_tests.indexer import (
AsyncDocumentIndexTestSuite,
DocumentIndexerTestSuite,
)
from langchain_core.documents import Document
from langchain_core.indexing.base import DocumentIndex
from langchain_core.indexing.in_memory import (
InMemoryDocumentIndex,
)
class TestDocumentIndexerTestSuite(DocumentIndexerTestSuite):
@pytest.fixture()
def index(self) -> Generator[DocumentIndex, None, None]:
yield InMemoryDocumentIndex()
class TestAsyncDocumentIndexerTestSuite(AsyncDocumentIndexTestSuite):
# Something funky is going on with mypy and async pytest fixture
@pytest.fixture()
async def index(self) -> AsyncGenerator[DocumentIndex, None]: # type: ignore
yield InMemoryDocumentIndex()
def test_sync_retriever() -> None:
index = InMemoryDocumentIndex()
documents = [
Document(id="1", page_content="hello world"),
Document(id="2", page_content="goodbye cat"),
]
index.upsert(documents)
assert index.invoke("hello") == [documents[0], documents[1]]
assert index.invoke("cat") == [documents[1], documents[0]]
async def test_async_retriever() -> None:
index = InMemoryDocumentIndex()
documents = [
Document(id="1", page_content="hello world"),
Document(id="2", page_content="goodbye cat"),
]
await index.aupsert(documents)
assert (await index.ainvoke("hello")) == [documents[0], documents[1]]
assert (await index.ainvoke("cat")) == [documents[1], documents[0]]

View File

@@ -4,11 +4,12 @@ from langchain_core.indexing import __all__
def test_all() -> None:
"""Use to catch obvious breaking changes."""
assert __all__ == sorted(__all__, key=str.lower)
assert __all__ == [
assert set(__all__) == {
"aindex",
"DocumentIndex",
"index",
"IndexingResult",
"InMemoryRecordManager",
"RecordManager",
"UpsertResponse",
]
}

View File

@@ -7,7 +7,7 @@ from typing_extensions import TypedDict
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.indexing.base import UpsertResponse
from langchain_core.indexing import UpsertResponse
from langchain_core.vectorstores import VectorStore