mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-09 06:53:59 +00:00
core[minor]: Introduce DocumentIndex abstraction (#25062)
This PR adds a minimal document indexer abstraction. The goal of this abstraction is to allow developers to create custom retrievers that also have a standard indexing API and allow updating the document content in them. The abstraction comes with a test suite that can verify that the indexer implements the correct semantics. This is an iteration over a previous PRs (https://github.com/langchain-ai/langchain/pull/24364). The main difference is that we're sub-classing from BaseRetriever in this iteration and as so have consolidated the sync and async interfaces. The main problem with the current design is that runt time search configuration has to be specified at init rather than provided at run time. We will likely resolve this issue in one of the two ways: (1) Define a method (`get_retriever`) that will allow creating a retriever at run time with a specific configuration.. If we do this, we will likely break the subclass on BaseRetriever (2) Generalize base retriever so it can support structured queries --------- Co-authored-by: Erick Friis <erick@langchain.dev>
This commit is contained in:
@@ -0,0 +1,50 @@
|
||||
"""Test in memory indexer"""
|
||||
|
||||
from typing import AsyncGenerator, Generator
|
||||
|
||||
import pytest
|
||||
from langchain_standard_tests.integration_tests.indexer import (
|
||||
AsyncDocumentIndexTestSuite,
|
||||
DocumentIndexerTestSuite,
|
||||
)
|
||||
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.indexing.base import DocumentIndex
|
||||
from langchain_core.indexing.in_memory import (
|
||||
InMemoryDocumentIndex,
|
||||
)
|
||||
|
||||
|
||||
class TestDocumentIndexerTestSuite(DocumentIndexerTestSuite):
|
||||
@pytest.fixture()
|
||||
def index(self) -> Generator[DocumentIndex, None, None]:
|
||||
yield InMemoryDocumentIndex()
|
||||
|
||||
|
||||
class TestAsyncDocumentIndexerTestSuite(AsyncDocumentIndexTestSuite):
|
||||
# Something funky is going on with mypy and async pytest fixture
|
||||
@pytest.fixture()
|
||||
async def index(self) -> AsyncGenerator[DocumentIndex, None]: # type: ignore
|
||||
yield InMemoryDocumentIndex()
|
||||
|
||||
|
||||
def test_sync_retriever() -> None:
|
||||
index = InMemoryDocumentIndex()
|
||||
documents = [
|
||||
Document(id="1", page_content="hello world"),
|
||||
Document(id="2", page_content="goodbye cat"),
|
||||
]
|
||||
index.upsert(documents)
|
||||
assert index.invoke("hello") == [documents[0], documents[1]]
|
||||
assert index.invoke("cat") == [documents[1], documents[0]]
|
||||
|
||||
|
||||
async def test_async_retriever() -> None:
|
||||
index = InMemoryDocumentIndex()
|
||||
documents = [
|
||||
Document(id="1", page_content="hello world"),
|
||||
Document(id="2", page_content="goodbye cat"),
|
||||
]
|
||||
await index.aupsert(documents)
|
||||
assert (await index.ainvoke("hello")) == [documents[0], documents[1]]
|
||||
assert (await index.ainvoke("cat")) == [documents[1], documents[0]]
|
@@ -4,11 +4,12 @@ from langchain_core.indexing import __all__
|
||||
def test_all() -> None:
|
||||
"""Use to catch obvious breaking changes."""
|
||||
assert __all__ == sorted(__all__, key=str.lower)
|
||||
assert __all__ == [
|
||||
assert set(__all__) == {
|
||||
"aindex",
|
||||
"DocumentIndex",
|
||||
"index",
|
||||
"IndexingResult",
|
||||
"InMemoryRecordManager",
|
||||
"RecordManager",
|
||||
"UpsertResponse",
|
||||
]
|
||||
}
|
||||
|
@@ -7,7 +7,7 @@ from typing_extensions import TypedDict
|
||||
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.embeddings import Embeddings
|
||||
from langchain_core.indexing.base import UpsertResponse
|
||||
from langchain_core.indexing import UpsertResponse
|
||||
from langchain_core.vectorstores import VectorStore
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user