mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-03 20:16:52 +00:00
core[patch]: Deprecating beta upsert APIs in vectorstore (#25069)
This PR deprecates the beta upsert APIs in vectorstore. We'll introduce them in a V2 abstraction instead to keep the existing vectorstore implementations lighter weight. The main problem with the existing APIs is that it's a bit more challenging to implement the correct behavior w/ respect to IDs since ID can be present in both the function signature and as an optional attribute on the document object. But VectorStores that pass the standard tests should have implemented the semantics properly! --------- Co-authored-by: Erick Friis <erick@langchain.dev>
This commit is contained in:
@@ -1,69 +1,50 @@
|
||||
"""Set of tests that complement the standard tests for vectorstore.
|
||||
|
||||
These tests verify that the base abstraction does appropriate delegation to
|
||||
the relevant methods.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import uuid
|
||||
from typing import Any, Dict, List, Optional, Sequence, Union
|
||||
|
||||
from typing_extensions import TypedDict
|
||||
from typing import Any, Dict, Iterable, List, Optional, Sequence
|
||||
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.embeddings import Embeddings
|
||||
from langchain_core.indexing import UpsertResponse
|
||||
from langchain_core.vectorstores import VectorStore
|
||||
|
||||
|
||||
def test_custom_upsert_type() -> None:
|
||||
"""Test that we can override the signature of the upsert method
|
||||
of the VectorStore class without creating typing issues by violating
|
||||
the Liskov Substitution Principle.
|
||||
"""
|
||||
|
||||
class ByVector(TypedDict):
|
||||
document: Document
|
||||
vector: List[float]
|
||||
|
||||
class CustomVectorStore(VectorStore):
|
||||
def upsert(
|
||||
# This unit test verifies that the signature of the upsert method
|
||||
# specifically the items parameter can be overridden without
|
||||
# violating the Liskov Substitution Principle (and getting
|
||||
# typing errors).
|
||||
self,
|
||||
items: Union[Sequence[Document], Sequence[ByVector]],
|
||||
/,
|
||||
**kwargs: Any,
|
||||
) -> UpsertResponse:
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
class CustomSyncVectorStore(VectorStore):
|
||||
"""A vectorstore that only implements the synchronous methods."""
|
||||
class CustomAddTextsVectorstore(VectorStore):
|
||||
"""A vectorstore that only implements add texts."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.store: Dict[str, Document] = {}
|
||||
|
||||
def upsert(
|
||||
def add_texts(
|
||||
self,
|
||||
items: Sequence[Document],
|
||||
/,
|
||||
texts: Iterable[str],
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
# One of the kwargs should be `ids` which is a list of ids
|
||||
# associated with the texts.
|
||||
# This is not yet enforced in the type signature for backwards compatibility
|
||||
# with existing implementations.
|
||||
ids: Optional[List[str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> UpsertResponse:
|
||||
ids = []
|
||||
for item in items:
|
||||
if item.id is None:
|
||||
new_item = item.copy()
|
||||
id_: str = str(uuid.uuid4())
|
||||
new_item.id = id_
|
||||
else:
|
||||
id_ = item.id
|
||||
new_item = item
|
||||
) -> List[str]:
|
||||
if not isinstance(texts, list):
|
||||
texts = list(texts)
|
||||
ids_iter = iter(ids or [])
|
||||
|
||||
self.store[id_] = new_item
|
||||
ids.append(id_)
|
||||
ids_ = []
|
||||
|
||||
return {
|
||||
"succeeded": ids,
|
||||
"failed": [],
|
||||
}
|
||||
metadatas_ = metadatas or [{} for _ in texts]
|
||||
|
||||
for text, metadata in zip(texts, metadatas_ or []):
|
||||
next_id = next(ids_iter, None)
|
||||
id_ = next_id or str(uuid.uuid4())
|
||||
self.store[id_] = Document(page_content=text, metadata=metadata, id=id_)
|
||||
ids_.append(id_)
|
||||
return ids_
|
||||
|
||||
def get_by_ids(self, ids: Sequence[str], /) -> List[Document]:
|
||||
return [self.store[id] for id in ids if id in self.store]
|
||||
@@ -74,8 +55,8 @@ class CustomSyncVectorStore(VectorStore):
|
||||
embedding: Embeddings,
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
**kwargs: Any,
|
||||
) -> CustomSyncVectorStore:
|
||||
vectorstore = CustomSyncVectorStore()
|
||||
) -> CustomAddTextsVectorstore:
|
||||
vectorstore = CustomAddTextsVectorstore()
|
||||
vectorstore.add_texts(texts, metadatas=metadatas, **kwargs)
|
||||
return vectorstore
|
||||
|
||||
@@ -85,30 +66,38 @@ class CustomSyncVectorStore(VectorStore):
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
def test_implement_upsert() -> None:
|
||||
def test_default_add_documents() -> None:
|
||||
"""Test that we can implement the upsert method of the CustomVectorStore
|
||||
class without violating the Liskov Substitution Principle.
|
||||
"""
|
||||
|
||||
store = CustomSyncVectorStore()
|
||||
store = CustomAddTextsVectorstore()
|
||||
|
||||
# Check upsert with id
|
||||
assert store.upsert([Document(id="1", page_content="hello")]) == {
|
||||
"succeeded": ["1"],
|
||||
"failed": [],
|
||||
}
|
||||
assert store.add_documents([Document(id="1", page_content="hello")]) == ["1"]
|
||||
|
||||
assert store.get_by_ids(["1"]) == [Document(id="1", page_content="hello")]
|
||||
|
||||
# Check upsert without id
|
||||
response = store.upsert([Document(page_content="world")])
|
||||
assert len(response["succeeded"]) == 1
|
||||
id_ = response["succeeded"][0]
|
||||
assert id_ is not None
|
||||
assert store.get_by_ids([id_]) == [Document(id=id_, page_content="world")]
|
||||
ids = store.add_documents([Document(page_content="world")])
|
||||
assert len(ids) == 1
|
||||
assert store.get_by_ids(ids) == [Document(id=ids[0], page_content="world")]
|
||||
|
||||
# Check that add_documents works
|
||||
assert store.add_documents([Document(id="5", page_content="baz")]) == ["5"]
|
||||
|
||||
# Test add documents with id specified in both document and ids
|
||||
original_document = Document(id="7", page_content="baz")
|
||||
assert store.add_documents([original_document], ids=["6"]) == ["6"]
|
||||
assert original_document.id == "7" # original document should not be modified
|
||||
assert store.get_by_ids(["6"]) == [Document(id="6", page_content="baz")]
|
||||
|
||||
|
||||
def test_default_add_texts() -> None:
|
||||
store = CustomAddTextsVectorstore()
|
||||
# Check that default implementation of add_texts works
|
||||
assert store.add_texts(["hello", "world"], ids=["3", "4"]) == ["3", "4"]
|
||||
|
||||
assert store.get_by_ids(["3", "4"]) == [
|
||||
Document(id="3", page_content="hello"),
|
||||
Document(id="4", page_content="world"),
|
||||
@@ -130,39 +119,37 @@ def test_implement_upsert() -> None:
|
||||
Document(id=ids_2[1], page_content="bar", metadata={"foo": "bar"}),
|
||||
]
|
||||
|
||||
# Check that add_documents works
|
||||
assert store.add_documents([Document(id="5", page_content="baz")]) == ["5"]
|
||||
|
||||
# Test add documents with id specified in both document and ids
|
||||
original_document = Document(id="7", page_content="baz")
|
||||
assert store.add_documents([original_document], ids=["6"]) == ["6"]
|
||||
assert original_document.id == "7" # original document should not be modified
|
||||
assert store.get_by_ids(["6"]) == [Document(id="6", page_content="baz")]
|
||||
|
||||
|
||||
async def test_aupsert_delegation_to_upsert() -> None:
|
||||
"""Test delegation to the synchronous upsert method in async execution
|
||||
if async methods are not implemented.
|
||||
"""
|
||||
store = CustomSyncVectorStore()
|
||||
async def test_default_aadd_documents() -> None:
|
||||
"""Test delegation to the synchronous method."""
|
||||
store = CustomAddTextsVectorstore()
|
||||
|
||||
# Check upsert with id
|
||||
assert await store.aupsert([Document(id="1", page_content="hello")]) == {
|
||||
"succeeded": ["1"],
|
||||
"failed": [],
|
||||
}
|
||||
assert await store.aadd_documents([Document(id="1", page_content="hello")]) == ["1"]
|
||||
|
||||
assert await store.aget_by_ids(["1"]) == [Document(id="1", page_content="hello")]
|
||||
|
||||
# Check upsert without id
|
||||
response = await store.aupsert([Document(page_content="world")])
|
||||
assert len(response["succeeded"]) == 1
|
||||
id_ = response["succeeded"][0]
|
||||
assert id_ is not None
|
||||
assert await store.aget_by_ids([id_]) == [Document(id=id_, page_content="world")]
|
||||
ids = await store.aadd_documents([Document(page_content="world")])
|
||||
assert len(ids) == 1
|
||||
assert await store.aget_by_ids(ids) == [Document(id=ids[0], page_content="world")]
|
||||
|
||||
# Check that add_documents works
|
||||
assert await store.aadd_documents([Document(id="5", page_content="baz")]) == ["5"]
|
||||
|
||||
# Test add documents with id specified in both document and ids
|
||||
original_document = Document(id="7", page_content="baz")
|
||||
assert await store.aadd_documents([original_document], ids=["6"]) == ["6"]
|
||||
assert original_document.id == "7" # original document should not be modified
|
||||
assert await store.aget_by_ids(["6"]) == [Document(id="6", page_content="baz")]
|
||||
|
||||
|
||||
async def test_default_aadd_texts() -> None:
|
||||
"""Test delegation to the synchronous method."""
|
||||
store = CustomAddTextsVectorstore()
|
||||
# Check that default implementation of add_texts works
|
||||
assert await store.aadd_texts(["hello", "world"], ids=["3", "4"]) == ["3", "4"]
|
||||
|
||||
assert await store.aget_by_ids(["3", "4"]) == [
|
||||
Document(id="3", page_content="hello"),
|
||||
Document(id="4", page_content="world"),
|
||||
@@ -183,12 +170,3 @@ async def test_aupsert_delegation_to_upsert() -> None:
|
||||
Document(id=ids_2[0], page_content="foo", metadata={"foo": "bar"}),
|
||||
Document(id=ids_2[1], page_content="bar", metadata={"foo": "bar"}),
|
||||
]
|
||||
|
||||
# Check that add_documents works
|
||||
assert await store.aadd_documents([Document(id="5", page_content="baz")]) == ["5"]
|
||||
|
||||
# Test add documents with id specified in both document and ids
|
||||
original_document = Document(id="7", page_content="baz")
|
||||
assert await store.aadd_documents([original_document], ids=["6"]) == ["6"]
|
||||
assert original_document.id == "7" # original document should not be modified
|
||||
assert await store.aget_by_ids(["6"]) == [Document(id="6", page_content="baz")]
|
||||
|
Reference in New Issue
Block a user