mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-02 19:47:13 +00:00
core[minor]: add upsert, streaming_upsert, aupsert, astreaming_upsert methods to the VectorStore abstraction (#23774)
This PR rolls out part of the new proposed interface for vectorstores (https://github.com/langchain-ai/langchain/pull/23544) to existing store implementations. The PR makes the following changes: 1. Adds standard upsert, streaming_upsert, aupsert, astreaming_upsert methods to the vectorstore. 2. Updates `add_texts` and `aadd_texts` to be non required with a default implementation that delegates to `upsert` and `aupsert` if those have been implemented. The original `add_texts` and `aadd_texts` methods are problematic as they spread object specific information across document and **kwargs. (e.g., ids are not a part of the document) 3. Adds a default implementation to `add_documents` and `aadd_documents` that delegates to `upsert` and `aupsert` respectively. 4. Adds standard unit tests to verify that a given vectorstore implements a correct read/write API. A downside of this implementation is that it creates `upsert` with a very similar signature to `add_documents`. The reason for introducing `upsert` is to: * Remove any ambiguities about what information is allowed in `kwargs`. Specifically kwargs should only be used for information common to all indexed data. (e.g., indexing timeout). *Allow inheriting from an anticipated generalized interface for indexing that will allow indexing `BaseMedia` (i.e., allow making a vectorstore for images/audio etc.) `add_documents` can be deprecated in the future in favor of `upsert` to make sure that users have a single correct way of indexing content. --------- Co-authored-by: ccurme <chester.curme@gmail.com>
This commit is contained in:
@@ -46,15 +46,21 @@ class ReadWriteTestSuite(ABC):
|
||||
|
||||
def test_add_documents(self, vectorstore: VectorStore) -> None:
|
||||
"""Test adding documents into the vectorstore."""
|
||||
documents = [
|
||||
original_documents = [
|
||||
Document(page_content="foo", metadata={"id": 1}),
|
||||
Document(page_content="bar", metadata={"id": 2}),
|
||||
]
|
||||
vectorstore.add_documents(documents)
|
||||
ids = vectorstore.add_documents(original_documents)
|
||||
documents = vectorstore.similarity_search("bar", k=2)
|
||||
assert documents == [
|
||||
Document(page_content="bar", metadata={"id": 2}),
|
||||
Document(page_content="bar", metadata={"id": 2}, id=ids[1]),
|
||||
Document(page_content="foo", metadata={"id": 1}, id=ids[0]),
|
||||
]
|
||||
# Verify that the original document object does not get mutated!
|
||||
# (e.g., an ID is added to the original document object)
|
||||
assert original_documents == [
|
||||
Document(page_content="foo", metadata={"id": 1}),
|
||||
Document(page_content="bar", metadata={"id": 2}),
|
||||
]
|
||||
|
||||
def test_vectorstore_still_empty(self, vectorstore: VectorStore) -> None:
|
||||
@@ -71,10 +77,11 @@ class ReadWriteTestSuite(ABC):
|
||||
Document(page_content="foo", metadata={"id": 1}),
|
||||
Document(page_content="bar", metadata={"id": 2}),
|
||||
]
|
||||
vectorstore.add_documents(documents, ids=["1", "2"])
|
||||
ids = vectorstore.add_documents(documents, ids=["1", "2"])
|
||||
assert ids == ["1", "2"]
|
||||
vectorstore.delete(["1"])
|
||||
documents = vectorstore.similarity_search("foo", k=1)
|
||||
assert documents == [Document(page_content="bar", metadata={"id": 2})]
|
||||
assert documents == [Document(page_content="bar", metadata={"id": 2}, id="2")]
|
||||
|
||||
def test_deleting_bulk_documents(self, vectorstore: VectorStore) -> None:
|
||||
"""Test that we can delete several documents at once."""
|
||||
@@ -87,7 +94,7 @@ class ReadWriteTestSuite(ABC):
|
||||
vectorstore.add_documents(documents, ids=["1", "2", "3"])
|
||||
vectorstore.delete(["1", "2"])
|
||||
documents = vectorstore.similarity_search("foo", k=1)
|
||||
assert documents == [Document(page_content="baz", metadata={"id": 3})]
|
||||
assert documents == [Document(page_content="baz", metadata={"id": 3}, id="3")]
|
||||
|
||||
def test_delete_missing_content(self, vectorstore: VectorStore) -> None:
|
||||
"""Deleting missing content should not raise an exception."""
|
||||
@@ -106,25 +113,8 @@ class ReadWriteTestSuite(ABC):
|
||||
vectorstore.add_documents(documents, ids=["1", "2"])
|
||||
documents = vectorstore.similarity_search("bar", k=2)
|
||||
assert documents == [
|
||||
Document(page_content="bar", metadata={"id": 2}),
|
||||
Document(page_content="foo", metadata={"id": 1}),
|
||||
]
|
||||
|
||||
def test_add_documents_without_ids_gets_duplicated(
|
||||
self, vectorstore: VectorStore
|
||||
) -> None:
|
||||
"""Adding documents without specifying IDs should duplicate content."""
|
||||
documents = [
|
||||
Document(page_content="foo", metadata={"id": 1}),
|
||||
Document(page_content="bar", metadata={"id": 2}),
|
||||
]
|
||||
|
||||
vectorstore.add_documents(documents)
|
||||
vectorstore.add_documents(documents)
|
||||
documents = vectorstore.similarity_search("bar", k=2)
|
||||
assert documents == [
|
||||
Document(page_content="bar", metadata={"id": 2}),
|
||||
Document(page_content="bar", metadata={"id": 2}),
|
||||
Document(page_content="bar", metadata={"id": 2}, id="2"),
|
||||
Document(page_content="foo", metadata={"id": 1}, id="1"),
|
||||
]
|
||||
|
||||
def test_add_documents_by_id_with_mutation(self, vectorstore: VectorStore) -> None:
|
||||
@@ -149,9 +139,11 @@ class ReadWriteTestSuite(ABC):
|
||||
documents = vectorstore.similarity_search("new foo", k=2)
|
||||
assert documents == [
|
||||
Document(
|
||||
page_content="new foo", metadata={"id": 1, "some_other_field": "foo"}
|
||||
id="1",
|
||||
page_content="new foo",
|
||||
metadata={"id": 1, "some_other_field": "foo"},
|
||||
),
|
||||
Document(page_content="bar", metadata={"id": 2}),
|
||||
Document(id="2", page_content="bar", metadata={"id": 2}),
|
||||
]
|
||||
|
||||
|
||||
@@ -190,15 +182,22 @@ class AsyncReadWriteTestSuite(ABC):
|
||||
|
||||
async def test_add_documents(self, vectorstore: VectorStore) -> None:
|
||||
"""Test adding documents into the vectorstore."""
|
||||
documents = [
|
||||
original_documents = [
|
||||
Document(page_content="foo", metadata={"id": 1}),
|
||||
Document(page_content="bar", metadata={"id": 2}),
|
||||
]
|
||||
await vectorstore.aadd_documents(documents)
|
||||
ids = await vectorstore.aadd_documents(original_documents)
|
||||
documents = await vectorstore.asimilarity_search("bar", k=2)
|
||||
assert documents == [
|
||||
Document(page_content="bar", metadata={"id": 2}),
|
||||
Document(page_content="bar", metadata={"id": 2}, id=ids[1]),
|
||||
Document(page_content="foo", metadata={"id": 1}, id=ids[0]),
|
||||
]
|
||||
|
||||
# Verify that the original document object does not get mutated!
|
||||
# (e.g., an ID is added to the original document object)
|
||||
assert original_documents == [
|
||||
Document(page_content="foo", metadata={"id": 1}),
|
||||
Document(page_content="bar", metadata={"id": 2}),
|
||||
]
|
||||
|
||||
async def test_vectorstore_still_empty(self, vectorstore: VectorStore) -> None:
|
||||
@@ -215,10 +214,11 @@ class AsyncReadWriteTestSuite(ABC):
|
||||
Document(page_content="foo", metadata={"id": 1}),
|
||||
Document(page_content="bar", metadata={"id": 2}),
|
||||
]
|
||||
await vectorstore.aadd_documents(documents, ids=["1", "2"])
|
||||
ids = await vectorstore.aadd_documents(documents, ids=["1", "2"])
|
||||
assert ids == ["1", "2"]
|
||||
await vectorstore.adelete(["1"])
|
||||
documents = await vectorstore.asimilarity_search("foo", k=1)
|
||||
assert documents == [Document(page_content="bar", metadata={"id": 2})]
|
||||
assert documents == [Document(page_content="bar", metadata={"id": 2}, id="2")]
|
||||
|
||||
async def test_deleting_bulk_documents(self, vectorstore: VectorStore) -> None:
|
||||
"""Test that we can delete several documents at once."""
|
||||
@@ -231,7 +231,7 @@ class AsyncReadWriteTestSuite(ABC):
|
||||
await vectorstore.aadd_documents(documents, ids=["1", "2", "3"])
|
||||
await vectorstore.adelete(["1", "2"])
|
||||
documents = await vectorstore.asimilarity_search("foo", k=1)
|
||||
assert documents == [Document(page_content="baz", metadata={"id": 3})]
|
||||
assert documents == [Document(page_content="baz", metadata={"id": 3}, id="3")]
|
||||
|
||||
async def test_delete_missing_content(self, vectorstore: VectorStore) -> None:
|
||||
"""Deleting missing content should not raise an exception."""
|
||||
@@ -250,25 +250,8 @@ class AsyncReadWriteTestSuite(ABC):
|
||||
await vectorstore.aadd_documents(documents, ids=["1", "2"])
|
||||
documents = await vectorstore.asimilarity_search("bar", k=2)
|
||||
assert documents == [
|
||||
Document(page_content="bar", metadata={"id": 2}),
|
||||
Document(page_content="foo", metadata={"id": 1}),
|
||||
]
|
||||
|
||||
async def test_add_documents_without_ids_gets_duplicated(
|
||||
self, vectorstore: VectorStore
|
||||
) -> None:
|
||||
"""Adding documents without specifying IDs should duplicate content."""
|
||||
documents = [
|
||||
Document(page_content="foo", metadata={"id": 1}),
|
||||
Document(page_content="bar", metadata={"id": 2}),
|
||||
]
|
||||
|
||||
await vectorstore.aadd_documents(documents)
|
||||
await vectorstore.aadd_documents(documents)
|
||||
documents = await vectorstore.asimilarity_search("bar", k=2)
|
||||
assert documents == [
|
||||
Document(page_content="bar", metadata={"id": 2}),
|
||||
Document(page_content="bar", metadata={"id": 2}),
|
||||
Document(page_content="bar", metadata={"id": 2}, id="2"),
|
||||
Document(page_content="foo", metadata={"id": 1}, id="1"),
|
||||
]
|
||||
|
||||
async def test_add_documents_by_id_with_mutation(
|
||||
@@ -295,7 +278,9 @@ class AsyncReadWriteTestSuite(ABC):
|
||||
documents = await vectorstore.asimilarity_search("new foo", k=2)
|
||||
assert documents == [
|
||||
Document(
|
||||
page_content="new foo", metadata={"id": 1, "some_other_field": "foo"}
|
||||
id="1",
|
||||
page_content="new foo",
|
||||
metadata={"id": 1, "some_other_field": "foo"},
|
||||
),
|
||||
Document(page_content="bar", metadata={"id": 2}),
|
||||
Document(id="2", page_content="bar", metadata={"id": 2}),
|
||||
]
|
||||
|
Reference in New Issue
Block a user