mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-21 14:18:52 +00:00
core[minor]: add upsert, streaming_upsert, aupsert, astreaming_upsert methods to the VectorStore abstraction (#23774)
This PR rolls out part of the new proposed interface for vectorstores (https://github.com/langchain-ai/langchain/pull/23544) to existing store implementations. The PR makes the following changes: 1. Adds standard upsert, streaming_upsert, aupsert, astreaming_upsert methods to the vectorstore. 2. Updates `add_texts` and `aadd_texts` to be non required with a default implementation that delegates to `upsert` and `aupsert` if those have been implemented. The original `add_texts` and `aadd_texts` methods are problematic as they spread object specific information across document and **kwargs. (e.g., ids are not a part of the document) 3. Adds a default implementation to `add_documents` and `aadd_documents` that delegates to `upsert` and `aupsert` respectively. 4. Adds standard unit tests to verify that a given vectorstore implements a correct read/write API. A downside of this implementation is that it creates `upsert` with a very similar signature to `add_documents`. The reason for introducing `upsert` is to: * Remove any ambiguities about what information is allowed in `kwargs`. Specifically kwargs should only be used for information common to all indexed data. (e.g., indexing timeout). *Allow inheriting from an anticipated generalized interface for indexing that will allow indexing `BaseMedia` (i.e., allow making a vectorstore for images/audio etc.) `add_documents` can be deprecated in the future in favor of `upsert` to make sure that users have a single correct way of indexing content. --------- Co-authored-by: ccurme <chester.curme@gmail.com>
This commit is contained in:
parent
3c752238c5
commit
6f08e11d7c
@ -6,6 +6,7 @@ from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tupl
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
from langchain_core.embeddings import Embeddings
|
from langchain_core.embeddings import Embeddings
|
||||||
|
from langchain_core.indexing import UpsertResponse
|
||||||
from langchain_core.load import dumpd, load
|
from langchain_core.load import dumpd, load
|
||||||
from langchain_core.vectorstores import VectorStore
|
from langchain_core.vectorstores import VectorStore
|
||||||
|
|
||||||
@ -37,27 +38,41 @@ class InMemoryVectorStore(VectorStore):
|
|||||||
async def adelete(self, ids: Optional[Sequence[str]] = None, **kwargs: Any) -> None:
|
async def adelete(self, ids: Optional[Sequence[str]] = None, **kwargs: Any) -> None:
|
||||||
self.delete(ids)
|
self.delete(ids)
|
||||||
|
|
||||||
def add_texts(
|
def upsert(self, items: Sequence[Document], /, **kwargs: Any) -> UpsertResponse:
|
||||||
self,
|
vectors = self.embedding.embed_documents([item.page_content for item in items])
|
||||||
texts: Iterable[str],
|
ids = []
|
||||||
metadatas: Optional[List[dict]] = None,
|
for item, vector in zip(items, vectors):
|
||||||
ids: Optional[Sequence[str]] = None,
|
doc_id = item.id if item.id else str(uuid.uuid4())
|
||||||
**kwargs: Any,
|
ids.append(doc_id)
|
||||||
) -> List[str]:
|
|
||||||
"""Add texts to the store."""
|
|
||||||
vectors = self.embedding.embed_documents(list(texts))
|
|
||||||
ids_ = []
|
|
||||||
|
|
||||||
for i, text in enumerate(texts):
|
|
||||||
doc_id = ids[i] if ids else str(uuid.uuid4())
|
|
||||||
ids_.append(doc_id)
|
|
||||||
self.store[doc_id] = {
|
self.store[doc_id] = {
|
||||||
"id": doc_id,
|
"id": doc_id,
|
||||||
"vector": vectors[i],
|
"vector": vector,
|
||||||
"text": text,
|
"text": item.page_content,
|
||||||
"metadata": metadatas[i] if metadatas else {},
|
"metadata": item.metadata,
|
||||||
}
|
}
|
||||||
return ids_
|
return {
|
||||||
|
"succeeded": ids,
|
||||||
|
"failed": [],
|
||||||
|
}
|
||||||
|
|
||||||
|
def get_by_ids(self, ids: Sequence[str], /) -> List[Document]:
|
||||||
|
"""Get documents by their ids."""
|
||||||
|
documents = []
|
||||||
|
|
||||||
|
for doc_id in ids:
|
||||||
|
doc = self.store.get(doc_id)
|
||||||
|
if doc:
|
||||||
|
documents.append(
|
||||||
|
Document(
|
||||||
|
id=doc["id"],
|
||||||
|
page_content=doc["text"],
|
||||||
|
metadata=doc["metadata"],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return documents
|
||||||
|
|
||||||
|
async def aget_by_ids(self, ids: Sequence[str], /) -> List[Document]:
|
||||||
|
return self.get_by_ids(ids)
|
||||||
|
|
||||||
async def aadd_texts(
|
async def aadd_texts(
|
||||||
self,
|
self,
|
||||||
@ -80,7 +95,9 @@ class InMemoryVectorStore(VectorStore):
|
|||||||
similarity = float(cosine_similarity([embedding], [vector]).item(0))
|
similarity = float(cosine_similarity([embedding], [vector]).item(0))
|
||||||
result.append(
|
result.append(
|
||||||
(
|
(
|
||||||
Document(page_content=doc["text"], metadata=doc["metadata"]),
|
Document(
|
||||||
|
id=doc["id"], page_content=doc["text"], metadata=doc["metadata"]
|
||||||
|
),
|
||||||
similarity,
|
similarity,
|
||||||
vector,
|
vector,
|
||||||
)
|
)
|
||||||
|
@ -1053,7 +1053,7 @@ class Milvus(VectorStore):
|
|||||||
pks = [item.get(self._primary_field) for item in query_result]
|
pks = [item.get(self._primary_field) for item in query_result]
|
||||||
return pks
|
return pks
|
||||||
|
|
||||||
def upsert(
|
def upsert( # type: ignore[override]
|
||||||
self,
|
self,
|
||||||
ids: Optional[List[str]] = None,
|
ids: Optional[List[str]] = None,
|
||||||
documents: List[Document] | None = None,
|
documents: List[Document] | None = None,
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
@ -13,6 +14,11 @@ from tests.integration_tests.vectorstores.fake_embeddings import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class AnyStr(str):
|
||||||
|
def __eq__(self, other: Any) -> bool:
|
||||||
|
return isinstance(other, str)
|
||||||
|
|
||||||
|
|
||||||
class TestInMemoryReadWriteTestSuite(ReadWriteTestSuite):
|
class TestInMemoryReadWriteTestSuite(ReadWriteTestSuite):
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def vectorstore(self) -> InMemoryVectorStore:
|
def vectorstore(self) -> InMemoryVectorStore:
|
||||||
@ -31,10 +37,13 @@ async def test_inmemory() -> None:
|
|||||||
["foo", "bar", "baz"], ConsistentFakeEmbeddings()
|
["foo", "bar", "baz"], ConsistentFakeEmbeddings()
|
||||||
)
|
)
|
||||||
output = await store.asimilarity_search("foo", k=1)
|
output = await store.asimilarity_search("foo", k=1)
|
||||||
assert output == [Document(page_content="foo")]
|
assert output == [Document(page_content="foo", id=AnyStr())]
|
||||||
|
|
||||||
output = await store.asimilarity_search("bar", k=2)
|
output = await store.asimilarity_search("bar", k=2)
|
||||||
assert output == [Document(page_content="bar"), Document(page_content="baz")]
|
assert output == [
|
||||||
|
Document(page_content="bar", id=AnyStr()),
|
||||||
|
Document(page_content="baz", id=AnyStr()),
|
||||||
|
]
|
||||||
|
|
||||||
output2 = await store.asimilarity_search_with_score("bar", k=2)
|
output2 = await store.asimilarity_search_with_score("bar", k=2)
|
||||||
assert output2[0][1] > output2[1][1]
|
assert output2[0][1] > output2[1][1]
|
||||||
@ -61,8 +70,8 @@ async def test_inmemory_mmr() -> None:
|
|||||||
"foo", k=10, lambda_mult=0.1
|
"foo", k=10, lambda_mult=0.1
|
||||||
)
|
)
|
||||||
assert len(output) == len(texts)
|
assert len(output) == len(texts)
|
||||||
assert output[0] == Document(page_content="foo")
|
assert output[0] == Document(page_content="foo", id=AnyStr())
|
||||||
assert output[1] == Document(page_content="foy")
|
assert output[1] == Document(page_content="foy", id=AnyStr())
|
||||||
|
|
||||||
|
|
||||||
async def test_inmemory_dump_load(tmp_path: Path) -> None:
|
async def test_inmemory_dump_load(tmp_path: Path) -> None:
|
||||||
@ -90,4 +99,4 @@ async def test_inmemory_filter() -> None:
|
|||||||
output = await store.asimilarity_search(
|
output = await store.asimilarity_search(
|
||||||
"baz", filter=lambda doc: doc.metadata["id"] == 1
|
"baz", filter=lambda doc: doc.metadata["id"] == 1
|
||||||
)
|
)
|
||||||
assert output == [Document(page_content="foo", metadata={"id": 1})]
|
assert output == [Document(page_content="foo", metadata={"id": 1}, id=AnyStr())]
|
||||||
|
@ -6,7 +6,11 @@ if it's unchanged.
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
from langchain_core.indexing.api import IndexingResult, aindex, index
|
from langchain_core.indexing.api import IndexingResult, aindex, index
|
||||||
from langchain_core.indexing.base import InMemoryRecordManager, RecordManager
|
from langchain_core.indexing.base import (
|
||||||
|
InMemoryRecordManager,
|
||||||
|
RecordManager,
|
||||||
|
UpsertResponse,
|
||||||
|
)
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"aindex",
|
"aindex",
|
||||||
@ -14,4 +18,5 @@ __all__ = [
|
|||||||
"IndexingResult",
|
"IndexingResult",
|
||||||
"InMemoryRecordManager",
|
"InMemoryRecordManager",
|
||||||
"RecordManager",
|
"RecordManager",
|
||||||
|
"UpsertResponse",
|
||||||
]
|
]
|
||||||
|
@ -421,3 +421,16 @@ class InMemoryRecordManager(RecordManager):
|
|||||||
keys: A list of keys to delete.
|
keys: A list of keys to delete.
|
||||||
"""
|
"""
|
||||||
self.delete_keys(keys)
|
self.delete_keys(keys)
|
||||||
|
|
||||||
|
|
||||||
|
class UpsertResponse(TypedDict):
|
||||||
|
"""A generic response for upsert operations.
|
||||||
|
|
||||||
|
The upsert response will be used by abstractions that implement an upsert
|
||||||
|
operation for content that can be upserted by ID.
|
||||||
|
"""
|
||||||
|
|
||||||
|
succeeded: List[str]
|
||||||
|
"""The IDs that were successfully indexed."""
|
||||||
|
failed: List[str]
|
||||||
|
"""The IDs that failed to index."""
|
||||||
|
@ -5,6 +5,7 @@ These functions do not depend on any other LangChain module.
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
from langchain_core.utils import image
|
from langchain_core.utils import image
|
||||||
|
from langchain_core.utils.aiter import abatch_iterate
|
||||||
from langchain_core.utils.env import get_from_dict_or_env, get_from_env
|
from langchain_core.utils.env import get_from_dict_or_env, get_from_env
|
||||||
from langchain_core.utils.formatting import StrictFormatter, formatter
|
from langchain_core.utils.formatting import StrictFormatter, formatter
|
||||||
from langchain_core.utils.input import (
|
from langchain_core.utils.input import (
|
||||||
@ -13,6 +14,7 @@ from langchain_core.utils.input import (
|
|||||||
get_colored_text,
|
get_colored_text,
|
||||||
print_text,
|
print_text,
|
||||||
)
|
)
|
||||||
|
from langchain_core.utils.iter import batch_iterate
|
||||||
from langchain_core.utils.loading import try_load_from_hub
|
from langchain_core.utils.loading import try_load_from_hub
|
||||||
from langchain_core.utils.strings import comma_list, stringify_dict, stringify_value
|
from langchain_core.utils.strings import comma_list, stringify_dict, stringify_value
|
||||||
from langchain_core.utils.utils import (
|
from langchain_core.utils.utils import (
|
||||||
@ -48,4 +50,6 @@ __all__ = [
|
|||||||
"stringify_dict",
|
"stringify_dict",
|
||||||
"comma_list",
|
"comma_list",
|
||||||
"stringify_value",
|
"stringify_value",
|
||||||
|
"batch_iterate",
|
||||||
|
"abatch_iterate",
|
||||||
]
|
]
|
||||||
|
@ -11,6 +11,7 @@ from typing import (
|
|||||||
Any,
|
Any,
|
||||||
AsyncContextManager,
|
AsyncContextManager,
|
||||||
AsyncGenerator,
|
AsyncGenerator,
|
||||||
|
AsyncIterable,
|
||||||
AsyncIterator,
|
AsyncIterator,
|
||||||
Awaitable,
|
Awaitable,
|
||||||
Callable,
|
Callable,
|
||||||
@ -245,3 +246,28 @@ class aclosing(AbstractAsyncContextManager):
|
|||||||
) -> None:
|
) -> None:
|
||||||
if hasattr(self.thing, "aclose"):
|
if hasattr(self.thing, "aclose"):
|
||||||
await self.thing.aclose()
|
await self.thing.aclose()
|
||||||
|
|
||||||
|
|
||||||
|
async def abatch_iterate(
|
||||||
|
size: int, iterable: AsyncIterable[T]
|
||||||
|
) -> AsyncIterator[List[T]]:
|
||||||
|
"""Utility batching function for async iterables.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
size: The size of the batch.
|
||||||
|
iterable: The async iterable to batch.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
An async iterator over the batches
|
||||||
|
"""
|
||||||
|
batch: List[T] = []
|
||||||
|
async for element in iterable:
|
||||||
|
if len(batch) < size:
|
||||||
|
batch.append(element)
|
||||||
|
|
||||||
|
if len(batch) >= size:
|
||||||
|
yield batch
|
||||||
|
batch = []
|
||||||
|
|
||||||
|
if batch:
|
||||||
|
yield batch
|
||||||
|
@ -25,26 +25,34 @@ import logging
|
|||||||
import math
|
import math
|
||||||
import warnings
|
import warnings
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
|
from itertools import cycle
|
||||||
from typing import (
|
from typing import (
|
||||||
TYPE_CHECKING,
|
TYPE_CHECKING,
|
||||||
Any,
|
Any,
|
||||||
|
AsyncIterable,
|
||||||
|
AsyncIterator,
|
||||||
Callable,
|
Callable,
|
||||||
ClassVar,
|
ClassVar,
|
||||||
Collection,
|
Collection,
|
||||||
Dict,
|
Dict,
|
||||||
Iterable,
|
Iterable,
|
||||||
|
Iterator,
|
||||||
List,
|
List,
|
||||||
Optional,
|
Optional,
|
||||||
Sequence,
|
Sequence,
|
||||||
Tuple,
|
Tuple,
|
||||||
Type,
|
Type,
|
||||||
TypeVar,
|
TypeVar,
|
||||||
|
Union,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
from langchain_core._api import beta
|
||||||
from langchain_core.embeddings import Embeddings
|
from langchain_core.embeddings import Embeddings
|
||||||
from langchain_core.pydantic_v1 import Field, root_validator
|
from langchain_core.pydantic_v1 import Field, root_validator
|
||||||
from langchain_core.retrievers import BaseRetriever
|
from langchain_core.retrievers import BaseRetriever
|
||||||
from langchain_core.runnables.config import run_in_executor
|
from langchain_core.runnables.config import run_in_executor
|
||||||
|
from langchain_core.utils.aiter import abatch_iterate
|
||||||
|
from langchain_core.utils.iter import batch_iterate
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from langchain_core.callbacks.manager import (
|
from langchain_core.callbacks.manager import (
|
||||||
@ -52,6 +60,7 @@ if TYPE_CHECKING:
|
|||||||
CallbackManagerForRetrieverRun,
|
CallbackManagerForRetrieverRun,
|
||||||
)
|
)
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
|
from langchain_core.indexing.base import UpsertResponse
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@ -61,11 +70,14 @@ VST = TypeVar("VST", bound="VectorStore")
|
|||||||
class VectorStore(ABC):
|
class VectorStore(ABC):
|
||||||
"""Interface for vector store."""
|
"""Interface for vector store."""
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def add_texts(
|
def add_texts(
|
||||||
self,
|
self,
|
||||||
texts: Iterable[str],
|
texts: Iterable[str],
|
||||||
metadatas: Optional[List[dict]] = None,
|
metadatas: Optional[List[dict]] = None,
|
||||||
|
# One of the kwargs should be `ids` which is a list of ids
|
||||||
|
# associated with the texts.
|
||||||
|
# This is not yet enforced in the type signature for backwards compatibility
|
||||||
|
# with existing implementations.
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> List[str]:
|
) -> List[str]:
|
||||||
"""Run more texts through the embeddings and add to the vectorstore.
|
"""Run more texts through the embeddings and add to the vectorstore.
|
||||||
@ -74,16 +86,205 @@ class VectorStore(ABC):
|
|||||||
texts: Iterable of strings to add to the vectorstore.
|
texts: Iterable of strings to add to the vectorstore.
|
||||||
metadatas: Optional list of metadatas associated with the texts.
|
metadatas: Optional list of metadatas associated with the texts.
|
||||||
**kwargs: vectorstore specific parameters.
|
**kwargs: vectorstore specific parameters.
|
||||||
|
One of the kwargs should be `ids` which is a list of ids
|
||||||
|
associated with the texts.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List of ids from adding the texts into the vectorstore.
|
List of ids from adding the texts into the vectorstore.
|
||||||
"""
|
"""
|
||||||
|
if type(self).upsert != VectorStore.upsert:
|
||||||
|
# Import document in local scope to avoid circular imports
|
||||||
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
|
# This condition is triggered if the subclass has provided
|
||||||
|
# an implementation of the upsert method.
|
||||||
|
# The existing add_texts
|
||||||
|
texts_: Sequence[str] = (
|
||||||
|
texts if isinstance(texts, (list, tuple)) else list(texts)
|
||||||
|
)
|
||||||
|
if metadatas and len(metadatas) != len(texts_):
|
||||||
|
raise ValueError(
|
||||||
|
"The number of metadatas must match the number of texts."
|
||||||
|
"Got {len(metadatas)} metadatas and {len(texts_)} texts."
|
||||||
|
)
|
||||||
|
|
||||||
|
if "ids" in kwargs:
|
||||||
|
ids = kwargs.pop("ids")
|
||||||
|
if ids and len(ids) != len(texts_):
|
||||||
|
raise ValueError(
|
||||||
|
"The number of ids must match the number of texts."
|
||||||
|
"Got {len(ids)} ids and {len(texts_)} texts."
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
ids = None
|
||||||
|
|
||||||
|
metadatas_ = iter(metadatas) if metadatas else cycle([{}])
|
||||||
|
ids_: Iterable[Union[str, None]] = ids if ids is not None else cycle([None])
|
||||||
|
docs = [
|
||||||
|
Document(page_content=text, metadata=metadata_, id=id_)
|
||||||
|
for text, metadata_, id_ in zip(texts, metadatas_, ids_)
|
||||||
|
]
|
||||||
|
upsert_response = self.upsert(docs, **kwargs)
|
||||||
|
return upsert_response["succeeded"]
|
||||||
|
raise NotImplementedError(
|
||||||
|
f"`add_texts` has not been implemented for {self.__class__.__name__} "
|
||||||
|
)
|
||||||
|
|
||||||
|
# Developer guidelines:
|
||||||
|
# Do not override streaming_upsert!
|
||||||
|
@beta(message="Added in 0.2.11. The API is subject to change.")
|
||||||
|
def streaming_upsert(
|
||||||
|
self, items: Iterable[Document], /, batch_size: int, **kwargs: Any
|
||||||
|
) -> Iterator[UpsertResponse]:
|
||||||
|
"""Upsert documents in a streaming fashion.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
items: Iterable of Documents to add to the vectorstore.
|
||||||
|
batch_size: The size of each batch to upsert.
|
||||||
|
**kwargs: Additional keyword arguments.
|
||||||
|
kwargs should only include parameters that are common to all
|
||||||
|
documents. (e.g., timeout for indexing, retry policy, etc.)
|
||||||
|
kwargs should not include ids to avoid ambiguous semantics.
|
||||||
|
Instead the ID should be provided as part of the Document object.
|
||||||
|
|
||||||
|
.. versionadded:: 0.2.11
|
||||||
|
"""
|
||||||
|
# The default implementation of this method breaks the input into
|
||||||
|
# batches of size `batch_size` and calls the `upsert` method on each batch.
|
||||||
|
# Subclasses can override this method to provide a more efficient
|
||||||
|
# implementation.
|
||||||
|
for item_batch in batch_iterate(batch_size, items):
|
||||||
|
yield self.upsert(item_batch, **kwargs)
|
||||||
|
|
||||||
|
# Please note that we've added a new method `upsert` instead of re-using the
|
||||||
|
# existing `add_documents` method.
|
||||||
|
# This was done to resolve potential ambiguities around the behavior of **kwargs
|
||||||
|
# in existing add_documents / add_texts methods which could include per document
|
||||||
|
# information (e.g., the `ids` parameter).
|
||||||
|
# Over time the `add_documents` could be denoted as legacy and deprecated
|
||||||
|
# in favor of the `upsert` method.
|
||||||
|
@beta(message="Added in 0.2.11. The API is subject to change.")
|
||||||
|
def upsert(self, items: Sequence[Document], /, **kwargs: Any) -> UpsertResponse:
|
||||||
|
"""Add or update documents in the vectorstore.
|
||||||
|
|
||||||
|
The upsert functionality should utilize the ID field of the Document object
|
||||||
|
if it is provided. If the ID is not provided, the upsert method is free
|
||||||
|
to generate an ID for the document.
|
||||||
|
|
||||||
|
When an ID is specified and the document already exists in the vectorstore,
|
||||||
|
the upsert method should update the document with the new data. If the document
|
||||||
|
does not exist, the upsert method should add the document to the vectorstore.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
items: Sequence of Documents to add to the vectorstore.
|
||||||
|
**kwargs: Additional keyword arguments.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
UpsertResponse: A response object that contains the list of IDs that were
|
||||||
|
successfully added or updated in the vectorstore and the list of IDs that
|
||||||
|
failed to be added or updated.
|
||||||
|
|
||||||
|
.. versionadded:: 0.2.11
|
||||||
|
"""
|
||||||
|
# Developer guidelines:
|
||||||
|
#
|
||||||
|
# Vectorstores implementations are free to extend `upsert` implementation
|
||||||
|
# to take in additional data per document.
|
||||||
|
#
|
||||||
|
# This data **SHOULD NOT** be part of the **kwargs** parameter, instead
|
||||||
|
# sub-classes can use a Union type on `documents` to include additional
|
||||||
|
# supported formats for the input data stream.
|
||||||
|
#
|
||||||
|
# For example,
|
||||||
|
#
|
||||||
|
# .. code-block:: python
|
||||||
|
# from typing import TypedDict
|
||||||
|
#
|
||||||
|
# class DocumentWithVector(TypedDict):
|
||||||
|
# document: Document
|
||||||
|
# vector: List[float]
|
||||||
|
#
|
||||||
|
# def upsert(
|
||||||
|
# self,
|
||||||
|
# documents: Union[Iterable[Document], Iterable[DocumentWithVector]],
|
||||||
|
# /,
|
||||||
|
# **kwargs
|
||||||
|
# ) -> UpsertResponse:
|
||||||
|
# \"\"\"Add or update documents in the vectorstore.\"\"\"
|
||||||
|
# # Implementation should check if documents is an
|
||||||
|
# # iterable of DocumentWithVector or Document
|
||||||
|
# pass
|
||||||
|
#
|
||||||
|
# Implementations that override upsert should include a new doc-string
|
||||||
|
# that explains the semantics of upsert and includes in code
|
||||||
|
# examples of how to insert using the alternate data formats.
|
||||||
|
|
||||||
|
# The implementation does not delegate to the `add_texts` method or
|
||||||
|
# the `add_documents` method by default since those implementations
|
||||||
|
raise NotImplementedError(
|
||||||
|
f"upsert has not been implemented for {self.__class__.__name__}"
|
||||||
|
)
|
||||||
|
|
||||||
|
@beta(message="Added in 0.2.11. The API is subject to change.")
|
||||||
|
async def astreaming_upsert(
|
||||||
|
self,
|
||||||
|
items: AsyncIterable[Document],
|
||||||
|
/,
|
||||||
|
batch_size: int,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> AsyncIterator[UpsertResponse]:
|
||||||
|
"""Upsert documents in a streaming fashion. Async version of streaming_upsert.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
items: Iterable of Documents to add to the vectorstore.
|
||||||
|
batch_size: The size of each batch to upsert.
|
||||||
|
**kwargs: Additional keyword arguments.
|
||||||
|
kwargs should only include parameters that are common to all
|
||||||
|
documents. (e.g., timeout for indexing, retry policy, etc.)
|
||||||
|
kwargs should not include ids to avoid ambiguous semantics.
|
||||||
|
Instead the ID should be provided as part of the Document object.
|
||||||
|
|
||||||
|
.. versionadded:: 0.2.11
|
||||||
|
"""
|
||||||
|
async for batch in abatch_iterate(batch_size, items):
|
||||||
|
yield await self.aupsert(batch, **kwargs)
|
||||||
|
|
||||||
|
@beta(message="Added in 0.2.11. The API is subject to change.")
|
||||||
|
async def aupsert(
|
||||||
|
self, items: Sequence[Document], /, **kwargs: Any
|
||||||
|
) -> UpsertResponse:
|
||||||
|
"""Add or update documents in the vectorstore. Async version of upsert.
|
||||||
|
|
||||||
|
The upsert functionality should utilize the ID field of the Document object
|
||||||
|
if it is provided. If the ID is not provided, the upsert method is free
|
||||||
|
to generate an ID for the document.
|
||||||
|
|
||||||
|
When an ID is specified and the document already exists in the vectorstore,
|
||||||
|
the upsert method should update the document with the new data. If the document
|
||||||
|
does not exist, the upsert method should add the document to the vectorstore.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
items: Sequence of Documents to add to the vectorstore.
|
||||||
|
**kwargs: Additional keyword arguments.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
UpsertResponse: A response object that contains the list of IDs that were
|
||||||
|
successfully added or updated in the vectorstore and the list of IDs that
|
||||||
|
failed to be added or updated.
|
||||||
|
|
||||||
|
.. versionadded:: 0.2.11
|
||||||
|
"""
|
||||||
|
# Developer guidelines: See guidelines for the `upsert` method.
|
||||||
|
# The implementation does not delegate to the `add_texts` method or
|
||||||
|
# the `add_documents` method by default since those implementations
|
||||||
|
return await run_in_executor(None, self.upsert, items, **kwargs)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def embeddings(self) -> Optional[Embeddings]:
|
def embeddings(self) -> Optional[Embeddings]:
|
||||||
"""Access the query embedding object if available."""
|
"""Access the query embedding object if available."""
|
||||||
logger.debug(
|
logger.debug(
|
||||||
f"{Embeddings.__name__} is not implemented for {self.__class__.__name__}"
|
f"The embeddings property has not been "
|
||||||
|
f"implemented for {self.__class__.__name__}"
|
||||||
)
|
)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@ -187,17 +388,81 @@ class VectorStore(ABC):
|
|||||||
Returns:
|
Returns:
|
||||||
List of ids from adding the texts into the vectorstore.
|
List of ids from adding the texts into the vectorstore.
|
||||||
"""
|
"""
|
||||||
|
if type(self).aupsert != VectorStore.aupsert:
|
||||||
|
# Import document in local scope to avoid circular imports
|
||||||
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
|
# This condition is triggered if the subclass has provided
|
||||||
|
# an implementation of the upsert method.
|
||||||
|
# The existing add_texts
|
||||||
|
texts_: Sequence[str] = (
|
||||||
|
texts if isinstance(texts, (list, tuple)) else list(texts)
|
||||||
|
)
|
||||||
|
if metadatas and len(metadatas) != len(texts_):
|
||||||
|
raise ValueError(
|
||||||
|
"The number of metadatas must match the number of texts."
|
||||||
|
"Got {len(metadatas)} metadatas and {len(texts_)} texts."
|
||||||
|
)
|
||||||
|
|
||||||
|
if "ids" in kwargs:
|
||||||
|
ids = kwargs.pop("ids")
|
||||||
|
if ids and len(ids) != len(texts_):
|
||||||
|
raise ValueError(
|
||||||
|
"The number of ids must match the number of texts."
|
||||||
|
"Got {len(ids)} ids and {len(texts_)} texts."
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
ids = None
|
||||||
|
|
||||||
|
metadatas_ = iter(metadatas) if metadatas else cycle([{}])
|
||||||
|
ids_: Iterable[Union[str, None]] = ids if ids is not None else cycle([None])
|
||||||
|
docs = [
|
||||||
|
Document(page_content=text, metadata=metadata_, id=id_)
|
||||||
|
for text, metadata_, id_ in zip(texts, metadatas_, ids_)
|
||||||
|
]
|
||||||
|
upsert_response = await self.aupsert(docs, **kwargs)
|
||||||
|
return upsert_response["succeeded"]
|
||||||
return await run_in_executor(None, self.add_texts, texts, metadatas, **kwargs)
|
return await run_in_executor(None, self.add_texts, texts, metadatas, **kwargs)
|
||||||
|
|
||||||
def add_documents(self, documents: List[Document], **kwargs: Any) -> List[str]:
|
def add_documents(self, documents: List[Document], **kwargs: Any) -> List[str]:
|
||||||
"""Run more documents through the embeddings and add to the vectorstore.
|
"""Add or update documents in the vectorstore.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
documents: Documents to add to the vectorstore.
|
documents: Documents to add to the vectorstore.
|
||||||
|
kwargs: Additional keyword arguments.
|
||||||
|
if kwargs contains ids and documents contain ids,
|
||||||
|
the ids in the kwargs will receive precedence.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List of IDs of the added texts.
|
List of IDs of the added texts.
|
||||||
"""
|
"""
|
||||||
|
if type(self).upsert != VectorStore.upsert:
|
||||||
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
|
if "ids" in kwargs:
|
||||||
|
ids = kwargs.pop("ids")
|
||||||
|
if ids and len(ids) != len(documents):
|
||||||
|
raise ValueError(
|
||||||
|
"The number of ids must match the number of documents. "
|
||||||
|
"Got {len(ids)} ids and {len(documents)} documents."
|
||||||
|
)
|
||||||
|
|
||||||
|
documents_ = []
|
||||||
|
|
||||||
|
for id_, document in zip(ids, documents):
|
||||||
|
doc_with_id = Document(
|
||||||
|
page_content=document.page_content,
|
||||||
|
metadata=document.metadata,
|
||||||
|
id=id_,
|
||||||
|
)
|
||||||
|
documents_.append(doc_with_id)
|
||||||
|
else:
|
||||||
|
documents_ = documents
|
||||||
|
|
||||||
|
# If upsert has been implemented, we can use it to add documents
|
||||||
|
return self.upsert(documents_, **kwargs)["succeeded"]
|
||||||
|
|
||||||
|
# Code path that delegates to add_text for backwards compatibility
|
||||||
# TODO: Handle the case where the user doesn't provide ids on the Collection
|
# TODO: Handle the case where the user doesn't provide ids on the Collection
|
||||||
texts = [doc.page_content for doc in documents]
|
texts = [doc.page_content for doc in documents]
|
||||||
metadatas = [doc.metadata for doc in documents]
|
metadatas = [doc.metadata for doc in documents]
|
||||||
@ -214,6 +479,38 @@ class VectorStore(ABC):
|
|||||||
Returns:
|
Returns:
|
||||||
List of IDs of the added texts.
|
List of IDs of the added texts.
|
||||||
"""
|
"""
|
||||||
|
# If either upsert or aupsert has been implemented, we delegate to them!
|
||||||
|
if (
|
||||||
|
type(self).aupsert != VectorStore.aupsert
|
||||||
|
or type(self).upsert != VectorStore.upsert
|
||||||
|
):
|
||||||
|
# If aupsert has been implemented, we can use it to add documents
|
||||||
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
|
if "ids" in kwargs:
|
||||||
|
ids = kwargs.pop("ids")
|
||||||
|
if ids and len(ids) != len(documents):
|
||||||
|
raise ValueError(
|
||||||
|
"The number of ids must match the number of documents."
|
||||||
|
"Got {len(ids)} ids and {len(documents)} documents."
|
||||||
|
)
|
||||||
|
|
||||||
|
documents_ = []
|
||||||
|
|
||||||
|
for id_, document in zip(ids, documents):
|
||||||
|
doc_with_id = Document(
|
||||||
|
page_content=document.page_content,
|
||||||
|
metadata=document.metadata,
|
||||||
|
id=id_,
|
||||||
|
)
|
||||||
|
documents_.append(doc_with_id)
|
||||||
|
else:
|
||||||
|
documents_ = documents
|
||||||
|
|
||||||
|
# If upsert has been implemented, we can use it to add documents
|
||||||
|
upsert_response = await self.aupsert(documents_, **kwargs)
|
||||||
|
return upsert_response["succeeded"]
|
||||||
|
|
||||||
texts = [doc.page_content for doc in documents]
|
texts = [doc.page_content for doc in documents]
|
||||||
metadatas = [doc.metadata for doc in documents]
|
metadatas = [doc.metadata for doc in documents]
|
||||||
return await self.aadd_texts(texts, metadatas, **kwargs)
|
return await self.aadd_texts(texts, metadatas, **kwargs)
|
||||||
|
@ -10,4 +10,5 @@ def test_all() -> None:
|
|||||||
"IndexingResult",
|
"IndexingResult",
|
||||||
"InMemoryRecordManager",
|
"InMemoryRecordManager",
|
||||||
"RecordManager",
|
"RecordManager",
|
||||||
|
"UpsertResponse",
|
||||||
]
|
]
|
||||||
|
31
libs/core/tests/unit_tests/utils/test_aiter.py
Normal file
31
libs/core/tests/unit_tests/utils/test_aiter.py
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
from typing import AsyncIterator, List
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from langchain_core.utils.aiter import abatch_iterate
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"input_size, input_iterable, expected_output",
|
||||||
|
[
|
||||||
|
(2, [1, 2, 3, 4, 5], [[1, 2], [3, 4], [5]]),
|
||||||
|
(3, [10, 20, 30, 40, 50], [[10, 20, 30], [40, 50]]),
|
||||||
|
(1, [100, 200, 300], [[100], [200], [300]]),
|
||||||
|
(4, [], []),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
async def test_abatch_iterate(
|
||||||
|
input_size: int, input_iterable: List[str], expected_output: List[str]
|
||||||
|
) -> None:
|
||||||
|
"""Test batching function."""
|
||||||
|
|
||||||
|
async def _to_async_iterable(iterable: List[str]) -> AsyncIterator[str]:
|
||||||
|
for item in iterable:
|
||||||
|
yield item
|
||||||
|
|
||||||
|
iterator_ = abatch_iterate(input_size, _to_async_iterable(input_iterable))
|
||||||
|
|
||||||
|
assert isinstance(iterator_, AsyncIterator)
|
||||||
|
|
||||||
|
output = [el async for el in iterator_]
|
||||||
|
assert output == expected_output
|
@ -6,6 +6,8 @@ EXPECTED_ALL = [
|
|||||||
"convert_to_secret_str",
|
"convert_to_secret_str",
|
||||||
"formatter",
|
"formatter",
|
||||||
"get_bolded_text",
|
"get_bolded_text",
|
||||||
|
"abatch_iterate",
|
||||||
|
"batch_iterate",
|
||||||
"get_color_mapping",
|
"get_color_mapping",
|
||||||
"get_colored_text",
|
"get_colored_text",
|
||||||
"get_pydantic_field_names",
|
"get_pydantic_field_names",
|
||||||
|
0
libs/core/tests/unit_tests/vectorstores/__init__.py
Normal file
0
libs/core/tests/unit_tests/vectorstores/__init__.py
Normal file
194
libs/core/tests/unit_tests/vectorstores/test_vectorstore.py
Normal file
194
libs/core/tests/unit_tests/vectorstores/test_vectorstore.py
Normal file
@ -0,0 +1,194 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import uuid
|
||||||
|
from typing import Any, Dict, List, Optional, Sequence, Union
|
||||||
|
|
||||||
|
from typing_extensions import TypedDict
|
||||||
|
|
||||||
|
from langchain_core.documents import Document
|
||||||
|
from langchain_core.embeddings import Embeddings
|
||||||
|
from langchain_core.indexing.base import UpsertResponse
|
||||||
|
from langchain_core.vectorstores import VectorStore
|
||||||
|
|
||||||
|
|
||||||
|
def test_custom_upsert_type() -> None:
|
||||||
|
"""Test that we can override the signature of the upsert method
|
||||||
|
of the VectorStore class without creating typing issues by violating
|
||||||
|
the Liskov Substitution Principle.
|
||||||
|
"""
|
||||||
|
|
||||||
|
class ByVector(TypedDict):
|
||||||
|
document: Document
|
||||||
|
vector: List[float]
|
||||||
|
|
||||||
|
class CustomVectorStore(VectorStore):
|
||||||
|
def upsert(
|
||||||
|
# This unit test verifies that the signature of the upsert method
|
||||||
|
# specifically the items parameter can be overridden without
|
||||||
|
# violating the Liskov Substitution Principle (and getting
|
||||||
|
# typing errors).
|
||||||
|
self,
|
||||||
|
items: Union[Sequence[Document], Sequence[ByVector]],
|
||||||
|
/,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> UpsertResponse:
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
|
||||||
|
class CustomSyncVectorStore(VectorStore):
|
||||||
|
"""A vectorstore that only implements the synchronous methods."""
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self.store: Dict[str, Document] = {}
|
||||||
|
|
||||||
|
def upsert(
|
||||||
|
self,
|
||||||
|
items: Sequence[Document],
|
||||||
|
/,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> UpsertResponse:
|
||||||
|
ids = []
|
||||||
|
for item in items:
|
||||||
|
if item.id is None:
|
||||||
|
new_item = item.copy()
|
||||||
|
id_: str = str(uuid.uuid4())
|
||||||
|
new_item.id = id_
|
||||||
|
else:
|
||||||
|
id_ = item.id
|
||||||
|
new_item = item
|
||||||
|
|
||||||
|
self.store[id_] = new_item
|
||||||
|
ids.append(id_)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"succeeded": ids,
|
||||||
|
"failed": [],
|
||||||
|
}
|
||||||
|
|
||||||
|
def get_by_ids(self, ids: Sequence[str], /) -> List[Document]:
|
||||||
|
return [self.store[id] for id in ids if id in self.store]
|
||||||
|
|
||||||
|
def from_texts( # type: ignore
|
||||||
|
cls,
|
||||||
|
texts: List[str],
|
||||||
|
embedding: Embeddings,
|
||||||
|
metadatas: Optional[List[dict]] = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> CustomSyncVectorStore:
|
||||||
|
vectorstore = CustomSyncVectorStore()
|
||||||
|
vectorstore.add_texts(texts, metadatas=metadatas, **kwargs)
|
||||||
|
return vectorstore
|
||||||
|
|
||||||
|
def similarity_search(
|
||||||
|
self, query: str, k: int = 4, **kwargs: Any
|
||||||
|
) -> List[Document]:
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
|
||||||
|
def test_implement_upsert() -> None:
|
||||||
|
"""Test that we can implement the upsert method of the CustomVectorStore
|
||||||
|
class without violating the Liskov Substitution Principle.
|
||||||
|
"""
|
||||||
|
|
||||||
|
store = CustomSyncVectorStore()
|
||||||
|
|
||||||
|
# Check upsert with id
|
||||||
|
assert store.upsert([Document(id="1", page_content="hello")]) == {
|
||||||
|
"succeeded": ["1"],
|
||||||
|
"failed": [],
|
||||||
|
}
|
||||||
|
|
||||||
|
assert store.get_by_ids(["1"]) == [Document(id="1", page_content="hello")]
|
||||||
|
|
||||||
|
# Check upsert without id
|
||||||
|
response = store.upsert([Document(page_content="world")])
|
||||||
|
assert len(response["succeeded"]) == 1
|
||||||
|
id_ = response["succeeded"][0]
|
||||||
|
assert id_ is not None
|
||||||
|
assert store.get_by_ids([id_]) == [Document(id=id_, page_content="world")]
|
||||||
|
|
||||||
|
# Check that default implementation of add_texts works
|
||||||
|
assert store.add_texts(["hello", "world"], ids=["3", "4"]) == ["3", "4"]
|
||||||
|
assert store.get_by_ids(["3", "4"]) == [
|
||||||
|
Document(id="3", page_content="hello"),
|
||||||
|
Document(id="4", page_content="world"),
|
||||||
|
]
|
||||||
|
|
||||||
|
# Add texts without ids
|
||||||
|
ids_ = store.add_texts(["foo", "bar"])
|
||||||
|
assert len(ids_) == 2
|
||||||
|
assert store.get_by_ids(ids_) == [
|
||||||
|
Document(id=ids_[0], page_content="foo"),
|
||||||
|
Document(id=ids_[1], page_content="bar"),
|
||||||
|
]
|
||||||
|
|
||||||
|
# Add texts with metadatas
|
||||||
|
ids_2 = store.add_texts(["foo", "bar"], metadatas=[{"foo": "bar"}] * 2)
|
||||||
|
assert len(ids_2) == 2
|
||||||
|
assert store.get_by_ids(ids_2) == [
|
||||||
|
Document(id=ids_2[0], page_content="foo", metadata={"foo": "bar"}),
|
||||||
|
Document(id=ids_2[1], page_content="bar", metadata={"foo": "bar"}),
|
||||||
|
]
|
||||||
|
|
||||||
|
# Check that add_documents works
|
||||||
|
assert store.add_documents([Document(id="5", page_content="baz")]) == ["5"]
|
||||||
|
|
||||||
|
# Test add documents with id specified in both document and ids
|
||||||
|
original_document = Document(id="7", page_content="baz")
|
||||||
|
assert store.add_documents([original_document], ids=["6"]) == ["6"]
|
||||||
|
assert original_document.id == "7" # original document should not be modified
|
||||||
|
assert store.get_by_ids(["6"]) == [Document(id="6", page_content="baz")]
|
||||||
|
|
||||||
|
|
||||||
|
async def test_aupsert_delegation_to_upsert() -> None:
|
||||||
|
"""Test delegation to the synchronous upsert method in async execution
|
||||||
|
if async methods are not implemented.
|
||||||
|
"""
|
||||||
|
store = CustomSyncVectorStore()
|
||||||
|
|
||||||
|
# Check upsert with id
|
||||||
|
assert await store.aupsert([Document(id="1", page_content="hello")]) == {
|
||||||
|
"succeeded": ["1"],
|
||||||
|
"failed": [],
|
||||||
|
}
|
||||||
|
|
||||||
|
assert await store.aget_by_ids(["1"]) == [Document(id="1", page_content="hello")]
|
||||||
|
|
||||||
|
# Check upsert without id
|
||||||
|
response = await store.aupsert([Document(page_content="world")])
|
||||||
|
assert len(response["succeeded"]) == 1
|
||||||
|
id_ = response["succeeded"][0]
|
||||||
|
assert id_ is not None
|
||||||
|
assert await store.aget_by_ids([id_]) == [Document(id=id_, page_content="world")]
|
||||||
|
|
||||||
|
# Check that default implementation of add_texts works
|
||||||
|
assert await store.aadd_texts(["hello", "world"], ids=["3", "4"]) == ["3", "4"]
|
||||||
|
assert await store.aget_by_ids(["3", "4"]) == [
|
||||||
|
Document(id="3", page_content="hello"),
|
||||||
|
Document(id="4", page_content="world"),
|
||||||
|
]
|
||||||
|
|
||||||
|
# Add texts without ids
|
||||||
|
ids_ = await store.aadd_texts(["foo", "bar"])
|
||||||
|
assert len(ids_) == 2
|
||||||
|
assert await store.aget_by_ids(ids_) == [
|
||||||
|
Document(id=ids_[0], page_content="foo"),
|
||||||
|
Document(id=ids_[1], page_content="bar"),
|
||||||
|
]
|
||||||
|
|
||||||
|
# Add texts with metadatas
|
||||||
|
ids_2 = await store.aadd_texts(["foo", "bar"], metadatas=[{"foo": "bar"}] * 2)
|
||||||
|
assert len(ids_2) == 2
|
||||||
|
assert await store.aget_by_ids(ids_2) == [
|
||||||
|
Document(id=ids_2[0], page_content="foo", metadata={"foo": "bar"}),
|
||||||
|
Document(id=ids_2[1], page_content="bar", metadata={"foo": "bar"}),
|
||||||
|
]
|
||||||
|
|
||||||
|
# Check that add_documents works
|
||||||
|
assert await store.aadd_documents([Document(id="5", page_content="baz")]) == ["5"]
|
||||||
|
|
||||||
|
# Test add documents with id specified in both document and ids
|
||||||
|
original_document = Document(id="7", page_content="baz")
|
||||||
|
assert await store.aadd_documents([original_document], ids=["6"]) == ["6"]
|
||||||
|
assert original_document.id == "7" # original document should not be modified
|
||||||
|
assert await store.aget_by_ids(["6"]) == [Document(id="6", page_content="baz")]
|
@ -46,15 +46,21 @@ class ReadWriteTestSuite(ABC):
|
|||||||
|
|
||||||
def test_add_documents(self, vectorstore: VectorStore) -> None:
|
def test_add_documents(self, vectorstore: VectorStore) -> None:
|
||||||
"""Test adding documents into the vectorstore."""
|
"""Test adding documents into the vectorstore."""
|
||||||
documents = [
|
original_documents = [
|
||||||
Document(page_content="foo", metadata={"id": 1}),
|
Document(page_content="foo", metadata={"id": 1}),
|
||||||
Document(page_content="bar", metadata={"id": 2}),
|
Document(page_content="bar", metadata={"id": 2}),
|
||||||
]
|
]
|
||||||
vectorstore.add_documents(documents)
|
ids = vectorstore.add_documents(original_documents)
|
||||||
documents = vectorstore.similarity_search("bar", k=2)
|
documents = vectorstore.similarity_search("bar", k=2)
|
||||||
assert documents == [
|
assert documents == [
|
||||||
Document(page_content="bar", metadata={"id": 2}),
|
Document(page_content="bar", metadata={"id": 2}, id=ids[1]),
|
||||||
|
Document(page_content="foo", metadata={"id": 1}, id=ids[0]),
|
||||||
|
]
|
||||||
|
# Verify that the original document object does not get mutated!
|
||||||
|
# (e.g., an ID is added to the original document object)
|
||||||
|
assert original_documents == [
|
||||||
Document(page_content="foo", metadata={"id": 1}),
|
Document(page_content="foo", metadata={"id": 1}),
|
||||||
|
Document(page_content="bar", metadata={"id": 2}),
|
||||||
]
|
]
|
||||||
|
|
||||||
def test_vectorstore_still_empty(self, vectorstore: VectorStore) -> None:
|
def test_vectorstore_still_empty(self, vectorstore: VectorStore) -> None:
|
||||||
@ -71,10 +77,11 @@ class ReadWriteTestSuite(ABC):
|
|||||||
Document(page_content="foo", metadata={"id": 1}),
|
Document(page_content="foo", metadata={"id": 1}),
|
||||||
Document(page_content="bar", metadata={"id": 2}),
|
Document(page_content="bar", metadata={"id": 2}),
|
||||||
]
|
]
|
||||||
vectorstore.add_documents(documents, ids=["1", "2"])
|
ids = vectorstore.add_documents(documents, ids=["1", "2"])
|
||||||
|
assert ids == ["1", "2"]
|
||||||
vectorstore.delete(["1"])
|
vectorstore.delete(["1"])
|
||||||
documents = vectorstore.similarity_search("foo", k=1)
|
documents = vectorstore.similarity_search("foo", k=1)
|
||||||
assert documents == [Document(page_content="bar", metadata={"id": 2})]
|
assert documents == [Document(page_content="bar", metadata={"id": 2}, id="2")]
|
||||||
|
|
||||||
def test_deleting_bulk_documents(self, vectorstore: VectorStore) -> None:
|
def test_deleting_bulk_documents(self, vectorstore: VectorStore) -> None:
|
||||||
"""Test that we can delete several documents at once."""
|
"""Test that we can delete several documents at once."""
|
||||||
@ -87,7 +94,7 @@ class ReadWriteTestSuite(ABC):
|
|||||||
vectorstore.add_documents(documents, ids=["1", "2", "3"])
|
vectorstore.add_documents(documents, ids=["1", "2", "3"])
|
||||||
vectorstore.delete(["1", "2"])
|
vectorstore.delete(["1", "2"])
|
||||||
documents = vectorstore.similarity_search("foo", k=1)
|
documents = vectorstore.similarity_search("foo", k=1)
|
||||||
assert documents == [Document(page_content="baz", metadata={"id": 3})]
|
assert documents == [Document(page_content="baz", metadata={"id": 3}, id="3")]
|
||||||
|
|
||||||
def test_delete_missing_content(self, vectorstore: VectorStore) -> None:
|
def test_delete_missing_content(self, vectorstore: VectorStore) -> None:
|
||||||
"""Deleting missing content should not raise an exception."""
|
"""Deleting missing content should not raise an exception."""
|
||||||
@ -106,25 +113,8 @@ class ReadWriteTestSuite(ABC):
|
|||||||
vectorstore.add_documents(documents, ids=["1", "2"])
|
vectorstore.add_documents(documents, ids=["1", "2"])
|
||||||
documents = vectorstore.similarity_search("bar", k=2)
|
documents = vectorstore.similarity_search("bar", k=2)
|
||||||
assert documents == [
|
assert documents == [
|
||||||
Document(page_content="bar", metadata={"id": 2}),
|
Document(page_content="bar", metadata={"id": 2}, id="2"),
|
||||||
Document(page_content="foo", metadata={"id": 1}),
|
Document(page_content="foo", metadata={"id": 1}, id="1"),
|
||||||
]
|
|
||||||
|
|
||||||
def test_add_documents_without_ids_gets_duplicated(
|
|
||||||
self, vectorstore: VectorStore
|
|
||||||
) -> None:
|
|
||||||
"""Adding documents without specifying IDs should duplicate content."""
|
|
||||||
documents = [
|
|
||||||
Document(page_content="foo", metadata={"id": 1}),
|
|
||||||
Document(page_content="bar", metadata={"id": 2}),
|
|
||||||
]
|
|
||||||
|
|
||||||
vectorstore.add_documents(documents)
|
|
||||||
vectorstore.add_documents(documents)
|
|
||||||
documents = vectorstore.similarity_search("bar", k=2)
|
|
||||||
assert documents == [
|
|
||||||
Document(page_content="bar", metadata={"id": 2}),
|
|
||||||
Document(page_content="bar", metadata={"id": 2}),
|
|
||||||
]
|
]
|
||||||
|
|
||||||
def test_add_documents_by_id_with_mutation(self, vectorstore: VectorStore) -> None:
|
def test_add_documents_by_id_with_mutation(self, vectorstore: VectorStore) -> None:
|
||||||
@ -149,9 +139,11 @@ class ReadWriteTestSuite(ABC):
|
|||||||
documents = vectorstore.similarity_search("new foo", k=2)
|
documents = vectorstore.similarity_search("new foo", k=2)
|
||||||
assert documents == [
|
assert documents == [
|
||||||
Document(
|
Document(
|
||||||
page_content="new foo", metadata={"id": 1, "some_other_field": "foo"}
|
id="1",
|
||||||
|
page_content="new foo",
|
||||||
|
metadata={"id": 1, "some_other_field": "foo"},
|
||||||
),
|
),
|
||||||
Document(page_content="bar", metadata={"id": 2}),
|
Document(id="2", page_content="bar", metadata={"id": 2}),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
@ -190,15 +182,22 @@ class AsyncReadWriteTestSuite(ABC):
|
|||||||
|
|
||||||
async def test_add_documents(self, vectorstore: VectorStore) -> None:
|
async def test_add_documents(self, vectorstore: VectorStore) -> None:
|
||||||
"""Test adding documents into the vectorstore."""
|
"""Test adding documents into the vectorstore."""
|
||||||
documents = [
|
original_documents = [
|
||||||
Document(page_content="foo", metadata={"id": 1}),
|
Document(page_content="foo", metadata={"id": 1}),
|
||||||
Document(page_content="bar", metadata={"id": 2}),
|
Document(page_content="bar", metadata={"id": 2}),
|
||||||
]
|
]
|
||||||
await vectorstore.aadd_documents(documents)
|
ids = await vectorstore.aadd_documents(original_documents)
|
||||||
documents = await vectorstore.asimilarity_search("bar", k=2)
|
documents = await vectorstore.asimilarity_search("bar", k=2)
|
||||||
assert documents == [
|
assert documents == [
|
||||||
Document(page_content="bar", metadata={"id": 2}),
|
Document(page_content="bar", metadata={"id": 2}, id=ids[1]),
|
||||||
|
Document(page_content="foo", metadata={"id": 1}, id=ids[0]),
|
||||||
|
]
|
||||||
|
|
||||||
|
# Verify that the original document object does not get mutated!
|
||||||
|
# (e.g., an ID is added to the original document object)
|
||||||
|
assert original_documents == [
|
||||||
Document(page_content="foo", metadata={"id": 1}),
|
Document(page_content="foo", metadata={"id": 1}),
|
||||||
|
Document(page_content="bar", metadata={"id": 2}),
|
||||||
]
|
]
|
||||||
|
|
||||||
async def test_vectorstore_still_empty(self, vectorstore: VectorStore) -> None:
|
async def test_vectorstore_still_empty(self, vectorstore: VectorStore) -> None:
|
||||||
@ -215,10 +214,11 @@ class AsyncReadWriteTestSuite(ABC):
|
|||||||
Document(page_content="foo", metadata={"id": 1}),
|
Document(page_content="foo", metadata={"id": 1}),
|
||||||
Document(page_content="bar", metadata={"id": 2}),
|
Document(page_content="bar", metadata={"id": 2}),
|
||||||
]
|
]
|
||||||
await vectorstore.aadd_documents(documents, ids=["1", "2"])
|
ids = await vectorstore.aadd_documents(documents, ids=["1", "2"])
|
||||||
|
assert ids == ["1", "2"]
|
||||||
await vectorstore.adelete(["1"])
|
await vectorstore.adelete(["1"])
|
||||||
documents = await vectorstore.asimilarity_search("foo", k=1)
|
documents = await vectorstore.asimilarity_search("foo", k=1)
|
||||||
assert documents == [Document(page_content="bar", metadata={"id": 2})]
|
assert documents == [Document(page_content="bar", metadata={"id": 2}, id="2")]
|
||||||
|
|
||||||
async def test_deleting_bulk_documents(self, vectorstore: VectorStore) -> None:
|
async def test_deleting_bulk_documents(self, vectorstore: VectorStore) -> None:
|
||||||
"""Test that we can delete several documents at once."""
|
"""Test that we can delete several documents at once."""
|
||||||
@ -231,7 +231,7 @@ class AsyncReadWriteTestSuite(ABC):
|
|||||||
await vectorstore.aadd_documents(documents, ids=["1", "2", "3"])
|
await vectorstore.aadd_documents(documents, ids=["1", "2", "3"])
|
||||||
await vectorstore.adelete(["1", "2"])
|
await vectorstore.adelete(["1", "2"])
|
||||||
documents = await vectorstore.asimilarity_search("foo", k=1)
|
documents = await vectorstore.asimilarity_search("foo", k=1)
|
||||||
assert documents == [Document(page_content="baz", metadata={"id": 3})]
|
assert documents == [Document(page_content="baz", metadata={"id": 3}, id="3")]
|
||||||
|
|
||||||
async def test_delete_missing_content(self, vectorstore: VectorStore) -> None:
|
async def test_delete_missing_content(self, vectorstore: VectorStore) -> None:
|
||||||
"""Deleting missing content should not raise an exception."""
|
"""Deleting missing content should not raise an exception."""
|
||||||
@ -250,25 +250,8 @@ class AsyncReadWriteTestSuite(ABC):
|
|||||||
await vectorstore.aadd_documents(documents, ids=["1", "2"])
|
await vectorstore.aadd_documents(documents, ids=["1", "2"])
|
||||||
documents = await vectorstore.asimilarity_search("bar", k=2)
|
documents = await vectorstore.asimilarity_search("bar", k=2)
|
||||||
assert documents == [
|
assert documents == [
|
||||||
Document(page_content="bar", metadata={"id": 2}),
|
Document(page_content="bar", metadata={"id": 2}, id="2"),
|
||||||
Document(page_content="foo", metadata={"id": 1}),
|
Document(page_content="foo", metadata={"id": 1}, id="1"),
|
||||||
]
|
|
||||||
|
|
||||||
async def test_add_documents_without_ids_gets_duplicated(
|
|
||||||
self, vectorstore: VectorStore
|
|
||||||
) -> None:
|
|
||||||
"""Adding documents without specifying IDs should duplicate content."""
|
|
||||||
documents = [
|
|
||||||
Document(page_content="foo", metadata={"id": 1}),
|
|
||||||
Document(page_content="bar", metadata={"id": 2}),
|
|
||||||
]
|
|
||||||
|
|
||||||
await vectorstore.aadd_documents(documents)
|
|
||||||
await vectorstore.aadd_documents(documents)
|
|
||||||
documents = await vectorstore.asimilarity_search("bar", k=2)
|
|
||||||
assert documents == [
|
|
||||||
Document(page_content="bar", metadata={"id": 2}),
|
|
||||||
Document(page_content="bar", metadata={"id": 2}),
|
|
||||||
]
|
]
|
||||||
|
|
||||||
async def test_add_documents_by_id_with_mutation(
|
async def test_add_documents_by_id_with_mutation(
|
||||||
@ -295,7 +278,9 @@ class AsyncReadWriteTestSuite(ABC):
|
|||||||
documents = await vectorstore.asimilarity_search("new foo", k=2)
|
documents = await vectorstore.asimilarity_search("new foo", k=2)
|
||||||
assert documents == [
|
assert documents == [
|
||||||
Document(
|
Document(
|
||||||
page_content="new foo", metadata={"id": 1, "some_other_field": "foo"}
|
id="1",
|
||||||
|
page_content="new foo",
|
||||||
|
metadata={"id": 1, "some_other_field": "foo"},
|
||||||
),
|
),
|
||||||
Document(page_content="bar", metadata={"id": 2}),
|
Document(id="2", page_content="bar", metadata={"id": 2}),
|
||||||
]
|
]
|
||||||
|
Loading…
Reference in New Issue
Block a user