core[minor]: add upsert, streaming_upsert, aupsert, astreaming_upsert methods to the VectorStore abstraction (#23774)

This PR rolls out part of the new proposed interface for vectorstores
(https://github.com/langchain-ai/langchain/pull/23544) to existing store
implementations.

The PR makes the following changes:

1. Adds standard upsert, streaming_upsert, aupsert, astreaming_upsert
methods to the vectorstore.
2. Updates `add_texts` and `aadd_texts` to be non required with a
default implementation that delegates to `upsert` and `aupsert` if those
have been implemented. The original `add_texts` and `aadd_texts` methods
are problematic as they spread object specific information across
document and **kwargs. (e.g., ids are not a part of the document)
3. Adds a default implementation to `add_documents` and `aadd_documents`
that delegates to `upsert` and `aupsert` respectively.
4. Adds standard unit tests to verify that a given vectorstore
implements a correct read/write API.

A downside of this implementation is that it creates `upsert` with a
very similar signature to `add_documents`.
The reason for introducing `upsert` is to:
* Remove any ambiguities about what information is allowed in `kwargs`.
Specifically kwargs should only be used for information common to all
indexed data. (e.g., indexing timeout).
*Allow inheriting from an anticipated generalized interface for indexing
that will allow indexing `BaseMedia` (i.e., allow making a vectorstore
for images/audio etc.)
 
`add_documents` can be deprecated in the future in favor of `upsert` to
make sure that users have a single correct way of indexing content.

---------

Co-authored-by: ccurme <chester.curme@gmail.com>
This commit is contained in:
Eugene Yurtsev
2024-07-05 12:21:40 -04:00
committed by GitHub
parent 3c752238c5
commit 6f08e11d7c
14 changed files with 667 additions and 83 deletions

View File

@@ -6,6 +6,7 @@ from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tupl
import numpy as np
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.indexing import UpsertResponse
from langchain_core.load import dumpd, load
from langchain_core.vectorstores import VectorStore
@@ -37,27 +38,41 @@ class InMemoryVectorStore(VectorStore):
async def adelete(self, ids: Optional[Sequence[str]] = None, **kwargs: Any) -> None:
self.delete(ids)
def add_texts(
self,
texts: Iterable[str],
metadatas: Optional[List[dict]] = None,
ids: Optional[Sequence[str]] = None,
**kwargs: Any,
) -> List[str]:
"""Add texts to the store."""
vectors = self.embedding.embed_documents(list(texts))
ids_ = []
for i, text in enumerate(texts):
doc_id = ids[i] if ids else str(uuid.uuid4())
ids_.append(doc_id)
def upsert(self, items: Sequence[Document], /, **kwargs: Any) -> UpsertResponse:
vectors = self.embedding.embed_documents([item.page_content for item in items])
ids = []
for item, vector in zip(items, vectors):
doc_id = item.id if item.id else str(uuid.uuid4())
ids.append(doc_id)
self.store[doc_id] = {
"id": doc_id,
"vector": vectors[i],
"text": text,
"metadata": metadatas[i] if metadatas else {},
"vector": vector,
"text": item.page_content,
"metadata": item.metadata,
}
return ids_
return {
"succeeded": ids,
"failed": [],
}
def get_by_ids(self, ids: Sequence[str], /) -> List[Document]:
"""Get documents by their ids."""
documents = []
for doc_id in ids:
doc = self.store.get(doc_id)
if doc:
documents.append(
Document(
id=doc["id"],
page_content=doc["text"],
metadata=doc["metadata"],
)
)
return documents
async def aget_by_ids(self, ids: Sequence[str], /) -> List[Document]:
return self.get_by_ids(ids)
async def aadd_texts(
self,
@@ -80,7 +95,9 @@ class InMemoryVectorStore(VectorStore):
similarity = float(cosine_similarity([embedding], [vector]).item(0))
result.append(
(
Document(page_content=doc["text"], metadata=doc["metadata"]),
Document(
id=doc["id"], page_content=doc["text"], metadata=doc["metadata"]
),
similarity,
vector,
)

View File

@@ -1053,7 +1053,7 @@ class Milvus(VectorStore):
pks = [item.get(self._primary_field) for item in query_result]
return pks
def upsert(
def upsert( # type: ignore[override]
self,
ids: Optional[List[str]] = None,
documents: List[Document] | None = None,