mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-26 00:23:25 +00:00
Add delete and ensure add_texts performs upsert (w/ ID optional) (#6126)
## Goal We want to ensure consistency across vectordbs: 1/ add `delete` by ID method to the base vectorstore class 2/ ensure `add_texts` performs `upsert` with ID optionally passed ## Testing - [x] Pinecone: notebook test w/ `langchain_test` vectorstore. - [x] Chroma: Review by @jeffchuber, notebook test w/ in memory vectorstore. - [x] Supabase: Review by @copple, notebook test w/ `langchain_test` table. - [x] Weaviate: Notebook test w/ `langchain_test` index. - [x] Elastic: Revied by @vestal. Notebook test w/ `langchain_test` table. - [ ] Redis: Asked for review from owner of recent `delete` method https://github.com/hwchase17/langchain/pull/6222
This commit is contained in:
parent
393f469eb3
commit
be02572d58
@ -48,6 +48,21 @@ class VectorStore(ABC):
|
|||||||
List of ids from adding the texts into the vectorstore.
|
List of ids from adding the texts into the vectorstore.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
def delete(self, ids: List[str]) -> Optional[bool]:
|
||||||
|
"""Delete by vector ID.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
ids: List of ids to delete.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Optional[bool]: True if deletion is successful,
|
||||||
|
False otherwise, None if not implemented.
|
||||||
|
"""
|
||||||
|
|
||||||
|
raise NotImplementedError(
|
||||||
|
"delete_by_id method must be implemented by subclass."
|
||||||
|
)
|
||||||
|
|
||||||
async def aadd_texts(
|
async def aadd_texts(
|
||||||
self,
|
self,
|
||||||
texts: Iterable[str],
|
texts: Iterable[str],
|
||||||
|
@ -146,7 +146,7 @@ class Chroma(VectorStore):
|
|||||||
embeddings = None
|
embeddings = None
|
||||||
if self._embedding_function is not None:
|
if self._embedding_function is not None:
|
||||||
embeddings = self._embedding_function.embed_documents(list(texts))
|
embeddings = self._embedding_function.embed_documents(list(texts))
|
||||||
self._collection.add(
|
self._collection.upsert(
|
||||||
metadatas=metadatas, embeddings=embeddings, documents=texts, ids=ids
|
metadatas=metadatas, embeddings=embeddings, documents=texts, ids=ids
|
||||||
)
|
)
|
||||||
return ids
|
return ids
|
||||||
@ -442,3 +442,11 @@ class Chroma(VectorStore):
|
|||||||
client_settings=client_settings,
|
client_settings=client_settings,
|
||||||
client=client,
|
client=client,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def delete(self, ids: List[str]) -> None:
|
||||||
|
"""Delete by vector IDs.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
ids: List of ids to delete.
|
||||||
|
"""
|
||||||
|
self._collection.delete(ids=ids)
|
||||||
|
@ -158,6 +158,7 @@ class ElasticVectorSearch(VectorStore, ABC):
|
|||||||
texts: Iterable[str],
|
texts: Iterable[str],
|
||||||
metadatas: Optional[List[dict]] = None,
|
metadatas: Optional[List[dict]] = None,
|
||||||
refresh_indices: bool = True,
|
refresh_indices: bool = True,
|
||||||
|
ids: Optional[List[str]] = None,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> List[str]:
|
) -> List[str]:
|
||||||
"""Run more texts through the embeddings and add to the vectorstore.
|
"""Run more texts through the embeddings and add to the vectorstore.
|
||||||
@ -179,7 +180,7 @@ class ElasticVectorSearch(VectorStore, ABC):
|
|||||||
"Please install it with `pip install elasticsearch`."
|
"Please install it with `pip install elasticsearch`."
|
||||||
)
|
)
|
||||||
requests = []
|
requests = []
|
||||||
ids = []
|
ids = ids or [str(uuid.uuid4()) for _ in texts]
|
||||||
embeddings = self.embedding.embed_documents(list(texts))
|
embeddings = self.embedding.embed_documents(list(texts))
|
||||||
dim = len(embeddings[0])
|
dim = len(embeddings[0])
|
||||||
mapping = _default_text_mapping(dim)
|
mapping = _default_text_mapping(dim)
|
||||||
@ -194,16 +195,14 @@ class ElasticVectorSearch(VectorStore, ABC):
|
|||||||
|
|
||||||
for i, text in enumerate(texts):
|
for i, text in enumerate(texts):
|
||||||
metadata = metadatas[i] if metadatas else {}
|
metadata = metadatas[i] if metadatas else {}
|
||||||
_id = str(uuid.uuid4())
|
|
||||||
request = {
|
request = {
|
||||||
"_op_type": "index",
|
"_op_type": "index",
|
||||||
"_index": self.index_name,
|
"_index": self.index_name,
|
||||||
"vector": embeddings[i],
|
"vector": embeddings[i],
|
||||||
"text": text,
|
"text": text,
|
||||||
"metadata": metadata,
|
"metadata": metadata,
|
||||||
"_id": _id,
|
"_id": ids[i],
|
||||||
}
|
}
|
||||||
ids.append(_id)
|
|
||||||
requests.append(request)
|
requests.append(request)
|
||||||
bulk(self.client, requests)
|
bulk(self.client, requests)
|
||||||
|
|
||||||
@ -318,6 +317,17 @@ class ElasticVectorSearch(VectorStore, ABC):
|
|||||||
)
|
)
|
||||||
return response
|
return response
|
||||||
|
|
||||||
|
def delete(self, ids: List[str]) -> None:
|
||||||
|
"""Delete by vector IDs.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
ids: List of ids to delete.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# TODO: Check if this can be done in bulk
|
||||||
|
for id in ids:
|
||||||
|
self.client.delete(index=self.index_name, id=id)
|
||||||
|
|
||||||
|
|
||||||
class ElasticKnnSearch(ElasticVectorSearch):
|
class ElasticKnnSearch(ElasticVectorSearch):
|
||||||
"""
|
"""
|
||||||
|
@ -353,3 +353,16 @@ class Pinecone(VectorStore):
|
|||||||
return cls(
|
return cls(
|
||||||
pinecone.Index(index_name), embedding.embed_query, text_key, namespace
|
pinecone.Index(index_name), embedding.embed_query, text_key, namespace
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def delete(self, ids: List[str]) -> None:
|
||||||
|
"""Delete by vector IDs.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
ids: List of ids to delete.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# This is the maximum number of IDs that can be deleted
|
||||||
|
chunk_size = 1000
|
||||||
|
for i in range(0, len(ids), chunk_size):
|
||||||
|
chunk = ids[i : i + chunk_size]
|
||||||
|
self._index.delete(ids=chunk)
|
||||||
|
@ -187,7 +187,6 @@ class Redis(VectorStore):
|
|||||||
texts: Iterable[str],
|
texts: Iterable[str],
|
||||||
metadatas: Optional[List[dict]] = None,
|
metadatas: Optional[List[dict]] = None,
|
||||||
embeddings: Optional[List[List[float]]] = None,
|
embeddings: Optional[List[List[float]]] = None,
|
||||||
keys: Optional[List[str]] = None,
|
|
||||||
batch_size: int = 1000,
|
batch_size: int = 1000,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> List[str]:
|
) -> List[str]:
|
||||||
@ -199,7 +198,7 @@ class Redis(VectorStore):
|
|||||||
Defaults to None.
|
Defaults to None.
|
||||||
embeddings (Optional[List[List[float]]], optional): Optional pre-generated
|
embeddings (Optional[List[List[float]]], optional): Optional pre-generated
|
||||||
embeddings. Defaults to None.
|
embeddings. Defaults to None.
|
||||||
keys (Optional[List[str]], optional): Optional key values to use as ids.
|
keys (List[str]) or ids (List[str]): Identifiers of entries.
|
||||||
Defaults to None.
|
Defaults to None.
|
||||||
batch_size (int, optional): Batch size to use for writes. Defaults to 1000.
|
batch_size (int, optional): Batch size to use for writes. Defaults to 1000.
|
||||||
|
|
||||||
@ -209,11 +208,15 @@ class Redis(VectorStore):
|
|||||||
ids = []
|
ids = []
|
||||||
prefix = _redis_prefix(self.index_name)
|
prefix = _redis_prefix(self.index_name)
|
||||||
|
|
||||||
|
# Get keys or ids from kwargs
|
||||||
|
# Other vectorstores use ids
|
||||||
|
keys_or_ids = kwargs.get("keys", kwargs.get("ids"))
|
||||||
|
|
||||||
# Write data to redis
|
# Write data to redis
|
||||||
pipeline = self.client.pipeline(transaction=False)
|
pipeline = self.client.pipeline(transaction=False)
|
||||||
for i, text in enumerate(texts):
|
for i, text in enumerate(texts):
|
||||||
# Use provided values by default or fallback
|
# Use provided values by default or fallback
|
||||||
key = keys[i] if keys else _redis_key(prefix)
|
key = keys_or_ids[i] if keys_or_ids else _redis_key(prefix)
|
||||||
metadata = metadatas[i] if metadatas else {}
|
metadata = metadatas[i] if metadatas else {}
|
||||||
embedding = embeddings[i] if embeddings else self.embedding_function(text)
|
embedding = embeddings[i] if embeddings else self.embedding_function(text)
|
||||||
pipeline.hset(
|
pipeline.hset(
|
||||||
@ -461,19 +464,23 @@ class Redis(VectorStore):
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def delete(
|
def delete(
|
||||||
keys: List[str],
|
ids: List[str],
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> bool:
|
) -> bool:
|
||||||
"""
|
"""
|
||||||
Delete a Redis entry.
|
Delete a Redis entry.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
keys (List[str]): Keys of entries to delete.
|
ids: List of ids (keys) to delete.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
bool: Whether or not the deletions were successful.
|
bool: Whether or not the deletions were successful.
|
||||||
"""
|
"""
|
||||||
redis_url = get_from_dict_or_env(kwargs, "redis_url", "REDIS_URL")
|
redis_url = get_from_dict_or_env(kwargs, "redis_url", "REDIS_URL")
|
||||||
|
|
||||||
|
if ids is None:
|
||||||
|
raise ValueError("'ids' (keys)() were not provided.")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import redis
|
import redis
|
||||||
except ImportError:
|
except ImportError:
|
||||||
@ -491,11 +498,11 @@ class Redis(VectorStore):
|
|||||||
raise ValueError(f"Your redis connected error: {e}")
|
raise ValueError(f"Your redis connected error: {e}")
|
||||||
# Check if index exists
|
# Check if index exists
|
||||||
try:
|
try:
|
||||||
client.delete(*keys)
|
client.delete(*ids)
|
||||||
logger.info("Entries deleted")
|
logger.info("Entries deleted")
|
||||||
return True
|
return True
|
||||||
except: # noqa: E722
|
except: # noqa: E722
|
||||||
# Keys not exist
|
# ids does not exist
|
||||||
return False
|
return False
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import uuid
|
||||||
from itertools import repeat
|
from itertools import repeat
|
||||||
from typing import (
|
from typing import (
|
||||||
TYPE_CHECKING,
|
TYPE_CHECKING,
|
||||||
@ -70,12 +71,14 @@ class SupabaseVectorStore(VectorStore):
|
|||||||
self,
|
self,
|
||||||
texts: Iterable[str],
|
texts: Iterable[str],
|
||||||
metadatas: Optional[List[dict[Any, Any]]] = None,
|
metadatas: Optional[List[dict[Any, Any]]] = None,
|
||||||
|
ids: Optional[List[str]] = None,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> List[str]:
|
) -> List[str]:
|
||||||
|
ids = ids or [str(uuid.uuid4()) for _ in texts]
|
||||||
docs = self._texts_to_documents(texts, metadatas)
|
docs = self._texts_to_documents(texts, metadatas)
|
||||||
|
|
||||||
vectors = self._embedding.embed_documents(list(texts))
|
vectors = self._embedding.embed_documents(list(texts))
|
||||||
return self.add_vectors(vectors, docs)
|
return self.add_vectors(vectors, docs, ids)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_texts(
|
def from_texts(
|
||||||
@ -86,6 +89,7 @@ class SupabaseVectorStore(VectorStore):
|
|||||||
client: Optional[supabase.client.Client] = None,
|
client: Optional[supabase.client.Client] = None,
|
||||||
table_name: Optional[str] = "documents",
|
table_name: Optional[str] = "documents",
|
||||||
query_name: Union[str, None] = "match_documents",
|
query_name: Union[str, None] = "match_documents",
|
||||||
|
ids: Optional[List[str]] = None,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> "SupabaseVectorStore":
|
) -> "SupabaseVectorStore":
|
||||||
"""Return VectorStore initialized from texts and embeddings."""
|
"""Return VectorStore initialized from texts and embeddings."""
|
||||||
@ -97,8 +101,9 @@ class SupabaseVectorStore(VectorStore):
|
|||||||
raise ValueError("Supabase document table_name is required.")
|
raise ValueError("Supabase document table_name is required.")
|
||||||
|
|
||||||
embeddings = embedding.embed_documents(texts)
|
embeddings = embedding.embed_documents(texts)
|
||||||
|
ids = [str(uuid.uuid4()) for _ in texts]
|
||||||
docs = cls._texts_to_documents(texts, metadatas)
|
docs = cls._texts_to_documents(texts, metadatas)
|
||||||
_ids = cls._add_vectors(client, table_name, embeddings, docs)
|
_ids = cls._add_vectors(client, table_name, embeddings, docs, ids)
|
||||||
|
|
||||||
return cls(
|
return cls(
|
||||||
client=client,
|
client=client,
|
||||||
@ -108,9 +113,12 @@ class SupabaseVectorStore(VectorStore):
|
|||||||
)
|
)
|
||||||
|
|
||||||
def add_vectors(
|
def add_vectors(
|
||||||
self, vectors: List[List[float]], documents: List[Document]
|
self,
|
||||||
|
vectors: List[List[float]],
|
||||||
|
documents: List[Document],
|
||||||
|
ids: List[str],
|
||||||
) -> List[str]:
|
) -> List[str]:
|
||||||
return self._add_vectors(self._client, self.table_name, vectors, documents)
|
return self._add_vectors(self._client, self.table_name, vectors, documents, ids)
|
||||||
|
|
||||||
def similarity_search(
|
def similarity_search(
|
||||||
self, query: str, k: int = 4, **kwargs: Any
|
self, query: str, k: int = 4, **kwargs: Any
|
||||||
@ -200,11 +208,13 @@ class SupabaseVectorStore(VectorStore):
|
|||||||
table_name: str,
|
table_name: str,
|
||||||
vectors: List[List[float]],
|
vectors: List[List[float]],
|
||||||
documents: List[Document],
|
documents: List[Document],
|
||||||
|
ids: List[str],
|
||||||
) -> List[str]:
|
) -> List[str]:
|
||||||
"""Add vectors to Supabase table."""
|
"""Add vectors to Supabase table."""
|
||||||
|
|
||||||
rows: List[dict[str, Any]] = [
|
rows: List[dict[str, Any]] = [
|
||||||
{
|
{
|
||||||
|
"id": ids[idx],
|
||||||
"content": documents[idx].page_content,
|
"content": documents[idx].page_content,
|
||||||
"embedding": embedding,
|
"embedding": embedding,
|
||||||
"metadata": documents[idx].metadata, # type: ignore
|
"metadata": documents[idx].metadata, # type: ignore
|
||||||
@ -219,7 +229,7 @@ class SupabaseVectorStore(VectorStore):
|
|||||||
for i in range(0, len(rows), chunk_size):
|
for i in range(0, len(rows), chunk_size):
|
||||||
chunk = rows[i : i + chunk_size]
|
chunk = rows[i : i + chunk_size]
|
||||||
|
|
||||||
result = client.from_(table_name).insert(chunk).execute() # type: ignore
|
result = client.from_(table_name).upsert(chunk).execute() # type: ignore
|
||||||
|
|
||||||
if len(result.data) == 0:
|
if len(result.data) == 0:
|
||||||
raise Exception("Error inserting: No rows added")
|
raise Exception("Error inserting: No rows added")
|
||||||
@ -335,3 +345,20 @@ class SupabaseVectorStore(VectorStore):
|
|||||||
embedding[0], k, fetch_k, lambda_mult=lambda_mult
|
embedding[0], k, fetch_k, lambda_mult=lambda_mult
|
||||||
)
|
)
|
||||||
return docs
|
return docs
|
||||||
|
|
||||||
|
def delete(self, ids: List[str]) -> None:
|
||||||
|
"""Delete by vector IDs.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
ids: List of ids to delete.
|
||||||
|
"""
|
||||||
|
rows: List[dict[str, Any]] = [
|
||||||
|
{
|
||||||
|
"id": id,
|
||||||
|
}
|
||||||
|
for id in ids
|
||||||
|
]
|
||||||
|
|
||||||
|
# TODO: Check if this can be done in bulk
|
||||||
|
for row in rows:
|
||||||
|
self._client.from_(self.table_name).delete().eq("id", row["id"]).execute()
|
||||||
|
@ -135,11 +135,15 @@ class Weaviate(VectorStore):
|
|||||||
for key, val in metadatas[i].items():
|
for key, val in metadatas[i].items():
|
||||||
data_properties[key] = _json_serializable(val)
|
data_properties[key] = _json_serializable(val)
|
||||||
|
|
||||||
|
# Allow for ids (consistent w/ other methods)
|
||||||
|
# # Or uuids (backwards compatble w/ existing arg)
|
||||||
# If the UUID of one of the objects already exists
|
# If the UUID of one of the objects already exists
|
||||||
# then the existing object will be replaced by the new object.
|
# then the existing object will be replaced by the new object.
|
||||||
_id = (
|
_id = get_valid_uuid(uuid4())
|
||||||
kwargs["uuids"][i] if "uuids" in kwargs else get_valid_uuid(uuid4())
|
if "uuids" in kwargs:
|
||||||
)
|
_id = kwargs["uuids"][i]
|
||||||
|
elif "ids" in kwargs:
|
||||||
|
_id = kwargs["ids"][i]
|
||||||
|
|
||||||
if self._embedding is not None:
|
if self._embedding is not None:
|
||||||
vector = self._embedding.embed_documents([text])[0]
|
vector = self._embedding.embed_documents([text])[0]
|
||||||
@ -465,3 +469,14 @@ class Weaviate(VectorStore):
|
|||||||
relevance_score_fn=relevance_score_fn,
|
relevance_score_fn=relevance_score_fn,
|
||||||
by_text=by_text,
|
by_text=by_text,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def delete(self, ids: List[str]) -> None:
|
||||||
|
"""Delete by vector IDs.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
ids: List of ids to delete.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# TODO: Check if this can be done in bulk
|
||||||
|
for id in ids:
|
||||||
|
self._client.data_object.delete(uuid=id)
|
||||||
|
Loading…
Reference in New Issue
Block a user