diff --git a/langchain/vectorstores/base.py b/langchain/vectorstores/base.py index 6bc5daa03ab..614eccb4ece 100644 --- a/langchain/vectorstores/base.py +++ b/langchain/vectorstores/base.py @@ -48,6 +48,21 @@ class VectorStore(ABC): List of ids from adding the texts into the vectorstore. """ + def delete(self, ids: List[str]) -> Optional[bool]: + """Delete by vector ID. + + Args: + ids: List of ids to delete. + + Returns: + Optional[bool]: True if deletion is successful, + False otherwise, None if not implemented. + """ + + raise NotImplementedError( + "delete_by_id method must be implemented by subclass." + ) + async def aadd_texts( self, texts: Iterable[str], diff --git a/langchain/vectorstores/chroma.py b/langchain/vectorstores/chroma.py index 59801f2b5ab..a3679c09f3c 100644 --- a/langchain/vectorstores/chroma.py +++ b/langchain/vectorstores/chroma.py @@ -146,7 +146,7 @@ class Chroma(VectorStore): embeddings = None if self._embedding_function is not None: embeddings = self._embedding_function.embed_documents(list(texts)) - self._collection.add( + self._collection.upsert( metadatas=metadatas, embeddings=embeddings, documents=texts, ids=ids ) return ids @@ -442,3 +442,11 @@ class Chroma(VectorStore): client_settings=client_settings, client=client, ) + + def delete(self, ids: List[str]) -> None: + """Delete by vector IDs. + + Args: + ids: List of ids to delete. + """ + self._collection.delete(ids=ids) diff --git a/langchain/vectorstores/elastic_vector_search.py b/langchain/vectorstores/elastic_vector_search.py index e9cdd7c592a..8d453538f8a 100644 --- a/langchain/vectorstores/elastic_vector_search.py +++ b/langchain/vectorstores/elastic_vector_search.py @@ -158,6 +158,7 @@ class ElasticVectorSearch(VectorStore, ABC): texts: Iterable[str], metadatas: Optional[List[dict]] = None, refresh_indices: bool = True, + ids: Optional[List[str]] = None, **kwargs: Any, ) -> List[str]: """Run more texts through the embeddings and add to the vectorstore. @@ -179,7 +180,7 @@ class ElasticVectorSearch(VectorStore, ABC): "Please install it with `pip install elasticsearch`." ) requests = [] - ids = [] + ids = ids or [str(uuid.uuid4()) for _ in texts] embeddings = self.embedding.embed_documents(list(texts)) dim = len(embeddings[0]) mapping = _default_text_mapping(dim) @@ -194,16 +195,14 @@ class ElasticVectorSearch(VectorStore, ABC): for i, text in enumerate(texts): metadata = metadatas[i] if metadatas else {} - _id = str(uuid.uuid4()) request = { "_op_type": "index", "_index": self.index_name, "vector": embeddings[i], "text": text, "metadata": metadata, - "_id": _id, + "_id": ids[i], } - ids.append(_id) requests.append(request) bulk(self.client, requests) @@ -318,6 +317,17 @@ class ElasticVectorSearch(VectorStore, ABC): ) return response + def delete(self, ids: List[str]) -> None: + """Delete by vector IDs. + + Args: + ids: List of ids to delete. + """ + + # TODO: Check if this can be done in bulk + for id in ids: + self.client.delete(index=self.index_name, id=id) + class ElasticKnnSearch(ElasticVectorSearch): """ diff --git a/langchain/vectorstores/pinecone.py b/langchain/vectorstores/pinecone.py index d34264dd44a..84dfa02bd17 100644 --- a/langchain/vectorstores/pinecone.py +++ b/langchain/vectorstores/pinecone.py @@ -353,3 +353,16 @@ class Pinecone(VectorStore): return cls( pinecone.Index(index_name), embedding.embed_query, text_key, namespace ) + + def delete(self, ids: List[str]) -> None: + """Delete by vector IDs. + + Args: + ids: List of ids to delete. + """ + + # This is the maximum number of IDs that can be deleted + chunk_size = 1000 + for i in range(0, len(ids), chunk_size): + chunk = ids[i : i + chunk_size] + self._index.delete(ids=chunk) diff --git a/langchain/vectorstores/redis.py b/langchain/vectorstores/redis.py index af23592ca98..f6bb3d85556 100644 --- a/langchain/vectorstores/redis.py +++ b/langchain/vectorstores/redis.py @@ -187,7 +187,6 @@ class Redis(VectorStore): texts: Iterable[str], metadatas: Optional[List[dict]] = None, embeddings: Optional[List[List[float]]] = None, - keys: Optional[List[str]] = None, batch_size: int = 1000, **kwargs: Any, ) -> List[str]: @@ -199,7 +198,7 @@ class Redis(VectorStore): Defaults to None. embeddings (Optional[List[List[float]]], optional): Optional pre-generated embeddings. Defaults to None. - keys (Optional[List[str]], optional): Optional key values to use as ids. + keys (List[str]) or ids (List[str]): Identifiers of entries. Defaults to None. batch_size (int, optional): Batch size to use for writes. Defaults to 1000. @@ -209,11 +208,15 @@ class Redis(VectorStore): ids = [] prefix = _redis_prefix(self.index_name) + # Get keys or ids from kwargs + # Other vectorstores use ids + keys_or_ids = kwargs.get("keys", kwargs.get("ids")) + # Write data to redis pipeline = self.client.pipeline(transaction=False) for i, text in enumerate(texts): # Use provided values by default or fallback - key = keys[i] if keys else _redis_key(prefix) + key = keys_or_ids[i] if keys_or_ids else _redis_key(prefix) metadata = metadatas[i] if metadatas else {} embedding = embeddings[i] if embeddings else self.embedding_function(text) pipeline.hset( @@ -461,19 +464,23 @@ class Redis(VectorStore): @staticmethod def delete( - keys: List[str], + ids: List[str], **kwargs: Any, ) -> bool: """ Delete a Redis entry. Args: - keys (List[str]): Keys of entries to delete. + ids: List of ids (keys) to delete. Returns: bool: Whether or not the deletions were successful. """ redis_url = get_from_dict_or_env(kwargs, "redis_url", "REDIS_URL") + + if ids is None: + raise ValueError("'ids' (keys)() were not provided.") + try: import redis except ImportError: @@ -491,11 +498,11 @@ class Redis(VectorStore): raise ValueError(f"Your redis connected error: {e}") # Check if index exists try: - client.delete(*keys) + client.delete(*ids) logger.info("Entries deleted") return True except: # noqa: E722 - # Keys not exist + # ids does not exist return False @staticmethod diff --git a/langchain/vectorstores/supabase.py b/langchain/vectorstores/supabase.py index 72a062de60c..94a2c199c9d 100644 --- a/langchain/vectorstores/supabase.py +++ b/langchain/vectorstores/supabase.py @@ -1,5 +1,6 @@ from __future__ import annotations +import uuid from itertools import repeat from typing import ( TYPE_CHECKING, @@ -70,12 +71,14 @@ class SupabaseVectorStore(VectorStore): self, texts: Iterable[str], metadatas: Optional[List[dict[Any, Any]]] = None, + ids: Optional[List[str]] = None, **kwargs: Any, ) -> List[str]: + ids = ids or [str(uuid.uuid4()) for _ in texts] docs = self._texts_to_documents(texts, metadatas) vectors = self._embedding.embed_documents(list(texts)) - return self.add_vectors(vectors, docs) + return self.add_vectors(vectors, docs, ids) @classmethod def from_texts( @@ -86,6 +89,7 @@ class SupabaseVectorStore(VectorStore): client: Optional[supabase.client.Client] = None, table_name: Optional[str] = "documents", query_name: Union[str, None] = "match_documents", + ids: Optional[List[str]] = None, **kwargs: Any, ) -> "SupabaseVectorStore": """Return VectorStore initialized from texts and embeddings.""" @@ -97,8 +101,9 @@ class SupabaseVectorStore(VectorStore): raise ValueError("Supabase document table_name is required.") embeddings = embedding.embed_documents(texts) + ids = [str(uuid.uuid4()) for _ in texts] docs = cls._texts_to_documents(texts, metadatas) - _ids = cls._add_vectors(client, table_name, embeddings, docs) + _ids = cls._add_vectors(client, table_name, embeddings, docs, ids) return cls( client=client, @@ -108,9 +113,12 @@ class SupabaseVectorStore(VectorStore): ) def add_vectors( - self, vectors: List[List[float]], documents: List[Document] + self, + vectors: List[List[float]], + documents: List[Document], + ids: List[str], ) -> List[str]: - return self._add_vectors(self._client, self.table_name, vectors, documents) + return self._add_vectors(self._client, self.table_name, vectors, documents, ids) def similarity_search( self, query: str, k: int = 4, **kwargs: Any @@ -200,11 +208,13 @@ class SupabaseVectorStore(VectorStore): table_name: str, vectors: List[List[float]], documents: List[Document], + ids: List[str], ) -> List[str]: """Add vectors to Supabase table.""" rows: List[dict[str, Any]] = [ { + "id": ids[idx], "content": documents[idx].page_content, "embedding": embedding, "metadata": documents[idx].metadata, # type: ignore @@ -219,7 +229,7 @@ class SupabaseVectorStore(VectorStore): for i in range(0, len(rows), chunk_size): chunk = rows[i : i + chunk_size] - result = client.from_(table_name).insert(chunk).execute() # type: ignore + result = client.from_(table_name).upsert(chunk).execute() # type: ignore if len(result.data) == 0: raise Exception("Error inserting: No rows added") @@ -335,3 +345,20 @@ class SupabaseVectorStore(VectorStore): embedding[0], k, fetch_k, lambda_mult=lambda_mult ) return docs + + def delete(self, ids: List[str]) -> None: + """Delete by vector IDs. + + Args: + ids: List of ids to delete. + """ + rows: List[dict[str, Any]] = [ + { + "id": id, + } + for id in ids + ] + + # TODO: Check if this can be done in bulk + for row in rows: + self._client.from_(self.table_name).delete().eq("id", row["id"]).execute() diff --git a/langchain/vectorstores/weaviate.py b/langchain/vectorstores/weaviate.py index 43501452053..d77ff298a62 100644 --- a/langchain/vectorstores/weaviate.py +++ b/langchain/vectorstores/weaviate.py @@ -135,11 +135,15 @@ class Weaviate(VectorStore): for key, val in metadatas[i].items(): data_properties[key] = _json_serializable(val) + # Allow for ids (consistent w/ other methods) + # # Or uuids (backwards compatble w/ existing arg) # If the UUID of one of the objects already exists # then the existing object will be replaced by the new object. - _id = ( - kwargs["uuids"][i] if "uuids" in kwargs else get_valid_uuid(uuid4()) - ) + _id = get_valid_uuid(uuid4()) + if "uuids" in kwargs: + _id = kwargs["uuids"][i] + elif "ids" in kwargs: + _id = kwargs["ids"][i] if self._embedding is not None: vector = self._embedding.embed_documents([text])[0] @@ -465,3 +469,14 @@ class Weaviate(VectorStore): relevance_score_fn=relevance_score_fn, by_text=by_text, ) + + def delete(self, ids: List[str]) -> None: + """Delete by vector IDs. + + Args: + ids: List of ids to delete. + """ + + # TODO: Check if this can be done in bulk + for id in ids: + self._client.data_object.delete(uuid=id)