From a1aa3a657c17e8ba72d238c578bd1f1f4dd21202 Mon Sep 17 00:00:00 2001 From: Jael Gu Date: Tue, 30 Jan 2024 03:19:50 +0800 Subject: [PATCH] community[patch]: Milvus supports add & delete texts by ids (#16256) # Description To support [langchain indexing](https://python.langchain.com/docs/modules/data_connection/indexing) as requested by users, vectorstore Milvus needs to support: - document addition by id (`add_documents` method with `ids` argument) - delete by id (`delete` method with `ids` argument) Example usage: ```python from langchain.indexes import SQLRecordManager, index from langchain.schema import Document from langchain_community.vectorstores import Milvus from langchain_openai import OpenAIEmbeddings collection_name = "test_index" embedding = OpenAIEmbeddings() vectorstore = Milvus(embedding_function=embedding, collection_name=collection_name) namespace = f"milvus/{collection_name}" record_manager = SQLRecordManager( namespace, db_url="sqlite:///record_manager_cache.sql" ) record_manager.create_schema() doc1 = Document(page_content="kitty", metadata={"source": "kitty.txt"}) doc2 = Document(page_content="doggy", metadata={"source": "doggy.txt"}) index( [doc1, doc1, doc2], record_manager, vectorstore, cleanup="incremental", # None, "incremental", or "full" source_id_key="source", ) ``` # Fix issues Fix https://github.com/milvus-io/milvus/issues/30112 --------- Signed-off-by: Jael Gu Co-authored-by: Bagatur --- .../modules/data_connection/indexing.ipynb | 2 +- .../vectorstores/milvus.py | 90 ++++++++++++++++--- .../vectorstores/zilliz.py | 13 ++- .../vectorstores/test_milvus.py | 30 ++++++- .../vectorstores/test_indexing_docs.py | 2 + 5 files changed, 124 insertions(+), 13 deletions(-) diff --git a/docs/docs/modules/data_connection/indexing.ipynb b/docs/docs/modules/data_connection/indexing.ipynb index fe0a9a0a263..b888a77958b 100644 --- a/docs/docs/modules/data_connection/indexing.ipynb +++ b/docs/docs/modules/data_connection/indexing.ipynb @@ -60,7 +60,7 @@ " * document addition by id (`add_documents` method with `ids` argument)\n", " * delete by id (`delete` method with `ids` argument)\n", "\n", - "Compatible Vectorstores: `AnalyticDB`, `AstraDB`, `AwaDB`, `Bagel`, `Cassandra`, `Chroma`, `DashVector`, `DatabricksVectorSearch`, `DeepLake`, `Dingo`, `ElasticVectorSearch`, `ElasticsearchStore`, `FAISS`, `HanaDB`, `MyScale`, `PGVector`, `Pinecone`, `Qdrant`, `Redis`, `ScaNN`, `SupabaseVectorStore`, `SurrealDBStore`, `TimescaleVector`, `Vald`, `Vearch`, `VespaStore`, `Weaviate`, `ZepVectorStore`.\n", + "Compatible Vectorstores: `AnalyticDB`, `AstraDB`, `AwaDB`, `Bagel`, `Cassandra`, `Chroma`, `DashVector`, `DatabricksVectorSearch`, `DeepLake`, `Dingo`, `ElasticVectorSearch`, `ElasticsearchStore`, `FAISS`, `HanaDB`, `Milvus`, `MyScale`, `PGVector`, `Pinecone`, `Qdrant`, `Redis`, `ScaNN`, `SupabaseVectorStore`, `SurrealDBStore`, `TimescaleVector`, `Vald`, `Vearch`, `VespaStore`, `Weaviate`, `ZepVectorStore`.\n", " \n", "## Caution\n", "\n", diff --git a/libs/community/langchain_community/vectorstores/milvus.py b/libs/community/langchain_community/vectorstores/milvus.py index ed751271e0f..e72a82f2bd2 100644 --- a/libs/community/langchain_community/vectorstores/milvus.py +++ b/libs/community/langchain_community/vectorstores/milvus.py @@ -56,6 +56,9 @@ class Milvus(VectorStore): default of index. drop_old (Optional[bool]): Whether to drop the current collection. Defaults to False. + auto_id (bool): Whether to enable auto id for primary key. Defaults to False. + If False, you needs to provide text ids (string less than 65535 bytes). + If True, Milvus will generate unique integers as primary keys. primary_field (str): Name of the primary key field. Defaults to "pk". text_field (str): Name of the text field. Defaults to "text". vector_field (str): Name of the vector field. Defaults to "vector". @@ -102,6 +105,7 @@ class Milvus(VectorStore): embedding_function = Embeddings, collection_name = "LangChainCollection", drop_old = True, + auto_id = True ) Raises: @@ -119,6 +123,7 @@ class Milvus(VectorStore): index_params: Optional[dict] = None, search_params: Optional[dict] = None, drop_old: Optional[bool] = False, + auto_id: bool = False, *, primary_field: str = "pk", text_field: str = "text", @@ -159,8 +164,9 @@ class Milvus(VectorStore): self.index_params = index_params self.search_params = search_params self.consistency_level = consistency_level + self.auto_id = auto_id - # In order for a collection to be compatible, pk needs to be auto'id and int + # In order for a collection to be compatible, pk needs to be varchar self._primary_field = primary_field # In order for compatibility, the text field will need to be called "text" self._text_field = text_field @@ -327,11 +333,22 @@ class Milvus(VectorStore): FieldSchema(self._text_field, DataType.VARCHAR, max_length=65_535) ) # Create the primary key field - fields.append( - FieldSchema( - self._primary_field, DataType.INT64, is_primary=True, auto_id=True + if self.auto_id: + fields.append( + FieldSchema( + self._primary_field, DataType.INT64, is_primary=True, auto_id=True + ) + ) + else: + fields.append( + FieldSchema( + self._primary_field, + DataType.VARCHAR, + is_primary=True, + auto_id=False, + max_length=65_535, + ) ) - ) # Create the vector field, supports binary or float vectors fields.append( FieldSchema(self._vector_field, infer_dtype_bydata(embeddings[0]), dim=dim) @@ -369,8 +386,6 @@ class Milvus(VectorStore): schema = self.col.schema for x in schema.fields: self.fields.append(x.name) - # Since primary field is auto-id, no need to track it - self.fields.remove(self._primary_field) def _get_index(self) -> Optional[dict[str, Any]]: """Return the vector index information if it exists""" @@ -467,6 +482,8 @@ class Milvus(VectorStore): metadatas: Optional[List[dict]] = None, timeout: Optional[int] = None, batch_size: int = 1000, + *, + ids: Optional[List[str]] = None, **kwargs: Any, ) -> List[str]: """Insert text data into Milvus. @@ -483,10 +500,12 @@ class Milvus(VectorStore): that they all fit in memory. metadatas (Optional[List[dict]]): Metadata dicts attached to each of the texts. Defaults to None. + should be less than 65535 bytes. Required and work when auto_id is False. timeout (Optional[int]): Timeout for each batch insert. Defaults to None. batch_size (int, optional): Batch size to use for insertion. Defaults to 1000. + ids (Optional[List[str]]): List of text ids. The length of each item Raises: MilvusException: Failure to add texts @@ -497,6 +516,16 @@ class Milvus(VectorStore): from pymilvus import Collection, MilvusException texts = list(texts) + if not self.auto_id: + assert isinstance( + ids, list + ), "A list of valid ids are required when auto_id is False." + assert len(set(ids)) == len( + texts + ), "Different lengths of texts and unique ids are provided." + assert all( + len(x.encode()) <= 65_535 for x in ids + ), "Each id should be a string less than 65535 bytes." try: embeddings = self.embedding_func.embed_documents(texts) @@ -524,6 +553,9 @@ class Milvus(VectorStore): self._vector_field: embeddings, } + if not self.auto_id: + insert_dict[self._primary_field] = ids + if self._metadata_field is not None: for d in metadatas: insert_dict.setdefault(self._metadata_field, []).append(d) @@ -532,7 +564,12 @@ class Milvus(VectorStore): if metadatas is not None: for d in metadatas: for key, value in d.items(): - if key in self.fields: + keys = ( + [x for x in self.fields if x != self._primary_field] + if self.auto_id + else [x for x in self.fields] + ) + for key in keys: insert_dict.setdefault(key, []).append(value) # Total insert count @@ -700,7 +737,7 @@ class Milvus(VectorStore): param = self.search_params # Determine result metadata fields. - output_fields = self.fields[:] + output_fields = [x for x in self.fields if x != self._primary_field] output_fields.remove(self._vector_field) # Perform the search. @@ -864,6 +901,30 @@ class Milvus(VectorStore): ret.append(documents[x]) return ret + def delete( + self, ids: Optional[List[str]] = None, expr: Optional[str] = None, **kwargs: str + ): + """Delete by vector ID or boolean expression. + Refer to [Milvus documentation](https://milvus.io/docs/delete_data.md) + for notes and examples of expressions. + + Args: + ids: List of ids to delete. + expr: Boolean expression that specifies the entities to delete. + kwargs: Other parameters in Milvus delete api. + """ + if isinstance(ids, list) and len(ids) > 0: + expr = f"{self._primary_field} in {ids}" + if expr is not None: + logger.warning( + "Both ids and expr are provided. " "Ignore expr and delete by ids." + ) + else: + assert isinstance( + expr, str + ), "Either ids list or expr string must be provided." + return self.col.delete(expr=expr, **kwargs) + @classmethod def from_texts( cls, @@ -876,6 +937,8 @@ class Milvus(VectorStore): index_params: Optional[dict] = None, search_params: Optional[dict] = None, drop_old: bool = False, + *, + ids: Optional[List[str]] = None, **kwargs: Any, ) -> Milvus: """Create a Milvus collection, indexes it with HNSW, and insert data. @@ -897,10 +960,16 @@ class Milvus(VectorStore): Defaults to None. drop_old (Optional[bool], optional): Whether to drop the collection with that name if it exists. Defaults to False. + ids (Optional[List[str]]): List of text ids. Defaults to None. Returns: Milvus: Milvus Vector Store """ + if isinstance(ids, list) and len(ids) > 0: + auto_id = False + else: + auto_id = True + vector_db = cls( embedding_function=embedding, collection_name=collection_name, @@ -909,9 +978,10 @@ class Milvus(VectorStore): index_params=index_params, search_params=search_params, drop_old=drop_old, + auto_id=auto_id, **kwargs, ) - vector_db.add_texts(texts=texts, metadatas=metadatas) + vector_db.add_texts(texts=texts, metadatas=metadatas, ids=ids) return vector_db def _parse_document(self, data: dict) -> Document: diff --git a/libs/community/langchain_community/vectorstores/zilliz.py b/libs/community/langchain_community/vectorstores/zilliz.py index c66b9294e80..c6da0e86697 100644 --- a/libs/community/langchain_community/vectorstores/zilliz.py +++ b/libs/community/langchain_community/vectorstores/zilliz.py @@ -36,6 +36,9 @@ class Zilliz(Milvus): default of index. drop_old (Optional[bool]): Whether to drop the current collection. Defaults to False. + auto_id (bool): Whether to enable auto id for primary key. Defaults to False. + If False, you needs to provide text ids (string less than 65535 bytes). + If True, Milvus will generate unique integers as primary keys. The connection args used for this class comes in the form of a dict, here are a few of the options: @@ -146,6 +149,9 @@ class Zilliz(Milvus): index_params: Optional[dict] = None, search_params: Optional[dict] = None, drop_old: bool = False, + *, + ids: Optional[List[str]] = None, + auto_id: bool = False, **kwargs: Any, ) -> Zilliz: """Create a Zilliz collection, indexes it with HNSW, and insert data. @@ -167,6 +173,10 @@ class Zilliz(Milvus): Defaults to None. drop_old (Optional[bool], optional): Whether to drop the collection with that name if it exists. Defaults to False. + ids (Optional[List[str]]): List of text ids. + auto_id (bool): Whether to enable auto id for primary key. Defaults to + False. If False, you needs to provide text ids (string less than 65535 + bytes). If True, Milvus will generate unique integers as primary keys. Returns: Zilliz: Zilliz Vector Store @@ -179,7 +189,8 @@ class Zilliz(Milvus): index_params=index_params, search_params=search_params, drop_old=drop_old, + auto_id=auto_id, **kwargs, ) - vector_db.add_texts(texts=texts, metadatas=metadatas) + vector_db.add_texts(texts=texts, metadatas=metadatas, ids=ids) return vector_db diff --git a/libs/community/tests/integration_tests/vectorstores/test_milvus.py b/libs/community/tests/integration_tests/vectorstores/test_milvus.py index d30972595ef..807edcdb6e4 100644 --- a/libs/community/tests/integration_tests/vectorstores/test_milvus.py +++ b/libs/community/tests/integration_tests/vectorstores/test_milvus.py @@ -11,12 +11,15 @@ from tests.integration_tests.vectorstores.fake_embeddings import ( def _milvus_from_texts( - metadatas: Optional[List[dict]] = None, drop: bool = True + metadatas: Optional[List[dict]] = None, + ids: Optional[List[str]] = None, + drop: bool = True, ) -> Milvus: return Milvus.from_texts( fake_texts, FakeEmbeddings(), metadatas=metadatas, + ids=ids, connection_args={"host": "127.0.0.1", "port": "19530"}, drop_old=drop, ) @@ -29,6 +32,30 @@ def test_milvus() -> None: assert output == [Document(page_content="foo")] +def test_milvus_with_metadata() -> None: + """Test with metadata""" + docsearch = _milvus_from_texts(metadatas=[{"label": "test"}] * len(fake_texts)) + output = docsearch.similarity_search("foo", k=1) + assert output == [Document(page_content="foo", metadata={"label": "test"})] + + +def test_milvus_with_id() -> None: + """Test with ids""" + ids = ["id_" + str(i) for i in range(len(fake_texts))] + docsearch = _milvus_from_texts(ids=ids) + output = docsearch.similarity_search("foo", k=1) + assert output == [Document(page_content="foo")] + + output = docsearch.delete(ids=ids) + assert output.delete_count == len(fake_texts) + + try: + ids = ["dup_id" for _ in fake_texts] + _milvus_from_texts(ids=ids) + except Exception as e: + assert isinstance(e, AssertionError) + + def test_milvus_with_score() -> None: """Test end to end construction and search with scores and IDs.""" texts = ["foo", "bar", "baz"] @@ -84,6 +111,7 @@ def test_milvus_no_drop() -> None: # if __name__ == "__main__": # test_milvus() +# test_milvus_with_metadata() # test_milvus_with_score() # test_milvus_max_marginal_relevance_search() # test_milvus_add_extra() diff --git a/libs/community/tests/unit_tests/vectorstores/test_indexing_docs.py b/libs/community/tests/unit_tests/vectorstores/test_indexing_docs.py index 9a1e438d2f0..1232d6bb9a6 100644 --- a/libs/community/tests/unit_tests/vectorstores/test_indexing_docs.py +++ b/libs/community/tests/unit_tests/vectorstores/test_indexing_docs.py @@ -61,6 +61,7 @@ def test_compatible_vectorstore_documentation() -> None: "ElasticsearchStore", "FAISS", "HanaDB", + "Milvus", "MomentoVectorIndex", "MyScale", "PGVector", @@ -78,6 +79,7 @@ def test_compatible_vectorstore_documentation() -> None: "VespaStore", "Weaviate", "ZepVectorStore", + "Zilliz", "Lantern", } assert compatible == documented