From a1aa3a657c17e8ba72d238c578bd1f1f4dd21202 Mon Sep 17 00:00:00 2001
From: Jael Gu <mjaelgu@hotmail.com>
Date: Tue, 30 Jan 2024 03:19:50 +0800
Subject: [PATCH] community[patch]: Milvus supports add & delete texts by ids
 (#16256)

# Description

To support [langchain
indexing](https://python.langchain.com/docs/modules/data_connection/indexing)
as requested by users, vectorstore Milvus needs to support:
- document addition by id (`add_documents` method with `ids` argument)
- delete by id (`delete` method with `ids` argument)

Example usage:

```python
from langchain.indexes import SQLRecordManager, index
from langchain.schema import Document
from langchain_community.vectorstores import Milvus
from langchain_openai import OpenAIEmbeddings

collection_name = "test_index"
embedding = OpenAIEmbeddings()
vectorstore = Milvus(embedding_function=embedding, collection_name=collection_name)

namespace = f"milvus/{collection_name}"
record_manager = SQLRecordManager(
    namespace, db_url="sqlite:///record_manager_cache.sql"
)
record_manager.create_schema()

doc1 = Document(page_content="kitty", metadata={"source": "kitty.txt"})
doc2 = Document(page_content="doggy", metadata={"source": "doggy.txt"})

index(
    [doc1, doc1, doc2],
    record_manager,
    vectorstore,
    cleanup="incremental",  # None, "incremental", or "full"
    source_id_key="source",
)
```

# Fix issues

Fix https://github.com/milvus-io/milvus/issues/30112

---------

Signed-off-by: Jael Gu <mengjia.gu@zilliz.com>
Co-authored-by: Bagatur <baskaryan@gmail.com>
---
 .../modules/data_connection/indexing.ipynb    |  2 +-
 .../vectorstores/milvus.py                    | 90 ++++++++++++++++---
 .../vectorstores/zilliz.py                    | 13 ++-
 .../vectorstores/test_milvus.py               | 30 ++++++-
 .../vectorstores/test_indexing_docs.py        |  2 +
 5 files changed, 124 insertions(+), 13 deletions(-)

diff --git a/docs/docs/modules/data_connection/indexing.ipynb b/docs/docs/modules/data_connection/indexing.ipynb
index fe0a9a0a263..b888a77958b 100644
--- a/docs/docs/modules/data_connection/indexing.ipynb
+++ b/docs/docs/modules/data_connection/indexing.ipynb
@@ -60,7 +60,7 @@
     "   * document addition by id (`add_documents` method with `ids` argument)\n",
     "   * delete by id (`delete` method with `ids` argument)\n",
     "\n",
-    "Compatible Vectorstores: `AnalyticDB`, `AstraDB`, `AwaDB`, `Bagel`, `Cassandra`, `Chroma`, `DashVector`, `DatabricksVectorSearch`, `DeepLake`, `Dingo`, `ElasticVectorSearch`, `ElasticsearchStore`, `FAISS`, `HanaDB`, `MyScale`, `PGVector`, `Pinecone`, `Qdrant`, `Redis`, `ScaNN`, `SupabaseVectorStore`, `SurrealDBStore`, `TimescaleVector`, `Vald`, `Vearch`, `VespaStore`, `Weaviate`, `ZepVectorStore`.\n",
+    "Compatible Vectorstores: `AnalyticDB`, `AstraDB`, `AwaDB`, `Bagel`, `Cassandra`, `Chroma`, `DashVector`, `DatabricksVectorSearch`, `DeepLake`, `Dingo`, `ElasticVectorSearch`, `ElasticsearchStore`, `FAISS`, `HanaDB`, `Milvus`, `MyScale`, `PGVector`, `Pinecone`, `Qdrant`, `Redis`, `ScaNN`, `SupabaseVectorStore`, `SurrealDBStore`, `TimescaleVector`, `Vald`, `Vearch`, `VespaStore`, `Weaviate`, `ZepVectorStore`.\n",
     "  \n",
     "## Caution\n",
     "\n",
diff --git a/libs/community/langchain_community/vectorstores/milvus.py b/libs/community/langchain_community/vectorstores/milvus.py
index ed751271e0f..e72a82f2bd2 100644
--- a/libs/community/langchain_community/vectorstores/milvus.py
+++ b/libs/community/langchain_community/vectorstores/milvus.py
@@ -56,6 +56,9 @@ class Milvus(VectorStore):
             default of index.
         drop_old (Optional[bool]): Whether to drop the current collection. Defaults
             to False.
+        auto_id (bool): Whether to enable auto id for primary key. Defaults to False.
+            If False, you needs to provide text ids (string less than 65535 bytes).
+            If True, Milvus will generate unique integers as primary keys.
         primary_field (str): Name of the primary key field. Defaults to "pk".
         text_field (str): Name of the text field. Defaults to "text".
         vector_field (str): Name of the vector field. Defaults to "vector".
@@ -102,6 +105,7 @@ class Milvus(VectorStore):
             embedding_function = Embeddings,
             collection_name = "LangChainCollection",
             drop_old = True,
+            auto_id = True
         )
 
     Raises:
@@ -119,6 +123,7 @@ class Milvus(VectorStore):
         index_params: Optional[dict] = None,
         search_params: Optional[dict] = None,
         drop_old: Optional[bool] = False,
+        auto_id: bool = False,
         *,
         primary_field: str = "pk",
         text_field: str = "text",
@@ -159,8 +164,9 @@ class Milvus(VectorStore):
         self.index_params = index_params
         self.search_params = search_params
         self.consistency_level = consistency_level
+        self.auto_id = auto_id
 
-        # In order for a collection to be compatible, pk needs to be auto'id and int
+        # In order for a collection to be compatible, pk needs to be varchar
         self._primary_field = primary_field
         # In order for compatibility, the text field will need to be called "text"
         self._text_field = text_field
@@ -327,11 +333,22 @@ class Milvus(VectorStore):
             FieldSchema(self._text_field, DataType.VARCHAR, max_length=65_535)
         )
         # Create the primary key field
-        fields.append(
-            FieldSchema(
-                self._primary_field, DataType.INT64, is_primary=True, auto_id=True
+        if self.auto_id:
+            fields.append(
+                FieldSchema(
+                    self._primary_field, DataType.INT64, is_primary=True, auto_id=True
+                )
+            )
+        else:
+            fields.append(
+                FieldSchema(
+                    self._primary_field,
+                    DataType.VARCHAR,
+                    is_primary=True,
+                    auto_id=False,
+                    max_length=65_535,
+                )
             )
-        )
         # Create the vector field, supports binary or float vectors
         fields.append(
             FieldSchema(self._vector_field, infer_dtype_bydata(embeddings[0]), dim=dim)
@@ -369,8 +386,6 @@ class Milvus(VectorStore):
             schema = self.col.schema
             for x in schema.fields:
                 self.fields.append(x.name)
-            # Since primary field is auto-id, no need to track it
-            self.fields.remove(self._primary_field)
 
     def _get_index(self) -> Optional[dict[str, Any]]:
         """Return the vector index information if it exists"""
@@ -467,6 +482,8 @@ class Milvus(VectorStore):
         metadatas: Optional[List[dict]] = None,
         timeout: Optional[int] = None,
         batch_size: int = 1000,
+        *,
+        ids: Optional[List[str]] = None,
         **kwargs: Any,
     ) -> List[str]:
         """Insert text data into Milvus.
@@ -483,10 +500,12 @@ class Milvus(VectorStore):
                 that they all fit in memory.
             metadatas (Optional[List[dict]]): Metadata dicts attached to each of
                 the texts. Defaults to None.
+            should be less than 65535 bytes. Required and work when auto_id is False.
             timeout (Optional[int]): Timeout for each batch insert. Defaults
                 to None.
             batch_size (int, optional): Batch size to use for insertion.
                 Defaults to 1000.
+            ids (Optional[List[str]]): List of text ids. The length of each item
 
         Raises:
             MilvusException: Failure to add texts
@@ -497,6 +516,16 @@ class Milvus(VectorStore):
         from pymilvus import Collection, MilvusException
 
         texts = list(texts)
+        if not self.auto_id:
+            assert isinstance(
+                ids, list
+            ), "A list of valid ids are required when auto_id is False."
+            assert len(set(ids)) == len(
+                texts
+            ), "Different lengths of texts and unique ids are provided."
+            assert all(
+                len(x.encode()) <= 65_535 for x in ids
+            ), "Each id should be a string less than 65535 bytes."
 
         try:
             embeddings = self.embedding_func.embed_documents(texts)
@@ -524,6 +553,9 @@ class Milvus(VectorStore):
             self._vector_field: embeddings,
         }
 
+        if not self.auto_id:
+            insert_dict[self._primary_field] = ids
+
         if self._metadata_field is not None:
             for d in metadatas:
                 insert_dict.setdefault(self._metadata_field, []).append(d)
@@ -532,7 +564,12 @@ class Milvus(VectorStore):
             if metadatas is not None:
                 for d in metadatas:
                     for key, value in d.items():
-                        if key in self.fields:
+                        keys = (
+                            [x for x in self.fields if x != self._primary_field]
+                            if self.auto_id
+                            else [x for x in self.fields]
+                        )
+                        for key in keys:
                             insert_dict.setdefault(key, []).append(value)
 
         # Total insert count
@@ -700,7 +737,7 @@ class Milvus(VectorStore):
             param = self.search_params
 
         # Determine result metadata fields.
-        output_fields = self.fields[:]
+        output_fields = [x for x in self.fields if x != self._primary_field]
         output_fields.remove(self._vector_field)
 
         # Perform the search.
@@ -864,6 +901,30 @@ class Milvus(VectorStore):
                 ret.append(documents[x])
         return ret
 
+    def delete(
+        self, ids: Optional[List[str]] = None, expr: Optional[str] = None, **kwargs: str
+    ):
+        """Delete by vector ID or boolean expression.
+        Refer to [Milvus documentation](https://milvus.io/docs/delete_data.md)
+        for notes and examples of expressions.
+
+        Args:
+            ids: List of ids to delete.
+            expr: Boolean expression that specifies the entities to delete.
+            kwargs: Other parameters in Milvus delete api.
+        """
+        if isinstance(ids, list) and len(ids) > 0:
+            expr = f"{self._primary_field} in {ids}"
+            if expr is not None:
+                logger.warning(
+                    "Both ids and expr are provided. " "Ignore expr and delete by ids."
+                )
+        else:
+            assert isinstance(
+                expr, str
+            ), "Either ids list or expr string must be provided."
+        return self.col.delete(expr=expr, **kwargs)
+
     @classmethod
     def from_texts(
         cls,
@@ -876,6 +937,8 @@ class Milvus(VectorStore):
         index_params: Optional[dict] = None,
         search_params: Optional[dict] = None,
         drop_old: bool = False,
+        *,
+        ids: Optional[List[str]] = None,
         **kwargs: Any,
     ) -> Milvus:
         """Create a Milvus collection, indexes it with HNSW, and insert data.
@@ -897,10 +960,16 @@ class Milvus(VectorStore):
                 Defaults to None.
             drop_old (Optional[bool], optional): Whether to drop the collection with
                 that name if it exists. Defaults to False.
+            ids (Optional[List[str]]): List of text ids. Defaults to None.
 
         Returns:
             Milvus: Milvus Vector Store
         """
+        if isinstance(ids, list) and len(ids) > 0:
+            auto_id = False
+        else:
+            auto_id = True
+
         vector_db = cls(
             embedding_function=embedding,
             collection_name=collection_name,
@@ -909,9 +978,10 @@ class Milvus(VectorStore):
             index_params=index_params,
             search_params=search_params,
             drop_old=drop_old,
+            auto_id=auto_id,
             **kwargs,
         )
-        vector_db.add_texts(texts=texts, metadatas=metadatas)
+        vector_db.add_texts(texts=texts, metadatas=metadatas, ids=ids)
         return vector_db
 
     def _parse_document(self, data: dict) -> Document:
diff --git a/libs/community/langchain_community/vectorstores/zilliz.py b/libs/community/langchain_community/vectorstores/zilliz.py
index c66b9294e80..c6da0e86697 100644
--- a/libs/community/langchain_community/vectorstores/zilliz.py
+++ b/libs/community/langchain_community/vectorstores/zilliz.py
@@ -36,6 +36,9 @@ class Zilliz(Milvus):
             default of index.
         drop_old (Optional[bool]): Whether to drop the current collection. Defaults
             to False.
+        auto_id (bool): Whether to enable auto id for primary key. Defaults to False.
+            If False, you needs to provide text ids (string less than 65535 bytes).
+            If True, Milvus will generate unique integers as primary keys.
 
     The connection args used for this class comes in the form of a dict,
     here are a few of the options:
@@ -146,6 +149,9 @@ class Zilliz(Milvus):
         index_params: Optional[dict] = None,
         search_params: Optional[dict] = None,
         drop_old: bool = False,
+        *,
+        ids: Optional[List[str]] = None,
+        auto_id: bool = False,
         **kwargs: Any,
     ) -> Zilliz:
         """Create a Zilliz collection, indexes it with HNSW, and insert data.
@@ -167,6 +173,10 @@ class Zilliz(Milvus):
                 Defaults to None.
             drop_old (Optional[bool], optional): Whether to drop the collection with
                 that name if it exists. Defaults to False.
+            ids (Optional[List[str]]): List of text ids.
+            auto_id (bool): Whether to enable auto id for primary key. Defaults to
+                False. If False, you needs to provide text ids (string less than 65535
+                bytes). If True, Milvus will generate unique integers as primary keys.
 
         Returns:
             Zilliz: Zilliz Vector Store
@@ -179,7 +189,8 @@ class Zilliz(Milvus):
             index_params=index_params,
             search_params=search_params,
             drop_old=drop_old,
+            auto_id=auto_id,
             **kwargs,
         )
-        vector_db.add_texts(texts=texts, metadatas=metadatas)
+        vector_db.add_texts(texts=texts, metadatas=metadatas, ids=ids)
         return vector_db
diff --git a/libs/community/tests/integration_tests/vectorstores/test_milvus.py b/libs/community/tests/integration_tests/vectorstores/test_milvus.py
index d30972595ef..807edcdb6e4 100644
--- a/libs/community/tests/integration_tests/vectorstores/test_milvus.py
+++ b/libs/community/tests/integration_tests/vectorstores/test_milvus.py
@@ -11,12 +11,15 @@ from tests.integration_tests.vectorstores.fake_embeddings import (
 
 
 def _milvus_from_texts(
-    metadatas: Optional[List[dict]] = None, drop: bool = True
+    metadatas: Optional[List[dict]] = None,
+    ids: Optional[List[str]] = None,
+    drop: bool = True,
 ) -> Milvus:
     return Milvus.from_texts(
         fake_texts,
         FakeEmbeddings(),
         metadatas=metadatas,
+        ids=ids,
         connection_args={"host": "127.0.0.1", "port": "19530"},
         drop_old=drop,
     )
@@ -29,6 +32,30 @@ def test_milvus() -> None:
     assert output == [Document(page_content="foo")]
 
 
+def test_milvus_with_metadata() -> None:
+    """Test with metadata"""
+    docsearch = _milvus_from_texts(metadatas=[{"label": "test"}] * len(fake_texts))
+    output = docsearch.similarity_search("foo", k=1)
+    assert output == [Document(page_content="foo", metadata={"label": "test"})]
+
+
+def test_milvus_with_id() -> None:
+    """Test with ids"""
+    ids = ["id_" + str(i) for i in range(len(fake_texts))]
+    docsearch = _milvus_from_texts(ids=ids)
+    output = docsearch.similarity_search("foo", k=1)
+    assert output == [Document(page_content="foo")]
+
+    output = docsearch.delete(ids=ids)
+    assert output.delete_count == len(fake_texts)
+
+    try:
+        ids = ["dup_id" for _ in fake_texts]
+        _milvus_from_texts(ids=ids)
+    except Exception as e:
+        assert isinstance(e, AssertionError)
+
+
 def test_milvus_with_score() -> None:
     """Test end to end construction and search with scores and IDs."""
     texts = ["foo", "bar", "baz"]
@@ -84,6 +111,7 @@ def test_milvus_no_drop() -> None:
 
 # if __name__ == "__main__":
 #     test_milvus()
+#     test_milvus_with_metadata()
 #     test_milvus_with_score()
 #     test_milvus_max_marginal_relevance_search()
 #     test_milvus_add_extra()
diff --git a/libs/community/tests/unit_tests/vectorstores/test_indexing_docs.py b/libs/community/tests/unit_tests/vectorstores/test_indexing_docs.py
index 9a1e438d2f0..1232d6bb9a6 100644
--- a/libs/community/tests/unit_tests/vectorstores/test_indexing_docs.py
+++ b/libs/community/tests/unit_tests/vectorstores/test_indexing_docs.py
@@ -61,6 +61,7 @@ def test_compatible_vectorstore_documentation() -> None:
         "ElasticsearchStore",
         "FAISS",
         "HanaDB",
+        "Milvus",
         "MomentoVectorIndex",
         "MyScale",
         "PGVector",
@@ -78,6 +79,7 @@ def test_compatible_vectorstore_documentation() -> None:
         "VespaStore",
         "Weaviate",
         "ZepVectorStore",
+        "Zilliz",
         "Lantern",
     }
     assert compatible == documented