community[patch]: Milvus supports add & delete texts by ids (#16256)

# Description To support [langchain indexing](https://python.langchain.com/docs/modules/data_connection/indexing) as requested by users, vectorstore Milvus needs to support: - document addition by id (`add_documents` method with `ids` argument) - delete by id (`delete` method with `ids` argument) Example usage: ```python from langchain.indexes import SQLRecordManager, index from langchain.schema import Document from langchain_community.vectorstores import Milvus from langchain_openai import OpenAIEmbeddings collection_name = "test_index" embedding = OpenAIEmbeddings() vectorstore = Milvus(embedding_function=embedding, collection_name=collection_name) namespace = f"milvus/{collection_name}" record_manager = SQLRecordManager( namespace, db_url="sqlite:///record_manager_cache.sql" ) record_manager.create_schema() doc1 = Document(page_content="kitty", metadata={"source": "kitty.txt"}) doc2 = Document(page_content="doggy", metadata={"source": "doggy.txt"}) index( [doc1, doc1, doc2], record_manager, vectorstore, cleanup="incremental", # None, "incremental", or "full" source_id_key="source", ) ``` # Fix issues Fix https://github.com/milvus-io/milvus/issues/30112 --------- Signed-off-by: Jael Gu <mengjia.gu@zilliz.com> Co-authored-by: Bagatur <baskaryan@gmail.com>
2025-06-22 14:49:29 +00:00 · 2024-01-30 03:19:50 +08:00 · 2024-01-30 03:19:50 +08:00 · a1aa3a657c
commit a1aa3a657c
parent e9d3527b79
5 changed files with 124 additions and 13 deletions
--- a/docs/docs/modules/data_connection/indexing.ipynb
+++ b/docs/docs/modules/data_connection/indexing.ipynb
@ -60,7 +60,7 @@
    "   * document addition by id (`add_documents` method with `ids` argument)\n",
    "   * delete by id (`delete` method with `ids` argument)\n",
    "\n",
-    "Compatible Vectorstores: `AnalyticDB`, `AstraDB`, `AwaDB`, `Bagel`, `Cassandra`, `Chroma`, `DashVector`, `DatabricksVectorSearch`, `DeepLake`, `Dingo`, `ElasticVectorSearch`, `ElasticsearchStore`, `FAISS`, `HanaDB`, `MyScale`, `PGVector`, `Pinecone`, `Qdrant`, `Redis`, `ScaNN`, `SupabaseVectorStore`, `SurrealDBStore`, `TimescaleVector`, `Vald`, `Vearch`, `VespaStore`, `Weaviate`, `ZepVectorStore`.\n",
+    "Compatible Vectorstores: `AnalyticDB`, `AstraDB`, `AwaDB`, `Bagel`, `Cassandra`, `Chroma`, `DashVector`, `DatabricksVectorSearch`, `DeepLake`, `Dingo`, `ElasticVectorSearch`, `ElasticsearchStore`, `FAISS`, `HanaDB`, `Milvus`, `MyScale`, `PGVector`, `Pinecone`, `Qdrant`, `Redis`, `ScaNN`, `SupabaseVectorStore`, `SurrealDBStore`, `TimescaleVector`, `Vald`, `Vearch`, `VespaStore`, `Weaviate`, `ZepVectorStore`.\n",
    "  \n",
    "## Caution\n",
    "\n",
--- a/libs/community/langchain_community/vectorstores/milvus.py
+++ b/libs/community/langchain_community/vectorstores/milvus.py
@ -56,6 +56,9 @@ class Milvus(VectorStore):
            default of index.
        drop_old (Optional[bool]): Whether to drop the current collection. Defaults
            to False.
        auto_id (bool): Whether to enable auto id for primary key. Defaults to False.
            If False, you needs to provide text ids (string less than 65535 bytes).
            If True, Milvus will generate unique integers as primary keys.
        primary_field (str): Name of the primary key field. Defaults to "pk".
        text_field (str): Name of the text field. Defaults to "text".
        vector_field (str): Name of the vector field. Defaults to "vector".
@ -102,6 +105,7 @@ class Milvus(VectorStore):
            embedding_function = Embeddings,
            collection_name = "LangChainCollection",
            drop_old = True,
            auto_id = True
        )
    Raises:
@ -119,6 +123,7 @@ class Milvus(VectorStore):
        index_params: Optional[dict] = None,
        search_params: Optional[dict] = None,
        drop_old: Optional[bool] = False,
        auto_id: bool = False,
        *,
        primary_field: str = "pk",
        text_field: str = "text",
@ -159,8 +164,9 @@ class Milvus(VectorStore):
        self.index_params = index_params
        self.search_params = search_params
        self.consistency_level = consistency_level
        self.auto_id = auto_id
-        # In order for a collection to be compatible, pk needs to be auto'id and int
+        # In order for a collection to be compatible, pk needs to be varchar
        self._primary_field = primary_field
        # In order for compatibility, the text field will need to be called "text"
        self._text_field = text_field
@ -327,11 +333,22 @@ class Milvus(VectorStore):
            FieldSchema(self._text_field, DataType.VARCHAR, max_length=65_535)
        )
        # Create the primary key field
-        fields.append(
+        if self.auto_id:
-            FieldSchema(
+            fields.append(
-                self._primary_field, DataType.INT64, is_primary=True, auto_id=True
+                FieldSchema(
                    self._primary_field, DataType.INT64, is_primary=True, auto_id=True
                )
            )
        else:
            fields.append(
                FieldSchema(
                    self._primary_field,
                    DataType.VARCHAR,
                    is_primary=True,
                    auto_id=False,
                    max_length=65_535,
                )
            )
        )
        # Create the vector field, supports binary or float vectors
        fields.append(
            FieldSchema(self._vector_field, infer_dtype_bydata(embeddings[0]), dim=dim)
@ -369,8 +386,6 @@ class Milvus(VectorStore):
            schema = self.col.schema
            for x in schema.fields:
                self.fields.append(x.name)
            # Since primary field is auto-id, no need to track it
            self.fields.remove(self._primary_field)
    def _get_index(self) -> Optional[dict[str, Any]]:
        """Return the vector index information if it exists"""
@ -467,6 +482,8 @@ class Milvus(VectorStore):
        metadatas: Optional[List[dict]] = None,
        timeout: Optional[int] = None,
        batch_size: int = 1000,
        *,
        ids: Optional[List[str]] = None,
        **kwargs: Any,
    ) -> List[str]:
        """Insert text data into Milvus.
@ -483,10 +500,12 @@ class Milvus(VectorStore):
                that they all fit in memory.
            metadatas (Optional[List[dict]]): Metadata dicts attached to each of
                the texts. Defaults to None.
            should be less than 65535 bytes. Required and work when auto_id is False.
            timeout (Optional[int]): Timeout for each batch insert. Defaults
                to None.
            batch_size (int, optional): Batch size to use for insertion.
                Defaults to 1000.
            ids (Optional[List[str]]): List of text ids. The length of each item
        Raises:
            MilvusException: Failure to add texts
@ -497,6 +516,16 @@ class Milvus(VectorStore):
        from pymilvus import Collection, MilvusException
        texts = list(texts)
        if not self.auto_id:
            assert isinstance(
                ids, list
            ), "A list of valid ids are required when auto_id is False."
            assert len(set(ids)) == len(
                texts
            ), "Different lengths of texts and unique ids are provided."
            assert all(
                len(x.encode()) <= 65_535 for x in ids
            ), "Each id should be a string less than 65535 bytes."
        try:
            embeddings = self.embedding_func.embed_documents(texts)
@ -524,6 +553,9 @@ class Milvus(VectorStore):
            self._vector_field: embeddings,
        }
        if not self.auto_id:
            insert_dict[self._primary_field] = ids
        if self._metadata_field is not None:
            for d in metadatas:
                insert_dict.setdefault(self._metadata_field, []).append(d)
@ -532,7 +564,12 @@ class Milvus(VectorStore):
            if metadatas is not None:
                for d in metadatas:
                    for key, value in d.items():
-                        if key in self.fields:
+                        keys = (
                            [x for x in self.fields if x != self._primary_field]
                            if self.auto_id
                            else [x for x in self.fields]
                        )
                        for key in keys:
                            insert_dict.setdefault(key, []).append(value)
        # Total insert count
@ -700,7 +737,7 @@ class Milvus(VectorStore):
            param = self.search_params
        # Determine result metadata fields.
-        output_fields = self.fields[:]
+        output_fields = [x for x in self.fields if x != self._primary_field]
        output_fields.remove(self._vector_field)
        # Perform the search.
@ -864,6 +901,30 @@ class Milvus(VectorStore):
                ret.append(documents[x])
        return ret
    def delete(
        self, ids: Optional[List[str]] = None, expr: Optional[str] = None, **kwargs: str
    ):
        """Delete by vector ID or boolean expression.
        Refer to [Milvus documentation](https://milvus.io/docs/delete_data.md)
        for notes and examples of expressions.
        Args:
            ids: List of ids to delete.
            expr: Boolean expression that specifies the entities to delete.
            kwargs: Other parameters in Milvus delete api.
        """
        if isinstance(ids, list) and len(ids) > 0:
            expr = f"{self._primary_field} in {ids}"
            if expr is not None:
                logger.warning(
                    "Both ids and expr are provided. " "Ignore expr and delete by ids."
                )
        else:
            assert isinstance(
                expr, str
            ), "Either ids list or expr string must be provided."
        return self.col.delete(expr=expr, **kwargs)
    @classmethod
    def from_texts(
        cls,
@ -876,6 +937,8 @@ class Milvus(VectorStore):
        index_params: Optional[dict] = None,
        search_params: Optional[dict] = None,
        drop_old: bool = False,
        *,
        ids: Optional[List[str]] = None,
        **kwargs: Any,
    ) -> Milvus:
        """Create a Milvus collection, indexes it with HNSW, and insert data.
@ -897,10 +960,16 @@ class Milvus(VectorStore):
                Defaults to None.
            drop_old (Optional[bool], optional): Whether to drop the collection with
                that name if it exists. Defaults to False.
            ids (Optional[List[str]]): List of text ids. Defaults to None.
        Returns:
            Milvus: Milvus Vector Store
        """
        if isinstance(ids, list) and len(ids) > 0:
            auto_id = False
        else:
            auto_id = True
        vector_db = cls(
            embedding_function=embedding,
            collection_name=collection_name,
@ -909,9 +978,10 @@ class Milvus(VectorStore):
            index_params=index_params,
            search_params=search_params,
            drop_old=drop_old,
            auto_id=auto_id,
            **kwargs,
        )
-        vector_db.add_texts(texts=texts, metadatas=metadatas)
+        vector_db.add_texts(texts=texts, metadatas=metadatas, ids=ids)
        return vector_db
    def _parse_document(self, data: dict) -> Document:
--- a/libs/community/langchain_community/vectorstores/zilliz.py
+++ b/libs/community/langchain_community/vectorstores/zilliz.py
@ -36,6 +36,9 @@ class Zilliz(Milvus):
            default of index.
        drop_old (Optional[bool]): Whether to drop the current collection. Defaults
            to False.
        auto_id (bool): Whether to enable auto id for primary key. Defaults to False.
            If False, you needs to provide text ids (string less than 65535 bytes).
            If True, Milvus will generate unique integers as primary keys.
    The connection args used for this class comes in the form of a dict,
    here are a few of the options:
@ -146,6 +149,9 @@ class Zilliz(Milvus):
        index_params: Optional[dict] = None,
        search_params: Optional[dict] = None,
        drop_old: bool = False,
        *,
        ids: Optional[List[str]] = None,
        auto_id: bool = False,
        **kwargs: Any,
    ) -> Zilliz:
        """Create a Zilliz collection, indexes it with HNSW, and insert data.
@ -167,6 +173,10 @@ class Zilliz(Milvus):
                Defaults to None.
            drop_old (Optional[bool], optional): Whether to drop the collection with
                that name if it exists. Defaults to False.
            ids (Optional[List[str]]): List of text ids.
            auto_id (bool): Whether to enable auto id for primary key. Defaults to
                False. If False, you needs to provide text ids (string less than 65535
                bytes). If True, Milvus will generate unique integers as primary keys.
        Returns:
            Zilliz: Zilliz Vector Store
@ -179,7 +189,8 @@ class Zilliz(Milvus):
            index_params=index_params,
            search_params=search_params,
            drop_old=drop_old,
            auto_id=auto_id,
            **kwargs,
        )
-        vector_db.add_texts(texts=texts, metadatas=metadatas)
+        vector_db.add_texts(texts=texts, metadatas=metadatas, ids=ids)
        return vector_db
--- a/libs/community/tests/integration_tests/vectorstores/test_milvus.py
+++ b/libs/community/tests/integration_tests/vectorstores/test_milvus.py
@ -11,12 +11,15 @@ from tests.integration_tests.vectorstores.fake_embeddings import (
 def _milvus_from_texts(
-    metadatas: Optional[List[dict]] = None, drop: bool = True
+    metadatas: Optional[List[dict]] = None,
    ids: Optional[List[str]] = None,
    drop: bool = True,
 ) -> Milvus:
    return Milvus.from_texts(
        fake_texts,
        FakeEmbeddings(),
        metadatas=metadatas,
        ids=ids,
        connection_args={"host": "127.0.0.1", "port": "19530"},
        drop_old=drop,
    )
@ -29,6 +32,30 @@ def test_milvus() -> None:
    assert output == [Document(page_content="foo")]
 def test_milvus_with_metadata() -> None:
    """Test with metadata"""
    docsearch = _milvus_from_texts(metadatas=[{"label": "test"}] * len(fake_texts))
    output = docsearch.similarity_search("foo", k=1)
    assert output == [Document(page_content="foo", metadata={"label": "test"})]
 def test_milvus_with_id() -> None:
    """Test with ids"""
    ids = ["id_" + str(i) for i in range(len(fake_texts))]
    docsearch = _milvus_from_texts(ids=ids)
    output = docsearch.similarity_search("foo", k=1)
    assert output == [Document(page_content="foo")]
    output = docsearch.delete(ids=ids)
    assert output.delete_count == len(fake_texts)
    try:
        ids = ["dup_id" for _ in fake_texts]
        _milvus_from_texts(ids=ids)
    except Exception as e:
        assert isinstance(e, AssertionError)
 def test_milvus_with_score() -> None:
    """Test end to end construction and search with scores and IDs."""
    texts = ["foo", "bar", "baz"]
@ -84,6 +111,7 @@ def test_milvus_no_drop() -> None:
 # if __name__ == "__main__":
 #     test_milvus()
 #     test_milvus_with_metadata()
 #     test_milvus_with_score()
 #     test_milvus_max_marginal_relevance_search()
 #     test_milvus_add_extra()
--- a/libs/community/tests/unit_tests/vectorstores/test_indexing_docs.py
+++ b/libs/community/tests/unit_tests/vectorstores/test_indexing_docs.py
@ -61,6 +61,7 @@ def test_compatible_vectorstore_documentation() -> None:
        "ElasticsearchStore",
        "FAISS",
        "HanaDB",
        "Milvus",
        "MomentoVectorIndex",
        "MyScale",
        "PGVector",
@ -78,6 +79,7 @@ def test_compatible_vectorstore_documentation() -> None:
        "VespaStore",
        "Weaviate",
        "ZepVectorStore",
        "Zilliz",
        "Lantern",
    }
    assert compatible == documented