mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-22 14:49:29 +00:00
community[patch]: Milvus supports add & delete texts by ids (#16256)
# Description To support [langchain indexing](https://python.langchain.com/docs/modules/data_connection/indexing) as requested by users, vectorstore Milvus needs to support: - document addition by id (`add_documents` method with `ids` argument) - delete by id (`delete` method with `ids` argument) Example usage: ```python from langchain.indexes import SQLRecordManager, index from langchain.schema import Document from langchain_community.vectorstores import Milvus from langchain_openai import OpenAIEmbeddings collection_name = "test_index" embedding = OpenAIEmbeddings() vectorstore = Milvus(embedding_function=embedding, collection_name=collection_name) namespace = f"milvus/{collection_name}" record_manager = SQLRecordManager( namespace, db_url="sqlite:///record_manager_cache.sql" ) record_manager.create_schema() doc1 = Document(page_content="kitty", metadata={"source": "kitty.txt"}) doc2 = Document(page_content="doggy", metadata={"source": "doggy.txt"}) index( [doc1, doc1, doc2], record_manager, vectorstore, cleanup="incremental", # None, "incremental", or "full" source_id_key="source", ) ``` # Fix issues Fix https://github.com/milvus-io/milvus/issues/30112 --------- Signed-off-by: Jael Gu <mengjia.gu@zilliz.com> Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
parent
e9d3527b79
commit
a1aa3a657c
@ -60,7 +60,7 @@
|
||||
" * document addition by id (`add_documents` method with `ids` argument)\n",
|
||||
" * delete by id (`delete` method with `ids` argument)\n",
|
||||
"\n",
|
||||
"Compatible Vectorstores: `AnalyticDB`, `AstraDB`, `AwaDB`, `Bagel`, `Cassandra`, `Chroma`, `DashVector`, `DatabricksVectorSearch`, `DeepLake`, `Dingo`, `ElasticVectorSearch`, `ElasticsearchStore`, `FAISS`, `HanaDB`, `MyScale`, `PGVector`, `Pinecone`, `Qdrant`, `Redis`, `ScaNN`, `SupabaseVectorStore`, `SurrealDBStore`, `TimescaleVector`, `Vald`, `Vearch`, `VespaStore`, `Weaviate`, `ZepVectorStore`.\n",
|
||||
"Compatible Vectorstores: `AnalyticDB`, `AstraDB`, `AwaDB`, `Bagel`, `Cassandra`, `Chroma`, `DashVector`, `DatabricksVectorSearch`, `DeepLake`, `Dingo`, `ElasticVectorSearch`, `ElasticsearchStore`, `FAISS`, `HanaDB`, `Milvus`, `MyScale`, `PGVector`, `Pinecone`, `Qdrant`, `Redis`, `ScaNN`, `SupabaseVectorStore`, `SurrealDBStore`, `TimescaleVector`, `Vald`, `Vearch`, `VespaStore`, `Weaviate`, `ZepVectorStore`.\n",
|
||||
" \n",
|
||||
"## Caution\n",
|
||||
"\n",
|
||||
|
@ -56,6 +56,9 @@ class Milvus(VectorStore):
|
||||
default of index.
|
||||
drop_old (Optional[bool]): Whether to drop the current collection. Defaults
|
||||
to False.
|
||||
auto_id (bool): Whether to enable auto id for primary key. Defaults to False.
|
||||
If False, you needs to provide text ids (string less than 65535 bytes).
|
||||
If True, Milvus will generate unique integers as primary keys.
|
||||
primary_field (str): Name of the primary key field. Defaults to "pk".
|
||||
text_field (str): Name of the text field. Defaults to "text".
|
||||
vector_field (str): Name of the vector field. Defaults to "vector".
|
||||
@ -102,6 +105,7 @@ class Milvus(VectorStore):
|
||||
embedding_function = Embeddings,
|
||||
collection_name = "LangChainCollection",
|
||||
drop_old = True,
|
||||
auto_id = True
|
||||
)
|
||||
|
||||
Raises:
|
||||
@ -119,6 +123,7 @@ class Milvus(VectorStore):
|
||||
index_params: Optional[dict] = None,
|
||||
search_params: Optional[dict] = None,
|
||||
drop_old: Optional[bool] = False,
|
||||
auto_id: bool = False,
|
||||
*,
|
||||
primary_field: str = "pk",
|
||||
text_field: str = "text",
|
||||
@ -159,8 +164,9 @@ class Milvus(VectorStore):
|
||||
self.index_params = index_params
|
||||
self.search_params = search_params
|
||||
self.consistency_level = consistency_level
|
||||
self.auto_id = auto_id
|
||||
|
||||
# In order for a collection to be compatible, pk needs to be auto'id and int
|
||||
# In order for a collection to be compatible, pk needs to be varchar
|
||||
self._primary_field = primary_field
|
||||
# In order for compatibility, the text field will need to be called "text"
|
||||
self._text_field = text_field
|
||||
@ -327,11 +333,22 @@ class Milvus(VectorStore):
|
||||
FieldSchema(self._text_field, DataType.VARCHAR, max_length=65_535)
|
||||
)
|
||||
# Create the primary key field
|
||||
fields.append(
|
||||
FieldSchema(
|
||||
self._primary_field, DataType.INT64, is_primary=True, auto_id=True
|
||||
if self.auto_id:
|
||||
fields.append(
|
||||
FieldSchema(
|
||||
self._primary_field, DataType.INT64, is_primary=True, auto_id=True
|
||||
)
|
||||
)
|
||||
else:
|
||||
fields.append(
|
||||
FieldSchema(
|
||||
self._primary_field,
|
||||
DataType.VARCHAR,
|
||||
is_primary=True,
|
||||
auto_id=False,
|
||||
max_length=65_535,
|
||||
)
|
||||
)
|
||||
)
|
||||
# Create the vector field, supports binary or float vectors
|
||||
fields.append(
|
||||
FieldSchema(self._vector_field, infer_dtype_bydata(embeddings[0]), dim=dim)
|
||||
@ -369,8 +386,6 @@ class Milvus(VectorStore):
|
||||
schema = self.col.schema
|
||||
for x in schema.fields:
|
||||
self.fields.append(x.name)
|
||||
# Since primary field is auto-id, no need to track it
|
||||
self.fields.remove(self._primary_field)
|
||||
|
||||
def _get_index(self) -> Optional[dict[str, Any]]:
|
||||
"""Return the vector index information if it exists"""
|
||||
@ -467,6 +482,8 @@ class Milvus(VectorStore):
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
timeout: Optional[int] = None,
|
||||
batch_size: int = 1000,
|
||||
*,
|
||||
ids: Optional[List[str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[str]:
|
||||
"""Insert text data into Milvus.
|
||||
@ -483,10 +500,12 @@ class Milvus(VectorStore):
|
||||
that they all fit in memory.
|
||||
metadatas (Optional[List[dict]]): Metadata dicts attached to each of
|
||||
the texts. Defaults to None.
|
||||
should be less than 65535 bytes. Required and work when auto_id is False.
|
||||
timeout (Optional[int]): Timeout for each batch insert. Defaults
|
||||
to None.
|
||||
batch_size (int, optional): Batch size to use for insertion.
|
||||
Defaults to 1000.
|
||||
ids (Optional[List[str]]): List of text ids. The length of each item
|
||||
|
||||
Raises:
|
||||
MilvusException: Failure to add texts
|
||||
@ -497,6 +516,16 @@ class Milvus(VectorStore):
|
||||
from pymilvus import Collection, MilvusException
|
||||
|
||||
texts = list(texts)
|
||||
if not self.auto_id:
|
||||
assert isinstance(
|
||||
ids, list
|
||||
), "A list of valid ids are required when auto_id is False."
|
||||
assert len(set(ids)) == len(
|
||||
texts
|
||||
), "Different lengths of texts and unique ids are provided."
|
||||
assert all(
|
||||
len(x.encode()) <= 65_535 for x in ids
|
||||
), "Each id should be a string less than 65535 bytes."
|
||||
|
||||
try:
|
||||
embeddings = self.embedding_func.embed_documents(texts)
|
||||
@ -524,6 +553,9 @@ class Milvus(VectorStore):
|
||||
self._vector_field: embeddings,
|
||||
}
|
||||
|
||||
if not self.auto_id:
|
||||
insert_dict[self._primary_field] = ids
|
||||
|
||||
if self._metadata_field is not None:
|
||||
for d in metadatas:
|
||||
insert_dict.setdefault(self._metadata_field, []).append(d)
|
||||
@ -532,7 +564,12 @@ class Milvus(VectorStore):
|
||||
if metadatas is not None:
|
||||
for d in metadatas:
|
||||
for key, value in d.items():
|
||||
if key in self.fields:
|
||||
keys = (
|
||||
[x for x in self.fields if x != self._primary_field]
|
||||
if self.auto_id
|
||||
else [x for x in self.fields]
|
||||
)
|
||||
for key in keys:
|
||||
insert_dict.setdefault(key, []).append(value)
|
||||
|
||||
# Total insert count
|
||||
@ -700,7 +737,7 @@ class Milvus(VectorStore):
|
||||
param = self.search_params
|
||||
|
||||
# Determine result metadata fields.
|
||||
output_fields = self.fields[:]
|
||||
output_fields = [x for x in self.fields if x != self._primary_field]
|
||||
output_fields.remove(self._vector_field)
|
||||
|
||||
# Perform the search.
|
||||
@ -864,6 +901,30 @@ class Milvus(VectorStore):
|
||||
ret.append(documents[x])
|
||||
return ret
|
||||
|
||||
def delete(
|
||||
self, ids: Optional[List[str]] = None, expr: Optional[str] = None, **kwargs: str
|
||||
):
|
||||
"""Delete by vector ID or boolean expression.
|
||||
Refer to [Milvus documentation](https://milvus.io/docs/delete_data.md)
|
||||
for notes and examples of expressions.
|
||||
|
||||
Args:
|
||||
ids: List of ids to delete.
|
||||
expr: Boolean expression that specifies the entities to delete.
|
||||
kwargs: Other parameters in Milvus delete api.
|
||||
"""
|
||||
if isinstance(ids, list) and len(ids) > 0:
|
||||
expr = f"{self._primary_field} in {ids}"
|
||||
if expr is not None:
|
||||
logger.warning(
|
||||
"Both ids and expr are provided. " "Ignore expr and delete by ids."
|
||||
)
|
||||
else:
|
||||
assert isinstance(
|
||||
expr, str
|
||||
), "Either ids list or expr string must be provided."
|
||||
return self.col.delete(expr=expr, **kwargs)
|
||||
|
||||
@classmethod
|
||||
def from_texts(
|
||||
cls,
|
||||
@ -876,6 +937,8 @@ class Milvus(VectorStore):
|
||||
index_params: Optional[dict] = None,
|
||||
search_params: Optional[dict] = None,
|
||||
drop_old: bool = False,
|
||||
*,
|
||||
ids: Optional[List[str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> Milvus:
|
||||
"""Create a Milvus collection, indexes it with HNSW, and insert data.
|
||||
@ -897,10 +960,16 @@ class Milvus(VectorStore):
|
||||
Defaults to None.
|
||||
drop_old (Optional[bool], optional): Whether to drop the collection with
|
||||
that name if it exists. Defaults to False.
|
||||
ids (Optional[List[str]]): List of text ids. Defaults to None.
|
||||
|
||||
Returns:
|
||||
Milvus: Milvus Vector Store
|
||||
"""
|
||||
if isinstance(ids, list) and len(ids) > 0:
|
||||
auto_id = False
|
||||
else:
|
||||
auto_id = True
|
||||
|
||||
vector_db = cls(
|
||||
embedding_function=embedding,
|
||||
collection_name=collection_name,
|
||||
@ -909,9 +978,10 @@ class Milvus(VectorStore):
|
||||
index_params=index_params,
|
||||
search_params=search_params,
|
||||
drop_old=drop_old,
|
||||
auto_id=auto_id,
|
||||
**kwargs,
|
||||
)
|
||||
vector_db.add_texts(texts=texts, metadatas=metadatas)
|
||||
vector_db.add_texts(texts=texts, metadatas=metadatas, ids=ids)
|
||||
return vector_db
|
||||
|
||||
def _parse_document(self, data: dict) -> Document:
|
||||
|
@ -36,6 +36,9 @@ class Zilliz(Milvus):
|
||||
default of index.
|
||||
drop_old (Optional[bool]): Whether to drop the current collection. Defaults
|
||||
to False.
|
||||
auto_id (bool): Whether to enable auto id for primary key. Defaults to False.
|
||||
If False, you needs to provide text ids (string less than 65535 bytes).
|
||||
If True, Milvus will generate unique integers as primary keys.
|
||||
|
||||
The connection args used for this class comes in the form of a dict,
|
||||
here are a few of the options:
|
||||
@ -146,6 +149,9 @@ class Zilliz(Milvus):
|
||||
index_params: Optional[dict] = None,
|
||||
search_params: Optional[dict] = None,
|
||||
drop_old: bool = False,
|
||||
*,
|
||||
ids: Optional[List[str]] = None,
|
||||
auto_id: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> Zilliz:
|
||||
"""Create a Zilliz collection, indexes it with HNSW, and insert data.
|
||||
@ -167,6 +173,10 @@ class Zilliz(Milvus):
|
||||
Defaults to None.
|
||||
drop_old (Optional[bool], optional): Whether to drop the collection with
|
||||
that name if it exists. Defaults to False.
|
||||
ids (Optional[List[str]]): List of text ids.
|
||||
auto_id (bool): Whether to enable auto id for primary key. Defaults to
|
||||
False. If False, you needs to provide text ids (string less than 65535
|
||||
bytes). If True, Milvus will generate unique integers as primary keys.
|
||||
|
||||
Returns:
|
||||
Zilliz: Zilliz Vector Store
|
||||
@ -179,7 +189,8 @@ class Zilliz(Milvus):
|
||||
index_params=index_params,
|
||||
search_params=search_params,
|
||||
drop_old=drop_old,
|
||||
auto_id=auto_id,
|
||||
**kwargs,
|
||||
)
|
||||
vector_db.add_texts(texts=texts, metadatas=metadatas)
|
||||
vector_db.add_texts(texts=texts, metadatas=metadatas, ids=ids)
|
||||
return vector_db
|
||||
|
@ -11,12 +11,15 @@ from tests.integration_tests.vectorstores.fake_embeddings import (
|
||||
|
||||
|
||||
def _milvus_from_texts(
|
||||
metadatas: Optional[List[dict]] = None, drop: bool = True
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
ids: Optional[List[str]] = None,
|
||||
drop: bool = True,
|
||||
) -> Milvus:
|
||||
return Milvus.from_texts(
|
||||
fake_texts,
|
||||
FakeEmbeddings(),
|
||||
metadatas=metadatas,
|
||||
ids=ids,
|
||||
connection_args={"host": "127.0.0.1", "port": "19530"},
|
||||
drop_old=drop,
|
||||
)
|
||||
@ -29,6 +32,30 @@ def test_milvus() -> None:
|
||||
assert output == [Document(page_content="foo")]
|
||||
|
||||
|
||||
def test_milvus_with_metadata() -> None:
|
||||
"""Test with metadata"""
|
||||
docsearch = _milvus_from_texts(metadatas=[{"label": "test"}] * len(fake_texts))
|
||||
output = docsearch.similarity_search("foo", k=1)
|
||||
assert output == [Document(page_content="foo", metadata={"label": "test"})]
|
||||
|
||||
|
||||
def test_milvus_with_id() -> None:
|
||||
"""Test with ids"""
|
||||
ids = ["id_" + str(i) for i in range(len(fake_texts))]
|
||||
docsearch = _milvus_from_texts(ids=ids)
|
||||
output = docsearch.similarity_search("foo", k=1)
|
||||
assert output == [Document(page_content="foo")]
|
||||
|
||||
output = docsearch.delete(ids=ids)
|
||||
assert output.delete_count == len(fake_texts)
|
||||
|
||||
try:
|
||||
ids = ["dup_id" for _ in fake_texts]
|
||||
_milvus_from_texts(ids=ids)
|
||||
except Exception as e:
|
||||
assert isinstance(e, AssertionError)
|
||||
|
||||
|
||||
def test_milvus_with_score() -> None:
|
||||
"""Test end to end construction and search with scores and IDs."""
|
||||
texts = ["foo", "bar", "baz"]
|
||||
@ -84,6 +111,7 @@ def test_milvus_no_drop() -> None:
|
||||
|
||||
# if __name__ == "__main__":
|
||||
# test_milvus()
|
||||
# test_milvus_with_metadata()
|
||||
# test_milvus_with_score()
|
||||
# test_milvus_max_marginal_relevance_search()
|
||||
# test_milvus_add_extra()
|
||||
|
@ -61,6 +61,7 @@ def test_compatible_vectorstore_documentation() -> None:
|
||||
"ElasticsearchStore",
|
||||
"FAISS",
|
||||
"HanaDB",
|
||||
"Milvus",
|
||||
"MomentoVectorIndex",
|
||||
"MyScale",
|
||||
"PGVector",
|
||||
@ -78,6 +79,7 @@ def test_compatible_vectorstore_documentation() -> None:
|
||||
"VespaStore",
|
||||
"Weaviate",
|
||||
"ZepVectorStore",
|
||||
"Zilliz",
|
||||
"Lantern",
|
||||
}
|
||||
assert compatible == documented
|
||||
|
Loading…
Reference in New Issue
Block a user