mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-19 03:01:29 +00:00
partners[milvus]: add dynamic field (#24544)
add dynamic field feature to langchain_milvus more unittest, more robustic plan to deprecate the `metadata_field` in the future, because it's function is the same as `enable_dynamic_field`, but the latter one is a more advanced concept in milvus Signed-off-by: ChengZi <chen.zhang@zilliz.com> Co-authored-by: Erick Friis <erick@langchain.dev>
This commit is contained in:
parent
20fe4deea0
commit
29a3b3a711
@ -130,24 +130,46 @@ class Milvus(VectorStore):
|
|||||||
drop_old (Optional[bool]): Whether to drop the current collection. Defaults
|
drop_old (Optional[bool]): Whether to drop the current collection. Defaults
|
||||||
to False.
|
to False.
|
||||||
auto_id (bool): Whether to enable auto id for primary key. Defaults to False.
|
auto_id (bool): Whether to enable auto id for primary key. Defaults to False.
|
||||||
If False, you needs to provide text ids (string less than 65535 bytes).
|
If False, you need to provide text ids (string less than 65535 bytes).
|
||||||
If True, Milvus will generate unique integers as primary keys.
|
If True, Milvus will generate unique integers as primary keys.
|
||||||
primary_field (str): Name of the primary key field. Defaults to "pk".
|
primary_field (str): Name of the primary key field. Defaults to "pk".
|
||||||
text_field (str): Name of the text field. Defaults to "text".
|
text_field (str): Name of the text field. Defaults to "text".
|
||||||
vector_field (str): Name of the vector field. Defaults to "vector".
|
vector_field (str): Name of the vector field. Defaults to "vector".
|
||||||
metadata_field (str): Name of the metadta field. Defaults to None.
|
enable_dynamic_field (Optional[bool]): Whether to enable
|
||||||
|
dynamic schema or not in Milvus. Defaults to False.
|
||||||
|
For more information about dynamic schema, please refer to
|
||||||
|
https://milvus.io/docs/enable-dynamic-field.md
|
||||||
|
metadata_field (str): Name of the metadata field. Defaults to None.
|
||||||
When metadata_field is specified,
|
When metadata_field is specified,
|
||||||
the document's metadata will store as json.
|
the document's metadata will store as json.
|
||||||
|
This argument is about to be deprecated,
|
||||||
|
because it can be replaced by setting `enable_dynamic_field`=True.
|
||||||
|
partition_key_field (Optional[str]): Name of the partition key field.
|
||||||
|
Defaults to None. For more information about partition key, please refer to
|
||||||
|
https://milvus.io/docs/use-partition-key.md#Use-Partition-Key
|
||||||
|
partition_names (Optional[list]): List of specific partition names.
|
||||||
|
Defaults to None. For more information about partition, please refer to
|
||||||
|
https://milvus.io/docs/manage-partitions.md#Manage-Partitions
|
||||||
|
replica_number (int): The number of replicas for the collection. Defaults to 1.
|
||||||
|
For more information about replica, please refer to
|
||||||
|
https://milvus.io/docs/replica.md#In-Memory-Replica
|
||||||
|
timeout (Optional[float]): The timeout for Milvus operations. Defaults to None.
|
||||||
|
An optional duration of time in seconds to allow for the RPCs.
|
||||||
|
If timeout is not set, the client keeps waiting until the server responds
|
||||||
|
or an error occurs.
|
||||||
|
num_shards (Optional[int]): The number of shards for the collection.
|
||||||
|
Defaults to None. For more information about shards, please refer to
|
||||||
|
https://milvus.io/docs/glossary.md#Shard
|
||||||
|
|
||||||
The connection args used for this class comes in the form of a dict,
|
The connection args used for this class comes in the form of a dict,
|
||||||
here are a few of the options:
|
here are a few of the options:
|
||||||
address (str): The actual address of Milvus
|
address (str): The actual address of Milvus
|
||||||
instance. Example address: "localhost:19530"
|
instance. Example address: "localhost:19530"
|
||||||
uri (str): The uri of Milvus instance. Example uri:
|
uri (str): The uri of Milvus instance. Example uri:
|
||||||
|
"path/to/local/directory/milvus_demo.db" for Milvus Lite.
|
||||||
"http://randomwebsite:19530",
|
"http://randomwebsite:19530",
|
||||||
"tcp:foobarsite:19530",
|
"tcp:foobarsite:19530",
|
||||||
"https://ok.s3.south.com:19530".
|
"https://ok.s3.south.com:19530".
|
||||||
or "path/to/local/directory/milvus_demo.db" for Milvus Lite.
|
|
||||||
host (str): The host of Milvus instance. Default at "localhost",
|
host (str): The host of Milvus instance. Default at "localhost",
|
||||||
PyMilvus will fill in the default host if only port is provided.
|
PyMilvus will fill in the default host if only port is provided.
|
||||||
port (str/int): The port of Milvus instance. Default at 19530, PyMilvus
|
port (str/int): The port of Milvus instance. Default at 19530, PyMilvus
|
||||||
@ -178,6 +200,7 @@ class Milvus(VectorStore):
|
|||||||
milvus_store = Milvus(
|
milvus_store = Milvus(
|
||||||
embedding_function = Embeddings,
|
embedding_function = Embeddings,
|
||||||
collection_name = "LangChainCollection",
|
collection_name = "LangChainCollection",
|
||||||
|
connection_args = {"uri": "./milvus_demo.db"},
|
||||||
drop_old = True,
|
drop_old = True,
|
||||||
auto_id = True
|
auto_id = True
|
||||||
)
|
)
|
||||||
@ -202,6 +225,7 @@ class Milvus(VectorStore):
|
|||||||
primary_field: str = "pk",
|
primary_field: str = "pk",
|
||||||
text_field: str = "text",
|
text_field: str = "text",
|
||||||
vector_field: str = "vector",
|
vector_field: str = "vector",
|
||||||
|
enable_dynamic_field: bool = False,
|
||||||
metadata_field: Optional[str] = None,
|
metadata_field: Optional[str] = None,
|
||||||
partition_key_field: Optional[str] = None,
|
partition_key_field: Optional[str] = None,
|
||||||
partition_names: Optional[list] = None,
|
partition_names: Optional[list] = None,
|
||||||
@ -260,6 +284,17 @@ class Milvus(VectorStore):
|
|||||||
self._text_field = text_field
|
self._text_field = text_field
|
||||||
# In order for compatibility, the vector field needs to be called "vector"
|
# In order for compatibility, the vector field needs to be called "vector"
|
||||||
self._vector_field = vector_field
|
self._vector_field = vector_field
|
||||||
|
if metadata_field:
|
||||||
|
logger.warning(
|
||||||
|
"DeprecationWarning: `metadata_field` is about to be deprecated, "
|
||||||
|
"please set `enable_dynamic_field`=True instead."
|
||||||
|
)
|
||||||
|
if enable_dynamic_field and metadata_field:
|
||||||
|
metadata_field = None
|
||||||
|
logger.warning(
|
||||||
|
"When `enable_dynamic_field` is True, `metadata_field` is ignored."
|
||||||
|
)
|
||||||
|
self.enable_dynamic_field = enable_dynamic_field
|
||||||
self._metadata_field = metadata_field
|
self._metadata_field = metadata_field
|
||||||
self._partition_key_field = partition_key_field
|
self._partition_key_field = partition_key_field
|
||||||
self.fields: list[str] = []
|
self.fields: list[str] = []
|
||||||
@ -389,13 +424,36 @@ class Milvus(VectorStore):
|
|||||||
# Determine embedding dim
|
# Determine embedding dim
|
||||||
dim = len(embeddings[0])
|
dim = len(embeddings[0])
|
||||||
fields = []
|
fields = []
|
||||||
if self._metadata_field is not None:
|
# If enable_dynamic_field, we don't need to create fields, and just pass it.
|
||||||
|
# In the future, when metadata_field is deprecated,
|
||||||
|
# This logical structure will be simplified like this:
|
||||||
|
# ```
|
||||||
|
# if not self.enable_dynamic_field and metadatas:
|
||||||
|
# for key, value in metadatas[0].items():
|
||||||
|
# ...
|
||||||
|
# ```
|
||||||
|
if self.enable_dynamic_field:
|
||||||
|
pass
|
||||||
|
elif self._metadata_field is not None:
|
||||||
fields.append(FieldSchema(self._metadata_field, DataType.JSON))
|
fields.append(FieldSchema(self._metadata_field, DataType.JSON))
|
||||||
else:
|
else:
|
||||||
# Determine metadata schema
|
# Determine metadata schema
|
||||||
if metadatas:
|
if metadatas:
|
||||||
# Create FieldSchema for each entry in metadata.
|
# Create FieldSchema for each entry in metadata.
|
||||||
for key, value in metadatas[0].items():
|
for key, value in metadatas[0].items():
|
||||||
|
if key in [
|
||||||
|
self._vector_field,
|
||||||
|
self._primary_field,
|
||||||
|
self._text_field,
|
||||||
|
]:
|
||||||
|
logger.error(
|
||||||
|
(
|
||||||
|
"Failure to create collection, "
|
||||||
|
"metadata key: %s is reserved."
|
||||||
|
),
|
||||||
|
key,
|
||||||
|
)
|
||||||
|
raise ValueError(f"Metadata key {key} is reserved.")
|
||||||
# Infer the corresponding datatype of the metadata
|
# Infer the corresponding datatype of the metadata
|
||||||
dtype = infer_dtype_bydata(value)
|
dtype = infer_dtype_bydata(value)
|
||||||
# Datatype isn't compatible
|
# Datatype isn't compatible
|
||||||
@ -408,7 +466,7 @@ class Milvus(VectorStore):
|
|||||||
key,
|
key,
|
||||||
)
|
)
|
||||||
raise ValueError(f"Unrecognized datatype for {key}.")
|
raise ValueError(f"Unrecognized datatype for {key}.")
|
||||||
# Dataype is a string/varchar equivalent
|
# Datatype is a string/varchar equivalent
|
||||||
elif dtype == DataType.VARCHAR:
|
elif dtype == DataType.VARCHAR:
|
||||||
fields.append(
|
fields.append(
|
||||||
FieldSchema(key, DataType.VARCHAR, max_length=65_535)
|
FieldSchema(key, DataType.VARCHAR, max_length=65_535)
|
||||||
@ -447,6 +505,7 @@ class Milvus(VectorStore):
|
|||||||
fields,
|
fields,
|
||||||
description=self.collection_description,
|
description=self.collection_description,
|
||||||
partition_key_field=self._partition_key_field,
|
partition_key_field=self._partition_key_field,
|
||||||
|
enable_dynamic_field=self.enable_dynamic_field,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Create the collection
|
# Create the collection
|
||||||
@ -617,16 +676,26 @@ class Milvus(VectorStore):
|
|||||||
|
|
||||||
texts = list(texts)
|
texts = list(texts)
|
||||||
if not self.auto_id:
|
if not self.auto_id:
|
||||||
assert isinstance(
|
assert isinstance(ids, list), (
|
||||||
ids, list
|
"A list of valid ids are required when auto_id is False. "
|
||||||
), "A list of valid ids are required when auto_id is False."
|
"You can set `auto_id` to True in this Milvus instance to generate "
|
||||||
|
"ids automatically, or specify string-type ids for each text."
|
||||||
|
)
|
||||||
assert len(set(ids)) == len(
|
assert len(set(ids)) == len(
|
||||||
texts
|
texts
|
||||||
), "Different lengths of texts and unique ids are provided."
|
), "Different lengths of texts and unique ids are provided."
|
||||||
|
assert all(isinstance(x, str) for x in ids), "All ids should be strings."
|
||||||
assert all(
|
assert all(
|
||||||
len(x.encode()) <= 65_535 for x in ids
|
len(x.encode()) <= 65_535 for x in ids
|
||||||
), "Each id should be a string less than 65535 bytes."
|
), "Each id should be a string less than 65535 bytes."
|
||||||
|
|
||||||
|
else:
|
||||||
|
if ids is not None:
|
||||||
|
logger.warning(
|
||||||
|
"The ids parameter is ignored when auto_id is True. "
|
||||||
|
"The ids will be generated automatically."
|
||||||
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
embeddings = self.embedding_func.embed_documents(texts)
|
embeddings = self.embedding_func.embed_documents(texts)
|
||||||
except NotImplementedError:
|
except NotImplementedError:
|
||||||
@ -647,34 +716,39 @@ class Milvus(VectorStore):
|
|||||||
kwargs["timeout"] = self.timeout
|
kwargs["timeout"] = self.timeout
|
||||||
self._init(**kwargs)
|
self._init(**kwargs)
|
||||||
|
|
||||||
# Dict to hold all insert columns
|
insert_list: list[dict] = []
|
||||||
insert_dict: dict[str, list] = {
|
|
||||||
self._text_field: texts,
|
|
||||||
self._vector_field: embeddings,
|
|
||||||
}
|
|
||||||
|
|
||||||
if not self.auto_id:
|
assert len(texts) == len(
|
||||||
insert_dict[self._primary_field] = ids # type: ignore[assignment]
|
embeddings
|
||||||
|
), "Mismatched lengths of texts and embeddings."
|
||||||
if self._metadata_field is not None:
|
|
||||||
for d in metadatas: # type: ignore[union-attr]
|
|
||||||
insert_dict.setdefault(self._metadata_field, []).append(d)
|
|
||||||
else:
|
|
||||||
# Collect the metadata into the insert dict.
|
|
||||||
if metadatas is not None:
|
if metadatas is not None:
|
||||||
for d in metadatas:
|
assert len(texts) == len(
|
||||||
for key, value in d.items():
|
metadatas
|
||||||
keys = (
|
), "Mismatched lengths of texts and metadatas."
|
||||||
[x for x in self.fields if x != self._primary_field]
|
|
||||||
if self.auto_id
|
for i, text, embedding in zip(range(len(texts)), texts, embeddings):
|
||||||
else [x for x in self.fields]
|
entity_dict = {}
|
||||||
)
|
metadata = metadatas[i] if metadatas else {}
|
||||||
if key in keys:
|
if not self.auto_id:
|
||||||
insert_dict.setdefault(key, []).append(value)
|
entity_dict[self._primary_field] = ids[i] # type: ignore[index]
|
||||||
|
|
||||||
|
entity_dict[self._text_field] = text
|
||||||
|
entity_dict[self._vector_field] = embedding
|
||||||
|
|
||||||
|
if self._metadata_field and not self.enable_dynamic_field:
|
||||||
|
entity_dict[self._metadata_field] = metadata
|
||||||
|
else:
|
||||||
|
for key, value in metadata.items():
|
||||||
|
# if not enable_dynamic_field, skip fields not in the collection.
|
||||||
|
if not self.enable_dynamic_field and key not in self.fields:
|
||||||
|
continue
|
||||||
|
# If enable_dynamic_field, all fields are allowed.
|
||||||
|
entity_dict[key] = value
|
||||||
|
|
||||||
|
insert_list.append(entity_dict)
|
||||||
|
|
||||||
# Total insert count
|
# Total insert count
|
||||||
vectors: list = insert_dict[self._vector_field]
|
total_count = len(insert_list)
|
||||||
total_count = len(vectors)
|
|
||||||
|
|
||||||
pks: list[str] = []
|
pks: list[str] = []
|
||||||
|
|
||||||
@ -682,15 +756,12 @@ class Milvus(VectorStore):
|
|||||||
for i in range(0, total_count, batch_size):
|
for i in range(0, total_count, batch_size):
|
||||||
# Grab end index
|
# Grab end index
|
||||||
end = min(i + batch_size, total_count)
|
end = min(i + batch_size, total_count)
|
||||||
# Convert dict to list of lists batch for insertion
|
batch_insert_list = insert_list[i:end]
|
||||||
insert_list = [
|
|
||||||
insert_dict[x][i:end] for x in self.fields if x in insert_dict
|
|
||||||
]
|
|
||||||
# Insert into the collection.
|
# Insert into the collection.
|
||||||
try:
|
try:
|
||||||
res: Collection
|
res: Collection
|
||||||
timeout = self.timeout or timeout
|
timeout = self.timeout or timeout
|
||||||
res = self.col.insert(insert_list, timeout=timeout, **kwargs)
|
res = self.col.insert(batch_insert_list, timeout=timeout, **kwargs)
|
||||||
pks.extend(res.primary_keys)
|
pks.extend(res.primary_keys)
|
||||||
except MilvusException as e:
|
except MilvusException as e:
|
||||||
logger.error(
|
logger.error(
|
||||||
@ -699,6 +770,61 @@ class Milvus(VectorStore):
|
|||||||
raise e
|
raise e
|
||||||
return pks
|
return pks
|
||||||
|
|
||||||
|
def _collection_search(
|
||||||
|
self,
|
||||||
|
embedding: List[float],
|
||||||
|
k: int = 4,
|
||||||
|
param: Optional[dict] = None,
|
||||||
|
expr: Optional[str] = None,
|
||||||
|
timeout: Optional[float] = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> "pymilvus.client.abstract.SearchResult | None": # type: ignore[name-defined] # noqa: F821
|
||||||
|
"""Perform a search on an embedding and return milvus search results.
|
||||||
|
|
||||||
|
For more information about the search parameters, take a look at the pymilvus
|
||||||
|
documentation found here:
|
||||||
|
https://milvus.io/api-reference/pymilvus/v2.4.x/ORM/Collection/search.md
|
||||||
|
|
||||||
|
Args:
|
||||||
|
embedding (List[float]): The embedding vector being searched.
|
||||||
|
k (int, optional): The amount of results to return. Defaults to 4.
|
||||||
|
param (dict): The search params for the specified index.
|
||||||
|
Defaults to None.
|
||||||
|
expr (str, optional): Filtering expression. Defaults to None.
|
||||||
|
timeout (float, optional): How long to wait before timeout error.
|
||||||
|
Defaults to None.
|
||||||
|
kwargs: Collection.search() keyword arguments.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
pymilvus.client.abstract.SearchResult: Milvus search result.
|
||||||
|
"""
|
||||||
|
if self.col is None:
|
||||||
|
logger.debug("No existing collection to search.")
|
||||||
|
return None
|
||||||
|
|
||||||
|
if param is None:
|
||||||
|
param = self.search_params
|
||||||
|
|
||||||
|
# Determine result metadata fields with PK.
|
||||||
|
if self.enable_dynamic_field:
|
||||||
|
output_fields = ["*"]
|
||||||
|
else:
|
||||||
|
output_fields = self.fields[:]
|
||||||
|
output_fields.remove(self._vector_field)
|
||||||
|
timeout = self.timeout or timeout
|
||||||
|
# Perform the search.
|
||||||
|
res = self.col.search(
|
||||||
|
data=[embedding],
|
||||||
|
anns_field=self._vector_field,
|
||||||
|
param=param,
|
||||||
|
limit=k,
|
||||||
|
expr=expr,
|
||||||
|
output_fields=output_fields,
|
||||||
|
timeout=timeout,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
return res
|
||||||
|
|
||||||
def similarity_search(
|
def similarity_search(
|
||||||
self,
|
self,
|
||||||
query: str,
|
query: str,
|
||||||
@ -778,7 +904,7 @@ class Milvus(VectorStore):
|
|||||||
|
|
||||||
For more information about the search parameters, take a look at the pymilvus
|
For more information about the search parameters, take a look at the pymilvus
|
||||||
documentation found here:
|
documentation found here:
|
||||||
https://milvus.io/api-reference/pymilvus/v2.2.6/Collection/search().md
|
https://milvus.io/api-reference/pymilvus/v2.4.x/ORM/Collection/search.md
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
query (str): The text being searched.
|
query (str): The text being searched.
|
||||||
@ -814,11 +940,11 @@ class Milvus(VectorStore):
|
|||||||
timeout: Optional[float] = None,
|
timeout: Optional[float] = None,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> List[Tuple[Document, float]]:
|
) -> List[Tuple[Document, float]]:
|
||||||
"""Perform a search on a query string and return results with score.
|
"""Perform a search on an embedding and return results with score.
|
||||||
|
|
||||||
For more information about the search parameters, take a look at the pymilvus
|
For more information about the search parameters, take a look at the pymilvus
|
||||||
documentation found here:
|
documentation found here:
|
||||||
https://milvus.io/api-reference/pymilvus/v2.2.6/Collection/search().md
|
https://milvus.io/api-reference/pymilvus/v2.4.x/ORM/Collection/search.md
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
embedding (List[float]): The embedding vector being searched.
|
embedding (List[float]): The embedding vector being searched.
|
||||||
@ -833,32 +959,14 @@ class Milvus(VectorStore):
|
|||||||
Returns:
|
Returns:
|
||||||
List[Tuple[Document, float]]: Result doc and score.
|
List[Tuple[Document, float]]: Result doc and score.
|
||||||
"""
|
"""
|
||||||
if self.col is None:
|
col_search_res = self._collection_search(
|
||||||
logger.debug("No existing collection to search.")
|
embedding=embedding, k=k, param=param, expr=expr, timeout=timeout, **kwargs
|
||||||
return []
|
|
||||||
|
|
||||||
if param is None:
|
|
||||||
param = self.search_params
|
|
||||||
|
|
||||||
# Determine result metadata fields with PK.
|
|
||||||
output_fields = self.fields[:]
|
|
||||||
output_fields.remove(self._vector_field)
|
|
||||||
timeout = self.timeout or timeout
|
|
||||||
# Perform the search.
|
|
||||||
res = self.col.search(
|
|
||||||
data=[embedding],
|
|
||||||
anns_field=self._vector_field,
|
|
||||||
param=param,
|
|
||||||
limit=k,
|
|
||||||
expr=expr,
|
|
||||||
output_fields=output_fields,
|
|
||||||
timeout=timeout,
|
|
||||||
**kwargs,
|
|
||||||
)
|
)
|
||||||
# Organize results.
|
if col_search_res is None:
|
||||||
|
return []
|
||||||
ret = []
|
ret = []
|
||||||
for result in res[0]:
|
for result in col_search_res[0]:
|
||||||
data = {x: result.entity.get(x) for x in output_fields}
|
data = {x: result.entity.get(x) for x in result.entity.fields}
|
||||||
doc = self._parse_document(data)
|
doc = self._parse_document(data)
|
||||||
pair = (doc, result.score)
|
pair = (doc, result.score)
|
||||||
ret.append(pair)
|
ret.append(pair)
|
||||||
@ -947,40 +1055,27 @@ class Milvus(VectorStore):
|
|||||||
Returns:
|
Returns:
|
||||||
List[Document]: Document results for search.
|
List[Document]: Document results for search.
|
||||||
"""
|
"""
|
||||||
if self.col is None:
|
col_search_res = self._collection_search(
|
||||||
logger.debug("No existing collection to search.")
|
embedding=embedding,
|
||||||
return []
|
k=fetch_k,
|
||||||
|
|
||||||
if param is None:
|
|
||||||
param = self.search_params
|
|
||||||
|
|
||||||
# Determine result metadata fields.
|
|
||||||
output_fields = self.fields[:]
|
|
||||||
output_fields.remove(self._vector_field)
|
|
||||||
timeout = self.timeout or timeout
|
|
||||||
# Perform the search.
|
|
||||||
res = self.col.search(
|
|
||||||
data=[embedding],
|
|
||||||
anns_field=self._vector_field,
|
|
||||||
param=param,
|
param=param,
|
||||||
limit=fetch_k,
|
|
||||||
expr=expr,
|
expr=expr,
|
||||||
output_fields=output_fields,
|
|
||||||
timeout=timeout,
|
timeout=timeout,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
# Organize results.
|
if col_search_res is None:
|
||||||
|
return []
|
||||||
ids = []
|
ids = []
|
||||||
documents = []
|
documents = []
|
||||||
scores = []
|
scores = []
|
||||||
for result in res[0]:
|
for result in col_search_res[0]:
|
||||||
data = {x: result.entity.get(x) for x in output_fields}
|
data = {x: result.entity.get(x) for x in result.entity.fields}
|
||||||
doc = self._parse_document(data)
|
doc = self._parse_document(data)
|
||||||
documents.append(doc)
|
documents.append(doc)
|
||||||
scores.append(result.score)
|
scores.append(result.score)
|
||||||
ids.append(result.id)
|
ids.append(result.id)
|
||||||
|
|
||||||
vectors = self.col.query(
|
vectors = self.col.query( # type: ignore[union-attr]
|
||||||
expr=f"{self._primary_field} in {ids}",
|
expr=f"{self._primary_field} in {ids}",
|
||||||
output_fields=[self._primary_field, self._vector_field],
|
output_fields=[self._primary_field, self._vector_field],
|
||||||
timeout=timeout,
|
timeout=timeout,
|
||||||
@ -1089,6 +1184,8 @@ class Milvus(VectorStore):
|
|||||||
return vector_db
|
return vector_db
|
||||||
|
|
||||||
def _parse_document(self, data: dict) -> Document:
|
def _parse_document(self, data: dict) -> Document:
|
||||||
|
if self._vector_field in data:
|
||||||
|
data.pop(self._vector_field)
|
||||||
return Document(
|
return Document(
|
||||||
page_content=data.pop(self._text_field),
|
page_content=data.pop(self._text_field),
|
||||||
metadata=data.pop(self._metadata_field) if self._metadata_field else data,
|
metadata=data.pop(self._metadata_field) if self._metadata_field else data,
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
"""Test Milvus functionality."""
|
"""Test Milvus functionality."""
|
||||||
from typing import Any, List, Optional
|
from typing import Any, List, Optional
|
||||||
|
|
||||||
|
import pytest
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
from langchain_milvus.vectorstores import Milvus
|
from langchain_milvus.vectorstores import Milvus
|
||||||
@ -27,6 +28,7 @@ def _milvus_from_texts(
|
|||||||
metadatas: Optional[List[dict]] = None,
|
metadatas: Optional[List[dict]] = None,
|
||||||
ids: Optional[List[str]] = None,
|
ids: Optional[List[str]] = None,
|
||||||
drop: bool = True,
|
drop: bool = True,
|
||||||
|
**kwargs: Any,
|
||||||
) -> Milvus:
|
) -> Milvus:
|
||||||
return Milvus.from_texts(
|
return Milvus.from_texts(
|
||||||
fake_texts,
|
fake_texts,
|
||||||
@ -36,6 +38,7 @@ def _milvus_from_texts(
|
|||||||
# connection_args={"uri": "http://127.0.0.1:19530"},
|
# connection_args={"uri": "http://127.0.0.1:19530"},
|
||||||
connection_args={"uri": "./milvus_demo.db"},
|
connection_args={"uri": "./milvus_demo.db"},
|
||||||
drop_old=drop,
|
drop_old=drop,
|
||||||
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@ -50,6 +53,15 @@ def test_milvus() -> None:
|
|||||||
assert_docs_equal_without_pk(output, [Document(page_content="foo")])
|
assert_docs_equal_without_pk(output, [Document(page_content="foo")])
|
||||||
|
|
||||||
|
|
||||||
|
def test_milvus_vector_search() -> None:
|
||||||
|
"""Test end to end construction and search by vector."""
|
||||||
|
docsearch = _milvus_from_texts()
|
||||||
|
output = docsearch.similarity_search_by_vector(
|
||||||
|
FakeEmbeddings().embed_query("foo"), k=1
|
||||||
|
)
|
||||||
|
assert_docs_equal_without_pk(output, [Document(page_content="foo")])
|
||||||
|
|
||||||
|
|
||||||
def test_milvus_with_metadata() -> None:
|
def test_milvus_with_metadata() -> None:
|
||||||
"""Test with metadata"""
|
"""Test with metadata"""
|
||||||
docsearch = _milvus_from_texts(metadatas=[{"label": "test"}] * len(fake_texts))
|
docsearch = _milvus_from_texts(metadatas=[{"label": "test"}] * len(fake_texts))
|
||||||
@ -110,6 +122,21 @@ def test_milvus_max_marginal_relevance_search() -> None:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_milvus_max_marginal_relevance_search_with_dynamic_field() -> None:
|
||||||
|
"""Test end to end construction and MRR search with enabling dynamic field."""
|
||||||
|
texts = ["foo", "bar", "baz"]
|
||||||
|
metadatas = [{"page": i} for i in range(len(texts))]
|
||||||
|
docsearch = _milvus_from_texts(metadatas=metadatas, enable_dynamic_field=True)
|
||||||
|
output = docsearch.max_marginal_relevance_search("foo", k=2, fetch_k=3)
|
||||||
|
assert_docs_equal_without_pk(
|
||||||
|
output,
|
||||||
|
[
|
||||||
|
Document(page_content="foo", metadata={"page": 0}),
|
||||||
|
Document(page_content="baz", metadata={"page": 2}),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_milvus_add_extra() -> None:
|
def test_milvus_add_extra() -> None:
|
||||||
"""Test end to end construction and MRR search."""
|
"""Test end to end construction and MRR search."""
|
||||||
texts = ["foo", "bar", "baz"]
|
texts = ["foo", "bar", "baz"]
|
||||||
@ -123,7 +150,7 @@ def test_milvus_add_extra() -> None:
|
|||||||
|
|
||||||
|
|
||||||
def test_milvus_no_drop() -> None:
|
def test_milvus_no_drop() -> None:
|
||||||
"""Test end to end construction and MRR search."""
|
"""Test construction without dropping old data."""
|
||||||
texts = ["foo", "bar", "baz"]
|
texts = ["foo", "bar", "baz"]
|
||||||
metadatas = [{"page": i} for i in range(len(texts))]
|
metadatas = [{"page": i} for i in range(len(texts))]
|
||||||
docsearch = _milvus_from_texts(metadatas=metadatas)
|
docsearch = _milvus_from_texts(metadatas=metadatas)
|
||||||
@ -171,14 +198,95 @@ def test_milvus_upsert_entities() -> None:
|
|||||||
assert len(ids) == 2 # type: ignore[arg-type]
|
assert len(ids) == 2 # type: ignore[arg-type]
|
||||||
|
|
||||||
|
|
||||||
|
def test_milvus_enable_dynamic_field() -> None:
|
||||||
|
"""Test end to end construction and enable dynamic field"""
|
||||||
|
texts = ["foo", "bar", "baz"]
|
||||||
|
metadatas = [{"id": i} for i in range(len(texts))]
|
||||||
|
docsearch = _milvus_from_texts(metadatas=metadatas, enable_dynamic_field=True)
|
||||||
|
output = docsearch.similarity_search("foo", k=10)
|
||||||
|
assert len(output) == 3
|
||||||
|
|
||||||
|
# When enable dynamic field, any new field data will be added to the collection.
|
||||||
|
new_metadatas = [{"id_new": i} for i in range(len(texts))]
|
||||||
|
docsearch.add_texts(texts, new_metadatas)
|
||||||
|
|
||||||
|
output = docsearch.similarity_search("foo", k=10)
|
||||||
|
assert len(output) == 6
|
||||||
|
|
||||||
|
assert set(docsearch.fields) == {
|
||||||
|
docsearch._primary_field,
|
||||||
|
docsearch._text_field,
|
||||||
|
docsearch._vector_field,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def test_milvus_disable_dynamic_field() -> None:
|
||||||
|
"""Test end to end construction and disable dynamic field"""
|
||||||
|
texts = ["foo", "bar", "baz"]
|
||||||
|
metadatas = [{"id": i} for i in range(len(texts))]
|
||||||
|
docsearch = _milvus_from_texts(metadatas=metadatas, enable_dynamic_field=False)
|
||||||
|
output = docsearch.similarity_search("foo", k=10)
|
||||||
|
assert len(output) == 3
|
||||||
|
# ["pk", "text", "vector", "id"]
|
||||||
|
assert set(docsearch.fields) == {
|
||||||
|
docsearch._primary_field,
|
||||||
|
docsearch._text_field,
|
||||||
|
docsearch._vector_field,
|
||||||
|
"id",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Try to add new fields "id_new", but since dynamic field is disabled,
|
||||||
|
# all fields in the collection is specified as ["pk", "text", "vector", "id"],
|
||||||
|
# new field information "id_new" will not be added.
|
||||||
|
new_metadatas = [{"id": i, "id_new": i} for i in range(len(texts))]
|
||||||
|
docsearch.add_texts(texts, new_metadatas)
|
||||||
|
output = docsearch.similarity_search("foo", k=10)
|
||||||
|
assert len(output) == 6
|
||||||
|
for doc in output:
|
||||||
|
assert set(doc.metadata.keys()) == {"id", "pk"} # `id_new` is not added.
|
||||||
|
|
||||||
|
# When disable dynamic field,
|
||||||
|
# missing data of the created fields "id", will raise an exception.
|
||||||
|
with pytest.raises(Exception):
|
||||||
|
new_metadatas = [{"id_new": i} for i in range(len(texts))]
|
||||||
|
docsearch.add_texts(texts, new_metadatas)
|
||||||
|
|
||||||
|
|
||||||
|
def test_milvus_metadata_field() -> None:
|
||||||
|
"""Test end to end construction and use metadata field"""
|
||||||
|
texts = ["foo", "bar", "baz"]
|
||||||
|
metadatas = [{"id": i} for i in range(len(texts))]
|
||||||
|
docsearch = _milvus_from_texts(metadatas=metadatas, metadata_field="metadata")
|
||||||
|
output = docsearch.similarity_search("foo", k=10)
|
||||||
|
assert len(output) == 3
|
||||||
|
|
||||||
|
new_metadatas = [{"id_new": i} for i in range(len(texts))]
|
||||||
|
docsearch.add_texts(texts, new_metadatas)
|
||||||
|
|
||||||
|
output = docsearch.similarity_search("foo", k=10)
|
||||||
|
assert len(output) == 6
|
||||||
|
|
||||||
|
assert set(docsearch.fields) == {
|
||||||
|
docsearch._primary_field,
|
||||||
|
docsearch._text_field,
|
||||||
|
docsearch._vector_field,
|
||||||
|
docsearch._metadata_field,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
# if __name__ == "__main__":
|
# if __name__ == "__main__":
|
||||||
# test_milvus()
|
# test_milvus()
|
||||||
|
# test_milvus_vector_search()
|
||||||
# test_milvus_with_metadata()
|
# test_milvus_with_metadata()
|
||||||
# test_milvus_with_id()
|
# test_milvus_with_id()
|
||||||
# test_milvus_with_score()
|
# test_milvus_with_score()
|
||||||
# test_milvus_max_marginal_relevance_search()
|
# test_milvus_max_marginal_relevance_search()
|
||||||
|
# test_milvus_max_marginal_relevance_search_with_dynamic_field()
|
||||||
# test_milvus_add_extra()
|
# test_milvus_add_extra()
|
||||||
# test_milvus_no_drop()
|
# test_milvus_no_drop()
|
||||||
# test_milvus_get_pks()
|
# test_milvus_get_pks()
|
||||||
# test_milvus_delete_entities()
|
# test_milvus_delete_entities()
|
||||||
# test_milvus_upsert_entities()
|
# test_milvus_upsert_entities()
|
||||||
|
# test_milvus_enable_dynamic_field()
|
||||||
|
# test_milvus_disable_dynamic_field()
|
||||||
|
# test_milvus_metadata_field()
|
||||||
|
Loading…
Reference in New Issue
Block a user