milvus: add array data type for collection create (#23219)

Add array data type for milvus vector store collection create


Thank you for contributing to LangChain!

- [x] **PR title**: "package: description"
- Where "package" is whichever of langchain, community, core,
experimental, etc. is being modified. Use "docs: ..." for purely docs
changes, "templates: ..." for template changes, "infra: ..." for CI
changes.
  - Example: "community: add foobar LLM"


- [x] **PR message**: ***Delete this entire checklist*** and replace
with
    - **Description:** a description of the change
    - **Issue:** the issue # it fixes, if applicable
    - **Dependencies:** any dependencies required for this change
- **Twitter handle:** if your PR gets announced, and you'd like a
mention, we'll gladly shout you out!


- [x] **Add tests and docs**: If you're adding a new integration, please
include
1. a test for the integration, preferably unit tests that do not rely on
network access,
2. an example notebook showing its use. It lives in
`docs/docs/integrations` directory.


- [x] **Lint and test**: Run `make format`, `make lint` and `make test`
from the root of the package(s) you've modified. See contribution
guidelines for more: https://python.langchain.com/docs/contributing/

Additional guidelines:
- Make sure optional dependencies are imported within a function.
- Please do not add dependencies to pyproject.toml files (even optional
ones) unless they are required for unit tests.
- Most PRs should not touch more than one package.
- Changes should be backwards compatible.
- If you are adding something to community, do not re-import it in
langchain.

If no one reviews your PR within a few days, please @-mention one of
baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17.

---------

Signed-off-by: ChengZi <chen.zhang@zilliz.com>
Co-authored-by: Rohit Gupta <rohit.gupta2@walmart.com>
Co-authored-by: ChengZi <chen.zhang@zilliz.com>
Co-authored-by: Erick Friis <erick@langchain.dev>
This commit is contained in:
Rohit Gupta 2024-08-28 22:25:57 +05:30 committed by GitHub
parent 754f3c41f9
commit aff50a1e6f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 87 additions and 15 deletions

View File

@ -240,6 +240,7 @@ class Milvus(VectorStore):
replica_number: int = 1,
timeout: Optional[float] = None,
num_shards: Optional[int] = None,
metadata_schema: Optional[dict[str, Any]] = None,
):
"""Initialize the Milvus vector store."""
try:
@ -310,6 +311,7 @@ class Milvus(VectorStore):
self.replica_number = replica_number
self.timeout = timeout
self.num_shards = num_shards
self.metadata_schema = metadata_schema
# Create the connection to the server
if connection_args is None:
@ -472,24 +474,47 @@ class Milvus(VectorStore):
)
raise ValueError(f"Metadata key {key} is reserved.")
# Infer the corresponding datatype of the metadata
dtype = infer_dtype_bydata(value)
# Datatype isn't compatible
if dtype == DataType.UNKNOWN or dtype == DataType.NONE:
logger.error(
(
"Failure to create collection, "
"unrecognized dtype for key: %s"
),
key,
)
raise ValueError(f"Unrecognized datatype for {key}.")
# Datatype is a string/varchar equivalent
elif dtype == DataType.VARCHAR:
if (
key in self.metadata_schema # type: ignore
and "dtype" in self.metadata_schema[key] # type: ignore
):
kwargs = self.metadata_schema[key].get("kwargs", {}) # type: ignore
fields.append(
FieldSchema(key, DataType.VARCHAR, max_length=65_535)
FieldSchema(
name=key,
dtype=self.metadata_schema[key]["dtype"], # type: ignore
**kwargs,
)
)
else:
fields.append(FieldSchema(key, dtype))
dtype = infer_dtype_bydata(value)
# Datatype isn't compatible
if dtype == DataType.UNKNOWN or dtype == DataType.NONE:
logger.error(
(
"Failure to create collection, "
"unrecognized dtype for key: %s"
),
key,
)
raise ValueError(f"Unrecognized datatype for {key}.")
# Datatype is a string/varchar equivalent
elif dtype == DataType.VARCHAR:
fields.append(
FieldSchema(key, DataType.VARCHAR, max_length=65_535)
)
# infer_dtype_bydata currently can't recognize array type,
# so this line can not be accessed.
# This line may need to be modified in the future when
# infer_dtype_bydata can recognize array type.
# https://github.com/milvus-io/pymilvus/issues/2165
elif dtype == DataType.ARRAY:
kwargs = self.metadata_schema[key]["kwargs"] # type: ignore
fields.append(
FieldSchema(name=key, dtype=DataType.ARRAY, **kwargs)
)
else:
fields.append(FieldSchema(key, dtype))
# Create the text field
fields.append(

View File

@ -39,6 +39,7 @@ def _milvus_from_texts(
# connection_args={"uri": "http://127.0.0.1:19530"},
connection_args={"uri": "./milvus_demo.db"},
drop_old=drop,
consistency_level="Strong",
**kwargs,
)
@ -303,6 +304,51 @@ def test_milvus_enable_dynamic_field_with_partition_key() -> None:
}
def test_milvus_array_field() -> None:
"""Manually specify metadata schema, including an array_field.
For more information about array data type and filtering, please refer to
https://milvus.io/docs/array_data_type.md
"""
from pymilvus import DataType
texts = ["foo", "bar", "baz"]
metadatas = [{"id": i, "array_field": [i, i + 1, i + 2]} for i in range(len(texts))]
# Manually specify metadata schema, including an array_field.
# If some fields are not specified, Milvus will automatically infer their schemas.
docsearch = _milvus_from_texts(
metadatas=metadatas,
metadata_schema={
"array_field": {
"dtype": DataType.ARRAY,
"kwargs": {"element_type": DataType.INT64, "max_capacity": 50},
},
# "id": {
# "dtype": DataType.INT64,
# }
},
)
output = docsearch.similarity_search("foo", k=10, expr="array_field[0] < 2")
assert len(output) == 2
output = docsearch.similarity_search(
"foo", k=10, expr="ARRAY_CONTAINS(array_field, 3)"
)
assert len(output) == 2
# If we use enable_dynamic_field,
# there is no need to manually specify metadata schema.
docsearch = _milvus_from_texts(
enable_dynamic_field=True,
metadatas=metadatas,
)
output = docsearch.similarity_search("foo", k=10, expr="array_field[0] < 2")
assert len(output) == 2
output = docsearch.similarity_search(
"foo", k=10, expr="ARRAY_CONTAINS(array_field, 3)"
)
assert len(output) == 2
# if __name__ == "__main__":
# test_milvus()
# test_milvus_vector_search()
@ -319,3 +365,4 @@ def test_milvus_enable_dynamic_field_with_partition_key() -> None:
# test_milvus_enable_dynamic_field()
# test_milvus_disable_dynamic_field()
# test_milvus_metadata_field()
# test_milvus_array_field()