mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-10 13:27:36 +00:00
Milvus allows to store metadata as json field (#14636)
Because Milvus doesn't support nullable fields, but document metadata is very rich, so it makes more sense to store it as json. https://github.com/milvus-io/pymilvus/issues/1705#issuecomment-1731112372 <!-- Thank you for contributing to LangChain! Replace this entire comment with: - **Description:** a description of the change, - **Issue:** the issue # it fixes (if applicable), - **Dependencies:** any dependencies required for this change, - **Tag maintainer:** for a quicker response, tag the relevant maintainer (see below), - **Twitter handle:** we announce bigger features on Twitter. If your PR gets announced, and you'd like a mention, we'll gladly shout you out! Please make sure your PR is passing linting and testing before submitting. Run `make format`, `make lint` and `make test` to check this locally. See contribution guidelines for more information on how to write/run tests, lint, etc: https://github.com/langchain-ai/langchain/blob/master/.github/CONTRIBUTING.md If you're adding a new integration, please include: 1. a test for the integration, preferably unit tests that do not rely on network access, 2. an example notebook showing its use. It lives in `docs/extras` directory. If no one reviews your PR within a few days, please @-mention one of @baskaryan, @eyurtsev, @hwchase17. --> --------- Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
This commit is contained in:
parent
620168e459
commit
da0f750a0b
@ -53,6 +53,9 @@ class Milvus(VectorStore):
|
|||||||
primary_field (str): Name of the primary key field. Defaults to "pk".
|
primary_field (str): Name of the primary key field. Defaults to "pk".
|
||||||
text_field (str): Name of the text field. Defaults to "text".
|
text_field (str): Name of the text field. Defaults to "text".
|
||||||
vector_field (str): Name of the vector field. Defaults to "vector".
|
vector_field (str): Name of the vector field. Defaults to "vector".
|
||||||
|
metadata_field (str): Name of the metadta field. Defaults to None.
|
||||||
|
When metadata_field is specified,
|
||||||
|
the document's metadata will store as json.
|
||||||
|
|
||||||
The connection args used for this class comes in the form of a dict,
|
The connection args used for this class comes in the form of a dict,
|
||||||
here are a few of the options:
|
here are a few of the options:
|
||||||
@ -112,6 +115,7 @@ class Milvus(VectorStore):
|
|||||||
primary_field: str = "pk",
|
primary_field: str = "pk",
|
||||||
text_field: str = "text",
|
text_field: str = "text",
|
||||||
vector_field: str = "vector",
|
vector_field: str = "vector",
|
||||||
|
metadata_field: Optional[str] = None,
|
||||||
):
|
):
|
||||||
"""Initialize the Milvus vector store."""
|
"""Initialize the Milvus vector store."""
|
||||||
try:
|
try:
|
||||||
@ -148,6 +152,7 @@ class Milvus(VectorStore):
|
|||||||
self._text_field = text_field
|
self._text_field = text_field
|
||||||
# In order for compatibility, the vector field needs to be called "vector"
|
# In order for compatibility, the vector field needs to be called "vector"
|
||||||
self._vector_field = vector_field
|
self._vector_field = vector_field
|
||||||
|
self._metadata_field = metadata_field
|
||||||
self.fields: list[str] = []
|
self.fields: list[str] = []
|
||||||
# Create the connection to the server
|
# Create the connection to the server
|
||||||
if connection_args is None:
|
if connection_args is None:
|
||||||
@ -250,24 +255,32 @@ class Milvus(VectorStore):
|
|||||||
# Determine embedding dim
|
# Determine embedding dim
|
||||||
dim = len(embeddings[0])
|
dim = len(embeddings[0])
|
||||||
fields = []
|
fields = []
|
||||||
# Determine metadata schema
|
if self._metadata_field is not None:
|
||||||
if metadatas:
|
fields.append(FieldSchema(self._metadata_field, DataType.JSON))
|
||||||
# Create FieldSchema for each entry in metadata.
|
else:
|
||||||
for key, value in metadatas[0].items():
|
# Determine metadata schema
|
||||||
# Infer the corresponding datatype of the metadata
|
if metadatas:
|
||||||
dtype = infer_dtype_bydata(value)
|
# Create FieldSchema for each entry in metadata.
|
||||||
# Datatype isn't compatible
|
for key, value in metadatas[0].items():
|
||||||
if dtype == DataType.UNKNOWN or dtype == DataType.NONE:
|
# Infer the corresponding datatype of the metadata
|
||||||
logger.error(
|
dtype = infer_dtype_bydata(value)
|
||||||
"Failure to create collection, unrecognized dtype for key: %s",
|
# Datatype isn't compatible
|
||||||
key,
|
if dtype == DataType.UNKNOWN or dtype == DataType.NONE:
|
||||||
)
|
logger.error(
|
||||||
raise ValueError(f"Unrecognized datatype for {key}.")
|
(
|
||||||
# Dataype is a string/varchar equivalent
|
"Failure to create collection, "
|
||||||
elif dtype == DataType.VARCHAR:
|
"unrecognized dtype for key: %s"
|
||||||
fields.append(FieldSchema(key, DataType.VARCHAR, max_length=65_535))
|
),
|
||||||
else:
|
key,
|
||||||
fields.append(FieldSchema(key, dtype))
|
)
|
||||||
|
raise ValueError(f"Unrecognized datatype for {key}.")
|
||||||
|
# Dataype is a string/varchar equivalent
|
||||||
|
elif dtype == DataType.VARCHAR:
|
||||||
|
fields.append(
|
||||||
|
FieldSchema(key, DataType.VARCHAR, max_length=65_535)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
fields.append(FieldSchema(key, dtype))
|
||||||
|
|
||||||
# Create the text field
|
# Create the text field
|
||||||
fields.append(
|
fields.append(
|
||||||
@ -442,12 +455,16 @@ class Milvus(VectorStore):
|
|||||||
self._vector_field: embeddings,
|
self._vector_field: embeddings,
|
||||||
}
|
}
|
||||||
|
|
||||||
# Collect the metadata into the insert dict.
|
if self._metadata_field is not None:
|
||||||
if metadatas is not None:
|
|
||||||
for d in metadatas:
|
for d in metadatas:
|
||||||
for key, value in d.items():
|
insert_dict.setdefault(self._metadata_field, []).append(d)
|
||||||
if key in self.fields:
|
else:
|
||||||
insert_dict.setdefault(key, []).append(value)
|
# Collect the metadata into the insert dict.
|
||||||
|
if metadatas is not None:
|
||||||
|
for d in metadatas:
|
||||||
|
for key, value in d.items():
|
||||||
|
if key in self.fields:
|
||||||
|
insert_dict.setdefault(key, []).append(value)
|
||||||
|
|
||||||
# Total insert count
|
# Total insert count
|
||||||
vectors: list = insert_dict[self._vector_field]
|
vectors: list = insert_dict[self._vector_field]
|
||||||
@ -630,8 +647,8 @@ class Milvus(VectorStore):
|
|||||||
# Organize results.
|
# Organize results.
|
||||||
ret = []
|
ret = []
|
||||||
for result in res[0]:
|
for result in res[0]:
|
||||||
meta = {x: result.entity.get(x) for x in output_fields}
|
data = {x: result.entity.get(x) for x in output_fields}
|
||||||
doc = Document(page_content=meta.pop(self._text_field), metadata=meta)
|
doc = self._parse_document(data)
|
||||||
pair = (doc, result.score)
|
pair = (doc, result.score)
|
||||||
ret.append(pair)
|
ret.append(pair)
|
||||||
|
|
||||||
@ -746,8 +763,8 @@ class Milvus(VectorStore):
|
|||||||
documents = []
|
documents = []
|
||||||
scores = []
|
scores = []
|
||||||
for result in res[0]:
|
for result in res[0]:
|
||||||
meta = {x: result.entity.get(x) for x in output_fields}
|
data = {x: result.entity.get(x) for x in output_fields}
|
||||||
doc = Document(page_content=meta.pop(self._text_field), metadata=meta)
|
doc = self._parse_document(data)
|
||||||
documents.append(doc)
|
documents.append(doc)
|
||||||
scores.append(result.score)
|
scores.append(result.score)
|
||||||
ids.append(result.id)
|
ids.append(result.id)
|
||||||
@ -826,3 +843,9 @@ class Milvus(VectorStore):
|
|||||||
)
|
)
|
||||||
vector_db.add_texts(texts=texts, metadatas=metadatas)
|
vector_db.add_texts(texts=texts, metadatas=metadatas)
|
||||||
return vector_db
|
return vector_db
|
||||||
|
|
||||||
|
def _parse_document(self, data: dict) -> Document:
|
||||||
|
return Document(
|
||||||
|
page_content=data.pop(self._text_field),
|
||||||
|
metadata=data.pop(self._metadata_field) if self._metadata_field else data,
|
||||||
|
)
|
||||||
|
Loading…
Reference in New Issue
Block a user