feat(RAG):add metadata properties filters (#1395)

This commit is contained in:
Aries-ckt
2024-04-10 14:33:24 +08:00
committed by GitHub
parent 0f2b46da62
commit 37e7c0151b
26 changed files with 619 additions and 166 deletions

View File

@@ -4,11 +4,12 @@ import math
import time
from abc import ABC, abstractmethod
from concurrent.futures import ThreadPoolExecutor
from typing import List, Optional
from typing import Any, List, Optional
from dbgpt._private.pydantic import BaseModel, Field
from dbgpt.core import Chunk, Embeddings
from dbgpt.core.awel.flow import Parameter
from dbgpt.storage.vector_store.filters import MetadataFilters
from dbgpt.util.i18n_utils import _
logger = logging.getLogger(__name__)
@@ -176,13 +177,15 @@ class VectorStoreBase(ABC):
return ids
@abstractmethod
def similar_search(self, text: str, topk: int) -> List[Chunk]:
def similar_search(
self, text: str, topk: int, filters: Optional[MetadataFilters] = None
) -> List[Chunk]:
"""Similar search in vector database.
Args:
text(str): The query text.
topk(int): The number of similar documents to return.
filters(Optional[MetadataFilters]): metadata filters.
Return:
List[Chunk]: The similar documents.
"""
@@ -190,7 +193,11 @@ class VectorStoreBase(ABC):
@abstractmethod
def similar_search_with_scores(
self, text, topk, score_threshold: float
self,
text,
topk,
score_threshold: float,
filters: Optional[MetadataFilters] = None,
) -> List[Chunk]:
"""Similar search with scores in vector database.
@@ -199,6 +206,7 @@ class VectorStoreBase(ABC):
topk(int): The number of similar documents to return.
score_threshold(int): score_threshold: Optional, a floating point value
between 0 to 1
filters(Optional[MetadataFilters]): metadata filters.
Return:
List[Chunk]: The similar documents.
"""
@@ -223,6 +231,15 @@ class VectorStoreBase(ABC):
Args:
vector_name(str): The name of vector to delete.
"""
pass
def convert_metadata_filters(self, filters: MetadataFilters) -> Any:
"""Convert metadata filters to vector store filters.
Args:
filters: (Optional[MetadataFilters]) metadata filters.
"""
raise NotImplementedError
def _normalization_vectors(self, vectors):
"""Return L2-normalization vectors to scale[0,1].

View File

@@ -1,7 +1,7 @@
"""Chroma vector store."""
import logging
import os
from typing import Any, List
from typing import List, Optional
from chromadb import PersistentClient
from chromadb.config import Settings
@@ -13,6 +13,7 @@ from dbgpt.core.awel.flow import Parameter, ResourceCategory, register_resource
from dbgpt.util.i18n_utils import _
from .base import _COMMON_PARAMETERS, VectorStoreBase, VectorStoreConfig
from .filters import FilterOperator, MetadataFilters
logger = logging.getLogger(__name__)
@@ -86,16 +87,23 @@ class ChromaStore(VectorStoreBase):
collection_metadata=collection_metadata,
)
def similar_search(self, text, topk, **kwargs: Any) -> List[Chunk]:
def similar_search(
self, text, topk, filters: Optional[MetadataFilters] = None
) -> List[Chunk]:
"""Search similar documents."""
logger.info("ChromaStore similar search")
lc_documents = self.vector_store_client.similarity_search(text, topk, **kwargs)
where_filters = self.convert_metadata_filters(filters) if filters else None
lc_documents = self.vector_store_client.similarity_search(
text, topk, filter=where_filters
)
return [
Chunk(content=doc.page_content, metadata=doc.metadata)
for doc in lc_documents
]
def similar_search_with_scores(self, text, topk, score_threshold) -> List[Chunk]:
def similar_search_with_scores(
self, text, topk, score_threshold, filters: Optional[MetadataFilters] = None
) -> List[Chunk]:
"""Search similar documents with scores.
Chroma similar_search_with_score.
@@ -106,11 +114,16 @@ class ChromaStore(VectorStoreBase):
score_threshold(float): score_threshold: Optional, a floating point value
between 0 to 1 to filter the resulting set of retrieved docs,0 is
dissimilar, 1 is most similar.
filters(MetadataFilters): metadata filters, defaults to None
"""
logger.info("ChromaStore similar search with scores")
where_filters = self.convert_metadata_filters(filters) if filters else None
docs_and_scores = (
self.vector_store_client.similarity_search_with_relevance_scores(
query=text, k=topk, score_threshold=score_threshold
query=text,
k=topk,
score_threshold=score_threshold,
filter=where_filters,
)
)
return [
@@ -152,10 +165,71 @@ class ChromaStore(VectorStoreBase):
collection = self.vector_store_client._collection
collection.delete(ids=ids)
def convert_metadata_filters(
self,
filters: MetadataFilters,
) -> dict:
"""Convert metadata filters to Chroma filters.
Args:
filters(MetadataFilters): metadata filters.
Returns:
dict: Chroma filters.
"""
where_filters = {}
filters_list = []
condition = filters.condition
chroma_condition = f"${condition}"
if filters.filters:
for filter in filters.filters:
if filter.operator:
filters_list.append(
{
filter.key: {
_convert_chroma_filter_operator(
filter.operator
): filter.value
}
}
)
else:
filters_list.append({filter.key: filter.value}) # type: ignore
if len(filters_list) == 1:
return filters_list[0]
elif len(filters_list) > 1:
where_filters[chroma_condition] = filters_list
return where_filters
def _clean_persist_folder(self):
"""Clean persist folder."""
for root, dirs, files in os.walk(self.persist_dir, topdown=False):
for name in files:
os.remove(os.path.join(root, name))
for name in dirs:
os.rmdir(os.path.join(root, name))
os.rmdir(self.persist_dir)
def _convert_chroma_filter_operator(operator: str) -> str:
"""Convert operator to Chroma where operator.
Args:
operator(str): operator.
Returns:
str: Chroma where operator.
"""
if operator == FilterOperator.EQ:
return "$eq"
elif operator == FilterOperator.NE:
return "$ne"
elif operator == FilterOperator.GT:
return "$gt"
elif operator == FilterOperator.LT:
return "$lt"
elif operator == FilterOperator.GTE:
return "$gte"
elif operator == FilterOperator.LTE:
return "$lte"
else:
raise ValueError(f"Chroma Where operator {operator} not supported")

View File

@@ -13,6 +13,7 @@ from dbgpt.core.awel.flow import (
)
from dbgpt.storage import vector_store
from dbgpt.storage.vector_store.base import VectorStoreBase, VectorStoreConfig
from dbgpt.storage.vector_store.filters import MetadataFilters
from dbgpt.util.i18n_utils import _
connector: Dict[str, Type] = {}
@@ -128,23 +129,29 @@ class VectorStoreConnector:
max_threads,
)
def similar_search(self, doc: str, topk: int) -> List[Chunk]:
def similar_search(
self, doc: str, topk: int, filters: Optional[MetadataFilters] = None
) -> List[Chunk]:
"""Similar search in vector database.
Args:
- doc: query text
- topk: topk
- filters: metadata filters.
Return:
- chunks: chunks.
"""
return self.client.similar_search(doc, topk)
return self.client.similar_search(doc, topk, filters)
def similar_search_with_scores(
self, doc: str, topk: int, score_threshold: float
self,
doc: str,
topk: int,
score_threshold: float,
filters: Optional[MetadataFilters] = None,
) -> List[Chunk]:
"""Similar search with scores in vector database.
"""Similar_search_with_score in vector database.
similar_search_with_score in vector database..
Return docs and relevance scores in the range [0, 1].
Args:
@@ -153,10 +160,13 @@ class VectorStoreConnector:
score_threshold(float): score_threshold: Optional, a floating point value
between 0 to 1 to filter the resulting set of retrieved docs,0 is
dissimilar, 1 is most similar.
filters: metadata filters.
Return:
- chunks: chunks.
- chunks: Return docs and relevance scores in the range [0, 1].
"""
return self.client.similar_search_with_scores(doc, topk, score_threshold)
return self.client.similar_search_with_scores(
doc, topk, score_threshold, filters
)
@property
def vector_store_config(self) -> VectorStoreConfig:

View File

@@ -0,0 +1,56 @@
"""Vector Store Meta data filters."""
from enum import Enum
from typing import List, Union
from pydantic import BaseModel, Field
class FilterOperator(str, Enum):
"""Meta data filter operator."""
EQ = "=="
GT = ">"
LT = "<"
NE = "!="
GTE = ">="
LTE = "<="
IN = "in"
NIN = "nin"
EXISTS = "exists"
class FilterCondition(str, Enum):
"""Vector Store Meta data filter conditions."""
AND = "and"
OR = "or"
class MetadataFilter(BaseModel):
"""Meta data filter."""
key: str = Field(
...,
description="The key of metadata to filter.",
)
operator: FilterOperator = Field(
default=FilterOperator.EQ,
description="The operator of metadata filter.",
)
value: Union[str, int, float, List[str], List[int], List[float]] = Field(
...,
description="The value of metadata to filter.",
)
class MetadataFilters(BaseModel):
"""Meta data filters."""
condition: FilterCondition = Field(
default=FilterCondition.AND,
description="The condition of metadata filters.",
)
filters: List[MetadataFilter] = Field(
...,
description="The metadata filters.",
)

View File

@@ -14,6 +14,7 @@ from dbgpt.storage.vector_store.base import (
VectorStoreBase,
VectorStoreConfig,
)
from dbgpt.storage.vector_store.filters import FilterOperator, MetadataFilters
from dbgpt.util import string_utils
from dbgpt.util.i18n_utils import _
@@ -206,6 +207,7 @@ class MilvusStore(VectorStoreBase):
self.vector_field = milvus_vector_config.get("embedding_field") or "vector"
self.text_field = milvus_vector_config.get("text_field") or "content"
self.metadata_field = milvus_vector_config.get("metadata_field") or "metadata"
self.props_field = milvus_vector_config.get("props_field") or "props_field"
if (self.username is None) != (self.password is None):
raise ValueError(
@@ -284,6 +286,7 @@ class MilvusStore(VectorStoreBase):
vector_field = self.vector_field
text_field = self.text_field
metadata_field = self.metadata_field
props_field = self.props_field
# self.text_field = text_field
collection_name = vector_name
fields = []
@@ -300,6 +303,7 @@ class MilvusStore(VectorStoreBase):
fields.append(FieldSchema(vector_field, DataType.FLOAT_VECTOR, dim=dim))
fields.append(FieldSchema(metadata_field, DataType.VARCHAR, max_length=65535))
fields.append(FieldSchema(props_field, DataType.JSON))
schema = CollectionSchema(fields)
# Create the collection
collection = Collection(collection_name, schema)
@@ -346,6 +350,7 @@ class MilvusStore(VectorStoreBase):
for d in metadatas:
# for key, value in d.items():
insert_dict.setdefault("metadata", []).append(json.dumps(d))
insert_dict.setdefault("props_field", []).append(d)
# Convert dict to list of lists for insertion
insert_list = [insert_dict[x] for x in self.fields]
# Insert into the collection.
@@ -368,7 +373,9 @@ class MilvusStore(VectorStoreBase):
doc_ids = [str(doc_id) for doc_id in doc_ids]
return doc_ids
def similar_search(self, text, topk) -> List[Chunk]:
def similar_search(
self, text, topk, filters: Optional[MetadataFilters] = None
) -> List[Chunk]:
"""Perform a search on a query string and return results."""
from pymilvus import Collection, DataType
@@ -383,7 +390,9 @@ class MilvusStore(VectorStoreBase):
self.primary_field = x.name
if x.dtype == DataType.FLOAT_VECTOR or x.dtype == DataType.BINARY_VECTOR:
self.vector_field = x.name
_, docs_and_scores = self._search(text, topk)
# convert to milvus expr filter.
milvus_filter_expr = self.convert_metadata_filters(filters) if filters else None
_, docs_and_scores = self._search(text, topk, expr=milvus_filter_expr)
return [
Chunk(
@@ -393,7 +402,13 @@ class MilvusStore(VectorStoreBase):
for doc, _, _ in docs_and_scores
]
def similar_search_with_scores(self, text, topk, score_threshold) -> List[Chunk]:
def similar_search_with_scores(
self,
text: str,
topk: int,
score_threshold: float,
filters: Optional[MetadataFilters] = None,
) -> List[Chunk]:
"""Perform a search on a query string and return results with score.
For more information about the search parameters, take a look at the pymilvus
@@ -401,15 +416,10 @@ class MilvusStore(VectorStoreBase):
https://milvus.io/api-reference/pymilvus/v2.2.6/Collection/search().md
Args:
embedding (List[float]): The embedding vector being searched.
k (int, optional): The amount of results to return. Defaults to 4.
param (dict): The search params for the specified index.
Defaults to None.
expr (str, optional): Filtering expression. Defaults to None.
timeout (int, optional): How long to wait before timeout error.
Defaults to None.
kwargs: Collection.search() keyword arguments.
text (str): The query text.
topk (int): The number of similar documents to return.
score_threshold (float): Optional, a floating point value between 0 to 1.
filters (Optional[MetadataFilters]): Optional, metadata filters.
Returns:
List[Tuple[Document, float]]: Result doc and score.
"""
@@ -427,7 +437,11 @@ class MilvusStore(VectorStoreBase):
if x.dtype == DataType.FLOAT_VECTOR or x.dtype == DataType.BINARY_VECTOR:
self.vector_field = x.name
_, docs_and_scores = self._search(text, topk)
# convert to milvus expr filter.
milvus_filter_expr = self.convert_metadata_filters(filters) if filters else None
_, docs_and_scores = self._search(
query=text, topk=topk, expr=milvus_filter_expr
)
if any(score < 0.0 or score > 1.0 for _, score, id in docs_and_scores):
logger.warning(
"similarity score need between" f" 0 and 1, got {docs_and_scores}"
@@ -462,6 +476,20 @@ class MilvusStore(VectorStoreBase):
timeout: Optional[int] = None,
**kwargs: Any,
):
"""Search in vector database.
Args:
query: query text.
k: topk.
param: search params.
expr: search expr.
partition_names: partition names.
round_decimal: round decimal.
timeout: timeout.
**kwargs: kwargs.
Returns:
Tuple[Document, float, int]: Result doc and score.
"""
self.col.load()
# use default index params.
if param is None:
@@ -495,7 +523,9 @@ class MilvusStore(VectorStoreBase):
result.id,
)
)
if len(ret) == 0:
logger.warning("No relevant docs were retrieved.")
return None, []
return ret[0], ret
def vector_name_exists(self):
@@ -523,6 +553,40 @@ class MilvusStore(VectorStoreBase):
logger.info(f"begin delete milvus ids: {ids}")
delete_ids = ids.split(",")
doc_ids = [int(doc_id) for doc_id in delete_ids]
delet_expr = f"{self.primary_field} in {doc_ids}"
self.col.delete(delet_expr)
delete_expr = f"{self.primary_field} in {doc_ids}"
self.col.delete(delete_expr)
return True
def convert_metadata_filters(self, filters: MetadataFilters) -> str:
"""Convert filter to milvus filters.
Args:
- filters: metadata filters.
Returns:
- metadata_filters: metadata filters.
"""
metadata_filters = []
for metadata_filter in filters.filters:
if isinstance(metadata_filter.value, str):
expr = (
f"{self.props_field}['{metadata_filter.key}'] "
f"{FilterOperator.EQ} '{metadata_filter.value}'"
)
metadata_filters.append(expr)
elif isinstance(metadata_filter.value, List):
expr = (
f"{self.props_field}['{metadata_filter.key}'] "
f"{FilterOperator.IN} {metadata_filter.value}"
)
metadata_filters.append(expr)
else:
expr = (
f"{self.props_field}['{metadata_filter.key}'] "
f"{FilterOperator.EQ} {str(metadata_filter.value)}"
)
metadata_filters.append(expr)
if len(metadata_filters) > 1:
metadata_filter_expr = f" {filters.condition} ".join(metadata_filters)
else:
metadata_filter_expr = metadata_filters[0]
return metadata_filter_expr

View File

@@ -1,6 +1,6 @@
"""Postgres vector store."""
import logging
from typing import Any, List
from typing import List, Optional
from dbgpt._private.pydantic import Field
from dbgpt.core import Chunk
@@ -10,6 +10,7 @@ from dbgpt.storage.vector_store.base import (
VectorStoreBase,
VectorStoreConfig,
)
from dbgpt.storage.vector_store.filters import MetadataFilters
from dbgpt.util.i18n_utils import _
logger = logging.getLogger(__name__)
@@ -70,9 +71,11 @@ class PGVectorStore(VectorStoreBase):
connection_string=self.connection_string,
)
def similar_search(self, text: str, topk: int, **kwargs: Any) -> List[Chunk]:
def similar_search(
self, text: str, topk: int, filters: Optional[MetadataFilters] = None
) -> List[Chunk]:
"""Perform similar search in PGVector."""
return self.vector_store_client.similarity_search(text, topk)
return self.vector_store_client.similarity_search(text, topk, filters)
def vector_name_exists(self) -> bool:
"""Check if vector name exists."""

View File

@@ -1,7 +1,7 @@
"""Weaviate vector store."""
import logging
import os
from typing import List
from typing import List, Optional
from dbgpt._private.pydantic import Field
from dbgpt.core import Chunk
@@ -9,6 +9,7 @@ from dbgpt.core.awel.flow import Parameter, ResourceCategory, register_resource
from dbgpt.util.i18n_utils import _
from .base import _COMMON_PARAMETERS, VectorStoreBase, VectorStoreConfig
from .filters import MetadataFilters
logger = logging.getLogger(__name__)
@@ -80,7 +81,9 @@ class WeaviateStore(VectorStoreBase):
self.vector_store_client = weaviate.Client(self.weaviate_url)
def similar_search(self, text: str, topk: int) -> List[Chunk]:
def similar_search(
self, text: str, topk: int, filters: Optional[MetadataFilters] = None
) -> List[Chunk]:
"""Perform similar search in Weaviate."""
logger.info("Weaviate similar search")
# nearText = {