Add metadata and page_content filters of documents in AwaDB (#7862)

1. Add the metadata filter of documents.
2. Add the text page_content filter of documents
3. fix the bug of similarity_search_with_score

Improvement and fix bug of AwaDB
Fix the conflict https://github.com/hwchase17/langchain/pull/7840
@rlancemartin @eyurtsev  Thanks!

---------

Co-authored-by: vincent <awadb.vincent@gmail.com>
This commit is contained in:
ljeagle 2023-07-18 22:50:17 +08:00 committed by GitHub
parent f1eaa9b626
commit 3902b85657
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 96 additions and 82 deletions

View File

@ -12,9 +12,6 @@ from langchain.embeddings.base import Embeddings
from langchain.vectorstores.base import VectorStore from langchain.vectorstores.base import VectorStore
from langchain.vectorstores.utils import maximal_marginal_relevance from langchain.vectorstores.utils import maximal_marginal_relevance
# from pydantic import BaseModel, Field, root_validator
if TYPE_CHECKING: if TYPE_CHECKING:
import awadb import awadb
@ -36,12 +33,16 @@ class AwaDB(VectorStore):
**kwargs: Any, **kwargs: Any,
) -> None: ) -> None:
"""Initialize with AwaDB client. """Initialize with AwaDB client.
If table_name is not specified,
a random table name of `_DEFAULT_TABLE_NAME + last segment of uuid`
would be created automatically.
Args: Args:
table_name: Iterable of strings to add to the vectorstore. table_name: Name of the table created, default _DEFAULT_TABLE_NAME.
embedding: Optional list of metadatas associated with the texts. embedding: Optional Embeddings initially set.
log_and_data_dir: Optional whether to duplicate texts. log_and_data_dir: Optional the root directory of log and data.
client: Optional AwaDB client. client: Optional AwaDB client.
kwargs: any possible extend parameters in the future. kwargs: Any possible extend parameters in the future.
Returns: Returns:
None. None.
@ -83,7 +84,7 @@ class AwaDB(VectorStore):
Args: Args:
texts: Iterable of strings to add to the vectorstore. texts: Iterable of strings to add to the vectorstore.
metadatas: Optional list of metadatas associated with the texts. metadatas: Optional list of metadatas associated with the texts.
is_duplicate_texts: Optional whether to duplicate texts. is_duplicate_texts: Optional whether to duplicate texts. Defaults to True.
kwargs: any possible extend parameters in the future. kwargs: any possible extend parameters in the future.
Returns: Returns:
@ -131,6 +132,8 @@ class AwaDB(VectorStore):
self, self,
query: str, query: str,
k: int = DEFAULT_TOPN, k: int = DEFAULT_TOPN,
text_in_page_content: Optional[str] = None,
meta_filter: Optional[dict] = None,
**kwargs: Any, **kwargs: Any,
) -> List[Document]: ) -> List[Document]:
"""Return docs most similar to query. """Return docs most similar to query.
@ -138,6 +141,13 @@ class AwaDB(VectorStore):
Args: Args:
query: Text query. query: Text query.
k: The maximum number of documents to return. k: The maximum number of documents to return.
text_in_page_content: Filter by the text in page_content of Document.
meta_filter (Optional[dict]): Filter by metadata. Defaults to None.
E.g. `{"color" : "red", "price": 4.20}`. Optional.
E.g. `{"max_price" : 15.66, "min_price": 4.20}`
`price` is the metadata field, means range filter(4.20<'price'<15.66).
E.g. `{"maxe_price" : 15.66, "mine_price": 4.20}`
`price` is the metadata field, means range filter(4.20<='price'<=15.66).
kwargs: Any possible extend parameters in the future. kwargs: Any possible extend parameters in the future.
Returns: Returns:
@ -158,13 +168,19 @@ class AwaDB(VectorStore):
not_include_fields: Set[str] = {"text_embedding", "_id", "score"} not_include_fields: Set[str] = {"text_embedding", "_id", "score"}
return self.similarity_search_by_vector( return self.similarity_search_by_vector(
embedding, k, not_include_fields_in_metadata=not_include_fields embedding,
k,
text_in_page_content=text_in_page_content,
meta_filter=meta_filter,
not_include_fields_in_metadata=not_include_fields,
) )
def similarity_search_with_score( def similarity_search_with_score(
self, self,
query: str, query: str,
k: int = DEFAULT_TOPN, k: int = DEFAULT_TOPN,
text_in_page_content: Optional[str] = None,
meta_filter: Optional[dict] = None,
**kwargs: Any, **kwargs: Any,
) -> List[Tuple[Document, float]]: ) -> List[Tuple[Document, float]]:
"""The most k similar documents and scores of the specified query. """The most k similar documents and scores of the specified query.
@ -172,6 +188,8 @@ class AwaDB(VectorStore):
Args: Args:
query: Text query. query: Text query.
k: The k most similar documents to the text query. k: The k most similar documents to the text query.
text_in_page_content: Filter by the text in page_content of Document.
meta_filter: Filter by metadata. Defaults to None.
kwargs: Any possible extend parameters in the future. kwargs: Any possible extend parameters in the future.
Returns: Returns:
@ -193,78 +211,37 @@ class AwaDB(VectorStore):
results: List[Tuple[Document, float]] = [] results: List[Tuple[Document, float]] = []
dists: List[float] = [] not_include_fields: Set[str] = {"text_embedding", "_id"}
not_include_fields: Set[str] = {"text_embedding", "_id", "score"}
retrieval_docs = self.similarity_search_by_vector( retrieval_docs = self.similarity_search_by_vector(
embedding, embedding,
k, k,
scores=dists, text_in_page_content=text_in_page_content,
meta_filter=meta_filter,
not_include_fields_in_metadata=not_include_fields, not_include_fields_in_metadata=not_include_fields,
) )
doc_no = 0
for doc in retrieval_docs: for doc in retrieval_docs:
doc_tuple = (doc, dists[doc_no]) score = doc.metadata["score"]
del doc.metadata["score"]
doc_tuple = (doc, score)
results.append(doc_tuple) results.append(doc_tuple)
doc_no = doc_no + 1
return results return results
def similarity_search_with_relevance_scores( def _similarity_search_with_relevance_scores(
self, self,
query: str, query: str,
k: int = DEFAULT_TOPN, k: int = 4,
**kwargs: Any, **kwargs: Any,
) -> List[Tuple[Document, float]]: ) -> List[Tuple[Document, float]]:
"""Return docs and relevance scores return self.similarity_search_with_score(query, k, **kwargs)
which denote the InnerProduct distance, range from 0 to 1.
Args:
query: Text query.
k: Number of the most similar documents to return. Defaults to 4.
Returns:
List of (Document, relevance_score) tuples similar to the text query.
Note that relevance_score ranged from 0 to 1.
0 is dissimilar, 1 is the most similar.
"""
if self.awadb_client is None:
raise ValueError("AwaDB client is None!!!")
embedding = None
if self.using_table_name in self.table2embeddings:
embedding = self.table2embeddings[self.using_table_name].embed_query(query)
show_results = self.awadb_client.Search(embedding, k)
results: List[Tuple[Document, float]] = []
if show_results.__len__() == 0:
return results
dists: List[float] = []
not_include_fields: Set[str] = {"text_embedding", "_id", "score"}
retrieval_docs = self.similarity_search_by_vector(
embedding,
k,
scores=dists,
not_include_fields_in_metadata=not_include_fields,
)
doc_no = 0
for doc in retrieval_docs:
doc_tuple = (doc, dists[doc_no])
results.append(doc_tuple)
doc_no = doc_no + 1
return results
def similarity_search_by_vector( def similarity_search_by_vector(
self, self,
embedding: Optional[List[float]] = None, embedding: Optional[List[float]] = None,
k: int = DEFAULT_TOPN, k: int = DEFAULT_TOPN,
scores: Optional[list] = None, text_in_page_content: Optional[str] = None,
meta_filter: Optional[dict] = None,
not_include_fields_in_metadata: Optional[Set[str]] = None, not_include_fields_in_metadata: Optional[Set[str]] = None,
**kwargs: Any, **kwargs: Any,
) -> List[Document]: ) -> List[Document]:
@ -273,7 +250,8 @@ class AwaDB(VectorStore):
Args: Args:
embedding: Embedding to look up documents similar to. embedding: Embedding to look up documents similar to.
k: Number of Documents to return. Defaults to 4. k: Number of Documents to return. Defaults to 4.
scores: Scores for retrieved docs. text_in_page_content: Filter by the text in page_content of Document.
meta_filter: Filter by metadata. Defaults to None.
not_incude_fields_in_metadata: Not include meta fields of each document. not_incude_fields_in_metadata: Not include meta fields of each document.
Returns: Returns:
@ -289,7 +267,11 @@ class AwaDB(VectorStore):
return results return results
show_results = self.awadb_client.Search( show_results = self.awadb_client.Search(
embedding, k, not_include_fields=not_include_fields_in_metadata embedding,
k,
text_in_page_content=text_in_page_content,
meta_filter=meta_filter,
not_include_fields=not_include_fields_in_metadata,
) )
if show_results.__len__() == 0: if show_results.__len__() == 0:
@ -302,10 +284,6 @@ class AwaDB(VectorStore):
if item_key == "embedding_text": if item_key == "embedding_text":
content = item_detail[item_key] content = item_detail[item_key]
continue continue
elif item_key == "score":
if scores is not None:
scores.append(item_detail[item_key])
continue
elif not_include_fields_in_metadata is not None: elif not_include_fields_in_metadata is not None:
if item_key in not_include_fields_in_metadata: if item_key in not_include_fields_in_metadata:
continue continue
@ -319,6 +297,8 @@ class AwaDB(VectorStore):
k: int = 4, k: int = 4,
fetch_k: int = 20, fetch_k: int = 20,
lambda_mult: float = 0.5, lambda_mult: float = 0.5,
text_in_page_content: Optional[str] = None,
meta_filter: Optional[dict] = None,
**kwargs: Any, **kwargs: Any,
) -> List[Document]: ) -> List[Document]:
"""Return docs selected using the maximal marginal relevance. """Return docs selected using the maximal marginal relevance.
@ -334,6 +314,9 @@ class AwaDB(VectorStore):
of diversity among the results with 0 corresponding of diversity among the results with 0 corresponding
to maximum diversity and 1 to minimum diversity. to maximum diversity and 1 to minimum diversity.
Defaults to 0.5. Defaults to 0.5.
text_in_page_content: Filter by the text in page_content of Document.
meta_filter (Optional[dict]): Filter by metadata. Defaults to None.
Returns: Returns:
List of Documents selected by maximal marginal relevance. List of Documents selected by maximal marginal relevance.
""" """
@ -353,7 +336,12 @@ class AwaDB(VectorStore):
return [] return []
results = self.max_marginal_relevance_search_by_vector( results = self.max_marginal_relevance_search_by_vector(
embedding, k, fetch_k, lambda_mult=lambda_mult embedding,
k,
fetch_k,
lambda_mult=lambda_mult,
text_in_page_content=text_in_page_content,
meta_filter=meta_filter,
) )
return results return results
@ -363,6 +351,8 @@ class AwaDB(VectorStore):
k: int = 4, k: int = 4,
fetch_k: int = 20, fetch_k: int = 20,
lambda_mult: float = 0.5, lambda_mult: float = 0.5,
text_in_page_content: Optional[str] = None,
meta_filter: Optional[dict] = None,
**kwargs: Any, **kwargs: Any,
) -> List[Document]: ) -> List[Document]:
"""Return docs selected using the maximal marginal relevance. """Return docs selected using the maximal marginal relevance.
@ -378,6 +368,9 @@ class AwaDB(VectorStore):
of diversity among the results with 0 corresponding of diversity among the results with 0 corresponding
to maximum diversity and 1 to minimum diversity. to maximum diversity and 1 to minimum diversity.
Defaults to 0.5. Defaults to 0.5.
text_in_page_content: Filter by the text in page_content of Document.
meta_filter (Optional[dict]): Filter by metadata. Defaults to None.
Returns: Returns:
List of Documents selected by maximal marginal relevance. List of Documents selected by maximal marginal relevance.
""" """
@ -392,7 +385,11 @@ class AwaDB(VectorStore):
not_include_fields: set = {"_id", "score"} not_include_fields: set = {"_id", "score"}
retrieved_docs = self.similarity_search_by_vector( retrieved_docs = self.similarity_search_by_vector(
embedding, fetch_k, not_include_fields_in_metadata=not_include_fields embedding,
fetch_k,
text_in_page_content=text_in_page_content,
meta_filter=meta_filter,
not_include_fields_in_metadata=not_include_fields,
) )
top_embeddings = [] top_embeddings = []
@ -412,29 +409,43 @@ class AwaDB(VectorStore):
def get( def get(
self, self,
ids: List[str], ids: Optional[List[str]] = None,
text_in_page_content: Optional[str] = None,
meta_filter: Optional[dict] = None,
not_include_fields: Optional[Set[str]] = None, not_include_fields: Optional[Set[str]] = None,
limit: Optional[int] = None,
**kwargs: Any, **kwargs: Any,
) -> Dict[str, Document]: ) -> Dict[str, Document]:
"""Return docs according ids. """Return docs according ids.
Args: Args:
ids: The ids of the embedding vectors. ids: The ids of the embedding vectors.
text_in_page_content: Filter by the text in page_content of Document.
meta_filter: Filter by any metadata of the document.
not_include_fields: Not pack the specified fields of each document.
limit: The number of documents to return. Defaults to 5. Optional.
Returns: Returns:
Documents which have the ids. Documents which satisfy the input conditions.
""" """
if self.awadb_client is None: if self.awadb_client is None:
raise ValueError("AwaDB client is None!!!") raise ValueError("AwaDB client is None!!!")
docs_detail = self.awadb_client.Get(ids, not_include_fields=not_include_fields) docs_detail = self.awadb_client.Get(
ids=ids,
text_in_page_content=text_in_page_content,
meta_filter=meta_filter,
not_include_fields=not_include_fields,
limit=limit,
)
results: Dict[str, Document] = {} results: Dict[str, Document] = {}
for doc_detail in docs_detail: for doc_detail in docs_detail:
content = "" content = ""
meta_info = {} meta_info = {}
for field in doc_detail: for field in doc_detail:
if field == "embeddint_text": if field == "embedding_text":
content = doc_detail[field] content = doc_detail[field]
continue continue
elif field == "text_embedding" or field == "_id": elif field == "text_embedding" or field == "_id":

21
poetry.lock generated
View File

@ -635,19 +635,22 @@ test = ["coverage (>=5,<6)", "pytest (>=6,<7)"]
[[package]] [[package]]
name = "awadb" name = "awadb"
version = "0.3.6" version = "0.3.7"
description = "The AI Native database for embedding vectors" description = "AI Native database for embedding vectors"
category = "main" category = "main"
optional = true optional = true
python-versions = ">=3.7" python-versions = ">=3.7"
files = [ files = [
{file = "awadb-0.3.6-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:6154f73aab9996aefe8c8f8bf754f7182d109d6b60302c9f31666c7f50cc7aca"}, {file = "awadb-0.3.7-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:deec44f3687aad3ef13ba3fac3e2e4fd83c710a30194d228b3f520d2fb013542"},
{file = "awadb-0.3.6-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:9d7e9dff353517595ecc8c9395a2367acdcfc83c68a64dd4785c8d366eed3f40"}, {file = "awadb-0.3.7-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:96e1e92e029f4b1000631bc06d6f845d554e4698851e515eafd35ff4f2b1994f"},
{file = "awadb-0.3.6-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:475af75d2ffbbe970999d93fbabdf7281797390c66fe852f6a6989e706b90c94"}, {file = "awadb-0.3.7-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:46d44c4e8d97aaeb73234a2b9945b393b91dfaeae98e9fc6632ffb64bbc9e995"},
{file = "awadb-0.3.6-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:304be1de63daec1555f0fe9de9a18cdf16a467687a35a6ccf3405cd400fefb48"}, {file = "awadb-0.3.7-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:d5e379ea2f9f44687edb99c1d35719d1bed116759f800d212d9561cef99736a3"},
{file = "awadb-0.3.6-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:176cc27d1afc4aad758515d5f8fb435f555c9ba827a9e84d6f28b1c6ac568965"}, {file = "awadb-0.3.7-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:f2de28efa210035d278a55466023b44b8479254f3d5de69c944e6a3fcfb97879"},
{file = "awadb-0.3.6-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:211d7f6b0f7c3c3d7518d424f0f3dfac5f45f9e5d7bbf397fdae861ff0dc46fd"}, {file = "awadb-0.3.7-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:7a2a3f063a133c945e12ea0ab9d9c7ab038c8255dbd867067dba0a513557124b"},
{file = "awadb-0.3.6-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:b935ab4ffaa3bcbcc9a381fce91ace5940143b527ffdb467dd4bc630cd94afab"}, {file = "awadb-0.3.7-cp38-cp38-macosx_13_0_arm64.whl", hash = "sha256:15a8f3349ea84bdfc9c096b3760cf00363eb1908e71728b4a1e3beecda763cd5"},
{file = "awadb-0.3.7-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:3ac1ef7e1d1a591eb36a57ea65824e717f54fc521e4ae303d634e510817ba270"},
{file = "awadb-0.3.7-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:947b6576a07e90cac31d8ff709cd0e0abc9779bc71276df817b2ffe18c1fa541"},
{file = "awadb-0.3.7-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:e4014edade8134579993639a4a3b18ff20c8449fdfc5ff511b24617109be5df7"},
] ]
[package.extras] [package.extras]