Add metadata and page_content filters of documents in AwaDB (#7862)

1. Add the metadata filter of documents.
2. Add the text page_content filter of documents
3. fix the bug of similarity_search_with_score

Improvement and fix bug of AwaDB
Fix the conflict https://github.com/hwchase17/langchain/pull/7840
@rlancemartin @eyurtsev  Thanks!

---------

Co-authored-by: vincent <awadb.vincent@gmail.com>
This commit is contained in:
ljeagle 2023-07-18 22:50:17 +08:00 committed by GitHub
parent f1eaa9b626
commit 3902b85657
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 96 additions and 82 deletions

View File

@ -12,9 +12,6 @@ from langchain.embeddings.base import Embeddings
from langchain.vectorstores.base import VectorStore
from langchain.vectorstores.utils import maximal_marginal_relevance
# from pydantic import BaseModel, Field, root_validator
if TYPE_CHECKING:
import awadb
@ -36,12 +33,16 @@ class AwaDB(VectorStore):
**kwargs: Any,
) -> None:
"""Initialize with AwaDB client.
If table_name is not specified,
a random table name of `_DEFAULT_TABLE_NAME + last segment of uuid`
would be created automatically.
Args:
table_name: Iterable of strings to add to the vectorstore.
embedding: Optional list of metadatas associated with the texts.
log_and_data_dir: Optional whether to duplicate texts.
table_name: Name of the table created, default _DEFAULT_TABLE_NAME.
embedding: Optional Embeddings initially set.
log_and_data_dir: Optional the root directory of log and data.
client: Optional AwaDB client.
kwargs: any possible extend parameters in the future.
kwargs: Any possible extend parameters in the future.
Returns:
None.
@ -83,7 +84,7 @@ class AwaDB(VectorStore):
Args:
texts: Iterable of strings to add to the vectorstore.
metadatas: Optional list of metadatas associated with the texts.
is_duplicate_texts: Optional whether to duplicate texts.
is_duplicate_texts: Optional whether to duplicate texts. Defaults to True.
kwargs: any possible extend parameters in the future.
Returns:
@ -131,6 +132,8 @@ class AwaDB(VectorStore):
self,
query: str,
k: int = DEFAULT_TOPN,
text_in_page_content: Optional[str] = None,
meta_filter: Optional[dict] = None,
**kwargs: Any,
) -> List[Document]:
"""Return docs most similar to query.
@ -138,6 +141,13 @@ class AwaDB(VectorStore):
Args:
query: Text query.
k: The maximum number of documents to return.
text_in_page_content: Filter by the text in page_content of Document.
meta_filter (Optional[dict]): Filter by metadata. Defaults to None.
E.g. `{"color" : "red", "price": 4.20}`. Optional.
E.g. `{"max_price" : 15.66, "min_price": 4.20}`
`price` is the metadata field, means range filter(4.20<'price'<15.66).
E.g. `{"maxe_price" : 15.66, "mine_price": 4.20}`
`price` is the metadata field, means range filter(4.20<='price'<=15.66).
kwargs: Any possible extend parameters in the future.
Returns:
@ -158,13 +168,19 @@ class AwaDB(VectorStore):
not_include_fields: Set[str] = {"text_embedding", "_id", "score"}
return self.similarity_search_by_vector(
embedding, k, not_include_fields_in_metadata=not_include_fields
embedding,
k,
text_in_page_content=text_in_page_content,
meta_filter=meta_filter,
not_include_fields_in_metadata=not_include_fields,
)
def similarity_search_with_score(
self,
query: str,
k: int = DEFAULT_TOPN,
text_in_page_content: Optional[str] = None,
meta_filter: Optional[dict] = None,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
"""The most k similar documents and scores of the specified query.
@ -172,6 +188,8 @@ class AwaDB(VectorStore):
Args:
query: Text query.
k: The k most similar documents to the text query.
text_in_page_content: Filter by the text in page_content of Document.
meta_filter: Filter by metadata. Defaults to None.
kwargs: Any possible extend parameters in the future.
Returns:
@ -193,78 +211,37 @@ class AwaDB(VectorStore):
results: List[Tuple[Document, float]] = []
dists: List[float] = []
not_include_fields: Set[str] = {"text_embedding", "_id", "score"}
not_include_fields: Set[str] = {"text_embedding", "_id"}
retrieval_docs = self.similarity_search_by_vector(
embedding,
k,
scores=dists,
text_in_page_content=text_in_page_content,
meta_filter=meta_filter,
not_include_fields_in_metadata=not_include_fields,
)
doc_no = 0
for doc in retrieval_docs:
doc_tuple = (doc, dists[doc_no])
score = doc.metadata["score"]
del doc.metadata["score"]
doc_tuple = (doc, score)
results.append(doc_tuple)
doc_no = doc_no + 1
return results
def similarity_search_with_relevance_scores(
def _similarity_search_with_relevance_scores(
self,
query: str,
k: int = DEFAULT_TOPN,
k: int = 4,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
"""Return docs and relevance scores
which denote the InnerProduct distance, range from 0 to 1.
Args:
query: Text query.
k: Number of the most similar documents to return. Defaults to 4.
Returns:
List of (Document, relevance_score) tuples similar to the text query.
Note that relevance_score ranged from 0 to 1.
0 is dissimilar, 1 is the most similar.
"""
if self.awadb_client is None:
raise ValueError("AwaDB client is None!!!")
embedding = None
if self.using_table_name in self.table2embeddings:
embedding = self.table2embeddings[self.using_table_name].embed_query(query)
show_results = self.awadb_client.Search(embedding, k)
results: List[Tuple[Document, float]] = []
if show_results.__len__() == 0:
return results
dists: List[float] = []
not_include_fields: Set[str] = {"text_embedding", "_id", "score"}
retrieval_docs = self.similarity_search_by_vector(
embedding,
k,
scores=dists,
not_include_fields_in_metadata=not_include_fields,
)
doc_no = 0
for doc in retrieval_docs:
doc_tuple = (doc, dists[doc_no])
results.append(doc_tuple)
doc_no = doc_no + 1
return results
return self.similarity_search_with_score(query, k, **kwargs)
def similarity_search_by_vector(
self,
embedding: Optional[List[float]] = None,
k: int = DEFAULT_TOPN,
scores: Optional[list] = None,
text_in_page_content: Optional[str] = None,
meta_filter: Optional[dict] = None,
not_include_fields_in_metadata: Optional[Set[str]] = None,
**kwargs: Any,
) -> List[Document]:
@ -273,7 +250,8 @@ class AwaDB(VectorStore):
Args:
embedding: Embedding to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
scores: Scores for retrieved docs.
text_in_page_content: Filter by the text in page_content of Document.
meta_filter: Filter by metadata. Defaults to None.
not_incude_fields_in_metadata: Not include meta fields of each document.
Returns:
@ -289,7 +267,11 @@ class AwaDB(VectorStore):
return results
show_results = self.awadb_client.Search(
embedding, k, not_include_fields=not_include_fields_in_metadata
embedding,
k,
text_in_page_content=text_in_page_content,
meta_filter=meta_filter,
not_include_fields=not_include_fields_in_metadata,
)
if show_results.__len__() == 0:
@ -302,10 +284,6 @@ class AwaDB(VectorStore):
if item_key == "embedding_text":
content = item_detail[item_key]
continue
elif item_key == "score":
if scores is not None:
scores.append(item_detail[item_key])
continue
elif not_include_fields_in_metadata is not None:
if item_key in not_include_fields_in_metadata:
continue
@ -319,6 +297,8 @@ class AwaDB(VectorStore):
k: int = 4,
fetch_k: int = 20,
lambda_mult: float = 0.5,
text_in_page_content: Optional[str] = None,
meta_filter: Optional[dict] = None,
**kwargs: Any,
) -> List[Document]:
"""Return docs selected using the maximal marginal relevance.
@ -334,6 +314,9 @@ class AwaDB(VectorStore):
of diversity among the results with 0 corresponding
to maximum diversity and 1 to minimum diversity.
Defaults to 0.5.
text_in_page_content: Filter by the text in page_content of Document.
meta_filter (Optional[dict]): Filter by metadata. Defaults to None.
Returns:
List of Documents selected by maximal marginal relevance.
"""
@ -353,7 +336,12 @@ class AwaDB(VectorStore):
return []
results = self.max_marginal_relevance_search_by_vector(
embedding, k, fetch_k, lambda_mult=lambda_mult
embedding,
k,
fetch_k,
lambda_mult=lambda_mult,
text_in_page_content=text_in_page_content,
meta_filter=meta_filter,
)
return results
@ -363,6 +351,8 @@ class AwaDB(VectorStore):
k: int = 4,
fetch_k: int = 20,
lambda_mult: float = 0.5,
text_in_page_content: Optional[str] = None,
meta_filter: Optional[dict] = None,
**kwargs: Any,
) -> List[Document]:
"""Return docs selected using the maximal marginal relevance.
@ -378,6 +368,9 @@ class AwaDB(VectorStore):
of diversity among the results with 0 corresponding
to maximum diversity and 1 to minimum diversity.
Defaults to 0.5.
text_in_page_content: Filter by the text in page_content of Document.
meta_filter (Optional[dict]): Filter by metadata. Defaults to None.
Returns:
List of Documents selected by maximal marginal relevance.
"""
@ -392,7 +385,11 @@ class AwaDB(VectorStore):
not_include_fields: set = {"_id", "score"}
retrieved_docs = self.similarity_search_by_vector(
embedding, fetch_k, not_include_fields_in_metadata=not_include_fields
embedding,
fetch_k,
text_in_page_content=text_in_page_content,
meta_filter=meta_filter,
not_include_fields_in_metadata=not_include_fields,
)
top_embeddings = []
@ -412,29 +409,43 @@ class AwaDB(VectorStore):
def get(
self,
ids: List[str],
ids: Optional[List[str]] = None,
text_in_page_content: Optional[str] = None,
meta_filter: Optional[dict] = None,
not_include_fields: Optional[Set[str]] = None,
limit: Optional[int] = None,
**kwargs: Any,
) -> Dict[str, Document]:
"""Return docs according ids.
Args:
ids: The ids of the embedding vectors.
text_in_page_content: Filter by the text in page_content of Document.
meta_filter: Filter by any metadata of the document.
not_include_fields: Not pack the specified fields of each document.
limit: The number of documents to return. Defaults to 5. Optional.
Returns:
Documents which have the ids.
Documents which satisfy the input conditions.
"""
if self.awadb_client is None:
raise ValueError("AwaDB client is None!!!")
docs_detail = self.awadb_client.Get(ids, not_include_fields=not_include_fields)
docs_detail = self.awadb_client.Get(
ids=ids,
text_in_page_content=text_in_page_content,
meta_filter=meta_filter,
not_include_fields=not_include_fields,
limit=limit,
)
results: Dict[str, Document] = {}
for doc_detail in docs_detail:
content = ""
meta_info = {}
for field in doc_detail:
if field == "embeddint_text":
if field == "embedding_text":
content = doc_detail[field]
continue
elif field == "text_embedding" or field == "_id":

21
poetry.lock generated
View File

@ -635,19 +635,22 @@ test = ["coverage (>=5,<6)", "pytest (>=6,<7)"]
[[package]]
name = "awadb"
version = "0.3.6"
description = "The AI Native database for embedding vectors"
version = "0.3.7"
description = "AI Native database for embedding vectors"
category = "main"
optional = true
python-versions = ">=3.7"
files = [
{file = "awadb-0.3.6-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:6154f73aab9996aefe8c8f8bf754f7182d109d6b60302c9f31666c7f50cc7aca"},
{file = "awadb-0.3.6-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:9d7e9dff353517595ecc8c9395a2367acdcfc83c68a64dd4785c8d366eed3f40"},
{file = "awadb-0.3.6-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:475af75d2ffbbe970999d93fbabdf7281797390c66fe852f6a6989e706b90c94"},
{file = "awadb-0.3.6-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:304be1de63daec1555f0fe9de9a18cdf16a467687a35a6ccf3405cd400fefb48"},
{file = "awadb-0.3.6-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:176cc27d1afc4aad758515d5f8fb435f555c9ba827a9e84d6f28b1c6ac568965"},
{file = "awadb-0.3.6-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:211d7f6b0f7c3c3d7518d424f0f3dfac5f45f9e5d7bbf397fdae861ff0dc46fd"},
{file = "awadb-0.3.6-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:b935ab4ffaa3bcbcc9a381fce91ace5940143b527ffdb467dd4bc630cd94afab"},
{file = "awadb-0.3.7-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:deec44f3687aad3ef13ba3fac3e2e4fd83c710a30194d228b3f520d2fb013542"},
{file = "awadb-0.3.7-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:96e1e92e029f4b1000631bc06d6f845d554e4698851e515eafd35ff4f2b1994f"},
{file = "awadb-0.3.7-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:46d44c4e8d97aaeb73234a2b9945b393b91dfaeae98e9fc6632ffb64bbc9e995"},
{file = "awadb-0.3.7-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:d5e379ea2f9f44687edb99c1d35719d1bed116759f800d212d9561cef99736a3"},
{file = "awadb-0.3.7-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:f2de28efa210035d278a55466023b44b8479254f3d5de69c944e6a3fcfb97879"},
{file = "awadb-0.3.7-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:7a2a3f063a133c945e12ea0ab9d9c7ab038c8255dbd867067dba0a513557124b"},
{file = "awadb-0.3.7-cp38-cp38-macosx_13_0_arm64.whl", hash = "sha256:15a8f3349ea84bdfc9c096b3760cf00363eb1908e71728b4a1e3beecda763cd5"},
{file = "awadb-0.3.7-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:3ac1ef7e1d1a591eb36a57ea65824e717f54fc521e4ae303d634e510817ba270"},
{file = "awadb-0.3.7-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:947b6576a07e90cac31d8ff709cd0e0abc9779bc71276df817b2ffe18c1fa541"},
{file = "awadb-0.3.7-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:e4014edade8134579993639a4a3b18ff20c8449fdfc5ff511b24617109be5df7"},
]
[package.extras]