mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-16 08:06:14 +00:00
Add metadata and page_content filters of documents in AwaDB (#7862)
1. Add the metadata filter of documents. 2. Add the text page_content filter of documents 3. fix the bug of similarity_search_with_score Improvement and fix bug of AwaDB Fix the conflict https://github.com/hwchase17/langchain/pull/7840 @rlancemartin @eyurtsev Thanks! --------- Co-authored-by: vincent <awadb.vincent@gmail.com>
This commit is contained in:
parent
f1eaa9b626
commit
3902b85657
@ -12,9 +12,6 @@ from langchain.embeddings.base import Embeddings
|
||||
from langchain.vectorstores.base import VectorStore
|
||||
from langchain.vectorstores.utils import maximal_marginal_relevance
|
||||
|
||||
# from pydantic import BaseModel, Field, root_validator
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import awadb
|
||||
|
||||
@ -36,12 +33,16 @@ class AwaDB(VectorStore):
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Initialize with AwaDB client.
|
||||
If table_name is not specified,
|
||||
a random table name of `_DEFAULT_TABLE_NAME + last segment of uuid`
|
||||
would be created automatically.
|
||||
|
||||
Args:
|
||||
table_name: Iterable of strings to add to the vectorstore.
|
||||
embedding: Optional list of metadatas associated with the texts.
|
||||
log_and_data_dir: Optional whether to duplicate texts.
|
||||
table_name: Name of the table created, default _DEFAULT_TABLE_NAME.
|
||||
embedding: Optional Embeddings initially set.
|
||||
log_and_data_dir: Optional the root directory of log and data.
|
||||
client: Optional AwaDB client.
|
||||
kwargs: any possible extend parameters in the future.
|
||||
kwargs: Any possible extend parameters in the future.
|
||||
|
||||
Returns:
|
||||
None.
|
||||
@ -83,7 +84,7 @@ class AwaDB(VectorStore):
|
||||
Args:
|
||||
texts: Iterable of strings to add to the vectorstore.
|
||||
metadatas: Optional list of metadatas associated with the texts.
|
||||
is_duplicate_texts: Optional whether to duplicate texts.
|
||||
is_duplicate_texts: Optional whether to duplicate texts. Defaults to True.
|
||||
kwargs: any possible extend parameters in the future.
|
||||
|
||||
Returns:
|
||||
@ -131,6 +132,8 @@ class AwaDB(VectorStore):
|
||||
self,
|
||||
query: str,
|
||||
k: int = DEFAULT_TOPN,
|
||||
text_in_page_content: Optional[str] = None,
|
||||
meta_filter: Optional[dict] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
"""Return docs most similar to query.
|
||||
@ -138,6 +141,13 @@ class AwaDB(VectorStore):
|
||||
Args:
|
||||
query: Text query.
|
||||
k: The maximum number of documents to return.
|
||||
text_in_page_content: Filter by the text in page_content of Document.
|
||||
meta_filter (Optional[dict]): Filter by metadata. Defaults to None.
|
||||
E.g. `{"color" : "red", "price": 4.20}`. Optional.
|
||||
E.g. `{"max_price" : 15.66, "min_price": 4.20}`
|
||||
`price` is the metadata field, means range filter(4.20<'price'<15.66).
|
||||
E.g. `{"maxe_price" : 15.66, "mine_price": 4.20}`
|
||||
`price` is the metadata field, means range filter(4.20<='price'<=15.66).
|
||||
kwargs: Any possible extend parameters in the future.
|
||||
|
||||
Returns:
|
||||
@ -158,13 +168,19 @@ class AwaDB(VectorStore):
|
||||
|
||||
not_include_fields: Set[str] = {"text_embedding", "_id", "score"}
|
||||
return self.similarity_search_by_vector(
|
||||
embedding, k, not_include_fields_in_metadata=not_include_fields
|
||||
embedding,
|
||||
k,
|
||||
text_in_page_content=text_in_page_content,
|
||||
meta_filter=meta_filter,
|
||||
not_include_fields_in_metadata=not_include_fields,
|
||||
)
|
||||
|
||||
def similarity_search_with_score(
|
||||
self,
|
||||
query: str,
|
||||
k: int = DEFAULT_TOPN,
|
||||
text_in_page_content: Optional[str] = None,
|
||||
meta_filter: Optional[dict] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Tuple[Document, float]]:
|
||||
"""The most k similar documents and scores of the specified query.
|
||||
@ -172,6 +188,8 @@ class AwaDB(VectorStore):
|
||||
Args:
|
||||
query: Text query.
|
||||
k: The k most similar documents to the text query.
|
||||
text_in_page_content: Filter by the text in page_content of Document.
|
||||
meta_filter: Filter by metadata. Defaults to None.
|
||||
kwargs: Any possible extend parameters in the future.
|
||||
|
||||
Returns:
|
||||
@ -193,78 +211,37 @@ class AwaDB(VectorStore):
|
||||
|
||||
results: List[Tuple[Document, float]] = []
|
||||
|
||||
dists: List[float] = []
|
||||
not_include_fields: Set[str] = {"text_embedding", "_id", "score"}
|
||||
not_include_fields: Set[str] = {"text_embedding", "_id"}
|
||||
retrieval_docs = self.similarity_search_by_vector(
|
||||
embedding,
|
||||
k,
|
||||
scores=dists,
|
||||
text_in_page_content=text_in_page_content,
|
||||
meta_filter=meta_filter,
|
||||
not_include_fields_in_metadata=not_include_fields,
|
||||
)
|
||||
|
||||
doc_no = 0
|
||||
for doc in retrieval_docs:
|
||||
doc_tuple = (doc, dists[doc_no])
|
||||
score = doc.metadata["score"]
|
||||
del doc.metadata["score"]
|
||||
doc_tuple = (doc, score)
|
||||
results.append(doc_tuple)
|
||||
doc_no = doc_no + 1
|
||||
|
||||
return results
|
||||
|
||||
def similarity_search_with_relevance_scores(
|
||||
def _similarity_search_with_relevance_scores(
|
||||
self,
|
||||
query: str,
|
||||
k: int = DEFAULT_TOPN,
|
||||
k: int = 4,
|
||||
**kwargs: Any,
|
||||
) -> List[Tuple[Document, float]]:
|
||||
"""Return docs and relevance scores
|
||||
which denote the InnerProduct distance, range from 0 to 1.
|
||||
|
||||
Args:
|
||||
query: Text query.
|
||||
k: Number of the most similar documents to return. Defaults to 4.
|
||||
|
||||
Returns:
|
||||
List of (Document, relevance_score) tuples similar to the text query.
|
||||
Note that relevance_score ranged from 0 to 1.
|
||||
0 is dissimilar, 1 is the most similar.
|
||||
"""
|
||||
|
||||
if self.awadb_client is None:
|
||||
raise ValueError("AwaDB client is None!!!")
|
||||
|
||||
embedding = None
|
||||
if self.using_table_name in self.table2embeddings:
|
||||
embedding = self.table2embeddings[self.using_table_name].embed_query(query)
|
||||
|
||||
show_results = self.awadb_client.Search(embedding, k)
|
||||
|
||||
results: List[Tuple[Document, float]] = []
|
||||
|
||||
if show_results.__len__() == 0:
|
||||
return results
|
||||
|
||||
dists: List[float] = []
|
||||
not_include_fields: Set[str] = {"text_embedding", "_id", "score"}
|
||||
retrieval_docs = self.similarity_search_by_vector(
|
||||
embedding,
|
||||
k,
|
||||
scores=dists,
|
||||
not_include_fields_in_metadata=not_include_fields,
|
||||
)
|
||||
|
||||
doc_no = 0
|
||||
for doc in retrieval_docs:
|
||||
doc_tuple = (doc, dists[doc_no])
|
||||
results.append(doc_tuple)
|
||||
doc_no = doc_no + 1
|
||||
|
||||
return results
|
||||
return self.similarity_search_with_score(query, k, **kwargs)
|
||||
|
||||
def similarity_search_by_vector(
|
||||
self,
|
||||
embedding: Optional[List[float]] = None,
|
||||
k: int = DEFAULT_TOPN,
|
||||
scores: Optional[list] = None,
|
||||
text_in_page_content: Optional[str] = None,
|
||||
meta_filter: Optional[dict] = None,
|
||||
not_include_fields_in_metadata: Optional[Set[str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
@ -273,7 +250,8 @@ class AwaDB(VectorStore):
|
||||
Args:
|
||||
embedding: Embedding to look up documents similar to.
|
||||
k: Number of Documents to return. Defaults to 4.
|
||||
scores: Scores for retrieved docs.
|
||||
text_in_page_content: Filter by the text in page_content of Document.
|
||||
meta_filter: Filter by metadata. Defaults to None.
|
||||
not_incude_fields_in_metadata: Not include meta fields of each document.
|
||||
|
||||
Returns:
|
||||
@ -289,7 +267,11 @@ class AwaDB(VectorStore):
|
||||
return results
|
||||
|
||||
show_results = self.awadb_client.Search(
|
||||
embedding, k, not_include_fields=not_include_fields_in_metadata
|
||||
embedding,
|
||||
k,
|
||||
text_in_page_content=text_in_page_content,
|
||||
meta_filter=meta_filter,
|
||||
not_include_fields=not_include_fields_in_metadata,
|
||||
)
|
||||
|
||||
if show_results.__len__() == 0:
|
||||
@ -302,10 +284,6 @@ class AwaDB(VectorStore):
|
||||
if item_key == "embedding_text":
|
||||
content = item_detail[item_key]
|
||||
continue
|
||||
elif item_key == "score":
|
||||
if scores is not None:
|
||||
scores.append(item_detail[item_key])
|
||||
continue
|
||||
elif not_include_fields_in_metadata is not None:
|
||||
if item_key in not_include_fields_in_metadata:
|
||||
continue
|
||||
@ -319,6 +297,8 @@ class AwaDB(VectorStore):
|
||||
k: int = 4,
|
||||
fetch_k: int = 20,
|
||||
lambda_mult: float = 0.5,
|
||||
text_in_page_content: Optional[str] = None,
|
||||
meta_filter: Optional[dict] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
"""Return docs selected using the maximal marginal relevance.
|
||||
@ -334,6 +314,9 @@ class AwaDB(VectorStore):
|
||||
of diversity among the results with 0 corresponding
|
||||
to maximum diversity and 1 to minimum diversity.
|
||||
Defaults to 0.5.
|
||||
text_in_page_content: Filter by the text in page_content of Document.
|
||||
meta_filter (Optional[dict]): Filter by metadata. Defaults to None.
|
||||
|
||||
Returns:
|
||||
List of Documents selected by maximal marginal relevance.
|
||||
"""
|
||||
@ -353,7 +336,12 @@ class AwaDB(VectorStore):
|
||||
return []
|
||||
|
||||
results = self.max_marginal_relevance_search_by_vector(
|
||||
embedding, k, fetch_k, lambda_mult=lambda_mult
|
||||
embedding,
|
||||
k,
|
||||
fetch_k,
|
||||
lambda_mult=lambda_mult,
|
||||
text_in_page_content=text_in_page_content,
|
||||
meta_filter=meta_filter,
|
||||
)
|
||||
return results
|
||||
|
||||
@ -363,6 +351,8 @@ class AwaDB(VectorStore):
|
||||
k: int = 4,
|
||||
fetch_k: int = 20,
|
||||
lambda_mult: float = 0.5,
|
||||
text_in_page_content: Optional[str] = None,
|
||||
meta_filter: Optional[dict] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
"""Return docs selected using the maximal marginal relevance.
|
||||
@ -378,6 +368,9 @@ class AwaDB(VectorStore):
|
||||
of diversity among the results with 0 corresponding
|
||||
to maximum diversity and 1 to minimum diversity.
|
||||
Defaults to 0.5.
|
||||
text_in_page_content: Filter by the text in page_content of Document.
|
||||
meta_filter (Optional[dict]): Filter by metadata. Defaults to None.
|
||||
|
||||
Returns:
|
||||
List of Documents selected by maximal marginal relevance.
|
||||
"""
|
||||
@ -392,7 +385,11 @@ class AwaDB(VectorStore):
|
||||
|
||||
not_include_fields: set = {"_id", "score"}
|
||||
retrieved_docs = self.similarity_search_by_vector(
|
||||
embedding, fetch_k, not_include_fields_in_metadata=not_include_fields
|
||||
embedding,
|
||||
fetch_k,
|
||||
text_in_page_content=text_in_page_content,
|
||||
meta_filter=meta_filter,
|
||||
not_include_fields_in_metadata=not_include_fields,
|
||||
)
|
||||
|
||||
top_embeddings = []
|
||||
@ -412,29 +409,43 @@ class AwaDB(VectorStore):
|
||||
|
||||
def get(
|
||||
self,
|
||||
ids: List[str],
|
||||
ids: Optional[List[str]] = None,
|
||||
text_in_page_content: Optional[str] = None,
|
||||
meta_filter: Optional[dict] = None,
|
||||
not_include_fields: Optional[Set[str]] = None,
|
||||
limit: Optional[int] = None,
|
||||
**kwargs: Any,
|
||||
) -> Dict[str, Document]:
|
||||
"""Return docs according ids.
|
||||
|
||||
Args:
|
||||
ids: The ids of the embedding vectors.
|
||||
text_in_page_content: Filter by the text in page_content of Document.
|
||||
meta_filter: Filter by any metadata of the document.
|
||||
not_include_fields: Not pack the specified fields of each document.
|
||||
limit: The number of documents to return. Defaults to 5. Optional.
|
||||
|
||||
Returns:
|
||||
Documents which have the ids.
|
||||
Documents which satisfy the input conditions.
|
||||
"""
|
||||
|
||||
if self.awadb_client is None:
|
||||
raise ValueError("AwaDB client is None!!!")
|
||||
|
||||
docs_detail = self.awadb_client.Get(ids, not_include_fields=not_include_fields)
|
||||
docs_detail = self.awadb_client.Get(
|
||||
ids=ids,
|
||||
text_in_page_content=text_in_page_content,
|
||||
meta_filter=meta_filter,
|
||||
not_include_fields=not_include_fields,
|
||||
limit=limit,
|
||||
)
|
||||
|
||||
results: Dict[str, Document] = {}
|
||||
for doc_detail in docs_detail:
|
||||
content = ""
|
||||
meta_info = {}
|
||||
for field in doc_detail:
|
||||
if field == "embeddint_text":
|
||||
if field == "embedding_text":
|
||||
content = doc_detail[field]
|
||||
continue
|
||||
elif field == "text_embedding" or field == "_id":
|
||||
|
21
poetry.lock
generated
21
poetry.lock
generated
@ -635,19 +635,22 @@ test = ["coverage (>=5,<6)", "pytest (>=6,<7)"]
|
||||
|
||||
[[package]]
|
||||
name = "awadb"
|
||||
version = "0.3.6"
|
||||
description = "The AI Native database for embedding vectors"
|
||||
version = "0.3.7"
|
||||
description = "AI Native database for embedding vectors"
|
||||
category = "main"
|
||||
optional = true
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "awadb-0.3.6-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:6154f73aab9996aefe8c8f8bf754f7182d109d6b60302c9f31666c7f50cc7aca"},
|
||||
{file = "awadb-0.3.6-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:9d7e9dff353517595ecc8c9395a2367acdcfc83c68a64dd4785c8d366eed3f40"},
|
||||
{file = "awadb-0.3.6-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:475af75d2ffbbe970999d93fbabdf7281797390c66fe852f6a6989e706b90c94"},
|
||||
{file = "awadb-0.3.6-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:304be1de63daec1555f0fe9de9a18cdf16a467687a35a6ccf3405cd400fefb48"},
|
||||
{file = "awadb-0.3.6-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:176cc27d1afc4aad758515d5f8fb435f555c9ba827a9e84d6f28b1c6ac568965"},
|
||||
{file = "awadb-0.3.6-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:211d7f6b0f7c3c3d7518d424f0f3dfac5f45f9e5d7bbf397fdae861ff0dc46fd"},
|
||||
{file = "awadb-0.3.6-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:b935ab4ffaa3bcbcc9a381fce91ace5940143b527ffdb467dd4bc630cd94afab"},
|
||||
{file = "awadb-0.3.7-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:deec44f3687aad3ef13ba3fac3e2e4fd83c710a30194d228b3f520d2fb013542"},
|
||||
{file = "awadb-0.3.7-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:96e1e92e029f4b1000631bc06d6f845d554e4698851e515eafd35ff4f2b1994f"},
|
||||
{file = "awadb-0.3.7-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:46d44c4e8d97aaeb73234a2b9945b393b91dfaeae98e9fc6632ffb64bbc9e995"},
|
||||
{file = "awadb-0.3.7-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:d5e379ea2f9f44687edb99c1d35719d1bed116759f800d212d9561cef99736a3"},
|
||||
{file = "awadb-0.3.7-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:f2de28efa210035d278a55466023b44b8479254f3d5de69c944e6a3fcfb97879"},
|
||||
{file = "awadb-0.3.7-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:7a2a3f063a133c945e12ea0ab9d9c7ab038c8255dbd867067dba0a513557124b"},
|
||||
{file = "awadb-0.3.7-cp38-cp38-macosx_13_0_arm64.whl", hash = "sha256:15a8f3349ea84bdfc9c096b3760cf00363eb1908e71728b4a1e3beecda763cd5"},
|
||||
{file = "awadb-0.3.7-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:3ac1ef7e1d1a591eb36a57ea65824e717f54fc521e4ae303d634e510817ba270"},
|
||||
{file = "awadb-0.3.7-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:947b6576a07e90cac31d8ff709cd0e0abc9779bc71276df817b2ffe18c1fa541"},
|
||||
{file = "awadb-0.3.7-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:e4014edade8134579993639a4a3b18ff20c8449fdfc5ff511b24617109be5df7"},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
|
Loading…
Reference in New Issue
Block a user