diff --git a/langchain/vectorstores/awadb.py b/langchain/vectorstores/awadb.py index 5a74a8c1ecb..b4b0ce1a40e 100644 --- a/langchain/vectorstores/awadb.py +++ b/langchain/vectorstores/awadb.py @@ -12,9 +12,6 @@ from langchain.embeddings.base import Embeddings from langchain.vectorstores.base import VectorStore from langchain.vectorstores.utils import maximal_marginal_relevance -# from pydantic import BaseModel, Field, root_validator - - if TYPE_CHECKING: import awadb @@ -36,12 +33,16 @@ class AwaDB(VectorStore): **kwargs: Any, ) -> None: """Initialize with AwaDB client. + If table_name is not specified, + a random table name of `_DEFAULT_TABLE_NAME + last segment of uuid` + would be created automatically. + Args: - table_name: Iterable of strings to add to the vectorstore. - embedding: Optional list of metadatas associated with the texts. - log_and_data_dir: Optional whether to duplicate texts. + table_name: Name of the table created, default _DEFAULT_TABLE_NAME. + embedding: Optional Embeddings initially set. + log_and_data_dir: Optional the root directory of log and data. client: Optional AwaDB client. - kwargs: any possible extend parameters in the future. + kwargs: Any possible extend parameters in the future. Returns: None. @@ -83,7 +84,7 @@ class AwaDB(VectorStore): Args: texts: Iterable of strings to add to the vectorstore. metadatas: Optional list of metadatas associated with the texts. - is_duplicate_texts: Optional whether to duplicate texts. + is_duplicate_texts: Optional whether to duplicate texts. Defaults to True. kwargs: any possible extend parameters in the future. Returns: @@ -131,6 +132,8 @@ class AwaDB(VectorStore): self, query: str, k: int = DEFAULT_TOPN, + text_in_page_content: Optional[str] = None, + meta_filter: Optional[dict] = None, **kwargs: Any, ) -> List[Document]: """Return docs most similar to query. @@ -138,6 +141,13 @@ class AwaDB(VectorStore): Args: query: Text query. k: The maximum number of documents to return. + text_in_page_content: Filter by the text in page_content of Document. + meta_filter (Optional[dict]): Filter by metadata. Defaults to None. + E.g. `{"color" : "red", "price": 4.20}`. Optional. + E.g. `{"max_price" : 15.66, "min_price": 4.20}` + `price` is the metadata field, means range filter(4.20<'price'<15.66). + E.g. `{"maxe_price" : 15.66, "mine_price": 4.20}` + `price` is the metadata field, means range filter(4.20<='price'<=15.66). kwargs: Any possible extend parameters in the future. Returns: @@ -158,13 +168,19 @@ class AwaDB(VectorStore): not_include_fields: Set[str] = {"text_embedding", "_id", "score"} return self.similarity_search_by_vector( - embedding, k, not_include_fields_in_metadata=not_include_fields + embedding, + k, + text_in_page_content=text_in_page_content, + meta_filter=meta_filter, + not_include_fields_in_metadata=not_include_fields, ) def similarity_search_with_score( self, query: str, k: int = DEFAULT_TOPN, + text_in_page_content: Optional[str] = None, + meta_filter: Optional[dict] = None, **kwargs: Any, ) -> List[Tuple[Document, float]]: """The most k similar documents and scores of the specified query. @@ -172,6 +188,8 @@ class AwaDB(VectorStore): Args: query: Text query. k: The k most similar documents to the text query. + text_in_page_content: Filter by the text in page_content of Document. + meta_filter: Filter by metadata. Defaults to None. kwargs: Any possible extend parameters in the future. Returns: @@ -193,78 +211,37 @@ class AwaDB(VectorStore): results: List[Tuple[Document, float]] = [] - dists: List[float] = [] - not_include_fields: Set[str] = {"text_embedding", "_id", "score"} + not_include_fields: Set[str] = {"text_embedding", "_id"} retrieval_docs = self.similarity_search_by_vector( embedding, k, - scores=dists, + text_in_page_content=text_in_page_content, + meta_filter=meta_filter, not_include_fields_in_metadata=not_include_fields, ) - doc_no = 0 for doc in retrieval_docs: - doc_tuple = (doc, dists[doc_no]) + score = doc.metadata["score"] + del doc.metadata["score"] + doc_tuple = (doc, score) results.append(doc_tuple) - doc_no = doc_no + 1 return results - def similarity_search_with_relevance_scores( + def _similarity_search_with_relevance_scores( self, query: str, - k: int = DEFAULT_TOPN, + k: int = 4, **kwargs: Any, ) -> List[Tuple[Document, float]]: - """Return docs and relevance scores - which denote the InnerProduct distance, range from 0 to 1. - - Args: - query: Text query. - k: Number of the most similar documents to return. Defaults to 4. - - Returns: - List of (Document, relevance_score) tuples similar to the text query. - Note that relevance_score ranged from 0 to 1. - 0 is dissimilar, 1 is the most similar. - """ - - if self.awadb_client is None: - raise ValueError("AwaDB client is None!!!") - - embedding = None - if self.using_table_name in self.table2embeddings: - embedding = self.table2embeddings[self.using_table_name].embed_query(query) - - show_results = self.awadb_client.Search(embedding, k) - - results: List[Tuple[Document, float]] = [] - - if show_results.__len__() == 0: - return results - - dists: List[float] = [] - not_include_fields: Set[str] = {"text_embedding", "_id", "score"} - retrieval_docs = self.similarity_search_by_vector( - embedding, - k, - scores=dists, - not_include_fields_in_metadata=not_include_fields, - ) - - doc_no = 0 - for doc in retrieval_docs: - doc_tuple = (doc, dists[doc_no]) - results.append(doc_tuple) - doc_no = doc_no + 1 - - return results + return self.similarity_search_with_score(query, k, **kwargs) def similarity_search_by_vector( self, embedding: Optional[List[float]] = None, k: int = DEFAULT_TOPN, - scores: Optional[list] = None, + text_in_page_content: Optional[str] = None, + meta_filter: Optional[dict] = None, not_include_fields_in_metadata: Optional[Set[str]] = None, **kwargs: Any, ) -> List[Document]: @@ -273,7 +250,8 @@ class AwaDB(VectorStore): Args: embedding: Embedding to look up documents similar to. k: Number of Documents to return. Defaults to 4. - scores: Scores for retrieved docs. + text_in_page_content: Filter by the text in page_content of Document. + meta_filter: Filter by metadata. Defaults to None. not_incude_fields_in_metadata: Not include meta fields of each document. Returns: @@ -289,7 +267,11 @@ class AwaDB(VectorStore): return results show_results = self.awadb_client.Search( - embedding, k, not_include_fields=not_include_fields_in_metadata + embedding, + k, + text_in_page_content=text_in_page_content, + meta_filter=meta_filter, + not_include_fields=not_include_fields_in_metadata, ) if show_results.__len__() == 0: @@ -302,10 +284,6 @@ class AwaDB(VectorStore): if item_key == "embedding_text": content = item_detail[item_key] continue - elif item_key == "score": - if scores is not None: - scores.append(item_detail[item_key]) - continue elif not_include_fields_in_metadata is not None: if item_key in not_include_fields_in_metadata: continue @@ -319,6 +297,8 @@ class AwaDB(VectorStore): k: int = 4, fetch_k: int = 20, lambda_mult: float = 0.5, + text_in_page_content: Optional[str] = None, + meta_filter: Optional[dict] = None, **kwargs: Any, ) -> List[Document]: """Return docs selected using the maximal marginal relevance. @@ -334,6 +314,9 @@ class AwaDB(VectorStore): of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5. + text_in_page_content: Filter by the text in page_content of Document. + meta_filter (Optional[dict]): Filter by metadata. Defaults to None. + Returns: List of Documents selected by maximal marginal relevance. """ @@ -353,7 +336,12 @@ class AwaDB(VectorStore): return [] results = self.max_marginal_relevance_search_by_vector( - embedding, k, fetch_k, lambda_mult=lambda_mult + embedding, + k, + fetch_k, + lambda_mult=lambda_mult, + text_in_page_content=text_in_page_content, + meta_filter=meta_filter, ) return results @@ -363,6 +351,8 @@ class AwaDB(VectorStore): k: int = 4, fetch_k: int = 20, lambda_mult: float = 0.5, + text_in_page_content: Optional[str] = None, + meta_filter: Optional[dict] = None, **kwargs: Any, ) -> List[Document]: """Return docs selected using the maximal marginal relevance. @@ -378,6 +368,9 @@ class AwaDB(VectorStore): of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5. + text_in_page_content: Filter by the text in page_content of Document. + meta_filter (Optional[dict]): Filter by metadata. Defaults to None. + Returns: List of Documents selected by maximal marginal relevance. """ @@ -392,7 +385,11 @@ class AwaDB(VectorStore): not_include_fields: set = {"_id", "score"} retrieved_docs = self.similarity_search_by_vector( - embedding, fetch_k, not_include_fields_in_metadata=not_include_fields + embedding, + fetch_k, + text_in_page_content=text_in_page_content, + meta_filter=meta_filter, + not_include_fields_in_metadata=not_include_fields, ) top_embeddings = [] @@ -412,29 +409,43 @@ class AwaDB(VectorStore): def get( self, - ids: List[str], + ids: Optional[List[str]] = None, + text_in_page_content: Optional[str] = None, + meta_filter: Optional[dict] = None, not_include_fields: Optional[Set[str]] = None, + limit: Optional[int] = None, **kwargs: Any, ) -> Dict[str, Document]: """Return docs according ids. Args: ids: The ids of the embedding vectors. + text_in_page_content: Filter by the text in page_content of Document. + meta_filter: Filter by any metadata of the document. + not_include_fields: Not pack the specified fields of each document. + limit: The number of documents to return. Defaults to 5. Optional. + Returns: - Documents which have the ids. + Documents which satisfy the input conditions. """ if self.awadb_client is None: raise ValueError("AwaDB client is None!!!") - docs_detail = self.awadb_client.Get(ids, not_include_fields=not_include_fields) + docs_detail = self.awadb_client.Get( + ids=ids, + text_in_page_content=text_in_page_content, + meta_filter=meta_filter, + not_include_fields=not_include_fields, + limit=limit, + ) results: Dict[str, Document] = {} for doc_detail in docs_detail: content = "" meta_info = {} for field in doc_detail: - if field == "embeddint_text": + if field == "embedding_text": content = doc_detail[field] continue elif field == "text_embedding" or field == "_id": diff --git a/poetry.lock b/poetry.lock index 071c60e4615..474dfa23770 100644 --- a/poetry.lock +++ b/poetry.lock @@ -635,19 +635,22 @@ test = ["coverage (>=5,<6)", "pytest (>=6,<7)"] [[package]] name = "awadb" -version = "0.3.6" -description = "The AI Native database for embedding vectors" +version = "0.3.7" +description = "AI Native database for embedding vectors" category = "main" optional = true python-versions = ">=3.7" files = [ - {file = "awadb-0.3.6-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:6154f73aab9996aefe8c8f8bf754f7182d109d6b60302c9f31666c7f50cc7aca"}, - {file = "awadb-0.3.6-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:9d7e9dff353517595ecc8c9395a2367acdcfc83c68a64dd4785c8d366eed3f40"}, - {file = "awadb-0.3.6-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:475af75d2ffbbe970999d93fbabdf7281797390c66fe852f6a6989e706b90c94"}, - {file = "awadb-0.3.6-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:304be1de63daec1555f0fe9de9a18cdf16a467687a35a6ccf3405cd400fefb48"}, - {file = "awadb-0.3.6-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:176cc27d1afc4aad758515d5f8fb435f555c9ba827a9e84d6f28b1c6ac568965"}, - {file = "awadb-0.3.6-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:211d7f6b0f7c3c3d7518d424f0f3dfac5f45f9e5d7bbf397fdae861ff0dc46fd"}, - {file = "awadb-0.3.6-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:b935ab4ffaa3bcbcc9a381fce91ace5940143b527ffdb467dd4bc630cd94afab"}, + {file = "awadb-0.3.7-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:deec44f3687aad3ef13ba3fac3e2e4fd83c710a30194d228b3f520d2fb013542"}, + {file = "awadb-0.3.7-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:96e1e92e029f4b1000631bc06d6f845d554e4698851e515eafd35ff4f2b1994f"}, + {file = "awadb-0.3.7-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:46d44c4e8d97aaeb73234a2b9945b393b91dfaeae98e9fc6632ffb64bbc9e995"}, + {file = "awadb-0.3.7-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:d5e379ea2f9f44687edb99c1d35719d1bed116759f800d212d9561cef99736a3"}, + {file = "awadb-0.3.7-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:f2de28efa210035d278a55466023b44b8479254f3d5de69c944e6a3fcfb97879"}, + {file = "awadb-0.3.7-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:7a2a3f063a133c945e12ea0ab9d9c7ab038c8255dbd867067dba0a513557124b"}, + {file = "awadb-0.3.7-cp38-cp38-macosx_13_0_arm64.whl", hash = "sha256:15a8f3349ea84bdfc9c096b3760cf00363eb1908e71728b4a1e3beecda763cd5"}, + {file = "awadb-0.3.7-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:3ac1ef7e1d1a591eb36a57ea65824e717f54fc521e4ae303d634e510817ba270"}, + {file = "awadb-0.3.7-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:947b6576a07e90cac31d8ff709cd0e0abc9779bc71276df817b2ffe18c1fa541"}, + {file = "awadb-0.3.7-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:e4014edade8134579993639a4a3b18ff20c8449fdfc5ff511b24617109be5df7"}, ] [package.extras]