Add metadata and page_content filters of documents in AwaDB (#7862)

1. Add the metadata filter of documents. 2. Add the text page_content filter of documents 3. fix the bug of similarity_search_with_score Improvement and fix bug of AwaDB Fix the conflict https://github.com/hwchase17/langchain/pull/7840 @rlancemartin @eyurtsev Thanks! --------- Co-authored-by: vincent <awadb.vincent@gmail.com>
2025-08-16 16:11:02 +00:00 · 2023-07-18 22:50:17 +08:00 · 2023-07-18 22:50:17 +08:00 · 3902b85657
commit 3902b85657
parent f1eaa9b626
2 changed files with 96 additions and 82 deletions
--- a/langchain/vectorstores/awadb.py
+++ b/langchain/vectorstores/awadb.py
@ -12,9 +12,6 @@ from langchain.embeddings.base import Embeddings
 from langchain.vectorstores.base import VectorStore
 from langchain.vectorstores.utils import maximal_marginal_relevance

-# from pydantic import BaseModel, Field, root_validator
-
-
 if TYPE_CHECKING:
    import awadb

@ -36,12 +33,16 @@ class AwaDB(VectorStore):
        **kwargs: Any,
    ) -> None:
        """Initialize with AwaDB client.
+           If table_name is not specified,
+           a random table name of `_DEFAULT_TABLE_NAME + last segment of uuid`
+           would be created automatically.
+
        Args:
-            table_name: Iterable of strings to add to the vectorstore.
-            embedding: Optional list of metadatas associated with the texts.
-            log_and_data_dir: Optional whether to duplicate texts.
+            table_name: Name of the table created, default _DEFAULT_TABLE_NAME.
+            embedding: Optional Embeddings initially set.
+            log_and_data_dir: Optional the root directory of log and data.
            client: Optional AwaDB client.
-            kwargs: any possible extend parameters in the future.
+            kwargs: Any possible extend parameters in the future.

        Returns:
            None.
@ -83,7 +84,7 @@ class AwaDB(VectorStore):
        Args:
            texts: Iterable of strings to add to the vectorstore.
            metadatas: Optional list of metadatas associated with the texts.
-            is_duplicate_texts: Optional whether to duplicate texts.
+            is_duplicate_texts: Optional whether to duplicate texts. Defaults to True.
            kwargs: any possible extend parameters in the future.

        Returns:
@ -131,6 +132,8 @@ class AwaDB(VectorStore):
        self,
        query: str,
        k: int = DEFAULT_TOPN,
+        text_in_page_content: Optional[str] = None,
+        meta_filter: Optional[dict] = None,
        **kwargs: Any,
    ) -> List[Document]:
        """Return docs most similar to query.
@ -138,6 +141,13 @@ class AwaDB(VectorStore):
        Args:
            query: Text query.
            k: The maximum number of documents to return.
+            text_in_page_content: Filter by the text in page_content of Document.
+            meta_filter (Optional[dict]): Filter by metadata. Defaults to None.
+            E.g. `{"color" : "red", "price": 4.20}`. Optional.
+            E.g. `{"max_price" : 15.66, "min_price": 4.20}`
+            `price` is the metadata field, means range filter(4.20<'price'<15.66).
+            E.g. `{"maxe_price" : 15.66, "mine_price": 4.20}`
+            `price` is the metadata field, means range filter(4.20<='price'<=15.66).
            kwargs: Any possible extend parameters in the future.

        Returns:
@ -158,13 +168,19 @@ class AwaDB(VectorStore):

        not_include_fields: Set[str] = {"text_embedding", "_id", "score"}
        return self.similarity_search_by_vector(
-            embedding, k, not_include_fields_in_metadata=not_include_fields
+            embedding,
+            k,
+            text_in_page_content=text_in_page_content,
+            meta_filter=meta_filter,
+            not_include_fields_in_metadata=not_include_fields,
        )

    def similarity_search_with_score(
        self,
        query: str,
        k: int = DEFAULT_TOPN,
+        text_in_page_content: Optional[str] = None,
+        meta_filter: Optional[dict] = None,
        **kwargs: Any,
    ) -> List[Tuple[Document, float]]:
        """The most k similar documents and scores of the specified query.
@ -172,6 +188,8 @@ class AwaDB(VectorStore):
        Args:
            query: Text query.
            k: The k most similar documents to the text query.
+            text_in_page_content: Filter by the text in page_content of Document.
+            meta_filter: Filter by metadata. Defaults to None.
            kwargs: Any possible extend parameters in the future.

        Returns:
@ -193,78 +211,37 @@ class AwaDB(VectorStore):

        results: List[Tuple[Document, float]] = []

-        dists: List[float] = []
-        not_include_fields: Set[str] = {"text_embedding", "_id", "score"}
+        not_include_fields: Set[str] = {"text_embedding", "_id"}
        retrieval_docs = self.similarity_search_by_vector(
            embedding,
            k,
-            scores=dists,
+            text_in_page_content=text_in_page_content,
+            meta_filter=meta_filter,
            not_include_fields_in_metadata=not_include_fields,
        )

-        doc_no = 0
        for doc in retrieval_docs:
-            doc_tuple = (doc, dists[doc_no])
+            score = doc.metadata["score"]
+            del doc.metadata["score"]
+            doc_tuple = (doc, score)
            results.append(doc_tuple)
-            doc_no = doc_no + 1

        return results

-    def similarity_search_with_relevance_scores(
+    def _similarity_search_with_relevance_scores(
        self,
        query: str,
-        k: int = DEFAULT_TOPN,
+        k: int = 4,
        **kwargs: Any,
    ) -> List[Tuple[Document, float]]:
-        """Return docs and relevance scores
-           which denote the InnerProduct distance, range from 0 to 1.
-
-        Args:
-            query: Text query.
-            k: Number of the most similar documents to return. Defaults to 4.
-
-        Returns:
-            List of (Document, relevance_score) tuples similar to the text query.
-            Note that relevance_score ranged from 0 to 1.
-            0 is dissimilar, 1 is the most similar.
-        """
-
-        if self.awadb_client is None:
-            raise ValueError("AwaDB client is None!!!")
-
-        embedding = None
-        if self.using_table_name in self.table2embeddings:
-            embedding = self.table2embeddings[self.using_table_name].embed_query(query)
-
-        show_results = self.awadb_client.Search(embedding, k)
-
-        results: List[Tuple[Document, float]] = []
-
-        if show_results.__len__() == 0:
-            return results
-
-        dists: List[float] = []
-        not_include_fields: Set[str] = {"text_embedding", "_id", "score"}
-        retrieval_docs = self.similarity_search_by_vector(
-            embedding,
-            k,
-            scores=dists,
-            not_include_fields_in_metadata=not_include_fields,
-        )
-
-        doc_no = 0
-        for doc in retrieval_docs:
-            doc_tuple = (doc, dists[doc_no])
-            results.append(doc_tuple)
-            doc_no = doc_no + 1
-
-        return results
+        return self.similarity_search_with_score(query, k, **kwargs)

    def similarity_search_by_vector(
        self,
        embedding: Optional[List[float]] = None,
        k: int = DEFAULT_TOPN,
-        scores: Optional[list] = None,
+        text_in_page_content: Optional[str] = None,
+        meta_filter: Optional[dict] = None,
        not_include_fields_in_metadata: Optional[Set[str]] = None,
        **kwargs: Any,
    ) -> List[Document]:
@ -273,7 +250,8 @@ class AwaDB(VectorStore):
        Args:
            embedding: Embedding to look up documents similar to.
            k: Number of Documents to return. Defaults to 4.
-            scores: Scores for retrieved docs.
+            text_in_page_content: Filter by the text in page_content of Document.
+            meta_filter: Filter by metadata. Defaults to None.
            not_incude_fields_in_metadata: Not include meta fields of each document.

        Returns:
@ -289,7 +267,11 @@ class AwaDB(VectorStore):
            return results

        show_results = self.awadb_client.Search(
-            embedding, k, not_include_fields=not_include_fields_in_metadata
+            embedding,
+            k,
+            text_in_page_content=text_in_page_content,
+            meta_filter=meta_filter,
+            not_include_fields=not_include_fields_in_metadata,
        )

        if show_results.__len__() == 0:
@ -302,10 +284,6 @@ class AwaDB(VectorStore):
                if item_key == "embedding_text":
                    content = item_detail[item_key]
                    continue
-                elif item_key == "score":
-                    if scores is not None:
-                        scores.append(item_detail[item_key])
-                        continue
                elif not_include_fields_in_metadata is not None:
                    if item_key in not_include_fields_in_metadata:
                        continue
@ -319,6 +297,8 @@ class AwaDB(VectorStore):
        k: int = 4,
        fetch_k: int = 20,
        lambda_mult: float = 0.5,
+        text_in_page_content: Optional[str] = None,
+        meta_filter: Optional[dict] = None,
        **kwargs: Any,
    ) -> List[Document]:
        """Return docs selected using the maximal marginal relevance.
@ -334,6 +314,9 @@ class AwaDB(VectorStore):
                        of diversity among the results with 0 corresponding
                        to maximum diversity and 1 to minimum diversity.
                        Defaults to 0.5.
+            text_in_page_content: Filter by the text in page_content of Document.
+            meta_filter (Optional[dict]): Filter by metadata. Defaults to None.
+
        Returns:
            List of Documents selected by maximal marginal relevance.
        """
@ -353,7 +336,12 @@ class AwaDB(VectorStore):
            return []

        results = self.max_marginal_relevance_search_by_vector(
-            embedding, k, fetch_k, lambda_mult=lambda_mult
+            embedding,
+            k,
+            fetch_k,
+            lambda_mult=lambda_mult,
+            text_in_page_content=text_in_page_content,
+            meta_filter=meta_filter,
        )
        return results

@ -363,6 +351,8 @@ class AwaDB(VectorStore):
        k: int = 4,
        fetch_k: int = 20,
        lambda_mult: float = 0.5,
+        text_in_page_content: Optional[str] = None,
+        meta_filter: Optional[dict] = None,
        **kwargs: Any,
    ) -> List[Document]:
        """Return docs selected using the maximal marginal relevance.
@ -378,6 +368,9 @@ class AwaDB(VectorStore):
                        of diversity among the results with 0 corresponding
                        to maximum diversity and 1 to minimum diversity.
                        Defaults to 0.5.
+            text_in_page_content: Filter by the text in page_content of Document.
+            meta_filter (Optional[dict]): Filter by metadata. Defaults to None.
+
        Returns:
            List of Documents selected by maximal marginal relevance.
        """
@ -392,7 +385,11 @@ class AwaDB(VectorStore):

        not_include_fields: set = {"_id", "score"}
        retrieved_docs = self.similarity_search_by_vector(
-            embedding, fetch_k, not_include_fields_in_metadata=not_include_fields
+            embedding,
+            fetch_k,
+            text_in_page_content=text_in_page_content,
+            meta_filter=meta_filter,
+            not_include_fields_in_metadata=not_include_fields,
        )

        top_embeddings = []
@ -412,29 +409,43 @@ class AwaDB(VectorStore):

    def get(
        self,
-        ids: List[str],
+        ids: Optional[List[str]] = None,
+        text_in_page_content: Optional[str] = None,
+        meta_filter: Optional[dict] = None,
        not_include_fields: Optional[Set[str]] = None,
+        limit: Optional[int] = None,
        **kwargs: Any,
    ) -> Dict[str, Document]:
        """Return docs according ids.

        Args:
            ids: The ids of the embedding vectors.
+            text_in_page_content: Filter by the text in page_content of Document.
+            meta_filter: Filter by any metadata of the document.
+            not_include_fields: Not pack the specified fields of each document.
+            limit: The number of documents to return. Defaults to 5. Optional.
+
        Returns:
-            Documents which have the ids.
+            Documents which satisfy the input conditions.
        """

        if self.awadb_client is None:
            raise ValueError("AwaDB client is None!!!")

-        docs_detail = self.awadb_client.Get(ids, not_include_fields=not_include_fields)
+        docs_detail = self.awadb_client.Get(
+            ids=ids,
+            text_in_page_content=text_in_page_content,
+            meta_filter=meta_filter,
+            not_include_fields=not_include_fields,
+            limit=limit,
+        )

        results: Dict[str, Document] = {}
        for doc_detail in docs_detail:
            content = ""
            meta_info = {}
            for field in doc_detail:
-                if field == "embeddint_text":
+                if field == "embedding_text":
                    content = doc_detail[field]
                    continue
                elif field == "text_embedding" or field == "_id":
--- a/poetry.lock
+++ b/poetry.lock
@ -635,19 +635,22 @@ test = ["coverage (>=5,<6)", "pytest (>=6,<7)"]

 [[package]]
 name = "awadb"
-version = "0.3.6"
-description = "The AI Native database for embedding vectors"
+version = "0.3.7"
+description = "AI Native database for embedding vectors"
 category = "main"
 optional = true
 python-versions = ">=3.7"
 files = [
-    {file = "awadb-0.3.6-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:6154f73aab9996aefe8c8f8bf754f7182d109d6b60302c9f31666c7f50cc7aca"},
-    {file = "awadb-0.3.6-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:9d7e9dff353517595ecc8c9395a2367acdcfc83c68a64dd4785c8d366eed3f40"},
-    {file = "awadb-0.3.6-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:475af75d2ffbbe970999d93fbabdf7281797390c66fe852f6a6989e706b90c94"},
-    {file = "awadb-0.3.6-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:304be1de63daec1555f0fe9de9a18cdf16a467687a35a6ccf3405cd400fefb48"},
-    {file = "awadb-0.3.6-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:176cc27d1afc4aad758515d5f8fb435f555c9ba827a9e84d6f28b1c6ac568965"},
-    {file = "awadb-0.3.6-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:211d7f6b0f7c3c3d7518d424f0f3dfac5f45f9e5d7bbf397fdae861ff0dc46fd"},
-    {file = "awadb-0.3.6-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:b935ab4ffaa3bcbcc9a381fce91ace5940143b527ffdb467dd4bc630cd94afab"},
+    {file = "awadb-0.3.7-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:deec44f3687aad3ef13ba3fac3e2e4fd83c710a30194d228b3f520d2fb013542"},
+    {file = "awadb-0.3.7-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:96e1e92e029f4b1000631bc06d6f845d554e4698851e515eafd35ff4f2b1994f"},
+    {file = "awadb-0.3.7-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:46d44c4e8d97aaeb73234a2b9945b393b91dfaeae98e9fc6632ffb64bbc9e995"},
+    {file = "awadb-0.3.7-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:d5e379ea2f9f44687edb99c1d35719d1bed116759f800d212d9561cef99736a3"},
+    {file = "awadb-0.3.7-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:f2de28efa210035d278a55466023b44b8479254f3d5de69c944e6a3fcfb97879"},
+    {file = "awadb-0.3.7-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:7a2a3f063a133c945e12ea0ab9d9c7ab038c8255dbd867067dba0a513557124b"},
+    {file = "awadb-0.3.7-cp38-cp38-macosx_13_0_arm64.whl", hash = "sha256:15a8f3349ea84bdfc9c096b3760cf00363eb1908e71728b4a1e3beecda763cd5"},
+    {file = "awadb-0.3.7-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:3ac1ef7e1d1a591eb36a57ea65824e717f54fc521e4ae303d634e510817ba270"},
+    {file = "awadb-0.3.7-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:947b6576a07e90cac31d8ff709cd0e0abc9779bc71276df817b2ffe18c1fa541"},
+    {file = "awadb-0.3.7-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:e4014edade8134579993639a4a3b18ff20c8449fdfc5ff511b24617109be5df7"},
 ]

 [package.extras]