Add MyScaleWithoutJSON which allows user to wrap columns into Document's Metadata (#13164)

Replace this entire comment with: - **Description:** Add MyScaleWithoutJSON which allows user to wrap columns into Document's Metadata - **Tag maintainer:** @baskaryan
2025-08-31 02:11:09 +00:00 · 2023-11-14 02:10:36 +08:00
parent 2aa13f1e10
commit 46af56dc4f
1 changed files with 122 additions and 0 deletions
--- a/libs/langchain/langchain/vectorstores/myscale.py
+++ b/libs/langchain/langchain/vectorstores/myscale.py
@@ -490,3 +490,125 @@ class MyScale(VectorStore):
    @property
    def metadata_column(self) -> str:
        return self.config.column_map["metadata"]
+
+
+class MyScaleWithoutJSON(MyScale):
+    """MyScale vector store without metadata column
+
+    This is super handy if you are working to a SQL-native table
+    """
+
+    def __init__(
+        self,
+        embedding: Embeddings,
+        config: Optional[MyScaleSettings] = None,
+        must_have_cols: List[str] = [],
+        **kwargs: Any,
+    ) -> None:
+        """Building a myscale vector store without metadata column
+
+        embedding (Embeddings): embedding model
+        config (MyScaleSettings): Configuration to MyScale Client
+        must_have_cols (List[str]): column names to be included in query
+        Other keyword arguments will pass into
+            [clickhouse-connect](https://docs.myscale.com/)
+        """
+        super().__init__(embedding, config, **kwargs)
+        self.must_have_cols: List[str] = must_have_cols
+
+    def _build_qstr(
+        self, q_emb: List[float], topk: int, where_str: Optional[str] = None
+    ) -> str:
+        q_emb_str = ",".join(map(str, q_emb))
+        if where_str:
+            where_str = f"PREWHERE {where_str}"
+        else:
+            where_str = ""
+
+        q_str = f"""
+            SELECT {self.config.column_map['text']}, dist, 
+                {','.join(self.must_have_cols)}
+            FROM {self.config.database}.{self.config.table}
+            {where_str}
+            ORDER BY distance({self.config.column_map['vector']}, [{q_emb_str}]) 
+                AS dist {self.dist_order}
+            LIMIT {topk}
+            """
+        return q_str
+
+    def similarity_search_by_vector(
+        self,
+        embedding: List[float],
+        k: int = 4,
+        where_str: Optional[str] = None,
+        **kwargs: Any,
+    ) -> List[Document]:
+        """Perform a similarity search with MyScale by vectors
+
+        Args:
+            query (str): query string
+            k (int, optional): Top K neighbors to retrieve. Defaults to 4.
+            where_str (Optional[str], optional): where condition string.
+                                                 Defaults to None.
+
+            NOTE: Please do not let end-user to fill this and always be aware
+                  of SQL injection. When dealing with metadatas, remember to
+                  use `{self.metadata_column}.attribute` instead of `attribute`
+                  alone. The default name for it is `metadata`.
+
+        Returns:
+            List[Document]: List of (Document, similarity)
+        """
+        q_str = self._build_qstr(embedding, k, where_str)
+        try:
+            return [
+                Document(
+                    page_content=r[self.config.column_map["text"]],
+                    metadata={k: r[k] for k in self.must_have_cols},
+                )
+                for r in self.client.query(q_str).named_results()
+            ]
+        except Exception as e:
+            logger.error(f"\033[91m\033[1m{type(e)}\033[0m \033[95m{str(e)}\033[0m")
+            return []
+
+    def similarity_search_with_relevance_scores(
+        self, query: str, k: int = 4, where_str: Optional[str] = None, **kwargs: Any
+    ) -> List[Tuple[Document, float]]:
+        """Perform a similarity search with MyScale
+
+        Args:
+            query (str): query string
+            k (int, optional): Top K neighbors to retrieve. Defaults to 4.
+            where_str (Optional[str], optional): where condition string.
+                                                 Defaults to None.
+
+            NOTE: Please do not let end-user to fill this and always be aware
+                  of SQL injection. When dealing with metadatas, remember to
+                  use `{self.metadata_column}.attribute` instead of `attribute`
+                  alone. The default name for it is `metadata`.
+
+        Returns:
+            List[Document]: List of documents most similar to the query text
+            and cosine distance in float for each.
+            Lower score represents more similarity.
+        """
+        q_str = self._build_qstr(self._embeddings.embed_query(query), k, where_str)
+        try:
+            return [
+                (
+                    Document(
+                        page_content=r[self.config.column_map["text"]],
+                        metadata={k: r[k] for k in self.must_have_cols},
+                    ),
+                    r["dist"],
+                )
+                for r in self.client.query(q_str).named_results()
+            ]
+        except Exception as e:
+            logger.error(f"\033[91m\033[1m{type(e)}\033[0m \033[95m{str(e)}\033[0m")
+            return []
+
+    @property
+    def metadata_column(self) -> str:
+        return ""