Feat rdb summary wide table (#2035)

Co-authored-by: dongzhancai1 <dongzhancai1@jd.com> Co-authored-by: dong <dongzhancai@iie2.com>
2025-09-14 05:31:40 +00:00 · 2024-12-18 20:34:21 +08:00
parent 7f4b5e79cf
commit 9b0161e521
17 changed files with 948 additions and 243 deletions
--- a/dbgpt/rag/summary/db_summary_client.py
+++ b/dbgpt/rag/summary/db_summary_client.py
@@ -2,13 +2,15 @@

 import logging
 import traceback
-from typing import List

 from dbgpt._private.config import Config
 from dbgpt.component import SystemApp
 from dbgpt.configs.model_config import EMBEDDING_MODEL_CONFIG
+from dbgpt.rag import ChunkParameters
 from dbgpt.rag.summary.gdbms_db_summary import GdbmsSummary
 from dbgpt.rag.summary.rdbms_db_summary import RdbmsSummary
+from dbgpt.rag.text_splitter.text_splitter import RDBTextSplitter
+from dbgpt.serve.rag.connector import VectorStoreConnector

 logger = logging.getLogger(__name__)

@@ -47,22 +49,26 @@ class DBSummaryClient:

        logger.info("db summary embedding success")

-    def get_db_summary(self, dbname, query, topk) -> List[str]:
+    def get_db_summary(self, dbname, query, topk):
        """Get user query related tables info."""
-        from dbgpt.serve.rag.connector import VectorStoreConnector
        from dbgpt.storage.vector_store.base import VectorStoreConfig

-        vector_store_config = VectorStoreConfig(name=dbname + "_profile")
-        vector_connector = VectorStoreConnector.from_default(
+        vector_store_name = dbname + "_profile"
+        table_vector_store_config = VectorStoreConfig(name=vector_store_name)
+        table_vector_connector = VectorStoreConnector.from_default(
            CFG.VECTOR_STORE_TYPE,
-            embedding_fn=self.embeddings,
-            vector_store_config=vector_store_config,
+            self.embeddings,
+            vector_store_config=table_vector_store_config,
        )
+
        from dbgpt.rag.retriever.db_schema import DBSchemaRetriever

        retriever = DBSchemaRetriever(
-            top_k=topk, index_store=vector_connector.index_client
+            top_k=topk,
+            table_vector_store_connector=table_vector_connector,
+            separator="--table-field-separator--",
        )
+
        table_docs = retriever.retrieve(query)
        ans = [d.content for d in table_docs]
        return ans
@@ -92,18 +98,23 @@ class DBSummaryClient:
        from dbgpt.serve.rag.connector import VectorStoreConnector
        from dbgpt.storage.vector_store.base import VectorStoreConfig

-        vector_store_config = VectorStoreConfig(name=vector_store_name)
-        vector_connector = VectorStoreConnector.from_default(
+        table_vector_store_config = VectorStoreConfig(name=vector_store_name)
+        table_vector_connector = VectorStoreConnector.from_default(
            CFG.VECTOR_STORE_TYPE,
            self.embeddings,
-            vector_store_config=vector_store_config,
+            vector_store_config=table_vector_store_config,
        )
-        if not vector_connector.vector_name_exists():
+        if not table_vector_connector.vector_name_exists():
            from dbgpt.rag.assembler.db_schema import DBSchemaAssembler

+            chunk_parameters = ChunkParameters(
+                text_splitter=RDBTextSplitter(separator="--table-field-separator--")
+            )
            db_assembler = DBSchemaAssembler.load_from_connection(
                connector=db_summary_client.db,
-                index_store=vector_connector.index_client,
+                table_vector_store_connector=table_vector_connector,
+                chunk_parameters=chunk_parameters,
+                max_seq_length=CFG.EMBEDDING_MODEL_MAX_SEQ_LEN,
            )

            if len(db_assembler.get_chunks()) > 0:
@@ -115,16 +126,26 @@ class DBSummaryClient:
    def delete_db_profile(self, dbname):
        """Delete db profile."""
        vector_store_name = dbname + "_profile"
+        table_vector_store_name = dbname + "_profile"
+        field_vector_store_name = dbname + "_profile_field"
        from dbgpt.serve.rag.connector import VectorStoreConnector
        from dbgpt.storage.vector_store.base import VectorStoreConfig

-        vector_store_config = VectorStoreConfig(name=vector_store_name)
-        vector_connector = VectorStoreConnector.from_default(
+        table_vector_store_config = VectorStoreConfig(name=vector_store_name)
+        field_vector_store_config = VectorStoreConfig(name=field_vector_store_name)
+        table_vector_connector = VectorStoreConnector.from_default(
            CFG.VECTOR_STORE_TYPE,
            self.embeddings,
-            vector_store_config=vector_store_config,
+            vector_store_config=table_vector_store_config,
        )
-        vector_connector.delete_vector_name(vector_store_name)
+        field_vector_connector = VectorStoreConnector.from_default(
+            CFG.VECTOR_STORE_TYPE,
+            self.embeddings,
+            vector_store_config=field_vector_store_config,
+        )
+
+        table_vector_connector.delete_vector_name(table_vector_store_name)
+        field_vector_connector.delete_vector_name(field_vector_store_name)
        logger.info(f"delete db profile {dbname} success")

    @staticmethod
--- a/dbgpt/rag/summary/rdbms_db_summary.py
+++ b/dbgpt/rag/summary/rdbms_db_summary.py
@@ -1,6 +1,6 @@
 """Summary for rdbms database."""
 import re
-from typing import TYPE_CHECKING, List, Optional
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple

 from dbgpt._private.config import Config
 from dbgpt.datasource import BaseConnector
@@ -80,6 +80,134 @@ def _parse_db_summary(
    return table_info_summaries


+def _parse_db_summary_with_metadata(
+    conn: BaseConnector,
+    summary_template: str = "table_name: {table_name}",
+    separator: str = "--table-field-separator--",
+    model_dimension: int = 512,
+) -> List[Tuple[str, Dict[str, Any]]]:
+    """Get db summary for database.
+
+    Args:
+        conn (BaseConnector): database connection
+        summary_template (str): summary template
+        separator(str, optional): separator used to separate table's
+            basic info and fields. defaults to `-- table-field-separator--`
+        model_dimension(int, optional): The threshold for splitting field string
+    """
+    tables = conn.get_table_names()
+    table_info_summaries = [
+        _parse_table_summary_with_metadata(
+            conn, summary_template, separator, table_name, model_dimension
+        )
+        for table_name in tables
+    ]
+    return table_info_summaries
+
+
+def _split_columns_str(columns: List[str], model_dimension: int):
+    """Split columns str.
+
+    Args:
+    columns (List[str]): fields string
+    model_dimension (int, optional): The threshold for splitting field string.
+    """
+    result = []
+    current_string = ""
+    current_length = 0
+
+    for element_str in columns:
+        element_length = len(element_str)
+
+        # If adding the current element's length would exceed the threshold,
+        # add the current string to results and reset
+        if current_length + element_length > model_dimension:
+            result.append(current_string.strip())  # Remove trailing spaces
+            current_string = element_str
+            current_length = element_length
+        else:
+            # If current string is empty, add element directly
+            if current_string:
+                current_string += "," + element_str
+            else:
+                current_string = element_str
+            current_length += element_length + 1  # Add length of space
+
+    # Handle the last string segment
+    if current_string:
+        result.append(current_string.strip())
+
+    return result
+
+
+def _parse_table_summary_with_metadata(
+    conn: BaseConnector,
+    summary_template: str,
+    separator,
+    table_name: str,
+    model_dimension=512,
+) -> Tuple[str, Dict[str, Any]]:
+    """Get table summary for table.
+
+    Args:
+        conn (BaseConnector): database connection
+        summary_template (str): summary template
+        separator(str, optional): separator used to separate table's
+            basic info and fields. defaults to `-- table-field-separator--`
+        model_dimension(int, optional): The threshold for splitting field string
+
+    Examples:
+        metadata: {'table_name': 'asd', 'separated': 0/1}
+
+        table_name: table1
+        table_comment: comment
+        index_keys: keys
+        --table-field-separator--
+        (column1,comment), (column2, comment), (column3, comment)
+        (column4,comment), (column5, comment), (column6, comment)
+    """
+    columns = []
+    metadata = {"table_name": table_name, "separated": 0}
+    for column in conn.get_columns(table_name):
+        if column.get("comment"):
+            columns.append(f"{column['name']} ({column.get('comment')})")
+        else:
+            columns.append(f"{column['name']}")
+    metadata.update({"field_num": len(columns)})
+    separated_columns = _split_columns_str(columns, model_dimension=model_dimension)
+    if len(separated_columns) > 1:
+        metadata["separated"] = 1
+    column_str = "\n".join(separated_columns)
+    # Obtain index information
+    index_keys = []
+    raw_indexes = conn.get_indexes(table_name)
+    for index in raw_indexes:
+        if isinstance(index, tuple):  # Process tuple type index information
+            index_name, index_creation_command = index
+            # Extract column names using re
+            matched_columns = re.findall(r"\(([^)]+)\)", index_creation_command)
+            if matched_columns:
+                key_str = ", ".join(matched_columns)
+                index_keys.append(f"{index_name}(`{key_str}`) ")
+        else:
+            key_str = ", ".join(index["column_names"])
+            index_keys.append(f"{index['name']}(`{key_str}`) ")
+    table_str = summary_template.format(table_name=table_name)
+
+    try:
+        comment = conn.get_table_comment(table_name)
+    except Exception:
+        comment = dict(text=None)
+    if comment.get("text"):
+        table_str += f"\ntable_comment: {comment.get('text')}"
+
+    if len(index_keys) > 0:
+        index_key_str = ", ".join(index_keys)
+        table_str += f"\nindex_keys: {index_key_str}"
+    table_str += f"\n{separator}\n{column_str}"
+    return table_str, metadata
+
+
 def _parse_table_summary(
    conn: BaseConnector, summary_template: str, table_name: str
 ) -> str: