From bfa1b11cecf4d6e9efe410960ed409566a2844c0 Mon Sep 17 00:00:00 2001
From: yaoyifan-yyf <yaoyifan.yyf@antgroup.com>
Date: Tue, 16 Dec 2025 19:59:19 +0800
Subject: [PATCH] feat: adjust benchmark data construct approach (#2948)

---
 .../service/benchmark/benchmark_service.py    |   2 +-
 .../fetchdata/benchmark_data_manager.py       | 541 +++++-------------
 pilot/benchmark_meta_data/table_mapping.json  |  95 ---
 3 files changed, 132 insertions(+), 506 deletions(-)
 delete mode 100644 pilot/benchmark_meta_data/table_mapping.json

diff --git a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/benchmark_service.py b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/benchmark_service.py
index 410b351bb..40b9732c6 100644
--- a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/benchmark_service.py
+++ b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/benchmark_service.py
@@ -40,7 +40,7 @@ from ...config import ServeConfig
 from ...models.models import ServeDao, ServeEntity
 from ..fetchdata.benchmark_data_manager import get_benchmark_manager
 from .data_compare_service import DataCompareService
-from .ext.excel_file_parse import ExcelFileParseService
+from .file_parse_service import ExcelFileParseService
 from .models import (
     BaseInputModel,
     BenchmarkDataSets,
diff --git a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/fetchdata/benchmark_data_manager.py b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/fetchdata/benchmark_data_manager.py
index 205383b85..c1a18b9e1 100644
--- a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/fetchdata/benchmark_data_manager.py
+++ b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/fetchdata/benchmark_data_manager.py
@@ -1,17 +1,16 @@
 import asyncio
-import csv
 import hashlib
-import json
 import logging
 import os
+import re
 import shutil
 import tempfile
 import threading
 import time
+import uuid
 import zipfile
 from concurrent.futures import ThreadPoolExecutor
 from concurrent.futures import TimeoutError as FutureTimeoutError
-from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple, cast
 
 import aiohttp
@@ -36,12 +35,10 @@ class BenchmarkDataConfig(BaseModel):
     db_path: str = os.path.join(
         BENCHMARK_DATA_ROOT_PATH, f"{BENCHMARK_DEFAULT_DB_SCHEMA}db"
     )
-    table_mapping_file: str = os.path.join(
-        BENCHMARK_DATA_ROOT_PATH, "table_mapping.json"
-    )
+    table_mapping_file: Optional[str] = None
     cache_expiry_days: int = 1
-    repo_url: str = "https://github.com/eosphoros-ai/Falcon"
-    data_dir: str = "data/source"
+    repo_url: str = "https://github.com/eosphoros-ai/Falcon/tree/yifan_1216"
+    data_dir: str = "dev_data/dev_databases"
 
 
 class BenchmarkDataManager(BaseComponent):
@@ -56,7 +53,6 @@ class BenchmarkDataManager(BaseComponent):
         self._config = config or BenchmarkDataConfig()
         self._http_session: Optional[aiohttp.ClientSession] = None
         self._connector: Optional[SQLiteConnector] = None
-        self._table_mappings = self._load_mappings()
         self._lock = asyncio.Lock()
         self.temp_dir: Optional[str] = None
 
@@ -142,59 +138,6 @@ class BenchmarkDataManager(BaseComponent):
         except Exception as e:
             logger.error(f"BenchmarkDataManager: auto load failed: {e}")
 
-    def _sanitize_column_name(self, name: str) -> str:
-        if name is None:
-            return ""
-        name = str(name).strip().strip('"').strip("'")
-        invalid_chars = [
-            "-",
-            " ",
-            ".",
-            ",",
-            ";",
-            ":",
-            "!",
-            "?",
-            "'",
-            '"',
-            "(",
-            ")",
-            "[",
-            "]",
-            "{",
-            "}",
-            "\t",
-            "\r",
-            "\n",
-            "\x00",
-        ]
-        while name and name[-1] in invalid_chars:
-            name = name[:-1]
-        for ch in invalid_chars:
-            if ch in name:
-                name = name.replace(ch, "_")
-        while "__" in name:
-            name = name.replace("__", "_")
-        if name and not (name[0].isalpha() or name[0] == "_"):
-            name = "_" + name
-        return name.lower()
-
-    def _sanitize_and_dedup_headers(self, headers: List[str]) -> List[str]:
-        sanitized: List[str] = []
-        used: set = set()
-        for idx, h in enumerate(headers):
-            name = self._sanitize_column_name(h)
-            if not name:
-                name = f"col_{idx}"
-            base = name
-            k = 2
-            while name in used or not name:
-                name = f"{base}_{k}"
-                k += 1
-            used.add(name)
-            sanitized.append(name)
-        return sanitized
-
     # ==========================================================
 
     # 通用查询（阻塞实现，在线程池中调用，支持超时与可中断）
@@ -292,7 +235,7 @@ class BenchmarkDataManager(BaseComponent):
                 return result.rowcount
 
         if timeout is not None:
-            # 使用ThreadPoolExecutor实现超时控制，类似于基类中DuckDB的实现
+            # 使用ThreadPoolExecutor实现超时控制
             with ThreadPoolExecutor(max_workers=1) as executor:
                 future = executor.submit(_execute_write)
                 try:
@@ -307,13 +250,7 @@ class BenchmarkDataManager(BaseComponent):
     async def query(
         self, query: str, params: tuple = (), timeout: Optional[float] = None
     ) -> List[Dict]:
-        """Execute query and return results as dict list
-
-        Args:
-            query: SQL query string
-            params: Query parameters
-            timeout: Query timeout in seconds (optional)
-        """
+        """Execute query and return results as dict list"""
         await self.init_connector()
         cols, rows = await self._run_in_thread(
             self._query_blocking, query, params, timeout
@@ -321,7 +258,7 @@ class BenchmarkDataManager(BaseComponent):
         return [dict(zip(cols, row)) for row in rows]
 
     async def load_from_github(
-        self, repo_url: str, data_dir: str = "data/source"
+        self, repo_url: str, data_dir: str = "dev_data/dev_databases"
     ) -> Dict:
         """Main method to load data from GitHub repository"""
         try:
@@ -330,14 +267,14 @@ class BenchmarkDataManager(BaseComponent):
             # 1. Download or use cached repository
             repo_dir = await self._download_repo_contents(repo_url)
 
-            # 2. Find all CSV files recursively
-            csv_files = self._discover_csv_files(repo_dir, data_dir)
-            if not csv_files:
-                raise ValueError("No CSV files found")
-            logger.info(f"Found {len(csv_files)} CSV files")
+            # 2. Find all SQLite files recursively in the specified data_dir
+            sqlite_files = self._discover_sqlite_files(repo_dir, data_dir)
+            if not sqlite_files:
+                raise ValueError(f"No SQLite files found in {data_dir}")
+            logger.info(f"Found {len(sqlite_files)} SQLite files")
 
-            # 3. Import to SQLite
-            result = await self._import_to_database(csv_files)
+            # 3. Merge all SQLite files into the main database
+            result = await self._merge_sqlite_databases(sqlite_files)
             return result
 
         except Exception as e:
@@ -389,63 +326,8 @@ class BenchmarkDataManager(BaseComponent):
         except Exception as e:
             logger.error(f"Failed to clear cache: {str(e)}")
 
-    def _load_mappings(self) -> Dict[str, str]:
-        """Load table name mappings from config file"""
-        if not self._config.table_mapping_file or not os.path.exists(
-            self._config.table_mapping_file
-        ):
-            logger.warning(
-                f"Table mapping file not found: {self._config.table_mapping_file}"
-            )
-            return {}
-
-        try:
-            with open(self._config.table_mapping_file, "r", encoding="utf-8") as f:
-                mapping = json.load(f)
-                return {
-                    key: value.split(".")[-1] if "." in value else value
-                    for key, value in mapping.items()
-                }
-        except Exception as e:
-            logger.error(f"Failed to load table mapping: {str(e)}")
-            return {}
-
-    def _sanitize_table_name(self, name: str) -> str:
-        """Normalize table names using mappings"""
-        mapped_name = self._table_mappings.get(name.lower(), name)
-        if mapped_name is None:
-            mapped_name = name or ""
-
-        invalid_chars = [
-            "-",
-            " ",
-            ".",
-            ",",
-            ";",
-            ":",
-            "!",
-            "?",
-            "'",
-            '"',
-            "(",
-            ")",
-            "[",
-            "]",
-            "{",
-            "}",
-        ]
-        while mapped_name and mapped_name[-1] in invalid_chars:
-            mapped_name = mapped_name[:-1]
-        for char in invalid_chars:
-            if char in mapped_name:
-                mapped_name = mapped_name.replace(char, "_")
-        while "__" in mapped_name:
-            mapped_name = mapped_name.replace("__", "_")
-
-        return (mapped_name or "").lower()
-
     async def _download_repo_contents(self, repo_url: str) -> str:
-        """Download repository with caching"""
+        """Download repository with caching, supporting branch URLs"""
         cache_path = self._get_cache_path(repo_url)
 
         # Use cache if valid
@@ -455,21 +337,45 @@ class BenchmarkDataManager(BaseComponent):
 
         # Download fresh copy
         self.temp_dir = tempfile.mkdtemp()
-        zip_url = (
-            repo_url.replace("github.com", "api.github.com/repos") + "/zipball/main"
-        )
+
+        # Simple parsing for github.com URLs
+        github_pattern = r"github\.com/([^/]+)/([^/]+)(?:/tree/(.+))?"
+        match = re.search(github_pattern, repo_url)
+
+        if match:
+            owner, repo, branch = match.groups()
+            branch = branch or "main"  # Default to main if no tree/branch specified
+            zip_url = f"https://api.github.com/repos/{owner}/{repo}/zipball/{branch}"
+        else:
+            # Fallback for generic structure or direct zip links
+            if repo_url.endswith(".zip"):
+                zip_url = repo_url
+            else:
+                # Default fallback behavior from original code
+                zip_url = (
+                    repo_url.replace("github.com", "api.github.com/repos")
+                    + "/zipball/main"
+                )
+
         logger.info(f"Downloading from GitHub repo: {zip_url}")
 
         try:
             if self._http_session is None:
                 self._http_session = aiohttp.ClientSession()
-            async with self._http_session.get(zip_url) as response:
-                response.raise_for_status()
+
+            headers = {"Accept": "application/vnd.github.v3+json"}
+            async with self._http_session.get(zip_url, headers=headers) as response:
+                if response.status != 200:
+                    text_resp = await response.text()
+                    raise RuntimeError(
+                        f"GitHub API Error {response.status}: {text_resp}"
+                    )
+
                 zip_path = os.path.join(self.temp_dir, "repo.zip")
 
                 with open(zip_path, "wb") as f:
                     while True:
-                        chunk = await response.content.read(1024)
+                        chunk = await response.content.read(1024 * 1024)  # 1MB chunks
                         if not chunk:
                             break
                         f.write(chunk)
@@ -515,297 +421,112 @@ class BenchmarkDataManager(BaseComponent):
             raise ValueError("No valid directory found after extraction")
         return os.path.join(self.temp_dir, extracted_dirs[0])
 
-    def _discover_csv_files(self, base_dir: str, search_dir: str) -> List[Dict]:
-        """Find all CSV files recursively"""
+    def _discover_sqlite_files(self, base_dir: str, search_dir: str) -> List[str]:
+        """Find all SQLite files recursively in the search directory"""
         full_search_dir = os.path.join(base_dir, search_dir) if search_dir else base_dir
         if not os.path.exists(full_search_dir):
             raise ValueError(f"Directory not found: {full_search_dir}")
 
-        csv_files = []
+        sqlite_files = []
         for root, _, files in os.walk(full_search_dir):
             for file in files:
-                if file.lower().endswith(".csv"):
-                    rel_path = os.path.relpath(root, start=base_dir)
-                    csv_files.append(
-                        {
-                            "full_path": os.path.join(root, file),
-                            "rel_path": rel_path,
-                            "file_name": file,
-                        }
-                    )
-        return csv_files
+                if file.lower().endswith(".sqlite"):
+                    full_path = os.path.join(root, file)
+                    sqlite_files.append(full_path)
+        return sqlite_files
 
-    async def _import_to_database(self, csv_files: List[Dict]) -> Dict:
-        """Import CSV data to SQLite"""
+    async def _merge_sqlite_databases(self, sqlite_files: List[str]) -> Dict:
+        """Merge multiple SQLite files into the main database"""
         await self.init_connector()
         assert self._connector is not None
-        results = {
-            "total_files": len(csv_files),
-            "successful": 0,
-            "failed": 0,
-            "tables_created": [],
-        }
 
-        def _process_one_file(file_info: Dict) -> Tuple[bool, Optional[str]]:
-            table_name = ""
-            try:
-                path_parts = [p for p in file_info["rel_path"].split(os.sep) if p]
-                table_name = "_".join(path_parts + [Path(file_info["file_name"]).stem])
-                table_name = self._sanitize_table_name(table_name)
-
-                with self._connector.session_scope() as session:
-                    session.execute(text(f'DROP TABLE IF EXISTS "{table_name}"'))
-                    session.commit()
-                encodings = ["utf-8-sig", "utf-8", "latin-1", "iso-8859-1", "cp1252"]
-
-                for encoding in encodings:
-                    try:
-                        with open(file_info["full_path"], "r", encoding=encoding) as f:
-                            content = f.read()
-
-                        if not content.strip():
-                            raise ValueError("File is empty")
-
-                        content = content.replace("\r\n", "\n").replace("\r", "\n")
-                        lines = [line for line in content.split("\n") if line.strip()]
-                        if not lines:
-                            raise ValueError("No data after normalization")
-
-                        header_line = lines[0]
-                        data_line = lines[1] if len(lines) > 1 else ""
-
-                        try:
-                            sample_for_sniff = "\n".join(lines[:10])
-                            sniffer = csv.Sniffer()
-                            try:
-                                dialect = sniffer.sniff(sample_for_sniff)
-                            except Exception:
-                                # Fallback: choose delimiter by counting common
-                                # separators in header/data line
-                                delims = [",", "\t", ";", "|"]
-                                counts = {
-                                    d: (header_line.count(d) if header_line else 0)
-                                    + (data_line.count(d) if data_line else 0)
-                                    for d in delims
-                                }
-                                best = (
-                                    max(counts, key=counts.get)
-                                    if any(counts.values())
-                                    else ","
-                                )
-
-                                class _DefaultDialect(csv.Dialect):
-                                    delimiter = best
-                                    quotechar = '"'
-                                    doublequote = True
-                                    skipinitialspace = False
-                                    lineterminator = "\n"
-                                    quoting = csv.QUOTE_MINIMAL
-
-                                dialect = _DefaultDialect()
-
-                            try:
-                                has_header = sniffer.has_header("\n".join(lines[:50]))
-                            except Exception:
-                                has_header = True
-
-                            header_row = (
-                                list(csv.reader([header_line], dialect))[0]
-                                if header_line
-                                else []
-                            )
-                            first_data_row = (
-                                list(csv.reader([data_line], dialect))[0]
-                                if data_line
-                                else []
-                            )
-
-                            # Heuristic: if has_header is False but header_row looks
-                            # like names (mostly alphabetic), treat as header
-                            if not has_header:
-
-                                def _looks_like_header(tokens: List[str]) -> bool:
-                                    if not tokens:
-                                        return False
-                                    # 非空、重复少、字母比例高
-                                    cleaned = [
-                                        str(t).strip() for t in tokens if str(t).strip()
-                                    ]
-                                    if not cleaned:
-                                        return False
-                                    # 允许少量数字，但大多以字母开头
-                                    alpha_starts = sum(
-                                        1
-                                        for t in cleaned
-                                        if t and (t[0].isalpha() or t[0] == "_")
-                                    )
-                                    return alpha_starts >= max(
-                                        1, int(0.6 * len(cleaned))
-                                    )
-
-                                if _looks_like_header(header_row):
-                                    has_header = True
-
-                            if not has_header:
-                                num_cols_guess = len(header_row)
-                                headers = [f"col_{i}" for i in range(num_cols_guess)]
-                                first_data_row = header_row
-                            else:
-                                headers = header_row
-
-                            num_cols = (
-                                len(first_data_row) if first_data_row else len(headers)
-                            )
-
-                            # no header
-                            if not headers or all(
-                                (not str(h).strip()) for h in headers
-                            ):
-                                headers = [f"col_{i}" for i in range(num_cols or 1)]
-
-                            headers = self._sanitize_and_dedup_headers(headers)
-
-                            if num_cols <= 0:
-                                num_cols = len(headers)
-                            headers = headers[:num_cols]
-                            if not headers or any(
-                                h is None or h == "" for h in headers
-                            ):
-                                raise csv.Error("Invalid headers after sanitization")
-
-                            create_sql = f'''
-                                CREATE TABLE IF NOT EXISTS "{table_name}" (
-                                    {", ".join([f'"{h}" TEXT' for h in headers])}
-                                )
-                            '''
-                            insert_sql = f'''
-                                INSERT INTO "{table_name}" ({
-                                ", ".join([f'"{h}"' for h in headers])
-                            })
-                                VALUES ({
-                                ", ".join([":" + f"p{i}" for i in range(len(headers))])
-                            })
-                            '''
-
-                            with self._connector.session_scope() as session:
-                                logger.debug(
-                                    f"Table: {table_name}, headers(final): {headers}"
-                                )
-                                session.execute(text(create_sql))
-
-                                reader = csv.reader(lines, dialect)
-                                if has_header:
-                                    next(reader, None)
-
-                                batch_params: List[Dict[str, Any]] = []
-                                for row in reader:
-                                    if not row:
-                                        continue
-                                    if len(row) != len(headers):
-                                        if len(row) < len(headers):
-                                            row += [None] * (len(headers) - len(row))
-                                        else:
-                                            row = row[: len(headers)]
-                                    params = {
-                                        f"p{i}": (row[i] if i < len(row) else None)
-                                        for i in range(len(headers))
-                                    }
-                                    batch_params.append(params)
-                                    if len(batch_params) >= 1000:
-                                        session.execute(text(insert_sql), batch_params)
-                                        batch_params = []
-                                if batch_params:
-                                    session.execute(text(insert_sql), batch_params)
-                                session.commit()
-
-                            return True, table_name
-
-                        except csv.Error:
-                            self._import_with_simple_split_blocking(table_name, content)
-                            return True, table_name
-
-                    except UnicodeDecodeError:
-                        continue
-                    except Exception as e:
-                        logger.warning(f"Error with encoding {encoding}: {str(e)}")
-                        continue
+        def _worker():
+            results = {
+                "total_files": len(sqlite_files),
+                "successful": 0,
+                "failed": 0,
+                "tables_merged": [],
+            }
 
+            with self._connector.session_scope() as session:
+                # 获取底层的 sqlite3 连接对象
+                connection_proxy = session.connection()
+                # 兼容不同版本的 SQLAlchemy 获取底层连接的方式
                 try:
-                    with open(file_info["full_path"], "rb") as f:
-                        content = f.read().decode("ascii", errors="ignore")
-                        if content.strip():
-                            self._import_with_simple_split_blocking(table_name, content)
-                            return True, table_name
-                        else:
-                            raise ValueError("File is empty or unreadable")
-                except Exception as e:
-                    return (
-                        False,
-                        f"Failed to process {file_info['file_name']}: {str(e)}",
-                    )
+                    # SQLAlchemy 1.4+ / 2.0
+                    raw_conn = connection_proxy.connection.dbapi_connection
+                except AttributeError:
+                    try:
+                        # 旧版本或某些驱动
+                        raw_conn = connection_proxy.connection
+                    except AttributeError:
+                        # 最后的尝试
+                        raw_conn = session.get_bind().raw_connection()
 
-            except Exception as e:
-                return (
-                    False,
-                    f"Failed to process {file_info.get('full_path', '')}: {str(e)}",
-                )
+                # 确保 raw_conn 是 sqlite3 的连接对象
+                if not raw_conn:
+                    raise RuntimeError("Failed to get raw sqlite3 connection")
 
-        for file_info in csv_files:
-            ok, info = await self._run_in_thread(_process_one_file, file_info)
-            if ok:
-                results["successful"] += 1
-                if info:
-                    results["tables_created"].append(info)
-            else:
-                results["failed"] += 1
-                logger.error(info)
+                cursor = raw_conn.cursor()
 
-        return results
+                for db_path in sqlite_files:
+                    src_alias = f"src_db_{uuid.uuid4().hex[:8]}"
+                    try:
+                        try:
+                            cursor.execute("PRAGMA database_list")
+                            attached_dbs = cursor.fetchall()
+                            for _, name, _ in attached_dbs:
+                                if name not in ("main", "temp"):
+                                    cursor.execute(f"DETACH DATABASE {name}")
+                        except Exception as cleanup_err:
+                            logger.warning(f"Cleanup warning: {cleanup_err}")
 
-    def _import_with_simple_split_blocking(self, table_name: str, content: str):
-        """Fallback method for malformed CSV files (blocking, 使用 SQLAlchemy 执行)"""
-        assert self._connector is not None
-        content = content.replace("\r\n", "\n").replace("\r", "\n")
-        lines = [line for line in content.split("\n") if line.strip()]
-        if not lines:
-            raise ValueError("No data found after cleaning")
+                        cursor.execute(f"ATTACH DATABASE ? AS {src_alias}", (db_path,))
 
-        first_line = lines[0]
-        delimiter = "," if "," in first_line else "\t" if "\t" in first_line else ";"
+                        cursor.execute(
+                            f"SELECT name, sql FROM {src_alias}.sqlite_master "
+                            f"WHERE type='table' AND name NOT LIKE 'sqlite_%'"
+                        )
+                        tables = cursor.fetchall()
 
-        raw_headers = first_line.split(delimiter)
-        headers = self._sanitize_and_dedup_headers(raw_headers)
-        actual_columns = len(headers)
+                        for table_name, create_sql in tables:
+                            cursor.execute(
+                                "SELECT name FROM sqlite_master "
+                                "WHERE type='table' "
+                                "AND name=?",
+                                (table_name,),
+                            )
+                            if not cursor.fetchone():
+                                cursor.execute(create_sql)
+                                cursor.execute(
+                                    f'INSERT INTO main."{table_name}" '
+                                    f'SELECT * FROM {src_alias}."{table_name}"'
+                                )
+                                results["tables_merged"].append(table_name)
+                            else:
+                                logger.warning(
+                                    f"Table '{table_name}' exists. Skipping."
+                                )
 
-        create_sql = f"""
-            CREATE TABLE IF NOT EXISTS "{table_name}" (
-                {", ".join([f'"{h}" TEXT' for h in headers])}
-            )
-        """
+                        raw_conn.commit()
+                        results["successful"] += 1
 
-        insert_sql = f"""
-            INSERT INTO "{table_name}" ({", ".join([f'"{h}"' for h in headers])})
-            VALUES ({", ".join([":" + f"p{i}" for i in range(actual_columns)])})
-        """
+                    except Exception as e:
+                        logger.error(f"Failed to merge {db_path}: {e}")
+                        results["failed"] += 1
+                        try:
+                            raw_conn.rollback()
+                        except Exception:
+                            pass
+                    finally:
+                        try:
+                            cursor.execute(f"DETACH DATABASE {src_alias}")
+                        except Exception:
+                            pass
 
-        with self._connector.session_scope() as session:
-            session.execute(text(create_sql))
-            batch: List[Dict[str, Any]] = []
-            for line in lines[1:]:
-                row = line.split(delimiter)
-                if len(row) != actual_columns:
-                    if len(row) < actual_columns:
-                        row += [None] * (actual_columns - len(row))
-                    else:
-                        row = row[:actual_columns]
-                params = {f"p{i}": row[i] for i in range(actual_columns)}
-                batch.append(params)
-                if len(batch) >= 1000:
-                    session.execute(text(insert_sql), batch)
-                    batch = []
-            if batch:
-                session.execute(text(insert_sql), batch)
-            session.commit()
+            return results
+
+        return await self._run_in_thread(_worker)
 
     async def get_table_info_simple(self) -> List[str]:
         """Return simplified table info: table(column1,column2,...)"""
diff --git a/pilot/benchmark_meta_data/table_mapping.json b/pilot/benchmark_meta_data/table_mapping.json
deleted file mode 100644
index 25ed34b99..000000000
--- a/pilot/benchmark_meta_data/table_mapping.json
+++ /dev/null
@@ -1,95 +0,0 @@
-{
-  "data_source_10_indexdata": "ant_icube_dev.stock_exchange_index_data",
-  "data_source_10_indexinfo": "ant_icube_dev.stock_exchange_index_info",
-  "data_source_11_price": "ant_icube_dev.bakery_sales_price",
-  "data_source_11_sales": "ant_icube_dev.bakery_sales_sale",
-  "data_source_12_events1": "ant_icube_dev.google_merchandise_events",
-  "data_source_12_items": "ant_icube_dev.google_merchandise_items",
-  "data_source_12_users": "ant_icube_dev.google_merchandise_users",
-  "data_source_13_features": "ant_icube_dev.walmart_features",
-  "data_source_13_sales": "ant_icube_dev.walmart_sales",
-  "data_source_13_stores": "ant_icube_dev.walmart_stores",
-  "data_source_14_inventory": "ant_icube_dev.mexico_toy_inventory",
-  "data_source_14_products": "ant_icube_dev.mexico_toy_products",
-  "data_source_14_sales": "ant_icube_dev.mexico_toy_sales",
-  "data_source_14_stores": "ant_icube_dev.mexico_toy_stores",
-  "data_source_15_cardbase": "ant_icube_dev.credit_card_card_base",
-  "data_source_15_customerbase": "ant_icube_dev.credit_card_customer_base",
-  "data_source_15_fraudbase": "ant_icube_dev.credit_card_fraud_base",
-  "data_source_15_transactionbase": "ant_icube_dev.credit_card_transaction_base",
-  "data_source_16_marks": "ant_icube_dev.school_marks",
-  "data_source_16_students": "ant_icube_dev.school_students",
-  "data_source_16_subjects": "ant_icube_dev.school_subject",
-  "data_source_16_teachers": "ant_icube_dev.school_teachers",
-  "data_source_17_df_customers": "ant_icube_dev.ecommerce_order_customers",
-  "data_source_17_df_orderitems": "ant_icube_dev.ecommerce_order_order_items",
-  "data_source_17_df_orders": "ant_icube_dev.ecommerce_order_orders",
-  "data_source_17_df_payments": "ant_icube_dev.ecommerce_order_payments",
-  "data_source_17_df_products": "ant_icube_dev.ecommerce_order_products",
-  "data_source_18_corruption": "ant_icube_dev.world_economic_corruption",
-  "data_source_18_cost_of_living": "ant_icube_dev.world_economic_cost_of_living",
-  "data_source_18_richest_countries": "ant_icube_dev.world_economic_richest_countries",
-  "data_source_18_tourism": "ant_icube_dev.world_economic_tourism",
-  "data_source_18_unemployment": "ant_icube_dev.world_economic_unemployment",
-  "data_source_19_drinks": "ant_icube_dev.alcohol_and_life_expectancy_drinks",
-  "data_source_19_lifeexpectancy-verbose": "ant_icube_dev.alcohol_and_life_expectancy_verbose",
-  "data_source_1_finance_data": "ant_icube_dev.di_finance_data",
-  "data_source_20_drivers_data": "ant_icube_dev.city_ride_data_drivers",
-  "data_source_20_rides_data": "ant_icube_dev.city_ride_data_rides",
-  "data_source_21_e_customers": "ant_icube_dev.di_data_cleaning_for_customer_database_e_customers",
-  "data_source_21_e_orders": "ant_icube_dev.di_data_cleaning_for_customer_database_e_orders",
-  "data_source_21_e_products": "ant_icube_dev.di_data_cleaning_for_customer_database_e_products",
-  "data_source_22_ufc_country_data": "ant_icube_dev.ufc_country_data",
-  "data_source_22_ufc_events_stats": "ant_icube_dev.ufc_events_stats",
-  "data_source_22_ufc_fighters_stats": "ant_icube_dev.ufc_fighters_stats",
-  "data_source_23_ben10_aliens": "ant_icube_dev.di_ben10_alien_universe_realistic_battle_dataset_aliens",
-  "data_source_23_ben10_battles": "ant_icube_dev.di_ben10_alien_universe_realistic_battle_dataset_battles",
-  "data_source_23_ben10_enemies": "ant_icube_dev.di_ben10_alien_universe_realistic_battle_dataset_enemies",
-  "data_source_24_blinkit_customer_feedback": "ant_icube_dev.blinkit_customers",
-  "data_source_24_blinkit_customers": "ant_icube_dev.blinkit_customers",
-  "data_source_24_blinkit_delivery_performance": "ant_icube_dev.blinkit_delivery_performance",
-  "data_source_24_blinkit_inventory": "ant_icube_dev.blinkit_inventory",
-  "data_source_24_blinkit_inventorynew": "ant_icube_dev.blinkit_inventory",
-  "data_source_24_blinkit_marketing_performance": "ant_icube_dev.blinkit_delivery_performance",
-  "data_source_24_blinkit_order_items": "ant_icube_dev.blinkit_order_items",
-  "data_source_24_blinkit_orders": "ant_icube_dev.blinkit_orders",
-  "data_source_24_blinkit_products": "ant_icube_dev.blinkit_products",
-  "data_source_25_bakutech_bakutech_product_categories": "ant_icube_dev.tech_sales_product_categories",
-  "data_source_25_bakutech_bakutech_product_subcategories": "ant_icube_dev.tech_sales_product_subcategories",
-  "data_source_25_bakutech_bakutech_sales_data": "ant_icube_dev.tech_sales_sales_data",
-  "data_source_25_bakutech_bakutech_assets": "ant_icube_dev.tech_sales_assets",
-  "data_source_25_bakutech_bakutech_customer_lookup": "ant_icube_dev.tech_sales_customer_lookup",
-  "data_source_25_bakutech_bakutech_dates": "ant_icube_dev.tech_sales_dates",
-  "data_source_25_bakutech_bakutech_product_returns": "ant_icube_dev.tech_sales_product_returns",
-  "data_source_25_bakutech_bakutech_products_lookup": "ant_icube_dev.tech_sales_product_lookup",
-  "data_source_26_appearances": "ant_icube_dev.football_appereances",
-  "data_source_26_games": "ant_icube_dev.football_games",
-  "data_source_26_leagues": "ant_icube_dev.football_leagues",
-  "data_source_26_players": "ant_icube_dev.football_players",
-  "data_source_26_shots": "ant_icube_dev.football_shots",
-  "data_source_26_teams": "ant_icube_dev.football_teams",
-  "data_source_26_teamstats": "ant_icube_dev.football_teamstats",
-  "data_source_27_categories": "ant_icube_dev.grocery_sales_categories",
-  "data_source_27_cities": "ant_icube_dev.grocery_sales_cities",
-  "data_source_27_countries": "ant_icube_dev.grocery_sales_countries",
-  "data_source_27_customers": "ant_極cube_dev.grocery_sales_customers",
-  "data_source_27_employees": "ant_icube_dev.grocery_sales_employees",
-  "data_source_27_products": "ant_icube_dev.grocery_sales_products",
-  "data_source_27_sales": "ant_icube_dev.grocery_sales_sales",
-  "data_source_28_customers": "ant_icube_dev.online_shop_customers",
-  "data_source_28_order_items": "ant_icube_dev.online_shop_order_items",
-  "data_source_28_orders": "ant_icube_dev.online_shop_orders",
-  "data_source_28_payment": "ant_icube_dev.online_shop_payment",
-  "data_source_28_products": "ant_icube_dev.online_shop_products",
-  "data_source_28_reviews": "ant_icube_dev.online_shop_reviews",
-  "data_source_28_shipments": "ant_icube_dev.online_shop_shipments",
-  "data_source_28_suppliers": "ant_icube_dev.online_shop_suppliers",
-  "data_source_2_finance_loan_approval_prediction_data": "ant_icube_dev.di_finance_loan_approval_prediction_data",
-  "data_source_3_stock_details_5_years 3": "ant_icube_dev.di_massive_yahoo_finance_dataset_0805",
-  "data_source_4_wa_fn-usec_-accounts-receivable 2": "ant_icube_dev.di_finance_factoring_ibm_late_payment_histories",
-  "data_source_5_unicorns till sep 2022": "ant_icube_dev.di_unicorn_startups",
-  "data_source_6_sales dataset": "ant_icube_dev.di_sales_dataset",
-  "data_source_7_vgsales": "ant_icube_dev.di_video_game_sales",
-  "data_source_8_googleplaystore": "ant_icube_dev.di_google_play_store_apps",
-  "data_source_9_final": "ant_icube_dev.di_global_lnternet_users"
-}
\ No newline at end of file