feat: async loading benchmark data

This commit is contained in:
yaoyifan-yyf
2025-09-25 11:26:13 +08:00
parent a0f413b915
commit 9e50bff12c
7 changed files with 614 additions and 0 deletions

View File

@@ -1,3 +1,4 @@
import asyncio
import logging
import os
import sys
@@ -35,6 +36,7 @@ from dbgpt_app.base import (
from dbgpt_app.component_configs import initialize_components
from dbgpt_app.config import ApplicationConfig, ServiceWebParameters, SystemParameters
from dbgpt_serve.core import add_exception_handler
from dbgpt_serve.evaluate.service.fetchdata.benchmark_data_manager import BenchmarkDataManager, get_benchmark_manager
logger = logging.getLogger(__name__)
ROOT_PATH = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
@@ -144,6 +146,15 @@ def initialize_app(param: ApplicationConfig, args: List[str] = None):
# After init, when the database is ready
system_app.after_init()
# Async fetch benchmark dataset from Falcon
loop = asyncio.get_event_loop()
if loop.is_running():
loop.create_task(load_benchmark_data())
else:
loop.run_until_complete(load_benchmark_data())
binding_port = web_config.port
binding_host = web_config.host
if not web_config.light:
@@ -319,6 +330,43 @@ def parse_args():
return parser.parse_args()
async def load_benchmark_data():
"""Load benchmark data from GitHub repository into SQLite database"""
logging.basicConfig(level=logging.INFO)
logger.info("Starting benchmark data loading process...")
try:
manager = get_benchmark_manager(system_app)
async with manager:
logger.info("Fetching data from GitHub repository...")
result = await manager.load_from_github(
repo_url="https://github.com/inclusionAI/Falcon",
data_dir="data/source"
)
# Log detailed results
logger.info("\nBenchmark Data Loading Summary:")
logger.info(f"Total CSV files processed: {result['total_files']}")
logger.info(f"Successfully imported: {result['successful']}")
logger.info(f"Failed imports: {result['failed']}")
if result['failed'] > 0:
logger.warning(f"Encountered {result['failed']} failures during import")
# Verify the loaded data
table_info = await manager.get_table_info()
logger.info(f"Loaded {len(table_info)} tables into database")
return {
'import_result': result,
'table_info': table_info
}
except Exception as e:
logger.error("Failed to load benchmark data", exc_info=True)
raise RuntimeError(f"Benchmark data loading failed: {str(e)}") from e
if __name__ == "__main__":
# Parse command line arguments
_args = parse_args()