diff --git a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/api/endpoints.py b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/api/endpoints.py index 401addc4c..9826487e3 100644 --- a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/api/endpoints.py +++ b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/api/endpoints.py @@ -204,68 +204,47 @@ async def evaluation( ) -@router.get("/benchmark/list_results", dependencies=[Depends(check_api_key)]) -async def list_compare_runs(limit: int = 50, offset: int = 0): +@router.get("/benchmark/result/{serial_no}", dependencies=[Depends(check_api_key)]) +async def get_compare_run_detail(serial_no: str, limit: int = 200, offset: int = 0): dao = BenchmarkResultDao() - rows = dao.list_summaries(limit=limit, offset=offset) - result = [] - for s in rows: - result.append( + summaries = dao.list_summaries_by_task(serial_no, limit=10000, offset=0) + if not summaries: + return Result.succ( + {"serialNo": serial_no, "summaries": [], "metrics": {}, "cotTokens": {"total": 0, "byModel": {}}}) + + detail_list = [] + total_counts = {"right": 0, "wrong": 0, "failed": 0, "exception": 0} + round_ids = set() + for s in summaries: + r, w, f, e = s.right, s.wrong, s.failed, s.exception + denom_exec = max(r + w + f + e, 1) + accuracy = r / denom_exec + exec_rate = (r + w) / denom_exec + total_counts["right"] += r + total_counts["wrong"] += w + total_counts["failed"] += f + total_counts["exception"] += e + round_ids.add(s.round_id) + detail_list.append( { - "id": s.id, "roundId": s.round_id, + "llmCode": getattr(s, "llm_code", None), + "right": r, + "wrong": w, + "failed": f, + "exception": e, + "accuracy": accuracy, + "execRate": exec_rate, "outputPath": s.output_path, - "right": s.right, - "wrong": s.wrong, - "failed": s.failed, - "exception": s.exception, - "gmtCreated": s.gmt_created.isoformat() if s.gmt_created else None, } ) - return Result.succ(result) - -@router.get("/benchmark/result/{summary_id}", dependencies=[Depends(check_api_key)]) -async def get_compare_run_detail(summary_id: int, limit: int = 200, offset: int = 0): - dao = BenchmarkResultDao() - s = dao.get_summary_by_id(summary_id) - if not s: - raise HTTPException(status_code=404, detail="compare run not found") - compares = dao.list_compare_by_round_and_path( - s.round_id, s.output_path, limit=limit, offset=offset + return Result.succ( + { + "serialNo": serial_no, + "summaries": detail_list, + } ) - detail = { - "id": s.id, - "roundId": s.round_id, - "outputPath": s.output_path, - "summary": { - "right": s.right, - "wrong": s.wrong, - "failed": s.failed, - "exception": s.exception, - }, - "items": [ - { - "id": r.id, - "serialNo": r.serial_no, - "analysisModelId": r.analysis_model_id, - "question": r.question, - "prompt": r.prompt, - "standardAnswerSql": r.standard_answer_sql, - "llmOutput": r.llm_output, - "executeResult": json.loads(r.execute_result) - if r.execute_result - else None, - "errorMsg": r.error_msg, - "compareResult": r.compare_result, - "isExecute": r.is_execute, - "llmCount": r.llm_count, - "gmtCreated": r.gmt_created.isoformat() if r.gmt_created else None, - } - for r in compares - ], - } - return Result.succ(detail) @router.post("/execute_benchmark_task", dependencies=[Depends(check_api_key)]) @@ -396,6 +375,32 @@ async def download_benchmark_result( raise HTTPException(status_code=404, detail=str(e)) +@router.get("/benchmark/list_compare_tasks", dependencies=[Depends(check_api_key)]) +async def list_benchmark_tasks(limit: int = 50, offset: int = 0): + dao = BenchmarkResultDao() + tasks = dao.list_tasks(limit=limit, offset=offset) + result = [] + for task_id in tasks: + summaries = dao.list_summaries_by_task(task_id, limit=10000, offset=0) + result.append( + { + "serialNo": task_id, + "summaries": [ + { + "roundId": s.round_id, + "llmCode": getattr(s, "llm_code", None), + "right": s.right, + "wrong": s.wrong, + "failed": s.failed, + "exception": s.exception, + "outputPath": s.output_path, + } + for s in summaries + ], + } + ) + return Result.succ(result) + def init_endpoints(system_app: SystemApp, config: ServeConfig) -> None: """Initialize the endpoints""" global global_system_app diff --git a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/db/benchmark_db.py b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/db/benchmark_db.py index 6d2b50c68..d18da5497 100644 --- a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/db/benchmark_db.py +++ b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/db/benchmark_db.py @@ -13,6 +13,7 @@ from sqlalchemy import ( Text, UniqueConstraint, desc, + func, ) from dbgpt.storage.metadata import BaseDao, Model @@ -81,12 +82,12 @@ class BenchmarkCompareEntity(Model): class BenchmarkSummaryEntity(Model): """Summary result for one round and one output path. - Counts of RIGHT/WRONG/FAILED/EXCEPTION. + Counts of RIGHT/WRONG/FAILED/EXCEPTION, per llm_code. """ __tablename__ = "benchmark_summary" __table_args__ = ( - UniqueConstraint("round_id", "output_path", name="uk_round_output"), + UniqueConstraint("round_id", "output_path", "llm_code", name="uk_round_output_llm"), ) id = Column( @@ -96,6 +97,8 @@ class BenchmarkSummaryEntity(Model): output_path = Column( String(512), nullable=False, comment="Original output file path" ) + task_serial_no = Column(String(255), nullable=True, comment="Task serial number (unique id per submitted task)") + llm_code = Column(String(255), nullable=True, comment="LLM code for this summary") right = Column(Integer, default=0, comment="RIGHT count") wrong = Column(Integer, default=0, comment="WRONG count") @@ -111,6 +114,7 @@ class BenchmarkSummaryEntity(Model): ) Index("idx_bm_sum_round", "round_id") + Index("idx_bm_sum_task", "task_serial_no") class BenchmarkResultDao(BaseDao): @@ -212,6 +216,53 @@ class BenchmarkResultDao(BaseDao): session.commit() return summary.id + def upsert_summary( + self, + round_id: int, + output_path: str, + llm_code: Optional[str], + right: int, + wrong: int, + failed: int, + exception: int, + task_serial_no: Optional[str] = None, + ) -> int: + """Upsert summary counts directly into DB (per llm_code), with task serial no.""" + with self.session() as session: + existing = ( + session.query(BenchmarkSummaryEntity) + .filter( + BenchmarkSummaryEntity.round_id == round_id, + BenchmarkSummaryEntity.output_path == output_path, + BenchmarkSummaryEntity.llm_code == llm_code, + ) + .first() + ) + if existing: + existing.right = right + existing.wrong = wrong + existing.failed = failed + existing.exception = exception + if task_serial_no is not None: + existing.task_serial_no = task_serial_no + existing.gmt_modified = datetime.now() + session.commit() + return existing.id + else: + summary = BenchmarkSummaryEntity( + round_id=round_id, + output_path=output_path, + llm_code=llm_code, + right=right, + wrong=wrong, + failed=failed, + exception=exception, + task_serial_no=task_serial_no, + ) + session.add(summary) + session.commit() + return summary.id + # Basic query helpers def list_compare_by_round(self, round_id: int, limit: int = 100, offset: int = 0): with self.session(commit=False) as session: @@ -237,7 +288,33 @@ class BenchmarkResultDao(BaseDao): .first() ) - # New helpers for listing summaries and detail by id + def list_summaries_by_round(self, round_id: int, limit: int = 100, offset: int = 0): + with self.session(commit=False) as session: + return ( + session.query(BenchmarkSummaryEntity) + .filter(BenchmarkSummaryEntity.round_id == round_id) + .order_by(desc(BenchmarkSummaryEntity.id)) + .limit(limit) + .offset(offset) + .all() + ) + + def list_rounds(self, limit: int = 100, offset: int = 0): + with self.session(commit=False) as session: + rows = ( + session.query( + BenchmarkSummaryEntity.round_id, + func.max(BenchmarkSummaryEntity.gmt_created).label("last_time"), + ) + .group_by(BenchmarkSummaryEntity.round_id) + .order_by(desc("last_time")) + .limit(limit) + .offset(offset) + .all() + ) + # return only round ids in order + return [r[0] for r in rows] + def list_summaries(self, limit: int = 100, offset: int = 0): with self.session(commit=False) as session: return ( @@ -256,17 +333,32 @@ class BenchmarkResultDao(BaseDao): .first() ) - def list_compare_by_round_and_path( - self, round_id: int, output_path: str, limit: int = 200, offset: int = 0 - ): + def list_tasks(self, limit: int = 100, offset: int = 0) -> List[str]: + """List submitted task ids (task_serial_no), ordered by latest summary time.""" with self.session(commit=False) as session: - return ( - session.query(BenchmarkCompareEntity) - .filter( - BenchmarkCompareEntity.round_id == round_id, - BenchmarkCompareEntity.output_path == output_path, + rows = ( + session.query( + BenchmarkSummaryEntity.task_serial_no, + func.max(BenchmarkSummaryEntity.gmt_created).label("last_time"), ) - .order_by(desc(BenchmarkCompareEntity.id)) + .filter(BenchmarkSummaryEntity.task_serial_no.isnot(None)) + .group_by(BenchmarkSummaryEntity.task_serial_no) + .order_by(desc("last_time")) + .limit(limit) + .offset(offset) + .all() + ) + return [r[0] for r in rows] + + def list_summaries_by_task( + self, task_serial_no: str, limit: int = 1000, offset: int = 0 + ): + """List summaries for a given task (may include multiple rounds).""" + with self.session(commit=False) as session: + return ( + session.query(BenchmarkSummaryEntity) + .filter(BenchmarkSummaryEntity.task_serial_no == task_serial_no) + .order_by(desc(BenchmarkSummaryEntity.id)) .limit(limit) .offset(offset) .all() diff --git a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/file_parse_service.py b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/file_parse_service.py index c4b6e494e..35d6a8095 100644 --- a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/file_parse_service.py +++ b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/file_parse_service.py @@ -78,57 +78,54 @@ class FileParseService(ABC): def summary_and_write_multi_round_benchmark_result( self, output_path: str, round_id: int ) -> str: - """Compute summary from the Excel file and return JSON string. + """Compute summary from the Excel file grouped by llmCode and return JSON list. - It will read the '_round{round_id}.xlsx' file and sheet - 'benchmark_compare_result', then count the compareResult column - (RIGHT/WRONG/FAILED/EXCEPTION) to build summary. + It reads the '_round{round_id}.xlsx' file and sheet + 'benchmark_compare_result', then for each llmCode counts the compareResult column + (RIGHT/WRONG/FAILED/EXCEPTION) to build summary list. """ try: base_name = Path(output_path).stem extension = Path(output_path).suffix if extension.lower() not in [".xlsx", ".xls"]: extension = ".xlsx" - excel_file = ( - Path(output_path).parent / f"{base_name}_round{round_id}{extension}" - ) + excel_file = Path(output_path).parent / f"{base_name}_round{round_id}{extension}" if not excel_file.exists(): logger.warning(f"summary excel not found: {excel_file}") - result = dict(right=0, wrong=0, failed=0, exception=0) - return json.dumps(result, ensure_ascii=False) + return json.dumps([], ensure_ascii=False) df = pd.read_excel(str(excel_file), sheet_name="benchmark_compare_result") - right = ( - int((df["compareResult"] == "RIGHT").sum()) - if "compareResult" in df.columns - else 0 - ) - wrong = ( - int((df["compareResult"] == "WRONG").sum()) - if "compareResult" in df.columns - else 0 - ) - failed = ( - int((df["compareResult"] == "FAILED").sum()) - if "compareResult" in df.columns - else 0 - ) - exception = ( - int((df["compareResult"] == "EXCEPTION").sum()) - if "compareResult" in df.columns - else 0 - ) + if "compareResult" not in df.columns: + logger.warning("compareResult column missing in excel") + return json.dumps([], ensure_ascii=False) + + # ensure llmCode column exists + if "llmCode" not in df.columns: + df["llmCode"] = None + + summaries = [] + for llm_code, group in df.groupby("llmCode"): + right = int((group["compareResult"] == "RIGHT").sum()) + wrong = int((group["compareResult"] == "WRONG").sum()) + failed = int((group["compareResult"] == "FAILED").sum()) + exception = int((group["compareResult"] == "EXCEPTION").sum()) + summaries.append( + { + "llmCode": None if pd.isna(llm_code) else str(llm_code), + "right": right, + "wrong": wrong, + "failed": failed, + "exception": exception, + } + ) - result = dict(right=right, wrong=wrong, failed=failed, exception=exception) logger.info( - f"[summary] summary computed from Excel for round={round_id}," - f" output_path={output_path} -> {result}" + f"[summary] computed per llmCode for round={round_id}, output_path={output_path} -> {summaries}" ) - return json.dumps(result, ensure_ascii=False) + return json.dumps(summaries, ensure_ascii=False) except Exception as e: logger.error(f"summary compute error from excel: {e}", exc_info=True) - result = dict(right=0, wrong=0, failed=0, exception=0) - return json.dumps(result, ensure_ascii=False) + return json.dumps([], ensure_ascii=False) def get_input_stream(self, location: str): """Get input stream from location diff --git a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/user_input_execute_service.py b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/user_input_execute_service.py index da749511b..3cdff3f4a 100644 --- a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/user_input_execute_service.py +++ b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/user_input_execute_service.py @@ -6,6 +6,7 @@ from dbgpt.util.benchmarks import StorageUtil from dbgpt_serve.evaluate.service.fetchdata.benchmark_data_manager import ( get_benchmark_manager, ) +from dbgpt_serve.evaluate.db.benchmark_db import BenchmarkResultDao from .data_compare_service import DataCompareService from .file_parse_service import FileParseService @@ -197,6 +198,26 @@ class UserInputExecuteService: config.benchmark_mode_type == BenchmarkModeTypeEnum.EXECUTE, llm_count, ) + try: + summary_json = self.file_service.summary_and_write_multi_round_benchmark_result( + location, round_id + ) + import json as _json + + results = _json.loads(summary_json) if summary_json else [] + dao = BenchmarkResultDao() + for item in results: + llm_code = item.get("llmCode") + right = int(item.get("right", 0)) + wrong = int(item.get("wrong", 0)) + failed = int(item.get("failed", 0)) + exception = int(item.get("exception", 0)) + dao.upsert_summary(round_id, location, llm_code, right, wrong, failed, exception, task_serial_no=config.evaluate_code) + except Exception as e: + logger.error( + f"[execute_llm_compare_result] summary from excel or write db failed: {e}", + exc_info=True, + ) def _convert_query_result_to_column_format( self, result: List[Dict]