diff --git a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/api/endpoints.py b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/api/endpoints.py
index 401addc4c..9826487e3 100644
--- a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/api/endpoints.py
+++ b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/api/endpoints.py
@@ -204,68 +204,47 @@ async def evaluation(
     )
 
 
-@router.get("/benchmark/list_results", dependencies=[Depends(check_api_key)])
-async def list_compare_runs(limit: int = 50, offset: int = 0):
+@router.get("/benchmark/result/{serial_no}", dependencies=[Depends(check_api_key)])
+async def get_compare_run_detail(serial_no: str, limit: int = 200, offset: int = 0):
     dao = BenchmarkResultDao()
-    rows = dao.list_summaries(limit=limit, offset=offset)
-    result = []
-    for s in rows:
-        result.append(
+    summaries = dao.list_summaries_by_task(serial_no, limit=10000, offset=0)
+    if not summaries:
+        return Result.succ(
+            {"serialNo": serial_no, "summaries": [], "metrics": {}, "cotTokens": {"total": 0, "byModel": {}}})
+
+    detail_list = []
+    total_counts = {"right": 0, "wrong": 0, "failed": 0, "exception": 0}
+    round_ids = set()
+    for s in summaries:
+        r, w, f, e = s.right, s.wrong, s.failed, s.exception
+        denom_exec = max(r + w + f + e, 1)
+        accuracy = r / denom_exec
+        exec_rate = (r + w) / denom_exec
+        total_counts["right"] += r
+        total_counts["wrong"] += w
+        total_counts["failed"] += f
+        total_counts["exception"] += e
+        round_ids.add(s.round_id)
+        detail_list.append(
             {
-                "id": s.id,
                 "roundId": s.round_id,
+                "llmCode": getattr(s, "llm_code", None),
+                "right": r,
+                "wrong": w,
+                "failed": f,
+                "exception": e,
+                "accuracy": accuracy,
+                "execRate": exec_rate,
                 "outputPath": s.output_path,
-                "right": s.right,
-                "wrong": s.wrong,
-                "failed": s.failed,
-                "exception": s.exception,
-                "gmtCreated": s.gmt_created.isoformat() if s.gmt_created else None,
             }
         )
-    return Result.succ(result)
 
-
-@router.get("/benchmark/result/{summary_id}", dependencies=[Depends(check_api_key)])
-async def get_compare_run_detail(summary_id: int, limit: int = 200, offset: int = 0):
-    dao = BenchmarkResultDao()
-    s = dao.get_summary_by_id(summary_id)
-    if not s:
-        raise HTTPException(status_code=404, detail="compare run not found")
-    compares = dao.list_compare_by_round_and_path(
-        s.round_id, s.output_path, limit=limit, offset=offset
+    return Result.succ(
+        {
+            "serialNo": serial_no,
+            "summaries": detail_list,
+        }
     )
-    detail = {
-        "id": s.id,
-        "roundId": s.round_id,
-        "outputPath": s.output_path,
-        "summary": {
-            "right": s.right,
-            "wrong": s.wrong,
-            "failed": s.failed,
-            "exception": s.exception,
-        },
-        "items": [
-            {
-                "id": r.id,
-                "serialNo": r.serial_no,
-                "analysisModelId": r.analysis_model_id,
-                "question": r.question,
-                "prompt": r.prompt,
-                "standardAnswerSql": r.standard_answer_sql,
-                "llmOutput": r.llm_output,
-                "executeResult": json.loads(r.execute_result)
-                if r.execute_result
-                else None,
-                "errorMsg": r.error_msg,
-                "compareResult": r.compare_result,
-                "isExecute": r.is_execute,
-                "llmCount": r.llm_count,
-                "gmtCreated": r.gmt_created.isoformat() if r.gmt_created else None,
-            }
-            for r in compares
-        ],
-    }
-    return Result.succ(detail)
 
 
 @router.post("/execute_benchmark_task", dependencies=[Depends(check_api_key)])
@@ -396,6 +375,32 @@ async def download_benchmark_result(
         raise HTTPException(status_code=404, detail=str(e))
 
 
+@router.get("/benchmark/list_compare_tasks", dependencies=[Depends(check_api_key)])
+async def list_benchmark_tasks(limit: int = 50, offset: int = 0):
+    dao = BenchmarkResultDao()
+    tasks = dao.list_tasks(limit=limit, offset=offset)
+    result = []
+    for task_id in tasks:
+        summaries = dao.list_summaries_by_task(task_id, limit=10000, offset=0)
+        result.append(
+            {
+                "serialNo": task_id,
+                "summaries": [
+                    {
+                        "roundId": s.round_id,
+                        "llmCode": getattr(s, "llm_code", None),
+                        "right": s.right,
+                        "wrong": s.wrong,
+                        "failed": s.failed,
+                        "exception": s.exception,
+                        "outputPath": s.output_path,
+                    }
+                    for s in summaries
+                ],
+            }
+        )
+    return Result.succ(result)
+
 def init_endpoints(system_app: SystemApp, config: ServeConfig) -> None:
     """Initialize the endpoints"""
     global global_system_app
diff --git a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/db/benchmark_db.py b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/db/benchmark_db.py
index 6d2b50c68..d18da5497 100644
--- a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/db/benchmark_db.py
+++ b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/db/benchmark_db.py
@@ -13,6 +13,7 @@ from sqlalchemy import (
     Text,
     UniqueConstraint,
     desc,
+    func,
 )
 
 from dbgpt.storage.metadata import BaseDao, Model
@@ -81,12 +82,12 @@ class BenchmarkCompareEntity(Model):
 class BenchmarkSummaryEntity(Model):
     """Summary result for one round and one output path.
 
-    Counts of RIGHT/WRONG/FAILED/EXCEPTION.
+    Counts of RIGHT/WRONG/FAILED/EXCEPTION, per llm_code.
     """
 
     __tablename__ = "benchmark_summary"
     __table_args__ = (
-        UniqueConstraint("round_id", "output_path", name="uk_round_output"),
+        UniqueConstraint("round_id", "output_path", "llm_code", name="uk_round_output_llm"),
     )
 
     id = Column(
@@ -96,6 +97,8 @@ class BenchmarkSummaryEntity(Model):
     output_path = Column(
         String(512), nullable=False, comment="Original output file path"
     )
+    task_serial_no = Column(String(255), nullable=True, comment="Task serial number (unique id per submitted task)")
+    llm_code = Column(String(255), nullable=True, comment="LLM code for this summary")
 
     right = Column(Integer, default=0, comment="RIGHT count")
     wrong = Column(Integer, default=0, comment="WRONG count")
@@ -111,6 +114,7 @@ class BenchmarkSummaryEntity(Model):
     )
 
     Index("idx_bm_sum_round", "round_id")
+    Index("idx_bm_sum_task", "task_serial_no")
 
 
 class BenchmarkResultDao(BaseDao):
@@ -212,6 +216,53 @@ class BenchmarkResultDao(BaseDao):
                 session.commit()
                 return summary.id
 
+    def upsert_summary(
+        self,
+        round_id: int,
+        output_path: str,
+        llm_code: Optional[str],
+        right: int,
+        wrong: int,
+        failed: int,
+        exception: int,
+        task_serial_no: Optional[str] = None,
+    ) -> int:
+        """Upsert summary counts directly into DB (per llm_code), with task serial no."""
+        with self.session() as session:
+            existing = (
+                session.query(BenchmarkSummaryEntity)
+                .filter(
+                    BenchmarkSummaryEntity.round_id == round_id,
+                    BenchmarkSummaryEntity.output_path == output_path,
+                    BenchmarkSummaryEntity.llm_code == llm_code,
+                )
+                .first()
+            )
+            if existing:
+                existing.right = right
+                existing.wrong = wrong
+                existing.failed = failed
+                existing.exception = exception
+                if task_serial_no is not None:
+                    existing.task_serial_no = task_serial_no
+                existing.gmt_modified = datetime.now()
+                session.commit()
+                return existing.id
+            else:
+                summary = BenchmarkSummaryEntity(
+                    round_id=round_id,
+                    output_path=output_path,
+                    llm_code=llm_code,
+                    right=right,
+                    wrong=wrong,
+                    failed=failed,
+                    exception=exception,
+                    task_serial_no=task_serial_no,
+                )
+                session.add(summary)
+                session.commit()
+                return summary.id
+
     # Basic query helpers
     def list_compare_by_round(self, round_id: int, limit: int = 100, offset: int = 0):
         with self.session(commit=False) as session:
@@ -237,7 +288,33 @@ class BenchmarkResultDao(BaseDao):
                 .first()
             )
 
-    # New helpers for listing summaries and detail by id
+    def list_summaries_by_round(self, round_id: int, limit: int = 100, offset: int = 0):
+        with self.session(commit=False) as session:
+            return (
+                session.query(BenchmarkSummaryEntity)
+                .filter(BenchmarkSummaryEntity.round_id == round_id)
+                .order_by(desc(BenchmarkSummaryEntity.id))
+                .limit(limit)
+                .offset(offset)
+                .all()
+            )
+
+    def list_rounds(self, limit: int = 100, offset: int = 0):
+        with self.session(commit=False) as session:
+            rows = (
+                session.query(
+                    BenchmarkSummaryEntity.round_id,
+                    func.max(BenchmarkSummaryEntity.gmt_created).label("last_time"),
+                )
+                .group_by(BenchmarkSummaryEntity.round_id)
+                .order_by(desc("last_time"))
+                .limit(limit)
+                .offset(offset)
+                .all()
+            )
+            # return only round ids in order
+            return [r[0] for r in rows]
+
     def list_summaries(self, limit: int = 100, offset: int = 0):
         with self.session(commit=False) as session:
             return (
@@ -256,17 +333,32 @@ class BenchmarkResultDao(BaseDao):
                 .first()
             )
 
-    def list_compare_by_round_and_path(
-        self, round_id: int, output_path: str, limit: int = 200, offset: int = 0
-    ):
+    def list_tasks(self, limit: int = 100, offset: int = 0) -> List[str]:
+        """List submitted task ids (task_serial_no), ordered by latest summary time."""
         with self.session(commit=False) as session:
-            return (
-                session.query(BenchmarkCompareEntity)
-                .filter(
-                    BenchmarkCompareEntity.round_id == round_id,
-                    BenchmarkCompareEntity.output_path == output_path,
+            rows = (
+                session.query(
+                    BenchmarkSummaryEntity.task_serial_no,
+                    func.max(BenchmarkSummaryEntity.gmt_created).label("last_time"),
                 )
-                .order_by(desc(BenchmarkCompareEntity.id))
+                .filter(BenchmarkSummaryEntity.task_serial_no.isnot(None))
+                .group_by(BenchmarkSummaryEntity.task_serial_no)
+                .order_by(desc("last_time"))
+                .limit(limit)
+                .offset(offset)
+                .all()
+            )
+            return [r[0] for r in rows]
+
+    def list_summaries_by_task(
+        self, task_serial_no: str, limit: int = 1000, offset: int = 0
+    ):
+        """List summaries for a given task (may include multiple rounds)."""
+        with self.session(commit=False) as session:
+            return (
+                session.query(BenchmarkSummaryEntity)
+                .filter(BenchmarkSummaryEntity.task_serial_no == task_serial_no)
+                .order_by(desc(BenchmarkSummaryEntity.id))
                 .limit(limit)
                 .offset(offset)
                 .all()
diff --git a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/file_parse_service.py b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/file_parse_service.py
index c4b6e494e..35d6a8095 100644
--- a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/file_parse_service.py
+++ b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/file_parse_service.py
@@ -78,57 +78,54 @@ class FileParseService(ABC):
     def summary_and_write_multi_round_benchmark_result(
         self, output_path: str, round_id: int
     ) -> str:
-        """Compute summary from the Excel file and return JSON string.
+        """Compute summary from the Excel file grouped by llmCode and return JSON list.
 
-        It will read the '<base>_round{round_id}.xlsx' file and sheet
-        'benchmark_compare_result', then count the compareResult column
-        (RIGHT/WRONG/FAILED/EXCEPTION) to build summary.
+        It reads the '<base>_round{round_id}.xlsx' file and sheet
+        'benchmark_compare_result', then for each llmCode counts the compareResult column
+        (RIGHT/WRONG/FAILED/EXCEPTION) to build summary list.
         """
         try:
             base_name = Path(output_path).stem
             extension = Path(output_path).suffix
             if extension.lower() not in [".xlsx", ".xls"]:
                 extension = ".xlsx"
-            excel_file = (
-                Path(output_path).parent / f"{base_name}_round{round_id}{extension}"
-            )
+            excel_file = Path(output_path).parent / f"{base_name}_round{round_id}{extension}"
             if not excel_file.exists():
                 logger.warning(f"summary excel not found: {excel_file}")
-                result = dict(right=0, wrong=0, failed=0, exception=0)
-                return json.dumps(result, ensure_ascii=False)
+                return json.dumps([], ensure_ascii=False)
 
             df = pd.read_excel(str(excel_file), sheet_name="benchmark_compare_result")
-            right = (
-                int((df["compareResult"] == "RIGHT").sum())
-                if "compareResult" in df.columns
-                else 0
-            )
-            wrong = (
-                int((df["compareResult"] == "WRONG").sum())
-                if "compareResult" in df.columns
-                else 0
-            )
-            failed = (
-                int((df["compareResult"] == "FAILED").sum())
-                if "compareResult" in df.columns
-                else 0
-            )
-            exception = (
-                int((df["compareResult"] == "EXCEPTION").sum())
-                if "compareResult" in df.columns
-                else 0
-            )
+            if "compareResult" not in df.columns:
+                logger.warning("compareResult column missing in excel")
+                return json.dumps([], ensure_ascii=False)
+
+            # ensure llmCode column exists
+            if "llmCode" not in df.columns:
+                df["llmCode"] = None
+
+            summaries = []
+            for llm_code, group in df.groupby("llmCode"):
+                right = int((group["compareResult"] == "RIGHT").sum())
+                wrong = int((group["compareResult"] == "WRONG").sum())
+                failed = int((group["compareResult"] == "FAILED").sum())
+                exception = int((group["compareResult"] == "EXCEPTION").sum())
+                summaries.append(
+                    {
+                        "llmCode": None if pd.isna(llm_code) else str(llm_code),
+                        "right": right,
+                        "wrong": wrong,
+                        "failed": failed,
+                        "exception": exception,
+                    }
+                )
 
-            result = dict(right=right, wrong=wrong, failed=failed, exception=exception)
             logger.info(
-                f"[summary] summary computed from Excel for round={round_id},"
-                f" output_path={output_path} -> {result}"
+                f"[summary] computed per llmCode for round={round_id}, output_path={output_path} -> {summaries}"
             )
-            return json.dumps(result, ensure_ascii=False)
+            return json.dumps(summaries, ensure_ascii=False)
         except Exception as e:
             logger.error(f"summary compute error from excel: {e}", exc_info=True)
-            result = dict(right=0, wrong=0, failed=0, exception=0)
-            return json.dumps(result, ensure_ascii=False)
+            return json.dumps([], ensure_ascii=False)
 
     def get_input_stream(self, location: str):
         """Get input stream from location
diff --git a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/user_input_execute_service.py b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/user_input_execute_service.py
index da749511b..3cdff3f4a 100644
--- a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/user_input_execute_service.py
+++ b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/user_input_execute_service.py
@@ -6,6 +6,7 @@ from dbgpt.util.benchmarks import StorageUtil
 from dbgpt_serve.evaluate.service.fetchdata.benchmark_data_manager import (
     get_benchmark_manager,
 )
+from dbgpt_serve.evaluate.db.benchmark_db import BenchmarkResultDao
 
 from .data_compare_service import DataCompareService
 from .file_parse_service import FileParseService
@@ -197,6 +198,26 @@ class UserInputExecuteService:
             config.benchmark_mode_type == BenchmarkModeTypeEnum.EXECUTE,
             llm_count,
         )
+        try:
+            summary_json = self.file_service.summary_and_write_multi_round_benchmark_result(
+                location, round_id
+            )
+            import json as _json
+
+            results = _json.loads(summary_json) if summary_json else []
+            dao = BenchmarkResultDao()
+            for item in results:
+                llm_code = item.get("llmCode")
+                right = int(item.get("right", 0))
+                wrong = int(item.get("wrong", 0))
+                failed = int(item.get("failed", 0))
+                exception = int(item.get("exception", 0))
+                dao.upsert_summary(round_id, location, llm_code, right, wrong, failed, exception, task_serial_no=config.evaluate_code)
+        except Exception as e:
+            logger.error(
+                f"[execute_llm_compare_result] summary from excel or write db failed: {e}",
+                exc_info=True,
+            )
 
     def _convert_query_result_to_column_format(
         self, result: List[Dict]