diff --git a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/file_parse_service.py b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/file_parse_service.py index 88b004ae1..f0c53b7b3 100644 --- a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/file_parse_service.py +++ b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/file_parse_service.py @@ -84,6 +84,7 @@ class FileParseService(ABC): "selfDefineTags", "prompt", "standardAnswerSql", + "standardAnswer", "llmCode", "llmOutput", "executeResult", @@ -121,6 +122,9 @@ class FileParseService(ABC): cm.selfDefineTags, cm.prompt, cm.standardAnswerSql, + json.dumps(cm.standardAnswer, ensure_ascii=False) + if cm.standardAnswer is not None + else "", cm.llmCode, cm.llmOutput, json.dumps(cm.executeResult, ensure_ascii=False) diff --git a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/models.py b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/models.py index a4308edb4..e75de3f6a 100644 --- a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/models.py +++ b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/models.py @@ -117,6 +117,7 @@ class RoundAnswerConfirmModel: selfDefineTags: Optional[str] prompt: Optional[str] standardAnswerSql: Optional[str] = None + standardAnswer: Optional[Dict[str, List[str]]] = None strategyConfig: Optional[DataCompareStrategyConfig] = None llmOutput: Optional[str] = None executeResult: Optional[Dict[str, List[str]]] = None diff --git a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/user_input_execute_service.py b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/user_input_execute_service.py index 72934e567..2a0a3c4eb 100644 --- a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/user_input_execute_service.py +++ b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/user_input_execute_service.py @@ -128,14 +128,24 @@ class UserInputExecuteService: strategy_cfg = None standard_sql = None + standard_answer = None if left is not None: standard_sql = left.llmOutput if config.benchmark_mode_type == BenchmarkModeTypeEnum.EXECUTE: strategy_cfg = left.strategyConfig + # 优先使用左侧的执行结果作为标准答案;若无,则尝试从策略配置的 standard_result 取第一项 + if left.executeResult is not None: + standard_answer = left.executeResult + elif left.strategyConfig and left.strategyConfig.standard_result: + try: + standard_answer = left.strategyConfig.standard_result[0] + except Exception: + standard_answer = None else: standard_result_list = [] if left.executeResult: standard_result_list.append(left.executeResult) + standard_answer = left.executeResult strategy_cfg = DataCompareStrategyConfig( strategy="EXACT_MATCH", order_by=True, @@ -173,6 +183,7 @@ class UserInputExecuteService: selfDefineTags=inp.self_define_tags, prompt=inp.prompt, standardAnswerSql=standard_sql, + standardAnswer=standard_answer, strategyConfig=strategy_cfg, llmOutput=right.llmOutput if right else None, executeResult=right.executeResult if right else None,