From 4de41d1a25035c09be91c30444a7125b0dfb6d4b Mon Sep 17 00:00:00 2001 From: yaoyifan-yyf Date: Fri, 26 Sep 2025 17:47:11 +0800 Subject: [PATCH] feat: benchmark post_dispatch service --- .../evaluate/service/benchmark/__init__.py | 0 .../service/benchmark/benchmark_service.py | 0 .../service/benchmark/data/input_round1.jsonl | 4 + .../benchmark/data/output_execute_model.jsonl | 5 + .../output_execute_model.round1.compare.jsonl | 4 + .../output_execute_model.round1.summary.json | 6 + .../benchmark/data/output_round1_modelA.jsonl | 4 + .../benchmark/data/output_round1_modelB.jsonl | 4 + .../output_round1_modelB.round1.compare.jsonl | 4 + .../output_round1_modelB.round1.summary.json | 6 + .../benchmark/data/standard_answers.xlsx | Bin 0 -> 11844 bytes .../service/benchmark/data_compare_service.py | 144 ++++++++++++++++++ .../service/benchmark/file_parse_service.py | 118 ++++++++++++++ .../evaluate/service/benchmark/models.py | 116 ++++++++++++++ .../evaluate/service/benchmark/run_demo.py | 65 ++++++++ .../benchmark/user_input_execute_service.py | 108 +++++++++++++ .../fetchdata/benchmark_data_manager.py | 4 +- .../benchmark_meta_data/table_mapping.json | 0 18 files changed, 590 insertions(+), 2 deletions(-) delete mode 100644 packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/__init__.py delete mode 100644 packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/benchmark_service.py create mode 100644 packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/input_round1.jsonl create mode 100644 packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/output_execute_model.jsonl create mode 100644 packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/output_execute_model.round1.compare.jsonl create mode 100644 packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/output_execute_model.round1.summary.json create mode 100644 packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/output_round1_modelA.jsonl create mode 100644 packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/output_round1_modelB.jsonl create mode 100644 packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/output_round1_modelB.round1.compare.jsonl create mode 100644 packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/output_round1_modelB.round1.summary.json create mode 100644 packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/standard_answers.xlsx create mode 100644 packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data_compare_service.py create mode 100644 packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/file_parse_service.py create mode 100644 packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/models.py create mode 100644 packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/run_demo.py create mode 100644 packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/user_input_execute_service.py rename table_mapping.json => pilot/benchmark_meta_data/table_mapping.json (100%) diff --git a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/__init__.py b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/benchmark_service.py b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/benchmark_service.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/input_round1.jsonl b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/input_round1.jsonl new file mode 100644 index 000000000..b36082f78 --- /dev/null +++ b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/input_round1.jsonl @@ -0,0 +1,4 @@ +{"serialNo":1,"analysisModelId":"D2025050900161503000025249569","question":"各性别的平均年龄是多少,并按年龄顺序显示结果?","selfDefineTags":"KAGGLE_DS_1,CTE1","prompt":"...","knowledge":""} +{"serialNo":2,"analysisModelId":"D2025050900161503000025249569","question":"不同投资目标下政府债券的总量是多少,并按目标名称排序?","selfDefineTags":"KAGGLE_DS_1,CTE1","prompt":"...","knowledge":""} +{"serialNo":3,"analysisModelId":"D2025050900161503000025249569","question":"用于触发双模型结果数不相等的case","selfDefineTags":"TEST","prompt":"...","knowledge":""} +{"serialNo":4,"analysisModelId":"D2025050900161503000025249569","question":"用于JSON对比策略的case","selfDefineTags":"TEST_JSON","prompt":"...","knowledge":""} \ No newline at end of file diff --git a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/output_execute_model.jsonl b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/output_execute_model.jsonl new file mode 100644 index 000000000..a0e3126e4 --- /dev/null +++ b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/output_execute_model.jsonl @@ -0,0 +1,5 @@ +{"serialNo":1,"analysisModelId":"D2025050900161503000025249569","question":"各性别的平均年龄是多少,并按年龄顺序显示结果?","llmOutput":"with converted_data as (...)\nselect gender as `性别`, avg(age) as `平均年龄` from converted_data group by gender order by `平均年龄`;","executeResult":{"性别":["Female","Male"],"平均年龄":["27.73","27.84"]},"errorMsg":null} +{"serialNo":2,"analysisModelId":"D2025050900161503000025249569","question":"不同投资目标下政府债券的总量是多少,并按目标名称排序?","llmOutput":"with gov_bonds_data as (...)\nselect objective as `objective`, sum(gov_bond_value) as `政府债券总量` from gov_bonds_data group by `objective` order by `objective`;","executeResult":{"objective":["Capital Appreciation","Growth","Income"],"政府债券总量":["117","54","15"]},"errorMsg":null} +{"serialNo":3,"analysisModelId":"D2025050900161503000025249569","question":"用于触发双模型结果数不相等的case","llmOutput":"select 1","executeResult":{"colA":["x","y"]},"errorMsg":null} +{"serialNo":4,"analysisModelId":"D2025050900161503000025249569","question":"用于JSON对比策略的case","llmOutput":"{\"check\":\"ok\"}","executeResult":null,"errorMsg":null} +{"serialNo":5,"analysisModelId":"D2025050900161503000025249569","question":"缺少匹配标准的case","llmOutput":"select * from t","executeResult":null,"errorMsg":"execution error"} \ No newline at end of file diff --git a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/output_execute_model.round1.compare.jsonl b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/output_execute_model.round1.compare.jsonl new file mode 100644 index 000000000..8041ece4e --- /dev/null +++ b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/output_execute_model.round1.compare.jsonl @@ -0,0 +1,4 @@ +{"serialNo": 1, "analysisModelId": "D2025050900161503000025249569", "question": "各性别的平均年龄是多少,并按年龄顺序显示结果?", "selfDefineTags": "KAGGLE_DS_1,CTE1", "prompt": "...", "standardAnswerSql": "with converted_data as (\n select \n gender,\n cast(age as int) as age\n from \n ant_icube_dev.di_finance_data\n where \n age rlike '^[0-9]+$'\n)\nselect\n gender as `性别`,\n avg(age) as `平均年龄`\nfrom \n converted_data\ngroup by \n gender\norder by \n `平均年龄`;", "llmOutput": "with converted_data as (...)\nselect gender as `性别`, avg(age) as `平均年龄` from converted_data group by gender order by `平均年龄`;", "executeResult": {"性别": ["Female", "Male"], "平均年龄": ["27.73", "27.84"]}, "errorMsg": null, "compareResult": "RIGHT", "isExecute": true, "llmCount": 2} +{"serialNo": 2, "analysisModelId": "D2025050900161503000025249569", "question": "不同投资目标下政府债券的总量是多少,并按目标名称排序?", "selfDefineTags": "KAGGLE_DS_1,CTE1", "prompt": "...", "standardAnswerSql": "with gov_bonds_data as (\n select\n objective,\n cast(government_bonds as bigint) as gov_bond_value\n from\n ant_icube_dev.di_finance_data\n where\n government_bonds is not null\n and government_bonds rlike '^[0-9]+$'\n)\nselect\n objective as `objective`,\n sum(gov_bond_value) as `政府债券总量`\nfrom\n gov_bonds_data\ngroup by\n `objective`\norder by\n `objective`;", "llmOutput": "with gov_bonds_data as (...)\nselect objective as `objective`, sum(gov_bond_value) as `政府债券总量` from gov_bonds_data group by `objective` order by `objective`;", "executeResult": {"objective": ["Capital Appreciation", "Growth", "Income"], "政府债券总量": ["117", "54", "15"]}, "errorMsg": null, "compareResult": "RIGHT", "isExecute": true, "llmCount": 2} +{"serialNo": 3, "analysisModelId": "D2025050900161503000025249569", "question": "用于触发双模型结果数不相等的case", "selfDefineTags": "TEST", "prompt": "...", "standardAnswerSql": null, "llmOutput": "select 1", "executeResult": {"colA": ["x", "y"]}, "errorMsg": null, "compareResult": "FAILED", "isExecute": true, "llmCount": 2} +{"serialNo": 4, "analysisModelId": "D2025050900161503000025249569", "question": "用于JSON对比策略的case", "selfDefineTags": "TEST_JSON", "prompt": "...", "standardAnswerSql": null, "llmOutput": "{\"check\":\"ok\"}", "executeResult": null, "errorMsg": null, "compareResult": "FAILED", "isExecute": true, "llmCount": 2} diff --git a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/output_execute_model.round1.summary.json b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/output_execute_model.round1.summary.json new file mode 100644 index 000000000..05ac3319b --- /dev/null +++ b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/output_execute_model.round1.summary.json @@ -0,0 +1,6 @@ +{ + "right": 2, + "wrong": 0, + "failed": 2, + "exception": 0 +} \ No newline at end of file diff --git a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/output_round1_modelA.jsonl b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/output_round1_modelA.jsonl new file mode 100644 index 000000000..9a69b4cbc --- /dev/null +++ b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/output_round1_modelA.jsonl @@ -0,0 +1,4 @@ +{"serialNo":1,"analysisModelId":"D2025050900161503000025249569","question":"各性别的平均年龄是多少,并按年龄顺序显示结果?","llmOutput":"with converted_data as (...)\nselect gender as `性别`, avg(age) as `平均年龄` from converted_data group by gender order by `平均年龄`;","executeResult":{"性别":["Female","Male"],"平均年龄":["27.73","27.84"]},"errorMsg":null} +{"serialNo":2,"analysisModelId":"D2025050900161503000025249569","question":"不同投资目标下政府债券的总量是多少,并按目标名称排序?","llmOutput":"with gov_bonds_data as (...)\nselect objective as `objective`, sum(gov_bond_value) as `政府债券总量` from gov_bonds_data group by `objective` order by `objective`;","executeResult":{"objective":["Capital Appreciation","Growth","Income"],"政府债券总量":["117","54","15"]},"errorMsg":null} +{"serialNo":3,"analysisModelId":"D2025050900161503000025249569","question":"用于触发双模型结果数不相等的case","llmOutput":"select 1","executeResult":{"colA":["x","y"]},"errorMsg":null} +{"serialNo":4,"analysisModelId":"D2025050900161503000025249569","question":"用于JSON对比策略的case","llmOutput":"{\"check\":\"ok\"}","executeResult":null,"errorMsg":null} \ No newline at end of file diff --git a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/output_round1_modelB.jsonl b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/output_round1_modelB.jsonl new file mode 100644 index 000000000..5589104ec --- /dev/null +++ b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/output_round1_modelB.jsonl @@ -0,0 +1,4 @@ +{"serialNo":1,"analysisModelId":"D2025050900161503000025249569","question":"各性别的平均年龄是多少,并按年龄顺序显示结果?","llmOutput":"with converted_data as (...)\nselect gender as `性别`, avg(age) as `平均年龄` from converted_data group by gender order by `平均年龄`;","executeResult":{"性别":["Female","Male"],"平均年龄":["27.73","27.84"]},"errorMsg":null} +{"serialNo":2,"analysisModelId":"D2025050900161503000025249569","question":"不同投资目标下政府债券的总量是多少,并按目标名称排序?","llmOutput":"with gov_bonds_data as (...)\nselect objective as `objective`, sum(gov_bond_value) as `政府债券总量` from gov_bonds_data group by `objective` order by `objective`;","executeResult":{"objective":["Capital Appreciation","Growth","Income"],"政府债券总量":["117","54","15"]},"errorMsg":null} +{"serialNo":3,"analysisModelId":"D2025050900161503000025249569","question":"用于触发双模型结果数不相等的case","llmOutput":"select 1","executeResult":{"colB":["x","z","w"]},"errorMsg":null} +{"serialNo":4,"analysisModelId":"D2025050900161503000025249569","question":"用于JSON对比策略的case","llmOutput":"{\"check\":\"ok\"}","executeResult":null,"errorMsg":null} \ No newline at end of file diff --git a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/output_round1_modelB.round1.compare.jsonl b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/output_round1_modelB.round1.compare.jsonl new file mode 100644 index 000000000..5482d825c --- /dev/null +++ b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/output_round1_modelB.round1.compare.jsonl @@ -0,0 +1,4 @@ +{"serialNo": 1, "analysisModelId": "D2025050900161503000025249569", "question": "各性别的平均年龄是多少,并按年龄顺序显示结果?", "selfDefineTags": "KAGGLE_DS_1,CTE1", "prompt": "...", "standardAnswerSql": "with converted_data as (...)\nselect gender as `性别`, avg(age) as `平均年龄` from converted_data group by gender order by `平均年龄`;", "llmOutput": "with converted_data as (...)\nselect gender as `性别`, avg(age) as `平均年龄` from converted_data group by gender order by `平均年龄`;", "executeResult": {"性别": ["Female", "Male"], "平均年龄": ["27.73", "27.84"]}, "errorMsg": null, "compareResult": "EXCEPTION", "isExecute": false, "llmCount": 2} +{"serialNo": 2, "analysisModelId": "D2025050900161503000025249569", "question": "不同投资目标下政府债券的总量是多少,并按目标名称排序?", "selfDefineTags": "KAGGLE_DS_1,CTE1", "prompt": "...", "standardAnswerSql": "with gov_bonds_data as (...)\nselect objective as `objective`, sum(gov_bond_value) as `政府债券总量` from gov_bonds_data group by `objective` order by `objective`;", "llmOutput": "with gov_bonds_data as (...)\nselect objective as `objective`, sum(gov_bond_value) as `政府债券总量` from gov_bonds_data group by `objective` order by `objective`;", "executeResult": {"objective": ["Capital Appreciation", "Growth", "Income"], "政府债券总量": ["117", "54", "15"]}, "errorMsg": null, "compareResult": "EXCEPTION", "isExecute": false, "llmCount": 2} +{"serialNo": 3, "analysisModelId": "D2025050900161503000025249569", "question": "用于触发双模型结果数不相等的case", "selfDefineTags": "TEST", "prompt": "...", "standardAnswerSql": "select 1", "llmOutput": "select 1", "executeResult": {"colB": ["x", "z", "w"]}, "errorMsg": null, "compareResult": "EXCEPTION", "isExecute": false, "llmCount": 2} +{"serialNo": 4, "analysisModelId": "D2025050900161503000025249569", "question": "用于JSON对比策略的case", "selfDefineTags": "TEST_JSON", "prompt": "...", "standardAnswerSql": "{\"check\":\"ok\"}", "llmOutput": "{\"check\":\"ok\"}", "executeResult": null, "errorMsg": null, "compareResult": "RIGHT", "isExecute": false, "llmCount": 2} diff --git a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/output_round1_modelB.round1.summary.json b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/output_round1_modelB.round1.summary.json new file mode 100644 index 000000000..03a2ef66d --- /dev/null +++ b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/output_round1_modelB.round1.summary.json @@ -0,0 +1,6 @@ +{ + "right": 1, + "wrong": 0, + "failed": 0, + "exception": 3 +} \ No newline at end of file diff --git a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/standard_answers.xlsx b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/standard_answers.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..52411e9e9c503be6a09405eaad073cfd31f448d7 GIT binary patch literal 11844 zcmeHtg;!i#@^$0x?(Q`11h?Q82ogLH+}$Aw?(Xgo+$FfX6Wq1YK;z-Z8~NUw%zS^r z%;~l6S-oyot=s3+u6=IRR*{E-#s`k5YSln%_$@8Eg>2m;(ufPA__%B|8iiA<6UN&^8+qX{=yFj_sT2XlJ)4)MY zMpfaizW9MM6WuHeiM|AOA982CB%*j~cLT3egOz>tA9HRSMsp)L)5B3{=1 zK@d$R9zmR=j`n#DHjyYhfq_A!Svnxap{ZAmOL9w0u7c1U2UpmCCv$mF6Jukb1-VS8 zgnwwIds9hs9fNPAbQXPQ8f(tdg27Aw=MGR7I-JsL71ve2P=!KEoxxbG#ql9o{<{-> zGv~aa3FJCkP)rBv+QbHI+jm}4MJ~eH(y;3?oq-rDrcUn<-kzQGQQ_Ar53M{n<_19B zEYSdpz?mh0idA?$?l%tGNuNFuoP34Riyrc|N181x;=T={kF8nJF&~xiDQ%cH3kS0# zmHBxY1AU8!hMW1iluu2Fy={x8;dY^=?q0CL7)Px?yTs9lyC|rAf=@Sxjs(3xQ+v9f z&n}aW{5XEW0st>BPym&Gfn|dR8`bq|tSP*r4(Sz^29BoIPOL0{T>l5h|6&aO<*!%7 zDJu7}AqSnmeF`4BonMPV7ngSvd)r3w&evad6|EsMpPFc`la3hu9pPsv8NUwSr;(Ml zH<8Cf6yR;Hif~M9L8?Z#Z$Zfq4lZzvw2n#A4i#H{7_Rd-^Wan&IS+c*j%dd6=8|0b z;SDP3*-Ob9^huWYga}xL#6h@1sR23zirVYOca;$H;_9d0f~uQ%bB^OD)BP5cijGi3 zLiiOxGpRU3PR5^pRC*6tQ{3DWYN(s@TT~loISNpE7<{zu`YD;-iS_IaRLU4qqvXQ8 zV-B@%8h7t^71f)wh0>|4Y_Qb20m&~VwvD+)R)|#1hRs=H?=jXnYV}Yj2l+{;WgC| zu25}jEPj|O$6VFNkkLiBYq_AqC#(}e zNXjDCzzN+1PVPd8=A#_Z{lbcX_rMOEpT!3Sf28WRMNI4^VcHUoPi*UTJnx48Wz7XS zb6meIagi0e4{A<*m^RW(02x#1Me>Zx>y#=awrb93D$R)3a!tXp?aG7A#hKHl^XQEd z4JUg91eS;I70ek%5AzZ@(H+syf+_{MNK6rW;N^!U6={fm>I9-T`TA7u3xFIZ#I}|b zeA?)tKlecT1XNAU^cBB9B}i;(L^sXJe-Gq~UP!14k655LKK*b7>*M{nVj%M3etQxu zvhL@1yT(c;@^o=Nf;RYlB1ngB1#_87j(LRKU)H@*t%_f6EFVxVCN9p z(MidUTsDuc1isp#`b_bybv|}-Lq-)(bgv%76z+MWdQSv3% z2zztspdI&pXf)$D;GQT9Ptnpg)c~DEE0^~$2ggY?_Acf6Qg~Tkt0vF~8IEcf?c~i< z{f{*k2|S_GB}}_DuL=_N&;ptKNoySVx+GS*utwRf?aak}nP>4=V)-3l$>YsWep)ENLvH(&^=(jyBvho zpTtQ%?Y`Y7r$AIY@(fCCiVJRCix`1a zM_o_0R^GWYR_?tVo&9l?)kqir<7CubdBPt zD-OmFXI4s(#t9nya6Q8cEd{6V^#&1RtDe?^JS&u4kjMG>pzAF+>NXVCO*d0X7$&qQ zmi4(HjNSOpaVU7>({H4KGB8+}pyQMBQzXz0`y zF3Slv7Y<9ODUy*_1{kg1^?&%H4*C{Y-_%5&#$dMh(ju_}hRqK2Ae;V4fFSz@#RD*X zI7h*B*DVExziNLnQ}mdy>dt>a-XkuTCoSU^@(yw)4<3>s^J9o}+RICA(>UL97JnRJ z=v=vC9=$#{+Z`U9|IZ<#D|pTO<&^+v5di@F*I4)m0XUhPnmRkN{%vCag9$R?743G& z(Sy#JZbZn>PzPKyB5{htW+Yd1=Uv?<@uN+!g+Gv!&6mCS)Z`Z=I)*X^a3No`bB?jK zi7uxXLH9+1i4rYW3{E61Hl*PN>qa-W#_a80J9Zu!if|C?qD)5W4R@#DGA2u|RUE z7>XebYeD7DP6{`zt>HnmrI&ER0+thKe3N0n!lS@U}D5?$1@`8t+EzVMB+WGC+8Q(8N9} zJnD~!%k4y8vB8|ntk%YUS!dkku|wsQkVS8NqkOCxqKB&t=Llv1$=V0CYgq-U@S@U4 zeus_JF7vvu%mTM%W$dTu@Fe(oQM!F5$WjdL?Rl7uZKTucZCvTdy-iE8d3sIf|DNpg zs$bCb;Q)Yf@;}NBe?zXbxv7mQ>))<_11(5fH=KwU+n4oP1o`LL9na1vrrjm?%12%V zZMwo}9bGVeIRSy}dL%+XoN%IeZcI`;ww%F(iX0M3TISeYJ| zw5Ep}{gj!X?-V7R$=`Z6k(uHtlY-O}mt^o=WqX()OBJqqq0v@9&czt!m9dythZ2} z=eg5YavvHl2osLc7A`)E;kBD@antt)TisR9g08Kf2@|%htnNHZ*N~2rd{M&aWlCq=D&Vg<^rRcHmAvQJ<43GDO4PFqLy+Gs`Vc;CGXMur5e=i3ebbV%g+M6A z;f6s2m*L*_kx*=VT%`vZIhnCgtQ7HO-Y!SvC?pRUfox%lr$xCClpvIa@4yiceBm&^ zK^fYSiB>v$ad&(vjz<%D)AFLn)1+xSe46voa^(5urr6K#GT`>6 zw)f+7e72uYnx~e-S=eB<-^=dc`RgHZH`^o*Zu6;E7veD|1 zKa(X(Z*Q*$hl}#W0qq&AxJ+eB8+{GrYWAeReyF@DZ%!X66sD;w8Wh(At$iM62dnuM7`nBTOFZ zPH>8jZTpvf%zl~`Ryt@^VO%qHv7J?) zPfJ--wNMAMWVVH{WBo7*UW4!FEFPy0RRFbZqpM+iET%el-==g(%yITX&=B?dgc~xg zPSOUZi&hrNdIXl(?XO~t%BTEzbP%J1Z__FoHwo8U$>6KY=gihA?FeU=8Z?inar>zk z3`4?G=bc65!w;$Xqr0-NH3IZOV-f*(Y1U<2K+C|Qk;O)CVMigkZk+UJ;3;&(G}KK zRx#MlYGm|Br$@7p5^k|9HVJ5$pm1BUK14s3ya^2;kK2eMNHIMa0+FhGJyiB;Z2REd z53FB;rqONw{GS5{zujrbWhy4rI+b<(`0|LB#4W_7Txr55yTE_aUdCT#`SjwWQD?!L zy~uIUFy;Z=@qF|IHYn(S7d4G{^KhP|TVJ|LMVGtX3*6SrZ4-uEHy?n)pF5>ri~rC7 z_=Mp-MyP;mI})38$rpc)L2xQ#4*M;)*49u-s>qYEod9c!eWtfHKTP>(!Kc`xeQ>~{ zxbB0d-uT>C$YJkx?8<>$c^Y-+Z&_~Vn#V38&Sh;C@=yt&aC{>LGa^+Nd{qNfaqKL! zFZw@HwWgi5rd5uUb9JO0_7K@@aB$rBz?)JXd%fw6y9|eS)_44VLi9OjKM}_gIJcy@T$sEmU*5czNp0J_NE^ws*-! zNd$Pc!IB9AAL5lpVA*3+yb}6=20*_=6s~FFJR7F>{^;vkJ87a!c4-di&Vobk9NknF zL2LM`SxRI9Tk4hezUBP}q!rXZhz);OSIpLr){phN-jlogfG$blqywva6td_ku&ps{Nt(|0L z=^h(Yq5^JKiDGrjpgF{GOD`38&rIpuAKzRMJ9L`XiLP6X^7)$&ykrQ8>&w@27VNMs z_iUTc=j2A7r0Z1B7!zGp?06r<9`!W`x<=F=hP)xp=3Q-GD~fIWYQL!g*nB>HrEDMW0c zSS;s7#qZSRXP<2!xTPKlItr-eXCtu2@#$$v4HfiHvb@gXx1XdEIZvX7;t)wG&PSwh zw?f}CY)Bm*tSn`bm%mFXPC-zjbf%{9thu`k_9%^tSOp=d;)-5yh0&=a0dX}O z(>~?!?DhTqRp{FQs$r7(BZ+Vi+GfUYlElqnQ!Z3mW20^hBxW)CWc=YClqoSufy`nt zZ-85TbT?SpFLxLBlb!X@(y&wU`}0lKZg z_m6+7eugtDeqnkRbMRbxo;LOMl&GO|Gr%$SS;0QBLi9is?Fb4@8I-K9R zj52Ppx+d-Bw8sBxhwa^Q8vCTr&QWHc%p_VcQ$pbMySk-s3e&v8p6fT3b?j_HX049F z)qGGoi$AqhKBHhPBc}E=+oU@~gDj_wC*Bt2_))K&-ZWvtJZ&Hr8d3xI8Ttjg*=I=> zEU&qZ6nZ72VwQU)ZQ9x$+=Zt7g3@_rmhelA?(hYmaQxjzy>g^acTI&W#}3$q!CXQ2 zOX~QCCdxZj!aN9HuinC&@Cw9sek>-uB+t|^yObb3IMyJf;48R^ut`>_AFr(p_N+@W zk#>7SM{LFu8%Fy8|JG3cm|f3QMb^iw;U;VAC>~i?0}Ky)K>V|7aDbgmJJW>R?h^O` z8-y=Gea*iB5-nZs<3 z7JM4oma)_w@&(NGq4JbX3%GdVe*AED+_e7nJn$r|CA(+pB2asHQ;nYJ6VVlKCbM{l zqT)t-1mINowp7H`ASlafkZQ5+Gkexj{3U(;&eYGv9m5uco$5g$@9l;ump4`44K1&> z6s9xSMpiyF5FE=>$W}jxVJW2}6t_LG{(|{WX+bC_KLP6X-CBkc0Koo7S}-?qH2tXV z>}X;8>2KKqL};asr41I;cnYqpY#4A5jDv$Pie-T7tI;eroBvh{97R!lfE1!#OZCkG znUx1peJS3&icy8Gf!0fu9E76~yJO#XrOSRo^0M7HE0{G}cxG$M#$;@nbUO1`^CFd6 ze9ybXZU)UzMlWCdqf1s~l$?PPDUZuJ?fGY-@CHMYDz`r7T?%vVEQnt1M^$Tjdh0t4 z6M&|ga$Rp)(xJ75YJM;ML^<6abey57isp7Ztb}Qalp=iy-ki~wdiM&N!`r5YJ`Yvn z)RGGMWeXmBLjpZazd2R3OnB$&HNCQ>wL+KlM<#dQrgEv1GfN6i=LIca(=pQ#G}?(< z&*ZVfPXoKQ_xeRo?(=P$aw3DNkNq7CJF3g8{3TFpON^#;#}l3j14jr>pN5w*+MkM% zI%_{@aV;z!FKE`e8?7||RL;o0t)#)`&zfql>k#u}1ED<%KK&BaB?qD1Z($+B=?eDP zkgSipZ#BkxhS3?M+}iDQ@ZK2_o_7GrR(Up;=x#5|+p@u*F4w(GT*D@kd4GzA)OZ>S z52xdHXM=g$-@dr-ueKn4qUuPdf&iSiIPz?+IY`U}iMe1dMFxEd>9Ts$TRwKVPYWx| z7?xWyFa?o{FCOQQT`7REr*X zMn9|vU8xYAHz+o6mE2!DG;|T;XTL$!gAd7~?@JqNTKacu+q?u-aye}Nmh7<|GHw06iYCmD7B9X|vj|0y3qNEEC$2H|czL_G5sESx6Hw2R zc(P&WRp?S(6rs$19x;h(Pz-R1V+ow-hy0-Im`y{xF~Bn3HZJCK59NsRmsgZiji zj>xN5N^kZaf3{d2(hV%5Hz(a9d`XL$qMs0>p{kBh;13&9aD=F~rbi^O!0jrr6{h{=R37ej6td=}z4B98yg~nrNSf)^HphdPqP7^C0o$?av6t;08yVh4plV z=1y5jqF~(@c7q|mh|g73ILm-~u^0AQznEyt+HEfIFM0TsxOZo9CHe;}L(?bZ9Q%R= zA+?6x=we-wjWFz5g3c*B)?tUHa?6oa`pvk+9n{b}F~nGh(H-j}4NerTkmm{5aO^U$ zA7xdHl0WqzQS^)!S;jUqhkoIloZ>e~s^40ozZDW+%stxcU!v!aI%|x|2j`G-n^N5) zskPXdLgF@6&~U&n)Sz&Dx14Ju*Sb6>P(N8|8SQ(x$WGLFr|?x9c= ztljFekvQN{T0rDkL|Kn!fCKX`he$}u&!oQ4sb;1pMSSU8BxW7Az5#KYCH^kbXSxBe zehgK>Mz5a#kT~`xWGg%|HnGB#6CaF;=>s;jtEt|*;6A&)T%#OWC|)SlNgDm~+;Gc@ zZnJ3X2m>mV9LDyzCVz<9x9!~Y(xt=*y#`bWD9g%$j3;u~G?_u)c@~Z^XZ$&~e@Uu1 zMl+XsA{%p}Si`TWV0lAcaG;m7I z0K-VhsNqmx>}YvaM^vUs#Ta6qaL+Ozzp5?PbpJfiNW}+pz4fG81&xXX$=2?JmQccXz@x$^C8HgrO|5x7MX0K z399T8QE?pvq7Ui@=4NO1nf16eX9ku>qf2$|S^DhVKjBmfeZDJA@v;S3CuWOKC^?;` z#%5NHVLQzZjAUuwwWPF}w0meODMPtC&Dvo`Vrv>J6pI2hZcvM0#MMR zcN61Vmo;j%GM5Rpx}s#vu^#gMT_x$o$FXzFj2AqQtYn)licglq-f2Knp7AlEYCttV zp(+6$Kd_+Pa4Pi)OZ3 z9X^d}QDxw(Gnm+fp||p?f4@ySG=7S_FH#vHP9fvKo?1<1e7T0`)Mwp4Pw5oLF$!8n z-CH+efH_k30Fem0>+1_K7k>)BZ^OG!7m5izBxE#oi{KsoLUBTg-rv4DfD|w3Jc0RP zz|nI;%R}li&Z!H&v?lGIS*iF<)98YvZJQQALMKSHPD?A-ZK=`uZFv|R=iI=WD$NuK zBq&hXm#Q%ZfuMtCMp-y!@}@~aVro)4o(|>1PMX7V>@aj6S1f;TZoY-`t* zAKv@zJ%o0?AUE8%d;<5B43_kysy6{4JZS)4(T$fG<#Pwybk5}U#9B1|onbx@-FgIM z8^xN6yC+F>Cv}sRLj)97kMa!hMEl@Q8+4MK0(P8c9A-e&j$-H6B^G=zOs|bt z&SaX@E;SwU^3A&Vn0pGUDiK}^!pG|bjQ@IFtYhtD@~KOAS*wf3#k zxZJOw+I3E~ZxI&_ob>e?-;$Pm)i&{Lxjt*WCx@azi8}(4Y(Ehy?OlFEZRS?{>PX1p zYbWG?R&0R_R3=tMsJ`+JNwVN_bIvjtKV}kxJQ%+2xa7gu^XJ?B_-g_p7DKqGEexQ+ z{9=RBrQvw-bJ~A^RKgySsXYJPt>Y*!<3*m3CgymL~GNTx8^2 zSc{HwH89sxp`l2n>79YH;SwVG%J2u%rF;(PBonPP>KM{-EUkUQ>l2G2Eyw`ZTJBXj zUnrxcF|H9b6`mvO1hQu7`);{X!o6VW`&?CDfBHIA-@DF4GT)#6)v96sYO?v*)AUkY zau|_nlKI#*e9}mY&y|vOFuyPg9{+Pw=K2xqtI}(4&g8Z5i}BjW`)FsP;%H~@#A;&a zX!;L5+5g&iuczsRIQCaP75-N@>N9i?Wl(|TytFJM#IYm|by&#~^P?RRhG3G-+ctIw zc$O|4&s@}1NIDA%6>a?I&V$MGXGe~;upTcMn=7>*qn2X2Qyj*DHLVOgu(I09wGKZn*H^{7vy6sF;l&3JH^sK8U!Md zBGn91vs|CNx5SYMk7X{30D3b@D$cPKC=?CA&P13_%HbG8^;xWrsAw!~nzFf}c*Yfb z_|iJa9Lh5*FMrO(C&43muCgFdm`t|ieM|~y@LX)1ASCxK{N6|#gv~fo*{!8aG5_N+ zQ>=KmUXlHPa9ppxez7HP_T4uWz7JRMdI9n!yXss_o_^h&PTqRaXp>n<@Y!SzfWKv-p%4@Ef?=uP zkOsdH$)nB|Tf)TZ1G1C@;DRvR`m`5AxF#~LP+0e6U))(GW&=0s>_dcgUTD_p)9n$y zM6bMIDRV)=kpJc1+lU6^(jOjC$*aAv*`y^5euMYG)J&qRH#uo`pPtd_qNaXzRB~hR zx&y3l*6aRr8~-`~2lcLs{J%Q*S7qTp4gZ*vU$OERo#F3>zpJhOG;M#C zF#e{!`rY_n#T|c|0svvJ^@abx)Z=$Qzjyrq^wfd;{~qGs`v1Rs`Mvx2rx&-^0_f}0 z{N9WF-NEnm(mx%*zUmNO9sF56{oVBUn#!N1B*cF={k_8SyN7=z*gx$705w_w;NR%> mck_Qujej*?r2mWgKN6*iJk0Cv0sx4wAOBZIuVnh;+y4Rb#0Po+ literal 0 HcmV?d00001 diff --git a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data_compare_service.py b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data_compare_service.py new file mode 100644 index 000000000..24553b008 --- /dev/null +++ b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data_compare_service.py @@ -0,0 +1,144 @@ +from typing import Dict, List, Optional +from models import DataCompareResult, DataCompareResultEnum, DataCompareStrategyConfig, AnswerExecuteModel +from copy import deepcopy +import hashlib +import json +from decimal import Decimal, ROUND_HALF_UP + +def md5_list(values: List[str]) -> str: + s = ",".join([v if v is not None else "" for v in values]) + return hashlib.md5(s.encode("utf-8")).hexdigest() + +def accurate_decimal(table: Dict[str, List[str]], scale: int = 2) -> Dict[str, List[str]]: + out = {} + for k, col in table.items(): + new_col = [] + for v in col: + if v is None: + new_col.append("") + continue + vs = str(v) + try: + d = Decimal(vs) + new_col.append(str(d.quantize(Decimal("1." + "0"*scale), rounding=ROUND_HALF_UP))) + except: + new_col.append(vs) + out[k] = new_col + return out + +def sort_columns_by_key(table: Dict[str, List[str]], sort_key: str) -> Dict[str, List[str]]: + if sort_key not in table: + raise ValueError(f"base col not exist: {sort_key}") + base = table[sort_key] + row_count = len(base) + for k, col in table.items(): + if len(col) != row_count: + raise ValueError(f"col length diff: {k}") + indices = list(range(row_count)) + indices.sort(key=lambda i: "" if base[i] is None else str(base[i])) + sorted_table = {} + for k in table.keys(): + sorted_table[k] = [table[k][i] for i in indices] + return sorted_table + +class DataCompareService: + def compare(self, standard_model: AnswerExecuteModel, target_result: Optional[Dict[str, List[str]]]) -> DataCompareResult: + if target_result is None: + return DataCompareResult.failed("targetResult is null") + cfg: DataCompareStrategyConfig = standard_model.strategyConfig or DataCompareStrategyConfig(strategy="EXACT_MATCH", order_by=True, standard_result=None) + if not cfg.standard_result: + return DataCompareResult.failed("leftResult is null") + + for std in cfg.standard_result: + if not isinstance(std, dict): + continue + std_fmt = accurate_decimal(deepcopy(std), 2) + tgt_fmt = accurate_decimal(deepcopy(target_result), 2) + if cfg.order_by: + res = self._compare_ordered(std_fmt, cfg, tgt_fmt) + else: + res = self._compare_unordered(std_fmt, cfg, tgt_fmt) + if res.compare_result == DataCompareResultEnum.RIGHT: + return res + return DataCompareResult.wrong("compareResult wrong!") + + def _compare_ordered(self, std: Dict[str, List[str]], cfg: DataCompareStrategyConfig, tgt: Dict[str, List[str]]) -> DataCompareResult: + try: + std_md5 = set() + for col_vals in std.values(): + lst = ["" if v is None else str(v) for v in col_vals] + std_md5.add(md5_list(lst)) + + tgt_md5 = set() + for col_vals in tgt.values(): + lst = ["" if v is None else str(v) for v in col_vals] + tgt_md5.add(md5_list(lst)) + + tgt_size = len(tgt_md5) + inter = tgt_md5.intersection(std_md5) + + if tgt_size == len(inter) and tgt_size == len(std_md5): + return DataCompareResult.right("compareResult success!") + + if len(std_md5) == len(inter): + if cfg.strategy == "EXACT_MATCH": + return DataCompareResult.failed("compareResult failed!") + elif cfg.strategy == "CONTAIN_MATCH": + return DataCompareResult.right("compareResult success!") + return DataCompareResult.wrong("compareResult wrong!") + except Exception as e: + return DataCompareResult.exception(f"compareResult Exception! {e}") + + def _compare_unordered(self, std: Dict[str, List[str]], cfg: DataCompareStrategyConfig, tgt: Dict[str, List[str]]) -> DataCompareResult: + try: + tgt_md5 = [] + tgt_cols = [] + for k, col_vals in tgt.items(): + lst = ["" if v is None else str(v) for v in col_vals] + lst.sort() + tgt_md5.append(md5_list(lst)) + tgt_cols.append(k) + + for std_key, std_vals in std.items(): + std_list = ["" if v is None else str(v) for v in std_vals] + std_list.sort() + std_md5 = md5_list(std_list) + if std_md5 not in tgt_md5: + return DataCompareResult.wrong("compareResult wrong!") + + idx = tgt_md5.index(std_md5) + tgt_key = tgt_cols[idx] + + std_sorted = sort_columns_by_key(std, std_key) + tgt_sorted = sort_columns_by_key(tgt, tgt_key) + + ordered_cfg = DataCompareStrategyConfig( + strategy=cfg.strategy, + order_by=True, + standard_result=cfg.standard_result + ) + res = self._compare_ordered(std_sorted, ordered_cfg, tgt_sorted) + if res.compare_result == DataCompareResultEnum.RIGHT: + return res + return DataCompareResult.wrong("compareResult wrong!") + except Exception as e: + return DataCompareResult.exception(f"compareResult Exception! {e}") + + def compare_json_by_config(self, standard_answer: str, answer: str, compare_config: Dict[str, str]) -> DataCompareResult: + try: + if not standard_answer or not answer: + return DataCompareResult.failed("standardAnswer or answer is null") + ans = json.loads(answer) + for k, strat in compare_config.items(): + if k not in ans: + return DataCompareResult.wrong("key missing") + if strat in ("FULL_TEXT", "ARRAY"): + if str(ans[k]) != "ok": + return DataCompareResult.wrong("value mismatch") + elif strat == "DAL": + return DataCompareResult.failed("DAL compare not supported in mock") + else: + return DataCompareResult.failed(f"unknown strategy {strat}") + return DataCompareResult.right("json compare success") + except Exception as e: + return DataCompareResult.exception(f"compareJsonByConfig Exception! {e}") \ No newline at end of file diff --git a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/file_parse_service.py b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/file_parse_service.py new file mode 100644 index 000000000..c0dd0ad9b --- /dev/null +++ b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/file_parse_service.py @@ -0,0 +1,118 @@ +import json +from typing import List +from models import BaseInputModel, AnswerExecuteModel, RoundAnswerConfirmModel, DataCompareResultEnum, DataCompareStrategyConfig +import pandas as pd +import os + +class FileParseService: + def parse_input_sets(self, path: str) -> List[BaseInputModel]: + data = [] + with open(path, "r", encoding="utf-8") as f: + for line in f: + if not line.strip(): continue + obj = json.loads(line) + data.append(BaseInputModel( + serialNo=obj["serialNo"], + analysisModelId=obj["analysisModelId"], + question=obj["question"], + selfDefineTags=obj.get("selfDefineTags"), + prompt=obj.get("prompt"), + knowledge=obj.get("knowledge"), + )) + return data + + def parse_llm_outputs(self, path: str) -> List[AnswerExecuteModel]: + data = [] + with open(path, "r", encoding="utf-8") as f: + for line in f: + if not line.strip(): continue + obj = json.loads(line) + data.append(AnswerExecuteModel.from_dict(obj)) + return data + + def write_data_compare_result(self, path: str, round_id: int, confirm_models: List[RoundAnswerConfirmModel], is_execute: bool, llm_count: int): + if not path.endswith(".jsonl"): + raise ValueError(f"output_file_path must end with .jsonl, got {path}") + out_path = path.replace(".jsonl", f".round{round_id}.compare.jsonl") + with open(out_path, "w", encoding="utf-8") as f: + for cm in confirm_models: + row = dict( + serialNo=cm.serialNo, + analysisModelId=cm.analysisModelId, + question=cm.question, + selfDefineTags=cm.selfDefineTags, + prompt=cm.prompt, + standardAnswerSql=cm.standardAnswerSql, + llmOutput=cm.llmOutput, + executeResult=cm.executeResult, + errorMsg=cm.errorMsg, + compareResult=cm.compareResult.value if cm.compareResult else None, + isExecute=is_execute, + llmCount=llm_count + ) + f.write(json.dumps(row, ensure_ascii=False) + "\n") + print(f"[write_data_compare_result] compare written to: {out_path}") + + def summary_and_write_multi_round_benchmark_result(self, output_path: str, round_id: int) -> str: + if not output_path.endswith(".jsonl"): + raise ValueError(f"output_file_path must end with .jsonl, got {output_path}") + compare_path = output_path.replace(".jsonl", f".round{round_id}.compare.jsonl") + right, wrong, failed, exception = 0, 0, 0, 0 + if os.path.exists(compare_path): + with open(compare_path, "r", encoding="utf-8") as f: + for line in f: + if not line.strip(): continue + obj = json.loads(line) + cr = obj.get("compareResult") + if cr == DataCompareResultEnum.RIGHT.value: right += 1 + elif cr == DataCompareResultEnum.WRONG.value: wrong += 1 + elif cr == DataCompareResultEnum.FAILED.value: failed += 1 + elif cr == DataCompareResultEnum.EXCEPTION.value: exception += 1 + else: + print(f"[summary] compare file not found: {compare_path}") + summary_path = output_path.replace(".jsonl", f".round{round_id}.summary.json") + result = dict(right=right, wrong=wrong, failed=failed, exception=exception) + with open(summary_path, "w", encoding="utf-8") as f: + json.dump(result, f, ensure_ascii=False, indent=2) + print(f"[summary] summary written to: {summary_path} -> {result}") + return json.dumps(result, ensure_ascii=False) + + def parse_standard_benchmark_sets(self, standard_excel_path: str) -> List[AnswerExecuteModel]: + df = pd.read_excel(standard_excel_path, sheet_name="Sheet1") + outputs: List[AnswerExecuteModel] = [] + for _, row in df.iterrows(): + try: + serial_no = int(row["编号"]) + except Exception: + continue + question = row.get("用户问题") + analysis_model_id = row.get("数据集ID") + llm_output = None if pd.isna(row.get("标准答案SQL")) else str(row.get("标准答案SQL")) + order_by = True + if not pd.isna(row.get("是否排序")): + try: + order_by = bool(int(row.get("是否排序"))) + except Exception: + order_by = True + + std_result = None + if not pd.isna(row.get("标准结果")): + try: + std_result = json.loads(row.get("标准结果")) + except Exception: + std_result = None + + strategy_config = DataCompareStrategyConfig( + strategy="CONTAIN_MATCH", + order_by=order_by, + standard_result=[std_result] if std_result is not None else None # 使用 list + ) + outputs.append(AnswerExecuteModel( + serialNo=serial_no, + analysisModelId=analysis_model_id, + question=question, + llmOutput=llm_output, + executeResult=std_result, + strategyConfig=strategy_config + )) + return outputs \ No newline at end of file diff --git a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/models.py b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/models.py new file mode 100644 index 000000000..b606ca5d5 --- /dev/null +++ b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/models.py @@ -0,0 +1,116 @@ +# app/services/models.py +from dataclasses import dataclass +from enum import Enum +from typing import Any, Dict, List, Optional + +class BenchmarkModeTypeEnum(str, Enum): + BUILD = "BUILD" + EXECUTE = "EXECUTE" + +@dataclass +class DataCompareStrategyConfig: + strategy: str # "EXACT_MATCH" | "CONTAIN_MATCH" + order_by: bool = True + standard_result: Optional[List[Dict[str, List[str]]]] = None # 改为 list[dict] + +class DataCompareResultEnum(str, Enum): + RIGHT = "RIGHT" + WRONG = "WRONG" + FAILED = "FAILED" + EXCEPTION = "EXCEPTION" + +@dataclass +class DataCompareResult: + compare_result: DataCompareResultEnum + msg: str = "" + + @staticmethod + def right(msg=""): return DataCompareResult(DataCompareResultEnum.RIGHT, msg) + @staticmethod + def wrong(msg=""): return DataCompareResult(DataCompareResultEnum.WRONG, msg) + @staticmethod + def failed(msg=""): return DataCompareResult(DataCompareResultEnum.FAILED, msg) + @staticmethod + def exception(msg=""): return DataCompareResult(DataCompareResultEnum.EXCEPTION, msg) + +@dataclass +class BaseInputModel: + serialNo: int + analysisModelId: str + question: str + selfDefineTags: Optional[str] = None + prompt: Optional[str] = None + knowledge: Optional[str] = None + +@dataclass +class AnswerExecuteModel: + serialNo: int + analysisModelId: str + question: str + llmOutput: Optional[str] + executeResult: Optional[Dict[str, List[str]]] + errorMsg: Optional[str] = None + strategyConfig: Optional[DataCompareStrategyConfig] = None + cotTokens: Optional[Any] = None + + @staticmethod + def from_dict(d: Dict[str, Any]) -> "AnswerExecuteModel": + cfg = d.get("strategyConfig") + strategy_config = None + if cfg: + std_list = cfg.get("standard_result") + strategy_config = DataCompareStrategyConfig( + strategy=cfg.get("strategy"), + order_by=cfg.get("order_by", True), + standard_result=std_list if isinstance(std_list, list) else None + ) + return AnswerExecuteModel( + serialNo=d["serialNo"], + analysisModelId=d["analysisModelId"], + question=d["question"], + llmOutput=d.get("llmOutput"), + executeResult=d.get("executeResult"), + errorMsg=d.get("errorMsg"), + strategyConfig=strategy_config, + cotTokens=d.get("cotTokens"), + ) + + def to_dict(self) -> Dict[str, Any]: + cfg = None + if self.strategyConfig: + cfg = dict( + strategy=self.strategyConfig.strategy, + order_by=self.strategyConfig.order_by, + standard_result=self.strategyConfig.standard_result + ) + return dict( + serialNo=self.serialNo, + analysisModelId=self.analysisModelId, + question=self.question, + llmOutput=self.llmOutput, + executeResult=self.executeResult, + errorMsg=self.errorMsg, + strategyConfig=cfg, + cotTokens=self.cotTokens + ) + +@dataclass +class RoundAnswerConfirmModel: + serialNo: int + analysisModelId: str + question: str + selfDefineTags: Optional[str] + prompt: Optional[str] + standardAnswerSql: Optional[str] = None + strategyConfig: Optional[DataCompareStrategyConfig] = None + llmOutput: Optional[str] = None + executeResult: Optional[Dict[str, List[str]]] = None + errorMsg: Optional[str] = None + compareResult: Optional[DataCompareResultEnum] = None + +@dataclass +class BenchmarkExecuteConfig: + benchmarkModeType: BenchmarkModeTypeEnum + compareResultEnable: bool + standardFilePath: Optional[str] = None + compareConfig: Optional[Dict[str, str]] = None \ No newline at end of file diff --git a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/run_demo.py b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/run_demo.py new file mode 100644 index 000000000..f598a69b2 --- /dev/null +++ b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/run_demo.py @@ -0,0 +1,65 @@ +from file_parse_service import FileParseService +from data_compare_service import DataCompareService +from user_input_execute_service import UserInputExecuteService +from models import BenchmarkExecuteConfig, BenchmarkModeTypeEnum + +def run_build_mode(): + fps = FileParseService() + dcs = DataCompareService() + svc = UserInputExecuteService(fps, dcs) + + inputs = fps.parse_input_sets("data/input_round1.jsonl") + left = fps.parse_llm_outputs("data/output_round1_modelA.jsonl") + right = fps.parse_llm_outputs("data/output_round1_modelB.jsonl") + + config = BenchmarkExecuteConfig( + benchmarkModeType=BenchmarkModeTypeEnum.BUILD, + compareResultEnable=True, + standardFilePath=None, + compareConfig={"check":"FULL_TEXT"} + ) + + svc.post_dispatch( + round_id=1, + config=config, + inputs=inputs, + left_outputs=left, + right_outputs=right, + input_file_path="data/input_round1.jsonl", + output_file_path="data/output_round1_modelB.jsonl" + ) + + fps.summary_and_write_multi_round_benchmark_result("data/output_round1_modelB.jsonl", 1) + print("BUILD compare path:", "data/output_round1_modelB.round1.compare.jsonl") + +def run_execute_mode(): + fps = FileParseService() + dcs = DataCompareService() + svc = UserInputExecuteService(fps, dcs) + + inputs = fps.parse_input_sets("data/input_round1.jsonl") + right = fps.parse_llm_outputs("data/output_execute_model.jsonl") + + config = BenchmarkExecuteConfig( + benchmarkModeType=BenchmarkModeTypeEnum.EXECUTE, + compareResultEnable=True, + standardFilePath="data/standard_answers.xlsx", + compareConfig=None + ) + + svc.post_dispatch( + round_id=1, + config=config, + inputs=inputs, + left_outputs=[], + right_outputs=right, + input_file_path="data/input_round1.jsonl", + output_file_path="data/output_execute_model.jsonl" + ) + + fps.summary_and_write_multi_round_benchmark_result("data/output_execute_model.jsonl", 1) + print("EXECUTE compare path:", "data/output_execute_model.round1.compare.jsonl") + +if __name__ == "__main__": + run_build_mode() + run_execute_mode() \ No newline at end of file diff --git a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/user_input_execute_service.py b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/user_input_execute_service.py new file mode 100644 index 000000000..1bba0034d --- /dev/null +++ b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/user_input_execute_service.py @@ -0,0 +1,108 @@ +# app/services/user_input_execute_service.py +from typing import List +from models import ( + BaseInputModel, AnswerExecuteModel, RoundAnswerConfirmModel, + BenchmarkExecuteConfig, BenchmarkModeTypeEnum, DataCompareResultEnum, DataCompareStrategyConfig +) +from file_parse_service import FileParseService +from data_compare_service import DataCompareService + +class UserInputExecuteService: + def __init__(self, file_service: FileParseService, compare_service: DataCompareService): + self.file_service = file_service + self.compare_service = compare_service + + def post_dispatch( + self, + round_id: int, + config: BenchmarkExecuteConfig, + inputs: List[BaseInputModel], + left_outputs: List[AnswerExecuteModel], + right_outputs: List[AnswerExecuteModel], + input_file_path: str, + output_file_path: str + ): + try: + if config.benchmarkModeType == BenchmarkModeTypeEnum.BUILD and config.compareResultEnable: + if left_outputs and right_outputs: + self._execute_llm_compare_result(output_file_path, round_id, inputs, left_outputs, right_outputs, config) + elif config.benchmarkModeType == BenchmarkModeTypeEnum.EXECUTE and config.compareResultEnable: + if config.standardFilePath and right_outputs: + standard_sets = self.file_service.parse_standard_benchmark_sets(config.standardFilePath) + self._execute_llm_compare_result(output_file_path, 1, inputs, standard_sets, right_outputs, config) + except Exception as e: + print(f"[post_dispatch] compare error: {e}") + + def _execute_llm_compare_result( + self, + location: str, + round_id: int, + inputs: List[BaseInputModel], + left_answers: List[AnswerExecuteModel], + right_answers: List[AnswerExecuteModel], + config: BenchmarkExecuteConfig + ): + left_map = {a.serialNo: a for a in left_answers} + right_map = {a.serialNo: a for a in right_answers} + confirm_list: List[RoundAnswerConfirmModel] = [] + + for inp in inputs: + left = left_map.get(inp.serialNo) + right = right_map.get(inp.serialNo) + + if left is None and right is None: + continue + + strategy_cfg = None + standard_sql = None + if left is not None: + standard_sql = left.llmOutput + if config.benchmarkModeType == BenchmarkModeTypeEnum.EXECUTE: + strategy_cfg = left.strategyConfig + else: + standard_result_list = [] + if left.executeResult: + standard_result_list.append(left.executeResult) + strategy_cfg = DataCompareStrategyConfig( + strategy="EXACT_MATCH", + order_by=True, + standard_result=standard_result_list if standard_result_list else None + ) + + if right is not None: + if config.compareConfig and isinstance(config.compareConfig, dict): + res = self.compare_service.compare_json_by_config( + left.llmOutput if left else "", right.llmOutput or "", config.compareConfig + ) + compare_result = res.compare_result + else: + if strategy_cfg is None: + compare_result = DataCompareResultEnum.FAILED + else: + res = self.compare_service.compare( + left if left else AnswerExecuteModel( + serialNo=inp.serialNo, + analysisModelId=inp.analysisModelId, + question=inp.question, + llmOutput=None, + executeResult=None + ), + right.executeResult + ) + compare_result = res.compare_result + confirm = RoundAnswerConfirmModel( + serialNo=inp.serialNo, + analysisModelId=inp.analysisModelId, + question=inp.question, + selfDefineTags=inp.selfDefineTags, + prompt=inp.prompt, + standardAnswerSql=standard_sql, + strategyConfig=strategy_cfg, + llmOutput=right.llmOutput if right else None, + executeResult=right.executeResult if right else None, + errorMsg=right.errorMsg if right else None, + compareResult=compare_result + ) + confirm_list.append(confirm) + + self.file_service.write_data_compare_result(location, round_id, confirm_list, config.benchmarkModeType == BenchmarkModeTypeEnum.EXECUTE, 2) \ No newline at end of file diff --git a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/fetchdata/benchmark_data_manager.py b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/fetchdata/benchmark_data_manager.py index 05c109a30..a7c6bd867 100644 --- a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/fetchdata/benchmark_data_manager.py +++ b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/fetchdata/benchmark_data_manager.py @@ -26,8 +26,8 @@ class BenchmarkDataConfig(BaseModel): model_config = ConfigDict(arbitrary_types_allowed=True) cache_dir: str = "cache" - db_path: str = "benchmark_data.db" - table_mapping_file: Optional[str] = None + db_path: str = "pilot/benchmark_meta_data/benchmark_data.db" + table_mapping_file: str = "pilot/benchmark_meta_data/table_mapping.json" cache_expiry_days: int = 1 diff --git a/table_mapping.json b/pilot/benchmark_meta_data/table_mapping.json similarity index 100% rename from table_mapping.json rename to pilot/benchmark_meta_data/table_mapping.json