From 4de41d1a25035c09be91c30444a7125b0dfb6d4b Mon Sep 17 00:00:00 2001
From: yaoyifan-yyf <yaoyifan.yyf@antgroup.com>
Date: Fri, 26 Sep 2025 17:47:11 +0800
Subject: [PATCH] feat: benchmark post_dispatch service

---
 .../evaluate/service/benchmark/__init__.py    |   0
 .../service/benchmark/benchmark_service.py    |   0
 .../service/benchmark/data/input_round1.jsonl |   4 +
 .../benchmark/data/output_execute_model.jsonl |   5 +
 .../output_execute_model.round1.compare.jsonl |   4 +
 .../output_execute_model.round1.summary.json  |   6 +
 .../benchmark/data/output_round1_modelA.jsonl |   4 +
 .../benchmark/data/output_round1_modelB.jsonl |   4 +
 .../output_round1_modelB.round1.compare.jsonl |   4 +
 .../output_round1_modelB.round1.summary.json  |   6 +
 .../benchmark/data/standard_answers.xlsx      | Bin 0 -> 11844 bytes
 .../service/benchmark/data_compare_service.py | 144 ++++++++++++++++++
 .../service/benchmark/file_parse_service.py   | 118 ++++++++++++++
 .../evaluate/service/benchmark/models.py      | 116 ++++++++++++++
 .../evaluate/service/benchmark/run_demo.py    |  65 ++++++++
 .../benchmark/user_input_execute_service.py   | 108 +++++++++++++
 .../fetchdata/benchmark_data_manager.py       |   4 +-
 .../benchmark_meta_data/table_mapping.json    |   0
 18 files changed, 590 insertions(+), 2 deletions(-)
 delete mode 100644 packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/__init__.py
 delete mode 100644 packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/benchmark_service.py
 create mode 100644 packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/input_round1.jsonl
 create mode 100644 packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/output_execute_model.jsonl
 create mode 100644 packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/output_execute_model.round1.compare.jsonl
 create mode 100644 packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/output_execute_model.round1.summary.json
 create mode 100644 packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/output_round1_modelA.jsonl
 create mode 100644 packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/output_round1_modelB.jsonl
 create mode 100644 packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/output_round1_modelB.round1.compare.jsonl
 create mode 100644 packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/output_round1_modelB.round1.summary.json
 create mode 100644 packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/standard_answers.xlsx
 create mode 100644 packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data_compare_service.py
 create mode 100644 packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/file_parse_service.py
 create mode 100644 packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/models.py
 create mode 100644 packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/run_demo.py
 create mode 100644 packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/user_input_execute_service.py
 rename table_mapping.json => pilot/benchmark_meta_data/table_mapping.json (100%)

diff --git a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/__init__.py b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/benchmark_service.py b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/benchmark_service.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/input_round1.jsonl b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/input_round1.jsonl
new file mode 100644
index 000000000..b36082f78
--- /dev/null
+++ b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/input_round1.jsonl
@@ -0,0 +1,4 @@
+{"serialNo":1,"analysisModelId":"D2025050900161503000025249569","question":"各性别的平均年龄是多少，并按年龄顺序显示结果？","selfDefineTags":"KAGGLE_DS_1,CTE1","prompt":"...","knowledge":""}
+{"serialNo":2,"analysisModelId":"D2025050900161503000025249569","question":"不同投资目标下政府债券的总量是多少，并按目标名称排序？","selfDefineTags":"KAGGLE_DS_1,CTE1","prompt":"...","knowledge":""}
+{"serialNo":3,"analysisModelId":"D2025050900161503000025249569","question":"用于触发双模型结果数不相等的case","selfDefineTags":"TEST","prompt":"...","knowledge":""}
+{"serialNo":4,"analysisModelId":"D2025050900161503000025249569","question":"用于JSON对比策略的case","selfDefineTags":"TEST_JSON","prompt":"...","knowledge":""}
\ No newline at end of file
diff --git a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/output_execute_model.jsonl b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/output_execute_model.jsonl
new file mode 100644
index 000000000..a0e3126e4
--- /dev/null
+++ b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/output_execute_model.jsonl
@@ -0,0 +1,5 @@
+{"serialNo":1,"analysisModelId":"D2025050900161503000025249569","question":"各性别的平均年龄是多少，并按年龄顺序显示结果？","llmOutput":"with converted_data as (...)\nselect gender as `性别`, avg(age) as `平均年龄` from converted_data group by gender order by `平均年龄`;","executeResult":{"性别":["Female","Male"],"平均年龄":["27.73","27.84"]},"errorMsg":null}
+{"serialNo":2,"analysisModelId":"D2025050900161503000025249569","question":"不同投资目标下政府债券的总量是多少，并按目标名称排序？","llmOutput":"with gov_bonds_data as (...)\nselect objective as `objective`, sum(gov_bond_value) as `政府债券总量` from gov_bonds_data group by `objective` order by `objective`;","executeResult":{"objective":["Capital Appreciation","Growth","Income"],"政府债券总量":["117","54","15"]},"errorMsg":null}
+{"serialNo":3,"analysisModelId":"D2025050900161503000025249569","question":"用于触发双模型结果数不相等的case","llmOutput":"select 1","executeResult":{"colA":["x","y"]},"errorMsg":null}
+{"serialNo":4,"analysisModelId":"D2025050900161503000025249569","question":"用于JSON对比策略的case","llmOutput":"{\"check\":\"ok\"}","executeResult":null,"errorMsg":null}
+{"serialNo":5,"analysisModelId":"D2025050900161503000025249569","question":"缺少匹配标准的case","llmOutput":"select * from t","executeResult":null,"errorMsg":"execution error"}
\ No newline at end of file
diff --git a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/output_execute_model.round1.compare.jsonl b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/output_execute_model.round1.compare.jsonl
new file mode 100644
index 000000000..8041ece4e
--- /dev/null
+++ b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/output_execute_model.round1.compare.jsonl
@@ -0,0 +1,4 @@
+{"serialNo": 1, "analysisModelId": "D2025050900161503000025249569", "question": "各性别的平均年龄是多少，并按年龄顺序显示结果？", "selfDefineTags": "KAGGLE_DS_1,CTE1", "prompt": "...", "standardAnswerSql": "with converted_data as (\n    select \n        gender,\n        cast(age as int) as age\n    from \n        ant_icube_dev.di_finance_data\n    where \n        age rlike '^[0-9]+$'\n)\nselect\n    gender as `性别`,\n    avg(age) as `平均年龄`\nfrom \n    converted_data\ngroup by \n    gender\norder by \n    `平均年龄`;", "llmOutput": "with converted_data as (...)\nselect gender as `性别`, avg(age) as `平均年龄` from converted_data group by gender order by `平均年龄`;", "executeResult": {"性别": ["Female", "Male"], "平均年龄": ["27.73", "27.84"]}, "errorMsg": null, "compareResult": "RIGHT", "isExecute": true, "llmCount": 2}
+{"serialNo": 2, "analysisModelId": "D2025050900161503000025249569", "question": "不同投资目标下政府债券的总量是多少，并按目标名称排序？", "selfDefineTags": "KAGGLE_DS_1,CTE1", "prompt": "...", "standardAnswerSql": "with gov_bonds_data as (\n    select\n        objective,\n        cast(government_bonds as bigint) as gov_bond_value\n    from\n        ant_icube_dev.di_finance_data\n    where\n        government_bonds is not null\n        and government_bonds rlike '^[0-9]+$'\n)\nselect\n    objective as `objective`,\n    sum(gov_bond_value) as `政府债券总量`\nfrom\n    gov_bonds_data\ngroup by\n    `objective`\norder by\n    `objective`;", "llmOutput": "with gov_bonds_data as (...)\nselect objective as `objective`, sum(gov_bond_value) as `政府债券总量` from gov_bonds_data group by `objective` order by `objective`;", "executeResult": {"objective": ["Capital Appreciation", "Growth", "Income"], "政府债券总量": ["117", "54", "15"]}, "errorMsg": null, "compareResult": "RIGHT", "isExecute": true, "llmCount": 2}
+{"serialNo": 3, "analysisModelId": "D2025050900161503000025249569", "question": "用于触发双模型结果数不相等的case", "selfDefineTags": "TEST", "prompt": "...", "standardAnswerSql": null, "llmOutput": "select 1", "executeResult": {"colA": ["x", "y"]}, "errorMsg": null, "compareResult": "FAILED", "isExecute": true, "llmCount": 2}
+{"serialNo": 4, "analysisModelId": "D2025050900161503000025249569", "question": "用于JSON对比策略的case", "selfDefineTags": "TEST_JSON", "prompt": "...", "standardAnswerSql": null, "llmOutput": "{\"check\":\"ok\"}", "executeResult": null, "errorMsg": null, "compareResult": "FAILED", "isExecute": true, "llmCount": 2}
diff --git a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/output_execute_model.round1.summary.json b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/output_execute_model.round1.summary.json
new file mode 100644
index 000000000..05ac3319b
--- /dev/null
+++ b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/output_execute_model.round1.summary.json
@@ -0,0 +1,6 @@
+{
+  "right": 2,
+  "wrong": 0,
+  "failed": 2,
+  "exception": 0
+}
\ No newline at end of file
diff --git a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/output_round1_modelA.jsonl b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/output_round1_modelA.jsonl
new file mode 100644
index 000000000..9a69b4cbc
--- /dev/null
+++ b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/output_round1_modelA.jsonl
@@ -0,0 +1,4 @@
+{"serialNo":1,"analysisModelId":"D2025050900161503000025249569","question":"各性别的平均年龄是多少，并按年龄顺序显示结果？","llmOutput":"with converted_data as (...)\nselect gender as `性别`, avg(age) as `平均年龄` from converted_data group by gender order by `平均年龄`;","executeResult":{"性别":["Female","Male"],"平均年龄":["27.73","27.84"]},"errorMsg":null}
+{"serialNo":2,"analysisModelId":"D2025050900161503000025249569","question":"不同投资目标下政府债券的总量是多少，并按目标名称排序？","llmOutput":"with gov_bonds_data as (...)\nselect objective as `objective`, sum(gov_bond_value) as `政府债券总量` from gov_bonds_data group by `objective` order by `objective`;","executeResult":{"objective":["Capital Appreciation","Growth","Income"],"政府债券总量":["117","54","15"]},"errorMsg":null}
+{"serialNo":3,"analysisModelId":"D2025050900161503000025249569","question":"用于触发双模型结果数不相等的case","llmOutput":"select 1","executeResult":{"colA":["x","y"]},"errorMsg":null}
+{"serialNo":4,"analysisModelId":"D2025050900161503000025249569","question":"用于JSON对比策略的case","llmOutput":"{\"check\":\"ok\"}","executeResult":null,"errorMsg":null}
\ No newline at end of file
diff --git a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/output_round1_modelB.jsonl b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/output_round1_modelB.jsonl
new file mode 100644
index 000000000..5589104ec
--- /dev/null
+++ b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/output_round1_modelB.jsonl
@@ -0,0 +1,4 @@
+{"serialNo":1,"analysisModelId":"D2025050900161503000025249569","question":"各性别的平均年龄是多少，并按年龄顺序显示结果？","llmOutput":"with converted_data as (...)\nselect gender as `性别`, avg(age) as `平均年龄` from converted_data group by gender order by `平均年龄`;","executeResult":{"性别":["Female","Male"],"平均年龄":["27.73","27.84"]},"errorMsg":null}
+{"serialNo":2,"analysisModelId":"D2025050900161503000025249569","question":"不同投资目标下政府债券的总量是多少，并按目标名称排序？","llmOutput":"with gov_bonds_data as (...)\nselect objective as `objective`, sum(gov_bond_value) as `政府债券总量` from gov_bonds_data group by `objective` order by `objective`;","executeResult":{"objective":["Capital Appreciation","Growth","Income"],"政府债券总量":["117","54","15"]},"errorMsg":null}
+{"serialNo":3,"analysisModelId":"D2025050900161503000025249569","question":"用于触发双模型结果数不相等的case","llmOutput":"select 1","executeResult":{"colB":["x","z","w"]},"errorMsg":null}
+{"serialNo":4,"analysisModelId":"D2025050900161503000025249569","question":"用于JSON对比策略的case","llmOutput":"{\"check\":\"ok\"}","executeResult":null,"errorMsg":null}
\ No newline at end of file
diff --git a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/output_round1_modelB.round1.compare.jsonl b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/output_round1_modelB.round1.compare.jsonl
new file mode 100644
index 000000000..5482d825c
--- /dev/null
+++ b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/output_round1_modelB.round1.compare.jsonl
@@ -0,0 +1,4 @@
+{"serialNo": 1, "analysisModelId": "D2025050900161503000025249569", "question": "各性别的平均年龄是多少，并按年龄顺序显示结果？", "selfDefineTags": "KAGGLE_DS_1,CTE1", "prompt": "...", "standardAnswerSql": "with converted_data as (...)\nselect gender as `性别`, avg(age) as `平均年龄` from converted_data group by gender order by `平均年龄`;", "llmOutput": "with converted_data as (...)\nselect gender as `性别`, avg(age) as `平均年龄` from converted_data group by gender order by `平均年龄`;", "executeResult": {"性别": ["Female", "Male"], "平均年龄": ["27.73", "27.84"]}, "errorMsg": null, "compareResult": "EXCEPTION", "isExecute": false, "llmCount": 2}
+{"serialNo": 2, "analysisModelId": "D2025050900161503000025249569", "question": "不同投资目标下政府债券的总量是多少，并按目标名称排序？", "selfDefineTags": "KAGGLE_DS_1,CTE1", "prompt": "...", "standardAnswerSql": "with gov_bonds_data as (...)\nselect objective as `objective`, sum(gov_bond_value) as `政府债券总量` from gov_bonds_data group by `objective` order by `objective`;", "llmOutput": "with gov_bonds_data as (...)\nselect objective as `objective`, sum(gov_bond_value) as `政府债券总量` from gov_bonds_data group by `objective` order by `objective`;", "executeResult": {"objective": ["Capital Appreciation", "Growth", "Income"], "政府债券总量": ["117", "54", "15"]}, "errorMsg": null, "compareResult": "EXCEPTION", "isExecute": false, "llmCount": 2}
+{"serialNo": 3, "analysisModelId": "D2025050900161503000025249569", "question": "用于触发双模型结果数不相等的case", "selfDefineTags": "TEST", "prompt": "...", "standardAnswerSql": "select 1", "llmOutput": "select 1", "executeResult": {"colB": ["x", "z", "w"]}, "errorMsg": null, "compareResult": "EXCEPTION", "isExecute": false, "llmCount": 2}
+{"serialNo": 4, "analysisModelId": "D2025050900161503000025249569", "question": "用于JSON对比策略的case", "selfDefineTags": "TEST_JSON", "prompt": "...", "standardAnswerSql": "{\"check\":\"ok\"}", "llmOutput": "{\"check\":\"ok\"}", "executeResult": null, "errorMsg": null, "compareResult": "RIGHT", "isExecute": false, "llmCount": 2}
diff --git a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/output_round1_modelB.round1.summary.json b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/output_round1_modelB.round1.summary.json
new file mode 100644
index 000000000..03a2ef66d
--- /dev/null
+++ b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/output_round1_modelB.round1.summary.json
@@ -0,0 +1,6 @@
+{
+  "right": 1,
+  "wrong": 0,
+  "failed": 0,
+  "exception": 3
+}
\ No newline at end of file
diff --git a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/standard_answers.xlsx b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data/standard_answers.xlsx
new file mode 100644
index 0000000000000000000000000000000000000000..52411e9e9c503be6a09405eaad073cfd31f448d7
GIT binary patch
literal 11844
zcmeHtg;!i#@^$0x?(Q`11h?Q82ogLH+}$Aw?(Xgo+$FfX6Wq1YK;z-Z8~NUw%zS^r
z%;~l6S-oyot=s3+u6=IRR*{E-#s<Iw-~j*t1;F?$(_9|{0EmGG0I&e?kh&6fw$7%u
z&Ia#1>`k5YSln%_$@8Eg>2m;(ufPA__%B|8iiA<6UN&^8+qX{=yFj_sT2XlJ)4)MY
zMpfaizW9MM6WuHei<hj>M|AOA982CB%*j<Rj>~cLT3egOz>tA9HRSMsp)L)5B3{=1
zK@d$R9zmR=j`n#DHjyYhfq_A!Svnxap{ZAmOL9w0u7c1U2UpmCCv$mF6Jukb1-VS8
zgnwwIds9hs9fNPAbQXPQ8f(tdg27Aw=MGR7I-JsL71ve2P=!KEoxxbG#ql9o{<{->
zGv~aa3FJCkP)rBv+QbHI+jm}4MJ~eH(y;3?oq-rDrcUn<-kzQGQQ_Ar53M{n<_19B
zEYSdpz?mh0idA?$?l%tGNuNFuoP34Riyrc|N181x;=T={kF8nJF&~xiDQ%cH3kS0#
zmHBxY1AU8!hMW1iluu2Fy={x8;dY^=?q0CL7)Px?yTs9lyC|rAf=@Sxjs(3xQ+v9f
z&n}aW{5XEW0st>BPym&Gfn|dR8`bq|tSP*r4(Sz^29BoIPOL0{T>l5h|6&aO<*!%7
zDJu7}AqSnmeF`4BonMPV7ngSvd)r3w&evad6|EsMpPFc`la3hu9pPsv8NUwSr;(Ml
zH<8Cf6yR;Hif~M9L8?Z#Z$Zfq4lZzvw2n#A4i#H{7_Rd-^Wan&IS+c*j%dd6=8|0b
z;SDP3*-Ob9^huWYga}xL#6h@1sR23zirVYOca;$H;_9d0f~uQ%bB^OD)BP5cijGi3
zLiiOxGpRU3PR5^pRC*6tQ{3DWYN(s@TT~loISNpE7<{zu`YD;-iS_IaRLU4qqvXQ8
z<Cv5gq{+GQ)2ZjW9L@CT<3;E#AG#P03@0mDe`TnD2gv|<G(8sz04RK=GPKu~akpl5
zvvagEwzISPgSRR))@)aJu)mX4yP-Q*#mr@!Qpi#0;^LAsKn^N4mIua`ysf6T#3osR
ze*pBtBAeR<Lw5WC!$X1JjJ7r{<y!O0aw6*syu?s?++7MDcn1r3*ObYf28A|my5z(|
zRajLhx$Dd=2K=(bCCaL7h+AL)dW=K*jO#r2PbV7?v$UibI0!ESa%`c5l4~z9EaNjY
z>V-B@%8h7t^71f)wh0>|4Y_Qb20m&~VwvD+)R)|#1hRs=H?=jXnYV}Yj2l+{;WgC|
zu2<U4mr1|Lfdk4*QdkM8wltv7-%Du^xxkfKX>5}jEPj|O$6VFNkkLiBYq_AqC#(}e
zNXjDCzzN+1PVPd8=A#_Z{lbcX_rMOEpT!3Sf28WRMNI4^VcHUoPi*UTJnx48Wz7XS
zb6meIagi0e4{A<*m^RW(02x#1Me>Zx>y#=awrb93D$R)3a!tXp?aG7A#hKHl^XQEd
z4JUg91eS;I70ek%5AzZ@(H+syf+_{MNK6rW;N^!U6={fm>I9-T`TA7u3xFIZ#I}|b
zeA?)tKlecT1XNAU^cBB9B}i;(L^sXJe-Gq~UP!14k655LKK*b7>*M{nVj%M3etQxu
zvhL@1yT(c;@^o=<etx$8@`aQ>Nf;RYlB1ngB1#_87j(LRKU)H@*t%_f6EFVxVCN9p
z(MidUTs<e)S$AkK2J!sHW;bYSibv68w=Z|gv(ePOk(Vt5Wh3`%4VHji7t+=PLNQXR
zU0oF$)LPIx5p99Oa^J6peyRnx2OHa`gDTH-ik)C-kFv9t@Ic^-S7u3WV=MD<0xQ&-
zquzzt8rnES&{nN(JF{9UTr0o*>Duc1isp#`b_bybv|}-Lq-)(bgv%76z+MWdQSv3%
z2zztspdI&pXf)$D;GQT9Ptnpg)c~DEE0^~$2ggY?_Acf6Qg~Tkt0vF~8IEcf?c~i<
z{f{*k2|S_GB}}_DuL=_N&;ptKNoySVx+GS*utwRf?aak}nP>4=V)-3l<IOpaukzR7
zf4w-cy0+dvzK=9cRj2qANbBS-rl`%62c);yLBH^M>$>YsWep)ENLvH(&^=(jyBvho
zpTtQ%?Y`Y7r$AIY@(fCCiVJRCix`1<SHZYo3o7SXpE#qf8}H*$jt-m1mS#L#S$i>a
zM_o_0R^GWYR_?tVo&9l?)kqir<7CubdB<J=r88CW`&m<tA2RweX34UWYl-sX^u{Yz
z{~gT%!U@SgU+=WOQXU}y9^w_ve`meFV)~y92=RI#ea#8~-J?QHQNE82y#wi2Fq>Pt
zD-OmFXI4s(#t9nya6Q8cEd{6V^#&1RtDe?^JS&u4kjMG>pzAF+>NXVCO*d0X7$&qQ
zmi4(HjNSOpaVU7>({H4KGB8+}pyQMBQzX<hC#)9XxFO~&Y`*<FdK!Y@{31%p>z0`y
zF3Slv7Y<9ODUy*_1{kg1^?&%H4*C{Y-_%5&#$dMh(ju_}hRqK2Ae;V4fFSz@#RD*X
zI7h*B*DVExziNLnQ}mdy>dt>a-XkuTCoSU^@(yw)4<3>s^J9o}+RICA(>UL97JnRJ
z=v=vC9=$#{+Z`U9|IZ<#D|pTO<&^+v5di@F*I4)m0XUhPnmRkN{%vCag9$R?743G&
z(Sy#JZbZn>PzPKyB5{htW+Yd1=Uv?<@uN+!g+Gv!&6mCS)Z`Z=I)*X^a3No`bB?jK
zi7<wO4KvwV`<$^&pwrTI4i!WFIxF+D@%bm6#|x%aw%zRSe0^_CT-jVtbO6ziVr`gB
z>uxX<QB651M5(CGOBN9tW5I=M=!@EhH}<l!aeWR-*jR+w@LxN=8l_3wyR&&0i#}rY
z1%4FnJJ*?R;Z(fTwGaDoS{dG~swetR9Pve1lwGaeebwAVn?_<?CXGERiYJ14?3$VL
zt|}p6bAfO6PL(`feQfbac{qX1mqPEr4=c#5uSfqH=1G!f11U6$wPF`5$rmfh!0_4W
zhcVqT=1aczuj;G;uDtt?1T^8y;kf12gpq^8)S<IDigm+qS(BC+jH{o%GIP=BPi^z}
ze-tKqIEeQD{Af%yS%w$WB?OHUrn-@p$iDJx)5N|%;4lbEDAs`$Tn)`2-+nCkWHSYC
zhE9(6wF<0lBgAzbFNNb&&2?2(;zLB;J6M`)BzTlPzJ(W}c=DMSt?T7=*Ixd9sYqP2
z4P*mI8}wg5YFz{aMLB}}qh5H)zl`be;ToRZyf5hoGwHh4Fx^})EjWs{iSUkZzL(fr
z)_ZI?CXO^O|CR4~e$*yoJdGA*W?ueKN46|K<(h1nlELZ2&f`sPxvnoG&b3%MsGT)n
zK^r@hHB;W!cvEh5ax;GM?(@(<oh5$lR(TKu5#0RJkaP~A?hkkk6FQ<;OOWYK31O0T
zMsw~4>LH9+1i4rY<q-NVJR)=<z<Ifk81~y{l~Kw`b}xRh1DBH=Bun$)fVqYf7g`K7
zXJepP4pTORplW~%>W3{E61Hl*PN>qa-W#_a80J9Zu!if|C?qD)5W4R@#DGA2u|RUE
z7>XebY<M|{w-6~WSY3Q5XaLk7CMMf^{eX9$yH1Ze_Z9)o9B^I!*nrg?ij5|8!0IlB
zdJ>eD7DP6{`zt>HnmrI&ER0+thKe3N0n!lS@U}D5?$1@`8t+EzVMB+WGC+8Q(8N9}
zJnD~!%k4y8vB8|ntk%YUS!dkku|wsQkVS8NqkOCxqKB&t=Llv1$=V0CYgq-U@S@U4
zeus_JF7vvu%mTM%W$dTu@Fe(oQM!F5$WjdL?Rl7uZKTucZCvTdy-iE8d3sIf|DNpg
zs$bCb;Q)Yf@;}NBe?zXbxv7mQ>))<_11(5fH=KwU+n4oP1o`LL9na1vrrjm?%12%V
zZMwo}9bGVeIRSy}dL%+XoN%IeZcI`;ww%F(iX0M<s~+u{buDgcH2N^s3>3TISeYJ|
zw5Ep}{gj!X?-V7R$=`Z6k(uHtlY-O}mt^o=<w__cVkw1sHp$P4kDkk0`E5c3hAD#W
zJreViTjPfB3FUGeMpO;lRp^p9P$bT=QG+=Q1DVPpVB8*ZLlHkZ_KS=#Fp6IQwu2q0
zDOae&E=_&e0o|ViUj8U9z4Z7z&FbfUbJ2t*93<lB#V?cHBN>WqX()OBJq<!PkWf&V
zndysG2+Q6&Mjnu?h^TId;yaa3G$qAWc#^@4h_Az=F@}w)_&eJe?p@diy7*D(fnXdV
zx@^)4T9iPxl7{yW-2NjjA3E2oZmgJm2>qq0v@9&czt!m<!4fP03du-Xy>9dythZ2}
z=eg5YavvHl2osLc7A`)E;kBD@antt)TisR9g08Kf2@|%htnNHZ*N~2rd{M&aWl<NW
z3ad3wS>Cq=D&Vg<^rRcHmAvQJ<43GDO4PFqLy+Gs`Vc;CGXMur5e=i3ebbV%g+M6A
z;f6s2m*L*_kx*=VT%`vZIhnCgtQ7HO-Y!SvC?pRUfox%lr$xCClpvIa@4yiceBm&^
zK^fYSiB>v$ad&(vjz<%D)AFLn)1+xSe46voa^(5urr6K#GT`<A&EVI~W}Wl#th>>6
zw)f+7e72uYnx~e-S=eB<-^=dc`RgHZH`<PrufB<2fNe_x(_~Ape{dZIw-=HGyaV;i
z7_%21LmpBpXC=3ll?ut-C7=XpvAYq;&cToX?_%bu(e%W(?;wuP>^o*Zu6;E7veD|1
zKa(X(Z*Q*$hl}#W0qq&AxJ+eB8+{GrYWAeReyF@DZ%<O{B7+=t+BUSjvd<tYyKBXn
zk2rJZSuDn<K*fSv;Y)!KK0OX<I8DkuaY5>!X66sD;w8Wh(At$iM62dnuM7`nBTOFZ
zPH>8jZTpvf%zl<c32gP*t+uz)EL696-XFIg#KjAWQ#h(b2E>~`Ryt@^VO%qHv7J?)
zPfJ--wNMAMWVVH{WBo7*UW4!FEFPy0RRFbZqpM+iET%el-==g(%yITX&=B?dgc~xg
zPSOUZi&hrNdIXl(?XO~t%BTEzbP%J1Z__FoHwo8U$>6KY=gihA?FeU=8Z?inar>zk
z3`4?G=bc65!w;$Xqr0-NH3IZOV-f*(Y1U<2K<W(Bs1PKaRf_@!0hKT&A!*pR)42@3
zmgtru6=%fDRZUOlBDBlpAb#<75ZFrtkVre;@>+C|Qk;O)CVMigkZk+UJ;3;&(G}KK
zRx#MlYGm|Br$@7p5^k|9HVJ5$pm1BUK14s3ya^2;kK2eMNHIMa0+FhGJyiB;Z2REd
z53FB;rqONw{GS5{zujrbWhy4rI+b<(`0|LB#4W_7Txr55yTE_aUdCT#`SjwWQD?!L
zy~uIUFy;Z=@qF|IHYn(S7d4G{^KhP|TVJ|LMVGtX3*6SrZ4-uEHy?n)pF5>ri~rC7
z_=Mp-MyP;mI})38$rpc)L2xQ#4*M;)*49u-s>qYEod9c!eWtfHKTP>(!Kc`xeQ>~{
zxbB0d-uT>C$YJkx?8<>$c^Y-+Z&_~Vn#V38&Sh;C@=yt&aC{>LGa^+Nd{qNfaqKL!
zFZw@HwWgi5rd5uUb9JO0_7K@@aB$rBz?)JXd%<GMVnJra2?r)P#r3J}5tWD&snVYD
zoTaG{D#omw<k8ORKUy6;EL|+hgk%bUR1wZ~k!&+;fp}ZYg5yb#+5@er!E+qSEygWG
zj-1Dlix^f?l*P>fw6y9|eS)_44VLi9OjKM}_gIJcy@T$sEmU*5czNp0J_NE^ws*-!
zNd$Pc!IB9AAL5lpVA*3+yb}6=20*_=6s~FFJR7F>{^;vkJ87a!c4-di&Vobk9NknF
zL2LM`SxRI9Tk4hezUBP}q!rXZh<iH7BBMK?1P*x73$(OYnc$*A{lh;y9*?sJq*vp_
z?%lf`8;+PSwujB|Cqz*RgsZsbrF_il;SQ}Fn^tBb!&m;1)}4F8@+PGW3Y@6C_K9qM
z#0l17|5H&v<MEtx{LzWvE=u{8#9S?7q3yF0(6Q2rr3Dpp-b3!q&YVvaR5u$idyGR-
zyIrX_tgcRVrJj=+9$9&g=g!7OnosDmmQG|Sb3(^Ol=(z1^X_4PwSC?Ut|{P+r@bFv
zvD%?ZPbRmF%5@=c41-M<ZVOc%A%?JrlakJ_HMJ$w*2HREdhQ-yiW1mDaVqQgNT+NS
zNsmS?b0_c4y1Oa!n~95>z);OSIpLr){phN-jlogfG$blqywva6td_ku&ps{Nt(|0L
z=^h(Yq5^JKiDGrjpgF{GOD`38&rIpuAKzRMJ9L`XiLP6X^7)$&ykrQ8>&w@27VNMs
z_iUTc=j2A7r0Z1B7!zGp?06r<9`!W`x<=F=hP)xp=3Q-GD~<nwj-TaZa6qqj#L)o&
zl)uxlle34lsnZ{)=ehTzQLlBBHM~O+_*JXx>fIWYQL!g<baEKTG3>*nB>HrEDMW0c
zSS;s7#qZSRXP<2!xTPKlItr-eXCtu2@#$$v4HfiHvb@gXx1XdEIZvX7;t)wG&PSwh
zw?f}CY)Bm*tSn`bm%mFXPC-zjbf%{9thu`k_9%^tSOp=d;)-5yh<g==rS2pQXYDq(
zW5R4*$kTp|!m-|)U_g?!*Rd?j;QxLSl}W4~W%)%r3mIa#=IKUdDRc;_;xLJ$ED9D$
z+k?ZaVMlx$2vwGdx7k<LUUU6Tu2DMT8cC2}v4CExs1mS<p)HMdMYC4Q>0&=a0dX}O
z(>~?!?DhTqRp{FQs$r7(BZ+Vi+GfUYlElqnQ!Z3mW20^hBxW)CWc=YClqoSufy`nt
zZ-85TbT?SpFLxLBlb!X<z5J$wIy&X2HOv$%+%xo3XsI7~%lN+zS>@(y&wU`}0lKZg
z_m6+7eugtDeqnkRbMRbxo;LOMl<u7PfF-=>&GO|Gr%$SS;0QBLi9is?Fb4@8I-K9R
zj52Ppx+d-Bw8sBxhwa^Q8vCTr&QWHc%p_VcQ$pbMySk-s3e&v8p6fT3b?j_HX049F
z)qGGoi$AqhKBHhPBc}E=+oU@~gDj_wC*Bt2_))K&-ZWvtJZ&Hr8d3xI8Ttjg*=I=>
zEU&qZ6nZ72VwQU)ZQ9x$+=Zt7g3@_rmhelA?(hYmaQxjzy>g^acTI&W#}3$q!CXQ2
zOX~QCCdxZj!aN9HuinC&@Cw9sek>-uB+t|^yObb3IMyJf;48R^ut`>_AFr(p_N+@W
zk#>7SM{LFu8%Fy8|JG3cm|f3QMb^iw;U;VAC>~i?0}Ky)K>V|7aDbgmJJW>R?h^O`
z8-y=Gea*iB5-nZ<D%QC?Cfi8XmyqRj#9WgAj?j-##SSFEcM4Cz7ebg*;|Bm4J!r#w
z5=nA*+b`dmF!7oSIBs$eidgQQwtQVdol<;nux!nf4}MTgR_2gtknNo3wT_DhW2_6x
zo@G)Md)OT%|A2pfKAIIF^E=lV1B<5SwP$U%LMs#cr)sX1?jGlB9+fBYS1D<3+RMoX
zp((6o`9`QT{R%C(j**DvQSQif6DJ6PcR5*npcx{vBAV)~l6U_UV0S`p@nzjEX>s<3
z7JM4oma)_w@&(NGq4JbX3%GdVe*AED+_e7nJn$r|CA(+pB2asHQ;nYJ6VVlKCbM{l
zqT)t-1mINowp7H`ASlafkZQ5+Gkexj{3U(;&eYGv9m5uco$5g$@9l;ump4`44K1&>
z6s9xSMpiyF5FE=>$W}jxVJW2}6t_LG{(|{WX+bC_KLP6X-CBkc0Koo7S}-?qH2tXV
z>}X;8>2KKqL};asr41I;cnYqpY#4A5jDv$Pie-T7tI;eroBvh{97R!lfE1!#OZCkG
znUx1peJS3&icy8Gf!0fu9E76~yJO#XrOSRo^0M7HE0{G}cxG$M#$;@nbUO1`^CFd6
ze9ybXZU)UzMlWCdqf1s~l$?PPDUZuJ?fGY-@CHMYDz`r7T?%vVEQnt1M^$Tjdh0t4
z6M&|ga$Rp)(xJ75YJM;ML^<6abey57isp7Ztb}Qalp=iy-ki~wdiM&N!`r5YJ`Yvn
z)RGGMWeXmBLjpZazd2R3OnB$&HNCQ>wL+KlM<#dQrgEv1GfN6i=LIca(=pQ#G}?(<
z&*ZVfPXoKQ_xeRo?(=P$aw3DNkNq7CJF3g8{3TFpON^#;#}l3j14jr>pN5w*+MkM%
zI%_{@aV;z!FKE`e8?7||RL;o0t)#)`&zfql>k#u}1ED<%KK&BaB?qD1Z($+B=?eDP
zkgSipZ#BkxhS3?M+}iDQ@ZK2_o_7GrR(Up;=x#5|+p@u*F4w(GT*D@kd4GzA)OZ>S
z52xdHXM=g$-@dr-ueKn4qUuPdf&iSiIPz?+IY`U}iMe1dMFxEd>9Ts$TRwKVPYWx|
z7?xWyFa?o{FCOQQT`7REr*X<B@&!JI-ohXbH^{k{Jft6HOkPJa;uwmaGAA}Z5T5{>
zMn9|vU8xYAHz+o6mE2!DG;|T;XTL$!gAd7~?@JqNTKacu+q?u-aye}Nmh7<V4cZto
z0?K-A$(y#X%Nwh^dEb(r_lqc`7O=*;N)+LwHdd*bx;$N4akt3Y1D^%uEsO-KYJKce
zjYIa9!V6jkYHW3ef(##r!RQqdUS)|L;AnxJrRV<DB}UxFu$I&j7YbN*!6-a^?4MZf
z2U)Qm!Fc-@Ph6ya21><|GHw06iYCmD7B9X|vj|0y3qNEEC$2H|czL_G5sESx6Hw2R
zc(P&WRp?S(6rs$19x;h(<oCYKL`6WKINN%P<G*2bGJ<3yMbWA9MVd42rG+q;S8HWl
zg+P%FYvxES5c=r?Q8h`;u#>Pz-R1V+ow-hy0-Im`y{xF~Bn3HZJCK59NsRmsgZiji
zj>xN5N^kZaf3{d2(hV%5Hz(a9d`XL$qMs0>p{kBh;13&9aD=F~rbi^<Z?&WIO|P}b
z>O!0jrr6{h{=R37ej6td=}z4B98yg~nrNSf)^HphdPqP7^C0o$?av6t;08yVh4plV
z=1y5jqF~(@c7q|mh|g73ILm-~u^0AQznEyt+HEfIFM0TsxOZo9CHe;}L(?bZ9Q%R=
zA+?6x=we-wjWFz5g3c*B)?tUHa?6oa`pvk+9n{b}F~nGh(H-j}4NerTkmm{5aO^U$
zA7xdHl0WqzQS^)!S;jUqhkoIloZ>e~s^40ozZDW+%stxcU!v!aI%|x|2j`G-n^N5)
zskPXdLgF@6&~U&n<Sn2Qqu4`KW`Yvnd;!lPR73N1x7Qa4s!-{I@ql0=9ANs^1j7}I
zO^P6ztxyzvprrBCgJvIbk!I2q&J>)Sz&Dx14Ju*Sb6>P(N8|8SQ(x$WGLFr|?x9c=
ztljFekvQN{T0rDkL|Kn!fCKX`he$}u&!oQ4sb;1pMSSU8BxW7Az5#KYCH^kbXSxBe
zehgK>Mz5a#kT~`xWGg%|HnGB#6CaF;=>s;jtEt|*;6A&)T%#OWC|)SlNgDm~+;Gc@
zZnJ3X2m>mV9LDyzCVz<9x9!~Y(xt=*y#`bWD9g%$j3;u~G?_u)c@~Z^XZ$&~e@Uu1
zMl+XsA{%p}Si`TWV0lAcaG;m7<K0vIvdfbke#hPHyD-1FdFCavpsUH<OVrzryM=>I
z0K-VhsNqmx>}YvaM^vUs#Ta6qaL+Ozzp5?PbpJfiNW}+pz4fG81&xXX$=2?J<ejwl
zdB~BW&FanaXZy)3)7hxtwEmU|b`QKBp%B%GPwDP{>mQccXz@x$^C8HgrO|5x7MX0K
z399T8QE?pvq7Ui@=4NO1nf16eX9ku>qf2$|S^DhVKjBmfeZDJA@v;S3CuWOKC^?;`
z#%5NHVLQzZjAUuwwWPF}w0meODMPtC&Dvo`V<d?b;nJ5TfTBy=`bZW98;y;KQy6@;
znD(y{JT@CsRwynvcabjgL_IHiDTd%DC%2wWLkdE3$l4q9a_fS{x5hm04;&zMLy=Jj
zN7G@#x_2ZH#7NnWn`@8;Tu*N8+JpqBc1dxaoNlf$K-|dMu>rv>J6pI2hZcvM0#MMR
zcN61Vmo;j%GM5Rpx}s#vu^#gMT_x$o$FXzFj2AqQtYn)licglq-f2Knp7AlEYCttV
zp(+6<YknAV)GeHLf^gBmTNCDq0JE{@H_+R)9~CMS)OK1C?TRk#>$Kd_+Pa4Pi)OZ3
z9X^d}QDxw(Gnm+fp||p?f4@ySG=7S_FH#vHP9fvKo?1<1e7T0`)Mwp4Pw5oLF$!8n
z-CH+efH_k30Fem0>+1_K7k>)BZ^OG!7m5izBxE#oi{KsoLUBTg-rv4DfD|w3Jc0RP
zz|nI;%R}li&Z!H&v?lGIS*iF<)98YvZJQQALMKSHPD?A-ZK=`uZFv|R=iI=WD$NuK
zBq&hXm#Q%ZfuMtCMp-y!@}@~aVro)4o(|><W?n&0Z>1PMX7V>@aj6S1f;TZoY-`t*
zAKv@zJ%o0?AUE8%d;<5B43_kysy6{4JZS)4(T$fG<#Pwybk5}U#9B1|onbx@-FgIM
z8^xN6yC+F>Cv}sRLj)97kMa!hMEl@Q8+4MK0(P8c9A-e&j$-H6B^G=zOs|bt<sfN>
z<!Advg8Hu#MkjY;CPNJn9A%$PLaF+{(hwtmg(A)|M{+oYcxzPtiSV{-l{YRFA1#zv
zNDqsGFCYU|n}knz_a~fBHvn!;Gg}0Dn(h?VaKVlr)h!S0oa_#XO=*Pzn`bjB5Gw$$
zg)#4jfQ?~<Tk>&SaX@E;SwU^3A&Vn0pGUDiK}^!pG|bjQ@IFtYhtD@~KOAS*wf3#k
zxZJOw+I3E~ZxI&_ob>e?-;$Pm)i&{Lxjt*WCx@azi8}(4Y(Ehy?OlFEZRS?{>PX1p
zYbWG?R&0R_R3=tMsJ`+JNwVN_bIvjtKV}kxJQ%+2xa7gu^XJ?B_-g_p7DKqGEexQ+
z{9=RBrQvw-<u0OxBd&ryjx@aG5>bJ~A^RKgySsXYJPt>Y*!<3*m3CgymL~GNTx8^2
zSc{HwH89sxp`l2n>79YH;SwVG%J2u%rF;(PBonPP>KM{-EUkUQ>l2G2Eyw`ZTJBXj
zUnrxcF|H9b6`mvO1hQu7`);{X!o6VW`&?CDfBHIA-@DF4GT)#6)v96sYO?v*)AUkY
zau|_nlKI#*e9}mY&y|vOFuyPg9{+Pw=K2xqtI}(4&g8Z5i}BjW`)FsP;%H~@#A;&a
zX!;L5+5g&iuczsRIQCaP75-N@>N9i?Wl(|TytFJM#IYm|by&#~^P?RRhG3G-+ctIw
zc$O|4&s@}1NIDA%6>a?I&V$MGXGe~;upTcMn=7><dSv)3(b~8J$IQIo{xLGDNi9Zx
zpd13ja)8(@=!(*a(X22?P>*qn2X2Qyj*DHLVOgu(I09wGKZn*H^{7vy6sF;<WKl~M
ztgz@$B`&SyUznj2W|dPVKq)}Ue-d*ng@@wLi3|fDv#`kqe;K_Ph`T?D6glG*wtALu
z;vBK^$!Q@h)642=LQ6wK1tkri#V;iJ=gMPIS@x*sdt{X)dt6SGxg-HWZ(r|@Q?$oh
z=-Z+jSR_l(D$pwts&n}!@*t0j2S$h4eo`uI1(I&h!QQ7XC6}3UdWsH9gb+ByM~Z5U
zDLilY(rO@M4Y!@LDbi;@LzM;44xIch1%9np{Cj`lG_(Mm`x=CVuSE^?zXqX^z5V}U
z@HP1UF*4%1ZCCyf5kY}bgU%u|Z0W;9HQ5V@Jhf^7vR(%t1#oHQgZc=EvrLmoV6RbJ
zmscvfNKNV^izb$5ZK?gX59q><jTf6MdQ+?~zNSn%#^&<o*8A#)q>l&3JH^sK8U!Md
zBGn91vs|CNx5SYMk7X{30D3b@D$cPKC=?CA&P13_%HbG8^;xWrsAw!~nzFf}c*Yfb
z_|iJa9Lh5*FMrO(C&43muCgFdm`t|ieM|~y@LX)1ASCxK{N6|#gv~fo*{!8aG5_N+
zQ>=KmUXlHPa9ppxez7HP_T4uWz7JRMdI9n!yXs<k6xS!Z0g<xhQMe?8i0txUCf190
zAG0Zp&@@9(^I(*bmzpAn(c5@b=g>s_o_^h&PTqRaXp>n<@Y!SzfWKv-p%4@Ef?=uP
zkOsdH$)nB|Tf)TZ1G1C@;DRvR`m`5AxF#~LP+0e6U))(GW&=0s>_dcgUTD_p)9n$y
zM6bMIDRV)=kpJc1+lU6^(jOjC$*aAv*`y^5euMYG)J&qRH#uo`pPtd_qNaXzRB~hR
zx<P*rYY0f->&y3l*6aRr8~-`~2lcLs{J%Q*S7qTp4gZ*vU$OERo#F3>zpJhOG;M#C
zF#e{!`rY_n#T|c|0svvJ^@abx)Z=$Qzjyrq^wfd;{~qGs`v1Rs`Mvx2rx&-^0_f}0
z{N9WF-NEnm(mx%*zUmNO9sF56{oVBUn#!N1B*cF={k_8SyN7=z*gx$705w_w;NR%>
mck_Qujej*?r2mWgKN6*iJk0Cv0sx4wAOBZIuVnh;+y4Rb#0Po+

literal 0
HcmV?d00001

diff --git a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data_compare_service.py b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data_compare_service.py
new file mode 100644
index 000000000..24553b008
--- /dev/null
+++ b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/data_compare_service.py
@@ -0,0 +1,144 @@
+from typing import Dict, List, Optional
+from models import DataCompareResult, DataCompareResultEnum, DataCompareStrategyConfig, AnswerExecuteModel
+from copy import deepcopy
+import hashlib
+import json
+from decimal import Decimal, ROUND_HALF_UP
+
+def md5_list(values: List[str]) -> str:
+    s = ",".join([v if v is not None else "" for v in values])
+    return hashlib.md5(s.encode("utf-8")).hexdigest()
+
+def accurate_decimal(table: Dict[str, List[str]], scale: int = 2) -> Dict[str, List[str]]:
+    out = {}
+    for k, col in table.items():
+        new_col = []
+        for v in col:
+            if v is None:
+                new_col.append("")
+                continue
+            vs = str(v)
+            try:
+                d = Decimal(vs)
+                new_col.append(str(d.quantize(Decimal("1." + "0"*scale), rounding=ROUND_HALF_UP)))
+            except:
+                new_col.append(vs)
+        out[k] = new_col
+    return out
+
+def sort_columns_by_key(table: Dict[str, List[str]], sort_key: str) -> Dict[str, List[str]]:
+    if sort_key not in table:
+        raise ValueError(f"base col not exist: {sort_key}")
+    base = table[sort_key]
+    row_count = len(base)
+    for k, col in table.items():
+        if len(col) != row_count:
+            raise ValueError(f"col length diff: {k}")
+    indices = list(range(row_count))
+    indices.sort(key=lambda i: "" if base[i] is None else str(base[i]))
+    sorted_table = {}
+    for k in table.keys():
+        sorted_table[k] = [table[k][i] for i in indices]
+    return sorted_table
+
+class DataCompareService:
+    def compare(self, standard_model: AnswerExecuteModel, target_result: Optional[Dict[str, List[str]]]) -> DataCompareResult:
+        if target_result is None:
+            return DataCompareResult.failed("targetResult is null")
+        cfg: DataCompareStrategyConfig = standard_model.strategyConfig or DataCompareStrategyConfig(strategy="EXACT_MATCH", order_by=True, standard_result=None)
+        if not cfg.standard_result:
+            return DataCompareResult.failed("leftResult is null")
+
+        for std in cfg.standard_result:
+            if not isinstance(std, dict):
+                continue
+            std_fmt = accurate_decimal(deepcopy(std), 2)
+            tgt_fmt = accurate_decimal(deepcopy(target_result), 2)
+            if cfg.order_by:
+                res = self._compare_ordered(std_fmt, cfg, tgt_fmt)
+            else:
+                res = self._compare_unordered(std_fmt, cfg, tgt_fmt)
+            if res.compare_result == DataCompareResultEnum.RIGHT:
+                return res
+        return DataCompareResult.wrong("compareResult wrong!")
+
+    def _compare_ordered(self, std: Dict[str, List[str]], cfg: DataCompareStrategyConfig, tgt: Dict[str, List[str]]) -> DataCompareResult:
+        try:
+            std_md5 = set()
+            for col_vals in std.values():
+                lst = ["" if v is None else str(v) for v in col_vals]
+                std_md5.add(md5_list(lst))
+
+            tgt_md5 = set()
+            for col_vals in tgt.values():
+                lst = ["" if v is None else str(v) for v in col_vals]
+                tgt_md5.add(md5_list(lst))
+
+            tgt_size = len(tgt_md5)
+            inter = tgt_md5.intersection(std_md5)
+
+            if tgt_size == len(inter) and tgt_size == len(std_md5):
+                return DataCompareResult.right("compareResult success!")
+
+            if len(std_md5) == len(inter):
+                if cfg.strategy == "EXACT_MATCH":
+                    return DataCompareResult.failed("compareResult failed!")
+                elif cfg.strategy == "CONTAIN_MATCH":
+                    return DataCompareResult.right("compareResult success!")
+            return DataCompareResult.wrong("compareResult wrong!")
+        except Exception as e:
+            return DataCompareResult.exception(f"compareResult Exception! {e}")
+
+    def _compare_unordered(self, std: Dict[str, List[str]], cfg: DataCompareStrategyConfig, tgt: Dict[str, List[str]]) -> DataCompareResult:
+        try:
+            tgt_md5 = []
+            tgt_cols = []
+            for k, col_vals in tgt.items():
+                lst = ["" if v is None else str(v) for v in col_vals]
+                lst.sort()
+                tgt_md5.append(md5_list(lst))
+                tgt_cols.append(k)
+
+            for std_key, std_vals in std.items():
+                std_list = ["" if v is None else str(v) for v in std_vals]
+                std_list.sort()
+                std_md5 = md5_list(std_list)
+                if std_md5 not in tgt_md5:
+                    return DataCompareResult.wrong("compareResult wrong!")
+
+                idx = tgt_md5.index(std_md5)
+                tgt_key = tgt_cols[idx]
+
+                std_sorted = sort_columns_by_key(std, std_key)
+                tgt_sorted = sort_columns_by_key(tgt, tgt_key)
+
+                ordered_cfg = DataCompareStrategyConfig(
+                    strategy=cfg.strategy,
+                    order_by=True,
+                    standard_result=cfg.standard_result
+                )
+                res = self._compare_ordered(std_sorted, ordered_cfg, tgt_sorted)
+                if res.compare_result == DataCompareResultEnum.RIGHT:
+                    return res
+            return DataCompareResult.wrong("compareResult wrong!")
+        except Exception as e:
+            return DataCompareResult.exception(f"compareResult Exception! {e}")
+
+    def compare_json_by_config(self, standard_answer: str, answer: str, compare_config: Dict[str, str]) -> DataCompareResult:
+        try:
+            if not standard_answer or not answer:
+                return DataCompareResult.failed("standardAnswer or answer is null")
+            ans = json.loads(answer)
+            for k, strat in compare_config.items():
+                if k not in ans:
+                    return DataCompareResult.wrong("key missing")
+                if strat in ("FULL_TEXT", "ARRAY"):
+                    if str(ans[k]) != "ok":
+                        return DataCompareResult.wrong("value mismatch")
+                elif strat == "DAL":
+                    return DataCompareResult.failed("DAL compare not supported in mock")
+                else:
+                    return DataCompareResult.failed(f"unknown strategy {strat}")
+            return DataCompareResult.right("json compare success")
+        except Exception as e:
+            return DataCompareResult.exception(f"compareJsonByConfig Exception! {e}")
\ No newline at end of file
diff --git a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/file_parse_service.py b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/file_parse_service.py
new file mode 100644
index 000000000..c0dd0ad9b
--- /dev/null
+++ b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/file_parse_service.py
@@ -0,0 +1,118 @@
+import json
+from typing import List
+from models import BaseInputModel, AnswerExecuteModel, RoundAnswerConfirmModel, DataCompareResultEnum, DataCompareStrategyConfig
+import pandas as pd
+import os
+
+class FileParseService:
+    def parse_input_sets(self, path: str) -> List[BaseInputModel]:
+        data = []
+        with open(path, "r", encoding="utf-8") as f:
+            for line in f:
+                if not line.strip(): continue
+                obj = json.loads(line)
+                data.append(BaseInputModel(
+                    serialNo=obj["serialNo"],
+                    analysisModelId=obj["analysisModelId"],
+                    question=obj["question"],
+                    selfDefineTags=obj.get("selfDefineTags"),
+                    prompt=obj.get("prompt"),
+                    knowledge=obj.get("knowledge"),
+                ))
+        return data
+
+    def parse_llm_outputs(self, path: str) -> List[AnswerExecuteModel]:
+        data = []
+        with open(path, "r", encoding="utf-8") as f:
+            for line in f:
+                if not line.strip(): continue
+                obj = json.loads(line)
+                data.append(AnswerExecuteModel.from_dict(obj))
+        return data
+
+    def write_data_compare_result(self, path: str, round_id: int, confirm_models: List[RoundAnswerConfirmModel], is_execute: bool, llm_count: int):
+        if not path.endswith(".jsonl"):
+            raise ValueError(f"output_file_path must end with .jsonl, got {path}")
+        out_path = path.replace(".jsonl", f".round{round_id}.compare.jsonl")
+        with open(out_path, "w", encoding="utf-8") as f:
+            for cm in confirm_models:
+                row = dict(
+                    serialNo=cm.serialNo,
+                    analysisModelId=cm.analysisModelId,
+                    question=cm.question,
+                    selfDefineTags=cm.selfDefineTags,
+                    prompt=cm.prompt,
+                    standardAnswerSql=cm.standardAnswerSql,
+                    llmOutput=cm.llmOutput,
+                    executeResult=cm.executeResult,
+                    errorMsg=cm.errorMsg,
+                    compareResult=cm.compareResult.value if cm.compareResult else None,
+                    isExecute=is_execute,
+                    llmCount=llm_count
+                )
+                f.write(json.dumps(row, ensure_ascii=False) + "\n")
+        print(f"[write_data_compare_result] compare written to: {out_path}")
+
+    def summary_and_write_multi_round_benchmark_result(self, output_path: str, round_id: int) -> str:
+        if not output_path.endswith(".jsonl"):
+            raise ValueError(f"output_file_path must end with .jsonl, got {output_path}")
+        compare_path = output_path.replace(".jsonl", f".round{round_id}.compare.jsonl")
+        right, wrong, failed, exception = 0, 0, 0, 0
+        if os.path.exists(compare_path):
+            with open(compare_path, "r", encoding="utf-8") as f:
+                for line in f:
+                    if not line.strip(): continue
+                    obj = json.loads(line)
+                    cr = obj.get("compareResult")
+                    if cr == DataCompareResultEnum.RIGHT.value: right += 1
+                    elif cr == DataCompareResultEnum.WRONG.value: wrong += 1
+                    elif cr == DataCompareResultEnum.FAILED.value: failed += 1
+                    elif cr == DataCompareResultEnum.EXCEPTION.value: exception += 1
+        else:
+            print(f"[summary] compare file not found: {compare_path}")
+        summary_path = output_path.replace(".jsonl", f".round{round_id}.summary.json")
+        result = dict(right=right, wrong=wrong, failed=failed, exception=exception)
+        with open(summary_path, "w", encoding="utf-8") as f:
+            json.dump(result, f, ensure_ascii=False, indent=2)
+        print(f"[summary] summary written to: {summary_path} -> {result}")
+        return json.dumps(result, ensure_ascii=False)
+
+    def parse_standard_benchmark_sets(self, standard_excel_path: str) -> List[AnswerExecuteModel]:
+        df = pd.read_excel(standard_excel_path, sheet_name="Sheet1")
+        outputs: List[AnswerExecuteModel] = []
+        for _, row in df.iterrows():
+            try:
+                serial_no = int(row["编号"])
+            except Exception:
+                continue
+            question = row.get("用户问题")
+            analysis_model_id = row.get("数据集ID")
+            llm_output = None if pd.isna(row.get("标准答案SQL")) else str(row.get("标准答案SQL"))
+            order_by = True
+            if not pd.isna(row.get("是否排序")):
+                try:
+                    order_by = bool(int(row.get("是否排序")))
+                except Exception:
+                    order_by = True
+
+            std_result = None
+            if not pd.isna(row.get("标准结果")):
+                try:
+                    std_result = json.loads(row.get("标准结果"))
+                except Exception:
+                    std_result = None
+
+            strategy_config = DataCompareStrategyConfig(
+                strategy="CONTAIN_MATCH",
+                order_by=order_by,
+                standard_result=[std_result] if std_result is not None else None  # 使用 list
+            )
+            outputs.append(AnswerExecuteModel(
+                serialNo=serial_no,
+                analysisModelId=analysis_model_id,
+                question=question,
+                llmOutput=llm_output,
+                executeResult=std_result,
+                strategyConfig=strategy_config
+            ))
+        return outputs
\ No newline at end of file
diff --git a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/models.py b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/models.py
new file mode 100644
index 000000000..b606ca5d5
--- /dev/null
+++ b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/models.py
@@ -0,0 +1,116 @@
+# app/services/models.py
+from dataclasses import dataclass
+from enum import Enum
+from typing import Any, Dict, List, Optional
+
+class BenchmarkModeTypeEnum(str, Enum):
+    BUILD = "BUILD"
+    EXECUTE = "EXECUTE"
+
+@dataclass
+class DataCompareStrategyConfig:
+    strategy: str  # "EXACT_MATCH" | "CONTAIN_MATCH"
+    order_by: bool = True
+    standard_result: Optional[List[Dict[str, List[str]]]] = None  # 改为 list[dict]
+
+class DataCompareResultEnum(str, Enum):
+    RIGHT = "RIGHT"
+    WRONG = "WRONG"
+    FAILED = "FAILED"
+    EXCEPTION = "EXCEPTION"
+
+@dataclass
+class DataCompareResult:
+    compare_result: DataCompareResultEnum
+    msg: str = ""
+
+    @staticmethod
+    def right(msg=""): return DataCompareResult(DataCompareResultEnum.RIGHT, msg)
+    @staticmethod
+    def wrong(msg=""): return DataCompareResult(DataCompareResultEnum.WRONG, msg)
+    @staticmethod
+    def failed(msg=""): return DataCompareResult(DataCompareResultEnum.FAILED, msg)
+    @staticmethod
+    def exception(msg=""): return DataCompareResult(DataCompareResultEnum.EXCEPTION, msg)
+
+@dataclass
+class BaseInputModel:
+    serialNo: int
+    analysisModelId: str
+    question: str
+    selfDefineTags: Optional[str] = None
+    prompt: Optional[str] = None
+    knowledge: Optional[str] = None
+
+@dataclass
+class AnswerExecuteModel:
+    serialNo: int
+    analysisModelId: str
+    question: str
+    llmOutput: Optional[str]
+    executeResult: Optional[Dict[str, List[str]]]
+    errorMsg: Optional[str] = None
+    strategyConfig: Optional[DataCompareStrategyConfig] = None
+    cotTokens: Optional[Any] = None
+
+    @staticmethod
+    def from_dict(d: Dict[str, Any]) -> "AnswerExecuteModel":
+        cfg = d.get("strategyConfig")
+        strategy_config = None
+        if cfg:
+            std_list = cfg.get("standard_result")
+            strategy_config = DataCompareStrategyConfig(
+                strategy=cfg.get("strategy"),
+                order_by=cfg.get("order_by", True),
+                standard_result=std_list if isinstance(std_list, list) else None
+            )
+        return AnswerExecuteModel(
+            serialNo=d["serialNo"],
+            analysisModelId=d["analysisModelId"],
+            question=d["question"],
+            llmOutput=d.get("llmOutput"),
+            executeResult=d.get("executeResult"),
+            errorMsg=d.get("errorMsg"),
+            strategyConfig=strategy_config,
+            cotTokens=d.get("cotTokens"),
+        )
+
+    def to_dict(self) -> Dict[str, Any]:
+        cfg = None
+        if self.strategyConfig:
+            cfg = dict(
+                strategy=self.strategyConfig.strategy,
+                order_by=self.strategyConfig.order_by,
+                standard_result=self.strategyConfig.standard_result
+            )
+        return dict(
+            serialNo=self.serialNo,
+            analysisModelId=self.analysisModelId,
+            question=self.question,
+            llmOutput=self.llmOutput,
+            executeResult=self.executeResult,
+            errorMsg=self.errorMsg,
+            strategyConfig=cfg,
+            cotTokens=self.cotTokens
+        )
+
+@dataclass
+class RoundAnswerConfirmModel:
+    serialNo: int
+    analysisModelId: str
+    question: str
+    selfDefineTags: Optional[str]
+    prompt: Optional[str]
+    standardAnswerSql: Optional[str] = None
+    strategyConfig: Optional[DataCompareStrategyConfig] = None
+    llmOutput: Optional[str] = None
+    executeResult: Optional[Dict[str, List[str]]] = None
+    errorMsg: Optional[str] = None
+    compareResult: Optional[DataCompareResultEnum] = None
+
+@dataclass
+class BenchmarkExecuteConfig:
+    benchmarkModeType: BenchmarkModeTypeEnum
+    compareResultEnable: bool
+    standardFilePath: Optional[str] = None
+    compareConfig: Optional[Dict[str, str]] = None
\ No newline at end of file
diff --git a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/run_demo.py b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/run_demo.py
new file mode 100644
index 000000000..f598a69b2
--- /dev/null
+++ b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/run_demo.py
@@ -0,0 +1,65 @@
+from file_parse_service import FileParseService
+from data_compare_service import DataCompareService
+from user_input_execute_service import UserInputExecuteService
+from models import BenchmarkExecuteConfig, BenchmarkModeTypeEnum
+
+def run_build_mode():
+    fps = FileParseService()
+    dcs = DataCompareService()
+    svc = UserInputExecuteService(fps, dcs)
+
+    inputs = fps.parse_input_sets("data/input_round1.jsonl")
+    left = fps.parse_llm_outputs("data/output_round1_modelA.jsonl")
+    right = fps.parse_llm_outputs("data/output_round1_modelB.jsonl")
+
+    config = BenchmarkExecuteConfig(
+        benchmarkModeType=BenchmarkModeTypeEnum.BUILD,
+        compareResultEnable=True,
+        standardFilePath=None,
+        compareConfig={"check":"FULL_TEXT"}
+    )
+
+    svc.post_dispatch(
+        round_id=1,
+        config=config,
+        inputs=inputs,
+        left_outputs=left,
+        right_outputs=right,
+        input_file_path="data/input_round1.jsonl",
+        output_file_path="data/output_round1_modelB.jsonl"
+    )
+
+    fps.summary_and_write_multi_round_benchmark_result("data/output_round1_modelB.jsonl", 1)
+    print("BUILD compare path:", "data/output_round1_modelB.round1.compare.jsonl")
+
+def run_execute_mode():
+    fps = FileParseService()
+    dcs = DataCompareService()
+    svc = UserInputExecuteService(fps, dcs)
+
+    inputs = fps.parse_input_sets("data/input_round1.jsonl")
+    right = fps.parse_llm_outputs("data/output_execute_model.jsonl")
+
+    config = BenchmarkExecuteConfig(
+        benchmarkModeType=BenchmarkModeTypeEnum.EXECUTE,
+        compareResultEnable=True,
+        standardFilePath="data/standard_answers.xlsx",
+        compareConfig=None
+    )
+
+    svc.post_dispatch(
+        round_id=1,
+        config=config,
+        inputs=inputs,
+        left_outputs=[],
+        right_outputs=right,
+        input_file_path="data/input_round1.jsonl",
+        output_file_path="data/output_execute_model.jsonl"
+    )
+
+    fps.summary_and_write_multi_round_benchmark_result("data/output_execute_model.jsonl", 1)
+    print("EXECUTE compare path:", "data/output_execute_model.round1.compare.jsonl")
+
+if __name__ == "__main__":
+    run_build_mode()
+    run_execute_mode()
\ No newline at end of file
diff --git a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/user_input_execute_service.py b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/user_input_execute_service.py
new file mode 100644
index 000000000..1bba0034d
--- /dev/null
+++ b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/user_input_execute_service.py
@@ -0,0 +1,108 @@
+# app/services/user_input_execute_service.py
+from typing import List
+from models import (
+    BaseInputModel, AnswerExecuteModel, RoundAnswerConfirmModel,
+    BenchmarkExecuteConfig, BenchmarkModeTypeEnum, DataCompareResultEnum, DataCompareStrategyConfig
+)
+from file_parse_service import FileParseService
+from data_compare_service import DataCompareService
+
+class UserInputExecuteService:
+    def __init__(self, file_service: FileParseService, compare_service: DataCompareService):
+        self.file_service = file_service
+        self.compare_service = compare_service
+
+    def post_dispatch(
+        self,
+        round_id: int,
+        config: BenchmarkExecuteConfig,
+        inputs: List[BaseInputModel],
+        left_outputs: List[AnswerExecuteModel],
+        right_outputs: List[AnswerExecuteModel],
+        input_file_path: str,
+        output_file_path: str
+    ):
+        try:
+            if config.benchmarkModeType == BenchmarkModeTypeEnum.BUILD and config.compareResultEnable:
+                if left_outputs and right_outputs:
+                    self._execute_llm_compare_result(output_file_path, round_id, inputs, left_outputs, right_outputs, config)
+            elif config.benchmarkModeType == BenchmarkModeTypeEnum.EXECUTE and config.compareResultEnable:
+                if config.standardFilePath and right_outputs:
+                    standard_sets = self.file_service.parse_standard_benchmark_sets(config.standardFilePath)
+                    self._execute_llm_compare_result(output_file_path, 1, inputs, standard_sets, right_outputs, config)
+        except Exception as e:
+            print(f"[post_dispatch] compare error: {e}")
+
+    def _execute_llm_compare_result(
+        self,
+        location: str,
+        round_id: int,
+        inputs: List[BaseInputModel],
+        left_answers: List[AnswerExecuteModel],
+        right_answers: List[AnswerExecuteModel],
+        config: BenchmarkExecuteConfig
+    ):
+        left_map = {a.serialNo: a for a in left_answers}
+        right_map = {a.serialNo: a for a in right_answers}
+        confirm_list: List[RoundAnswerConfirmModel] = []
+
+        for inp in inputs:
+            left = left_map.get(inp.serialNo)
+            right = right_map.get(inp.serialNo)
+
+            if left is None and right is None:
+                continue
+
+            strategy_cfg = None
+            standard_sql = None
+            if left is not None:
+                standard_sql = left.llmOutput
+                if config.benchmarkModeType == BenchmarkModeTypeEnum.EXECUTE:
+                    strategy_cfg = left.strategyConfig
+                else:
+                    standard_result_list = []
+                    if left.executeResult:
+                        standard_result_list.append(left.executeResult)
+                    strategy_cfg = DataCompareStrategyConfig(
+                        strategy="EXACT_MATCH",
+                        order_by=True,
+                        standard_result=standard_result_list if standard_result_list else None
+                    )
+
+            if right is not None:
+                if config.compareConfig and isinstance(config.compareConfig, dict):
+                    res = self.compare_service.compare_json_by_config(
+                        left.llmOutput if left else "", right.llmOutput or "", config.compareConfig
+                    )
+                    compare_result = res.compare_result
+                else:
+                    if strategy_cfg is None:
+                        compare_result = DataCompareResultEnum.FAILED
+                    else:
+                        res = self.compare_service.compare(
+                            left if left else AnswerExecuteModel(
+                                serialNo=inp.serialNo,
+                                analysisModelId=inp.analysisModelId,
+                                question=inp.question,
+                                llmOutput=None,
+                                executeResult=None
+                            ),
+                            right.executeResult
+                        )
+                        compare_result = res.compare_result
+                confirm = RoundAnswerConfirmModel(
+                    serialNo=inp.serialNo,
+                    analysisModelId=inp.analysisModelId,
+                    question=inp.question,
+                    selfDefineTags=inp.selfDefineTags,
+                    prompt=inp.prompt,
+                    standardAnswerSql=standard_sql,
+                    strategyConfig=strategy_cfg,
+                    llmOutput=right.llmOutput if right else None,
+                    executeResult=right.executeResult if right else None,
+                    errorMsg=right.errorMsg if right else None,
+                    compareResult=compare_result
+                )
+                confirm_list.append(confirm)
+
+        self.file_service.write_data_compare_result(location, round_id, confirm_list, config.benchmarkModeType == BenchmarkModeTypeEnum.EXECUTE, 2)
\ No newline at end of file
diff --git a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/fetchdata/benchmark_data_manager.py b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/fetchdata/benchmark_data_manager.py
index 05c109a30..a7c6bd867 100644
--- a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/fetchdata/benchmark_data_manager.py
+++ b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/fetchdata/benchmark_data_manager.py
@@ -26,8 +26,8 @@ class BenchmarkDataConfig(BaseModel):
     model_config = ConfigDict(arbitrary_types_allowed=True)
 
     cache_dir: str = "cache"
-    db_path: str = "benchmark_data.db"
-    table_mapping_file: Optional[str] = None
+    db_path: str = "pilot/benchmark_meta_data/benchmark_data.db"
+    table_mapping_file: str = "pilot/benchmark_meta_data/table_mapping.json"
     cache_expiry_days: int = 1
 
 
diff --git a/table_mapping.json b/pilot/benchmark_meta_data/table_mapping.json
similarity index 100%
rename from table_mapping.json
rename to pilot/benchmark_meta_data/table_mapping.json