diff --git a/packages/dbgpt-app/src/dbgpt_app/initialization/db_model_initialization.py b/packages/dbgpt-app/src/dbgpt_app/initialization/db_model_initialization.py index 578cbb0f0..7958dd359 100644 --- a/packages/dbgpt-app/src/dbgpt_app/initialization/db_model_initialization.py +++ b/packages/dbgpt-app/src/dbgpt_app/initialization/db_model_initialization.py @@ -12,10 +12,7 @@ from dbgpt_serve.agent.app.recommend_question.recommend_question import ( from dbgpt_serve.agent.hub.db.my_plugin_db import MyPluginEntity from dbgpt_serve.agent.hub.db.plugin_hub_db import PluginHubEntity from dbgpt_serve.datasource.manages.connect_config_db import ConnectConfigEntity -from dbgpt_serve.evaluate.db.benchmark_db import ( - BenchmarkCompareEntity, - BenchmarkSummaryEntity, -) +from dbgpt_serve.evaluate.db.benchmark_db import BenchmarkSummaryEntity from dbgpt_serve.file.models.models import ServeEntity as FileServeEntity from dbgpt_serve.flow.models.models import ServeEntity as FlowServeEntity from dbgpt_serve.flow.models.models import VariablesEntity as FlowVariableEntity @@ -40,6 +37,5 @@ _MODELS = [ FlowServeEntity, RecommendQuestionEntity, FlowVariableEntity, - BenchmarkCompareEntity, BenchmarkSummaryEntity, ] diff --git a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/db/__init__.py b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/db/__init__.py index 7f1db962d..1a01d04c4 100644 --- a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/db/__init__.py +++ b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/db/__init__.py @@ -1,11 +1,9 @@ from .benchmark_db import ( - BenchmarkCompareEntity, BenchmarkResultDao, BenchmarkSummaryEntity, ) __all__ = [ - "BenchmarkCompareEntity", "BenchmarkSummaryEntity", "BenchmarkResultDao", ] diff --git a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/db/benchmark_db.py b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/db/benchmark_db.py index 752807111..691374ad5 100644 --- a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/db/benchmark_db.py +++ b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/db/benchmark_db.py @@ -3,13 +3,11 @@ from datetime import datetime from typing import List, Optional from sqlalchemy import ( - Boolean, Column, DateTime, Index, Integer, String, - Text, UniqueConstraint, desc, func, @@ -20,64 +18,6 @@ from dbgpt.storage.metadata import BaseDao, Model logger = logging.getLogger(__name__) -class BenchmarkCompareEntity(Model): - """Single compare record for one input serialNo in one round. - - Fields match the JSON lines produced by FileParseService.write_data_compare_result. - """ - - __tablename__ = "benchmark_compare" - __table_args__ = ( - UniqueConstraint( - "round_id", "serial_no", "output_path", name="uk_round_serial_output" - ), - ) - - id = Column( - Integer, primary_key=True, autoincrement=True, comment="autoincrement id" - ) - # Round and mode - round_id = Column(Integer, nullable=False, comment="Benchmark round id") - mode = Column(String(16), nullable=False, comment="BUILD or EXECUTE") - - # Input & outputs - serial_no = Column(Integer, nullable=False, comment="Input serial number") - analysis_model_id = Column(String(255), nullable=False, comment="Analysis model id") - question = Column(Text, nullable=False, comment="User question") - self_define_tags = Column(String(255), nullable=True, comment="Self define tags") - prompt = Column(Text, nullable=True, comment="Prompt text") - - standard_answer_sql = Column(Text, nullable=True, comment="Standard answer SQL") - llm_output = Column(Text, nullable=True, comment="LLM output text or JSON") - execute_result = Column( - Text, nullable=True, comment="Execution result JSON (serialized)" - ) - error_msg = Column(Text, nullable=True, comment="Error message") - - compare_result = Column( - String(16), nullable=True, comment="RIGHT/WRONG/FAILED/EXCEPTION" - ) - is_execute = Column(Boolean, default=False, comment="Whether this is EXECUTE mode") - llm_count = Column(Integer, default=0, comment="Number of LLM outputs compared") - - # Source path for traceability (original output jsonl file path) - output_path = Column( - String(512), nullable=False, comment="Original output file path" - ) - - gmt_created = Column(DateTime, default=datetime.now, comment="Record creation time") - gmt_modified = Column( - DateTime, - default=datetime.now, - onupdate=datetime.now, - comment="Record update time", - ) - - Index("idx_bm_comp_round", "round_id") - Index("idx_bm_comp_mode", "mode") - Index("idx_bm_comp_serial", "serial_no") - - class BenchmarkSummaryEntity(Model): """Summary result for one round and one output path. @@ -171,18 +111,6 @@ class BenchmarkResultDao(BaseDao): session.add(summary) session.commit() - # Basic query helpers - def list_compare_by_round(self, round_id: int, limit: int = 100, offset: int = 0): - with self.session(commit=False) as session: - return ( - session.query(BenchmarkCompareEntity) - .filter(BenchmarkCompareEntity.round_id == round_id) - .order_by(desc(BenchmarkCompareEntity.id)) - .limit(limit) - .offset(offset) - .all() - ) - def get_summary( self, round_id: int, output_path: str ) -> Optional[BenchmarkSummaryEntity]: diff --git a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/benchmark_service.py b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/benchmark_service.py index decb0b4d4..ae0757c7d 100644 --- a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/benchmark_service.py +++ b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/benchmark_service.py @@ -57,7 +57,7 @@ BENCHMARK_SERVICE_COMPONENT_NAME = "dbgpt_serve_evaluate_benchmark_service" STANDARD_BENCHMARK_FILE_PATH = os.path.join( BENCHMARK_DATA_ROOT_PATH, - "2025_07_27_public_500_standard_benchmark_question_list_v2.xlsx", + "2025_07_27_public_500_standard_benchmark_question_list.xlsx", ) BENCHMARK_OUTPUT_RESULT_PATH = os.path.join(BENCHMARK_DATA_ROOT_PATH, "result") diff --git a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/template/input_round1.jsonl b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/template/input_round1.jsonl deleted file mode 100644 index b36082f78..000000000 --- a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/template/input_round1.jsonl +++ /dev/null @@ -1,4 +0,0 @@ -{"serialNo":1,"analysisModelId":"D2025050900161503000025249569","question":"各性别的平均年龄是多少,并按年龄顺序显示结果?","selfDefineTags":"KAGGLE_DS_1,CTE1","prompt":"...","knowledge":""} -{"serialNo":2,"analysisModelId":"D2025050900161503000025249569","question":"不同投资目标下政府债券的总量是多少,并按目标名称排序?","selfDefineTags":"KAGGLE_DS_1,CTE1","prompt":"...","knowledge":""} -{"serialNo":3,"analysisModelId":"D2025050900161503000025249569","question":"用于触发双模型结果数不相等的case","selfDefineTags":"TEST","prompt":"...","knowledge":""} -{"serialNo":4,"analysisModelId":"D2025050900161503000025249569","question":"用于JSON对比策略的case","selfDefineTags":"TEST_JSON","prompt":"...","knowledge":""} \ No newline at end of file diff --git a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/template/output_execute_model.jsonl b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/template/output_execute_model.jsonl deleted file mode 100644 index a0e3126e4..000000000 --- a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/template/output_execute_model.jsonl +++ /dev/null @@ -1,5 +0,0 @@ -{"serialNo":1,"analysisModelId":"D2025050900161503000025249569","question":"各性别的平均年龄是多少,并按年龄顺序显示结果?","llmOutput":"with converted_data as (...)\nselect gender as `性别`, avg(age) as `平均年龄` from converted_data group by gender order by `平均年龄`;","executeResult":{"性别":["Female","Male"],"平均年龄":["27.73","27.84"]},"errorMsg":null} -{"serialNo":2,"analysisModelId":"D2025050900161503000025249569","question":"不同投资目标下政府债券的总量是多少,并按目标名称排序?","llmOutput":"with gov_bonds_data as (...)\nselect objective as `objective`, sum(gov_bond_value) as `政府债券总量` from gov_bonds_data group by `objective` order by `objective`;","executeResult":{"objective":["Capital Appreciation","Growth","Income"],"政府债券总量":["117","54","15"]},"errorMsg":null} -{"serialNo":3,"analysisModelId":"D2025050900161503000025249569","question":"用于触发双模型结果数不相等的case","llmOutput":"select 1","executeResult":{"colA":["x","y"]},"errorMsg":null} -{"serialNo":4,"analysisModelId":"D2025050900161503000025249569","question":"用于JSON对比策略的case","llmOutput":"{\"check\":\"ok\"}","executeResult":null,"errorMsg":null} -{"serialNo":5,"analysisModelId":"D2025050900161503000025249569","question":"缺少匹配标准的case","llmOutput":"select * from t","executeResult":null,"errorMsg":"execution error"} \ No newline at end of file diff --git a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/template/output_round1_modelA.jsonl b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/template/output_round1_modelA.jsonl deleted file mode 100644 index 9a69b4cbc..000000000 --- a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/template/output_round1_modelA.jsonl +++ /dev/null @@ -1,4 +0,0 @@ -{"serialNo":1,"analysisModelId":"D2025050900161503000025249569","question":"各性别的平均年龄是多少,并按年龄顺序显示结果?","llmOutput":"with converted_data as (...)\nselect gender as `性别`, avg(age) as `平均年龄` from converted_data group by gender order by `平均年龄`;","executeResult":{"性别":["Female","Male"],"平均年龄":["27.73","27.84"]},"errorMsg":null} -{"serialNo":2,"analysisModelId":"D2025050900161503000025249569","question":"不同投资目标下政府债券的总量是多少,并按目标名称排序?","llmOutput":"with gov_bonds_data as (...)\nselect objective as `objective`, sum(gov_bond_value) as `政府债券总量` from gov_bonds_data group by `objective` order by `objective`;","executeResult":{"objective":["Capital Appreciation","Growth","Income"],"政府债券总量":["117","54","15"]},"errorMsg":null} -{"serialNo":3,"analysisModelId":"D2025050900161503000025249569","question":"用于触发双模型结果数不相等的case","llmOutput":"select 1","executeResult":{"colA":["x","y"]},"errorMsg":null} -{"serialNo":4,"analysisModelId":"D2025050900161503000025249569","question":"用于JSON对比策略的case","llmOutput":"{\"check\":\"ok\"}","executeResult":null,"errorMsg":null} \ No newline at end of file diff --git a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/template/output_round1_modelB.jsonl b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/template/output_round1_modelB.jsonl deleted file mode 100644 index 5589104ec..000000000 --- a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/template/output_round1_modelB.jsonl +++ /dev/null @@ -1,4 +0,0 @@ -{"serialNo":1,"analysisModelId":"D2025050900161503000025249569","question":"各性别的平均年龄是多少,并按年龄顺序显示结果?","llmOutput":"with converted_data as (...)\nselect gender as `性别`, avg(age) as `平均年龄` from converted_data group by gender order by `平均年龄`;","executeResult":{"性别":["Female","Male"],"平均年龄":["27.73","27.84"]},"errorMsg":null} -{"serialNo":2,"analysisModelId":"D2025050900161503000025249569","question":"不同投资目标下政府债券的总量是多少,并按目标名称排序?","llmOutput":"with gov_bonds_data as (...)\nselect objective as `objective`, sum(gov_bond_value) as `政府债券总量` from gov_bonds_data group by `objective` order by `objective`;","executeResult":{"objective":["Capital Appreciation","Growth","Income"],"政府债券总量":["117","54","15"]},"errorMsg":null} -{"serialNo":3,"analysisModelId":"D2025050900161503000025249569","question":"用于触发双模型结果数不相等的case","llmOutput":"select 1","executeResult":{"colB":["x","z","w"]},"errorMsg":null} -{"serialNo":4,"analysisModelId":"D2025050900161503000025249569","question":"用于JSON对比策略的case","llmOutput":"{\"check\":\"ok\"}","executeResult":null,"errorMsg":null} \ No newline at end of file diff --git a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/template/standard_answers.xlsx b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/template/standard_answers.xlsx deleted file mode 100644 index 52411e9e9..000000000 Binary files a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/template/standard_answers.xlsx and /dev/null differ diff --git a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/user_input_execute_service.py b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/user_input_execute_service.py index 29add532a..bff082c91 100644 --- a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/user_input_execute_service.py +++ b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/benchmark/user_input_execute_service.py @@ -6,7 +6,7 @@ from typing import Dict, List, Optional, Union from dbgpt.util.benchmarks import StorageUtil from dbgpt_serve.evaluate.db.benchmark_db import BenchmarkResultDao from dbgpt_serve.evaluate.service.fetchdata.benchmark_data_manager import ( - get_benchmark_manager, + get_benchmark_manager, BENCHMARK_DEFAULT_DB_SCHEMA, ) from .data_compare_service import DataCompareService @@ -26,8 +26,6 @@ from .models import ( RoundAnswerConfirmModel, ) -BENCHMARK_DEFAULT_DB_SCHEMA = "ant_icube_dev." - logger = logging.getLogger(__name__) diff --git a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/fetchdata/benchmark_data_manager.py b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/fetchdata/benchmark_data_manager.py index 5712f96d0..ab08d675b 100644 --- a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/fetchdata/benchmark_data_manager.py +++ b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/fetchdata/benchmark_data_manager.py @@ -24,6 +24,7 @@ from dbgpt_ext.datasource.rdbms.conn_sqlite import SQLiteConnector logger = logging.getLogger(__name__) +BENCHMARK_DEFAULT_DB_SCHEMA = "ant_icube_dev." class BenchmarkDataConfig(BaseModel): """Configuration for Benchmark Data Manager""" @@ -31,7 +32,7 @@ class BenchmarkDataConfig(BaseModel): model_config = ConfigDict(arbitrary_types_allowed=True) cache_dir: str = "cache" - db_path: str = os.path.join(BENCHMARK_DATA_ROOT_PATH, "ant_icube_dev.db") + db_path: str = os.path.join(BENCHMARK_DATA_ROOT_PATH, f"{BENCHMARK_DEFAULT_DB_SCHEMA}db") table_mapping_file: str = os.path.join( BENCHMARK_DATA_ROOT_PATH, "table_mapping.json" ) diff --git a/pilot/benchmark_meta_data/2025_07_27_public_500_standard_benchmark_question_list.xlsx b/pilot/benchmark_meta_data/2025_07_27_public_500_standard_benchmark_question_list.xlsx index c5df65201..b5c7c2ba3 100644 Binary files a/pilot/benchmark_meta_data/2025_07_27_public_500_standard_benchmark_question_list.xlsx and b/pilot/benchmark_meta_data/2025_07_27_public_500_standard_benchmark_question_list.xlsx differ diff --git a/pilot/benchmark_meta_data/2025_07_27_public_500_standard_benchmark_question_list_v2.xlsx b/pilot/benchmark_meta_data/2025_07_27_public_500_standard_benchmark_question_list_v2.xlsx deleted file mode 100644 index b5c7c2ba3..000000000 Binary files a/pilot/benchmark_meta_data/2025_07_27_public_500_standard_benchmark_question_list_v2.xlsx and /dev/null differ