From eae01b67402fe0827fc9cfb843a45c9e6c6f3b14 Mon Sep 17 00:00:00 2001 From: Yuanchen <70520919+chengeharrison@users.noreply.github.com> Date: Fri, 22 Dec 2023 14:52:50 +0800 Subject: [PATCH] Improve logic for selecting metrics (#5196) Co-authored-by: Xu --- .../dataset_evaluator/dataset_evaluator.py | 15 +++++------ .../evaluate/dataset_evaluator/metrics.py | 20 +++++++-------- .../examples/dataset_evaluation/inference.py | 25 ++++++++++++++++--- .../examples/gpt_evaluation/inference.py | 25 ++++++++++++++++--- 4 files changed, 62 insertions(+), 23 deletions(-) diff --git a/applications/ColossalEval/colossal_eval/evaluate/dataset_evaluator/dataset_evaluator.py b/applications/ColossalEval/colossal_eval/evaluate/dataset_evaluator/dataset_evaluator.py index 8a9873a28..37dbac3cf 100644 --- a/applications/ColossalEval/colossal_eval/evaluate/dataset_evaluator/dataset_evaluator.py +++ b/applications/ColossalEval/colossal_eval/evaluate/dataset_evaluator/dataset_evaluator.py @@ -1,5 +1,5 @@ import os -from typing import Dict, List +from typing import Dict, List, Union import colossal_eval.evaluate.dataset_evaluator.metrics as metric_helper import numpy as np @@ -279,7 +279,9 @@ class DatasetEvaluator(object): return self.evaluation_results - def get_evaluation_results(self, data: List[Dict], dataset_name: str, model_name: str, metrics: List[str]): + def get_evaluation_results( + self, data: Dict[str, Union[str, Dict]], dataset_name: str, model_name: str, metrics: List[str] + ): """ Evaluate inference data on the given metrics. @@ -290,10 +292,11 @@ class DatasetEvaluator(object): metrics: Metrics used to evaluate. """ - self.data = data + self.data = data["inference_results"] self.dataset_name = dataset_name + self.dataset_class = data["dataset_class"] self.model_name = model_name - self.categories = list(data.keys()) + self.categories = list(self.data.keys()) self.metrics = metrics self.judgements = {} @@ -313,9 +316,7 @@ class DatasetEvaluator(object): for metric in self.metrics: # Train and reference split use same metric as test split. - self.suggested_categories[metric] = metric_helper.metrics4subcategory[self.dataset_name.split("_")[0]][ - metric - ] + self.suggested_categories[metric] = metric_helper.metrics4subcategory[self.dataset_class][metric] if "ALL" in self.suggested_categories[metric]: self.suggested_categories[metric] = self.categories self.metric_total_length[metric] = self.total_length diff --git a/applications/ColossalEval/colossal_eval/evaluate/dataset_evaluator/metrics.py b/applications/ColossalEval/colossal_eval/evaluate/dataset_evaluator/metrics.py index 030059b14..d734eabdb 100644 --- a/applications/ColossalEval/colossal_eval/evaluate/dataset_evaluator/metrics.py +++ b/applications/ColossalEval/colossal_eval/evaluate/dataset_evaluator/metrics.py @@ -25,7 +25,7 @@ metrics4subcategory = { "per_byte_ppl_score": ["ALL"], }, # The commented are non 4-choice questions. - "agieval": { + "AGIEvalDataset": { "combined_single_choice_accuracy": [ # "lsat-ar", # "lsat-lr", @@ -103,14 +103,14 @@ metrics4subcategory = { ], "ppl_score": ["ALL"], }, - "cmmlu": { + "CMMLUDataset": { "first_token_accuracy": ["ALL"], "single_choice_accuracy": ["ALL"], "perplexity": ["ALL"], "ppl_score_over_choices": ["ALL"], "ppl_score": ["ALL"], }, - "gaokaobench": { + "GaoKaoBenchDataset": { "combined_single_choice_accuracy": [ "English MCQs", "Biology MCQs", @@ -170,7 +170,7 @@ metrics4subcategory = { "ppl_score_over_choices": ["ALL"], "ppl_score": ["ALL"], }, - "longbench": { + "LongBenchDataset": { "f1_score": ["hotpotqa", "2wikimqa", "musique", "narrativeqa", "qasper", "multifieldqa_en", "triviaqa"], "f1_zh_score": ["multifieldqa_zh"], "rouge_score": ["gov_report", "qmsum", "multi_news", "samsum"], @@ -183,7 +183,7 @@ metrics4subcategory = { "perplexity": ["ALL"], "ppl_score": ["ALL"], }, - "mmlu": { + "MMLUDataset": { "first_token_accuracy": ["ALL"], "single_choice_accuracy": ["ALL"], "accuracy": ["ALL"], @@ -191,11 +191,11 @@ metrics4subcategory = { "ppl_score_over_choices": ["ALL"], "ppl_score": ["ALL"], }, - "mtbench": {"mtbench_single_judge": ["ALL"]}, - "cvalues": {"first_token_accuracy": ["ALL"]}, - "safetybench_zh": {"first_token_accuracy": ["ALL"]}, - "safetybench_en": {"first_token_accuracy": ["ALL"]}, - "gsm": { + "MTBenchDataset": {"mtbench_single_judge": ["ALL"]}, + "CValuesDataset": {"first_token_accuracy": ["ALL"]}, + "SafetyBenchZHDataset": {"first_token_accuracy": ["ALL"]}, + "SafetyBenchENDataset": {"first_token_accuracy": ["ALL"]}, + "GSMDataset": { "loss_over_all_tokens": ["ALL"], "gsm_accuracy": ["ALL"], }, diff --git a/applications/ColossalEval/examples/dataset_evaluation/inference.py b/applications/ColossalEval/examples/dataset_evaluation/inference.py index 3f066e721..5b09f9de8 100644 --- a/applications/ColossalEval/examples/dataset_evaluation/inference.py +++ b/applications/ColossalEval/examples/dataset_evaluation/inference.py @@ -15,7 +15,13 @@ from colossalai.shardformer import ShardConfig logger = get_dist_logger() -def rm_and_merge(dp_size: int, save_path: str, model_names: List[str], dataset_names: Dict[str, List]) -> None: +def rm_and_merge( + dp_size: int, + save_path: str, + model_names: List[str], + dataset_names: Dict[str, List], + dataset_classes: Dict[str, List], +) -> None: """ Remove inference result per rank and merge them into one file. @@ -24,11 +30,15 @@ def rm_and_merge(dp_size: int, save_path: str, model_names: List[str], dataset_n save_path: The folder for storing inference results. model_names: Names of models for inference. dataset_names: Names of dataset for inference. + dataset_classes: Dataset class for different inference results. We need to save dataset class to smooth the evaluation process. """ for model_name in model_names: for dataset_name, categories in dataset_names.items(): + all_answers_with_dataset_class = {} + all_answers_with_dataset_class["dataset_class"] = dataset_classes[dataset_name] + all_answers = {} for category in categories: all_answers[category] = {"data": []} @@ -58,8 +68,13 @@ def rm_and_merge(dp_size: int, save_path: str, model_names: List[str], dataset_n all_answers[category] = answers + all_answers_with_dataset_class["inference_results"] = all_answers + logger.info(f"Save inference results of model {model_name} on dataset {dataset_name}.") - utils.jdump(all_answers, os.path.join(save_path, model_name, f"{dataset_name}_inference_results.json")) + utils.jdump( + all_answers_with_dataset_class, + os.path.join(save_path, model_name, f"{dataset_name}_inference_results.json"), + ) logger.info(f"Save inference results of model {model_name} for all dataset.") logger.info(f"Save inference results of all models for all dataset.") @@ -98,6 +113,7 @@ def main(args): ) inference_data = {} + dataset_classes = {} debug_args = {} few_shot_args = {} multiturn_args = {} @@ -128,6 +144,7 @@ def main(args): continue + dataset_classes[dataset_name] = dataset_parameter["dataset_class"] dataset_class = eval(f"dataset.{dataset_parameter['dataset_class']}") if not issubclass(dataset_class, dataset.BaseDataset): raise ValueError(f"Dataset class {dataset_parameter['dataset_class']} is not a subclass of BaseDataset.") @@ -149,12 +166,14 @@ def main(args): debug_args[new_dataset_name] = dataset_parameter["debug"] few_shot_args[new_dataset_name] = dataset_parameter["few_shot"] inference_data[new_dataset_name] = dataset_.dataset["train"] + dataset_classes[new_dataset_name] = dataset_parameter["dataset_class"] if load_reference and "reference" in dataset_.dataset: new_dataset_name = f"{dataset_name}_reference" debug_args[new_dataset_name] = dataset_parameter["debug"] few_shot_args[new_dataset_name] = dataset_parameter["few_shot"] inference_data[new_dataset_name] = dataset_.dataset["reference"] + dataset_classes[new_dataset_name] = dataset_parameter["dataset_class"] if rank == 0: logger.info(f"Dataset for inference are: {list(inference_data.keys())}") @@ -225,7 +244,7 @@ def main(args): if rank == 0: model_names = [model_parameter["name"] for model_parameter in model_parameters] dataset_names = {key: list(inference_data[key].keys()) for key in inference_data} - rm_and_merge(dp_size, args.inference_save_path, model_names, dataset_names) + rm_and_merge(dp_size, args.inference_save_path, model_names, dataset_names, dataset_classes) if __name__ == "__main__": diff --git a/applications/ColossalEval/examples/gpt_evaluation/inference.py b/applications/ColossalEval/examples/gpt_evaluation/inference.py index 3f066e721..5b09f9de8 100644 --- a/applications/ColossalEval/examples/gpt_evaluation/inference.py +++ b/applications/ColossalEval/examples/gpt_evaluation/inference.py @@ -15,7 +15,13 @@ from colossalai.shardformer import ShardConfig logger = get_dist_logger() -def rm_and_merge(dp_size: int, save_path: str, model_names: List[str], dataset_names: Dict[str, List]) -> None: +def rm_and_merge( + dp_size: int, + save_path: str, + model_names: List[str], + dataset_names: Dict[str, List], + dataset_classes: Dict[str, List], +) -> None: """ Remove inference result per rank and merge them into one file. @@ -24,11 +30,15 @@ def rm_and_merge(dp_size: int, save_path: str, model_names: List[str], dataset_n save_path: The folder for storing inference results. model_names: Names of models for inference. dataset_names: Names of dataset for inference. + dataset_classes: Dataset class for different inference results. We need to save dataset class to smooth the evaluation process. """ for model_name in model_names: for dataset_name, categories in dataset_names.items(): + all_answers_with_dataset_class = {} + all_answers_with_dataset_class["dataset_class"] = dataset_classes[dataset_name] + all_answers = {} for category in categories: all_answers[category] = {"data": []} @@ -58,8 +68,13 @@ def rm_and_merge(dp_size: int, save_path: str, model_names: List[str], dataset_n all_answers[category] = answers + all_answers_with_dataset_class["inference_results"] = all_answers + logger.info(f"Save inference results of model {model_name} on dataset {dataset_name}.") - utils.jdump(all_answers, os.path.join(save_path, model_name, f"{dataset_name}_inference_results.json")) + utils.jdump( + all_answers_with_dataset_class, + os.path.join(save_path, model_name, f"{dataset_name}_inference_results.json"), + ) logger.info(f"Save inference results of model {model_name} for all dataset.") logger.info(f"Save inference results of all models for all dataset.") @@ -98,6 +113,7 @@ def main(args): ) inference_data = {} + dataset_classes = {} debug_args = {} few_shot_args = {} multiturn_args = {} @@ -128,6 +144,7 @@ def main(args): continue + dataset_classes[dataset_name] = dataset_parameter["dataset_class"] dataset_class = eval(f"dataset.{dataset_parameter['dataset_class']}") if not issubclass(dataset_class, dataset.BaseDataset): raise ValueError(f"Dataset class {dataset_parameter['dataset_class']} is not a subclass of BaseDataset.") @@ -149,12 +166,14 @@ def main(args): debug_args[new_dataset_name] = dataset_parameter["debug"] few_shot_args[new_dataset_name] = dataset_parameter["few_shot"] inference_data[new_dataset_name] = dataset_.dataset["train"] + dataset_classes[new_dataset_name] = dataset_parameter["dataset_class"] if load_reference and "reference" in dataset_.dataset: new_dataset_name = f"{dataset_name}_reference" debug_args[new_dataset_name] = dataset_parameter["debug"] few_shot_args[new_dataset_name] = dataset_parameter["few_shot"] inference_data[new_dataset_name] = dataset_.dataset["reference"] + dataset_classes[new_dataset_name] = dataset_parameter["dataset_class"] if rank == 0: logger.info(f"Dataset for inference are: {list(inference_data.keys())}") @@ -225,7 +244,7 @@ def main(args): if rank == 0: model_names = [model_parameter["name"] for model_parameter in model_parameters] dataset_names = {key: list(inference_data[key].keys()) for key in inference_data} - rm_and_merge(dp_size, args.inference_save_path, model_names, dataset_names) + rm_and_merge(dp_size, args.inference_save_path, model_names, dataset_names, dataset_classes) if __name__ == "__main__":