[feature] ColossalEval: Evaluation Pipeline for LLMs (#4786)

* Add ColossalEval

* Delete evaluate in Chat

---------

Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
This commit is contained in:
Yuanchen
2023-09-24 23:14:11 +08:00
committed by GitHub
parent 74aa7d964a
commit ce777853ae
60 changed files with 5314 additions and 2497 deletions

View File

@@ -0,0 +1,58 @@
{
"model": [
{
"name": "model1"
},
{
"name": "model2"
}
],
"dataset": [
{
"name": "mmlu",
"metrics": [
"first_token_accuracy",
"single_choice_accuracy",
"perplexity",
"ppl_score",
"ppl_score_over_choices"
]
},
{
"name": "cmmlu",
"metrics": [
"first_token_accuracy",
"single_choice_accuracy",
"perplexity",
"ppl_score",
"ppl_score_over_choices"
]
},
{
"name": "agieval",
"metrics": [
"first_token_accuracy",
"single_choice_accuracy",
"multi_choice_accuracy",
"math_equivalence",
"perplexity",
"ppl_score_over_choices",
"ppl_score"
]
},
{
"name": "gaokaobench",
"metrics": [
"first_token_accuracy",
"single_choice_accuracy",
"multi_choice_accuracy",
"math_equivalence",
"rouge_score",
"rouge_zh_score",
"perplexity",
"ppl_score_over_choices",
"ppl_score"
]
}
]
}

View File

@@ -0,0 +1,84 @@
{
"model": [
{
"name": "model name",
"model_class": "HuggingFaceCausalLM",
"parameters": {
"path": "path to model",
"model_max_length": 4096,
"tokenizer_path": "",
"tokenizer_kwargs": {
"trust_remote_code": true
},
"peft_path": null,
"model_kwargs": {
"torch_dtype": "torch.float32",
"trust_remote_code": true
},
"prompt_template": "plain",
"batch_size": 4
}
},
{
"name": "model2 name",
"model_class": "HuggingFaceCausalLM",
"parameters": {
"path": "path to model2",
"model_max_length": 4096,
"tokenizer_path": "",
"tokenizer_kwargs": {
"trust_remote_code": true
},
"peft_path": null,
"model_kwargs": {
"torch_dtype": "torch.float32",
"trust_remote_code": true
},
"prompt_template": "plain",
"batch_size": 4
}
}
],
"dataset": [
{
"name": "agieval",
"dataset_class": "AGIEvalDataset",
"debug": false,
"few_shot": false,
"path": "path to original dataset (folder)",
"save_path": "path to save converted dataset (e.g. inference_data/agieval.json)"
},
{
"name": "ceval",
"dataset_class": "CEvalDataset",
"debug": false,
"few_shot": true,
"path": "path to original dataset (folder)",
"save_path": "path to save converted dataset (e.g. inference_data/ceval.json)"
},
{
"name": "cmmlu",
"dataset_class": "CMMLUDataset",
"debug": false,
"few_shot": true,
"path": "path to original dataset (folder)",
"save_path": "path to save converted dataset (e.g. inference_data/cmmlu.json)"
},
{
"name": "gaokaobench",
"dataset_class": "GaoKaoBenchDataset",
"debug": false,
"few_shot": false,
"path": "path to original dataset (folder)",
"save_path": "path to save converted dataset (e.g. inference_data/gaokaobench.json)"
},
{
"name": "mmlu",
"dataset_class": "MMLUDataset",
"debug": false,
"few_shot": true,
"path": "path to original dataset (folder)",
"save_path": "path to save converted dataset (e.g. inference_data/mmlu.json)"
}
]
}

View File

@@ -0,0 +1,73 @@
import argparse
import os
import tabulate
from colossal_eval.evaluate.dataset_evaluator import DatasetEvaluator
from colossal_eval.utils import jdump, jload
def main(args):
config = jload(args.config)
evaluation_results = {dataset["name"]: {} for dataset in config["dataset"]}
evaluation_results_table = {dataset["name"]: {} for dataset in config["dataset"]}
evaluator = DatasetEvaluator()
for dataset_parameter in config["dataset"]:
dataset_name = dataset_parameter["name"]
metrics = dataset_parameter["metrics"]
results_metric_model = {metric: {model["name"]: None for model in config["model"]} for metric in metrics}
for model in config["model"]:
model_name = model["name"]
data = jload(
os.path.join(args.inference_results_path, model_name, f"{dataset_name}_inference_results.json")
)
results = evaluator.get_evaluation_results(data, dataset_name, model_name, metrics)
for metric, score in results.items():
results_metric_model[metric][model_name] = score["ALL"]
evaluation_results[dataset_name][model_name] = results
evaluation_results_table[dataset_name] = results_metric_model
table = []
header = ["dataset", "metric"] + [model["name"] for model in config["model"]]
table.append(header)
for dataset_parameter in config["dataset"]:
dataset_name = dataset_parameter["name"]
metrics = dataset_parameter["metrics"]
for metric, model_results in evaluation_results_table[dataset_name].items():
row = [dataset_name]
for model, score in model_results.items():
if len(row) == 1:
row.extend([metric, "{:.02f}".format(score)])
else:
row.append("{:.02f}".format(score))
table.append(row)
table = tabulate.tabulate(table, headers="firstrow")
print(table)
os.makedirs(args.evaluation_results_save_path, exist_ok=True)
with open(os.path.join(args.evaluation_results_save_path, "evaluation_results_table.txt"), "w") as file:
file.write(table)
jdump(evaluation_results, os.path.join(args.evaluation_results_save_path, "evaluation_results.json"))
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="ColossalEval evaluation process.")
parser.add_argument("--config", type=str, default=None, required=True, help="path to config file")
parser.add_argument("--inference_results_path", type=str, default=None, help="path to inference results")
parser.add_argument(
"--evaluation_results_save_path", type=str, default=None, help="path to save evaluation results"
)
args = parser.parse_args()
main(args)

View File

@@ -0,0 +1,4 @@
python eval_dataset.py \
--config "path to config file" \
--inference_results_path "path to inference results" \
--evaluation_results_save_path "path to save evaluation results"

View File

@@ -0,0 +1,171 @@
import argparse
import copy
import os
from typing import Dict, List
import torch
import torch.distributed as dist
from colossal_eval import dataset, models, utils
import colossalai
from colossalai.logging import get_dist_logger
logger = get_dist_logger()
def rm_and_merge(world_size: int, save_path: str, model_names: List[str], dataset_names: Dict[str, List]) -> None:
"""
Remove inference result per rank and merge them into one file.
Args:
world_size: Number of processes for inference.
save_path: The folder for storing inference results.
model_names: Names of models for inference.
dataset_names: Names of dataset for inference.
"""
for model_name in model_names:
for dataset_name, categories in dataset_names.items():
all_answers = {}
for category in categories:
all_answers[category] = {"data": []}
answers = {"data": []}
for r in range(world_size):
directory = os.path.join(
save_path, model_name, f"{dataset_name}_{category}_inference_results_rank{r}.json"
)
if not os.path.exists(directory):
raise Exception(
f"Directory {directory} not found. There may be an error during inference time."
)
else:
rank_answers = utils.jload(directory)
answers["data"].extend(rank_answers["data"])
answers["inference_kwargs"] = rank_answers["inference_kwargs"]
for r in range(world_size):
try:
directory = os.path.join(
save_path, model_name, f"{dataset_name}_{category}_inference_results_rank{r}.json"
)
os.remove(directory)
except Exception as e:
print(e)
all_answers[category] = answers
logger.info(f"Save inference results of model {model_name} on dataset {dataset_name}.")
utils.jdump(all_answers, os.path.join(save_path, model_name, f"{dataset_name}_inference_results.json"))
logger.info(f"Save inference results of model {model_name} for all dataset.")
logger.info(f"Save inference results of all models for all dataset.")
def main(args):
colossalai.launch_from_torch(config={}, seed=42)
world_size = dist.get_world_size()
rank = dist.get_rank()
inference_data = {}
debug_args = {}
few_shot_args = {}
config = utils.jload(args.config)
model_parameters = config["model"]
dataset_parameters = config["dataset"]
for dataset_parameter in dataset_parameters:
path = dataset_parameter["path"]
save_path = dataset_parameter["save_path"]
dataset_name = dataset_parameter["name"]
debug_args[dataset_name] = dataset_parameter["debug"]
few_shot_args[dataset_name] = dataset_parameter["few_shot"]
if not args.load_dataset:
if os.path.exists(save_path):
dataset_ = utils.jload(save_path)
inference_data[dataset_name] = dataset_["test"]
else:
raise Exception(
"Can't find the converted dataset. You may set load_dataset True to store the dataset first."
)
continue
dataset_class = eval(f"dataset.{dataset_parameter['dataset_class']}")
if not issubclass(dataset_class, dataset.BaseDataset):
raise ValueError(f"Dataset class {dataset_parameter['dataset_class']} is not a subclass of BaseDataset.")
dataset_ = dataset_class(path, logger, dataset_parameter["few_shot"])
dataset_.save(save_path)
inference_data[dataset_name] = dataset_.dataset["test"]
for model_parameter in model_parameters:
model_name = model_parameter["name"]
model_class = eval(f"models.{model_parameter['model_class']}")
paramerters = model_parameter["parameters"]
paramerters.update({"logger": logger})
paramerters.update({"prompt_template": utils.prompt_templates[paramerters["prompt_template"]]})
model_ = model_class(**paramerters)
if not issubclass(model_class, models.BaseModel):
raise ValueError(f"Model class {model_parameter['model_class']} is not a subclass of BaseModel.")
for dataset_name, split_data in inference_data.items():
start = 0
for category, category_data in split_data.items():
if few_shot_args[dataset_name] and category_data["inference_kwargs"].get("few_shot_data", None) is None:
raise Exception(f"Dataset {dataset_name} doesn't have few-shot data for category {category}!")
answers_to_dump = copy.deepcopy(category_data)
partition_size = len(category_data["data"]) // world_size
redundant = len(category_data["data"]) % world_size
# Ensure that the amount of data for inference is as consistent as possible across different processes.
lengths = [partition_size for _ in range(world_size)]
for j in range(redundant):
lengths[(j + start) % world_size] += 1
start = (start + redundant) % world_size
questions = category_data["data"][sum(lengths[0:rank]) : sum(lengths[0:rank]) + lengths[rank]]
answers_per_rank = model_.inference(
questions, inference_kwargs=category_data["inference_kwargs"], debug=debug_args[dataset_name]
)
answers_to_dump["data"] = answers_per_rank
utils.jdump(
answers_to_dump,
os.path.join(
args.inference_save_path,
model_name,
f"{dataset_name}_{category}_inference_results_rank{rank}.json",
),
)
logger.info(f"Rank {rank} peak CUDA mem: {torch.cuda.max_memory_allocated()/1024**3:.3f} GB")
del model_
torch.cuda.empty_cache()
dist.barrier()
if rank == 0:
model_names = [model_parameter["name"] for model_parameter in model_parameters]
dataset_names = {key: list(inference_data[key].keys()) for key in inference_data}
rm_and_merge(world_size, args.inference_save_path, model_names, dataset_names)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="ColossalEval inference process.")
parser.add_argument("--config", type=str, default=None, required=True, help="path to config file")
parser.add_argument("--load_dataset", default=False, action="store_true")
parser.add_argument("--inference_save_path", type=str, default=None, help="path to save inference results")
args = parser.parse_args()
main(args)

View File

@@ -0,0 +1,4 @@
torchrun --nproc_per_node=1 inference.py \
--config "path to config file" \
--load_dataset \
--inference_save_path "path to save inference results"

View File

@@ -0,0 +1,44 @@
{
"language": "en",
"category": {
"brainstorming": {
"GPT": [
"language organization",
"relevance",
"creativity",
"practicality",
"reasonableness"
]
},
"chat": {
"GPT": [
"language organization",
"naturalness",
"engagingness",
"fidelity"
]
},
"generation": {
"GPT": [
"language organization",
"relevance",
"diversity"
]
},
"open_qa": {
"GPT": [
"language organization",
"relevance",
"correctness"
]
},
"roleplay": {
"GPT": [
"language organization",
"relevance",
"fidelity",
"creativity"
]
}
}
}

View File

@@ -0,0 +1,33 @@
{
"model": [
{
"name": "model name",
"model_class": "HuggingFaceCausalLM",
"parameters": {
"path": "path to model",
"model_max_length": 4096,
"tokenizer_path": "",
"tokenizer_kwargs": {
"trust_remote_code": true
},
"peft_path": null,
"model_kwargs": {
"torch_dtype": "torch.float32",
"trust_remote_code": true
},
"prompt_template": "plain",
"batch_size": 4
}
}
],
"dataset": [
{
"name": "colossal",
"dataset_class": "ColossalDataset",
"debug": false,
"few_shot": false,
"path": "../../configs/gpt_evaluation/data/eval_en_examples.json",
"save_path": "path to save converted dataset (inference_data/colossal.json)"
}
]
}

View File

@@ -0,0 +1,139 @@
import argparse
import os
import openai
from colossal_eval.evaluate.evaluator import Evaluator
from colossal_eval.utils import jload
def main(args):
assert len(args.answer_file_list) == len(
args.model_name_list
), "The number of answer files and model names should be equal!"
# load config
config = jload(args.config_file)
if config["language"] in ["cn", "en"]:
# get metric settings for all categories
metrics_per_category = {}
for category in config["category"].keys():
metrics_all = {}
for metric_type, metrics in config["category"][category].items():
metrics_all[metric_type] = metrics
metrics_per_category[category] = metrics_all
battle_prompt = None
if args.battle_prompt_file:
battle_prompt = jload(args.battle_prompt_file)
gpt_evaluation_prompt = None
if args.gpt_evaluation_prompt_file:
gpt_evaluation_prompt = jload(args.gpt_evaluation_prompt_file)
if len(args.model_name_list) == 2 and not battle_prompt:
raise Exception("No prompt file for battle provided. Please specify the prompt file for battle!")
if len(args.model_name_list) == 1 and not gpt_evaluation_prompt:
raise Exception(
"No prompt file for gpt evaluation provided. Please specify the prompt file for gpt evaluation!"
)
if args.gpt_model == "text-davinci-003" and args.gpt_with_reference:
raise Exception(
"GPT evaluation with reference is not supported for text-davinci-003. You should specify chat models such as gpt-3.5-turbo or gpt-4."
)
# initialize evaluator
evaluator = Evaluator(
metrics_per_category,
battle_prompt,
gpt_evaluation_prompt,
args.gpt_model,
config["language"],
args.gpt_with_reference,
)
if len(args.model_name_list) == 2:
answers_1 = jload(args.answer_file_list[0])
answers_2 = jload(args.answer_file_list[1])
answers1 = []
for category, value in answers_1.items():
answers1.extend(value["data"])
answers2 = []
for category, value in answers_2.items():
answers2.extend(value["data"])
assert len(answers1) == len(answers2), "The number of answers for two models should be equal!"
evaluator.battle(answers1=answers1, answers2=answers2)
evaluator.save(args.save_path, args.model_name_list)
elif len(args.model_name_list) == 1:
targets = jload(args.target_file)
answers = jload(args.answer_file_list[0])
references = []
for category, value in targets["test"].items():
references.extend(value["data"])
predictions = []
for category, value in answers.items():
predictions.extend(value["data"])
assert len(references) == len(
predictions
), "The number of target answers and model answers should be equal!"
evaluator.evaluate(
answers=predictions, targets=references, save_path=args.save_path, model_name=args.model_name_list[0]
)
evaluator.save(args.save_path, args.model_name_list)
else:
raise ValueError("Unsupported number of answer files and model names!")
else:
raise ValueError(f'Unsupported language {config["language"]}!')
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="ColossalAI LLM evaluation pipeline.")
parser.add_argument(
"--config_file", type=str, default=None, required=True, help="path to the file of target results"
)
parser.add_argument("--battle_prompt_file", type=str, default=None, help="path to the prompt file for battle")
parser.add_argument(
"--gpt_evaluation_prompt_file", type=str, default=None, help="path to the prompt file for gpt evaluation"
)
parser.add_argument("--target_file", type=str, default=None, help="path to the target answer (ground truth) file")
parser.add_argument(
"--answer_file_list",
type=str,
nargs="+",
default=[],
required=True,
help="path to the answer files of at most 2 models",
)
parser.add_argument(
"--model_name_list", type=str, nargs="+", default=[], required=True, help="the names of at most 2 models"
)
parser.add_argument(
"--gpt_model",
default="gpt-3.5-turbo-16k",
choices=["text-davinci-003", "gpt-3.5-turbo", "gpt-3.5-turbo-16k", "gpt-4"],
help="which GPT model to use for evaluation",
)
parser.add_argument(
"--gpt_with_reference",
default=False,
action="store_true",
help="whether to include reference answer in gpt evaluation",
)
parser.add_argument("--save_path", type=str, default="results", help="path to save evaluation results")
parser.add_argument("--openai_key", type=str, default=None, required=True, help="Your openai key")
args = parser.parse_args()
if args.openai_key is not None:
os.environ["OPENAI_API_KEY"] = args.openai_key
openai.api_key = os.getenv("OPENAI_API_KEY")
main(args)

View File

@@ -0,0 +1,9 @@
python eval.py \
--config_file "path to the config file" \
--battle_prompt_file "path to the prompt file for battle" \
--gpt_evaluation_prompt_file "path to the prompt file for gpt evaluation" \
--target_file "path to the target answer file" \
--answer_file_list "path to the answer files of at most 2 models" \
--model_name_list "the names of at most 2 models" \
--save_path "path to save results" \
--openai_key "your openai key" \

View File

@@ -0,0 +1,171 @@
import argparse
import copy
import os
from typing import Dict, List
import torch
import torch.distributed as dist
from colossal_eval import dataset, models, utils
import colossalai
from colossalai.logging import get_dist_logger
logger = get_dist_logger()
def rm_and_merge(world_size: int, save_path: str, model_names: List[str], dataset_names: Dict[str, List]) -> None:
"""
Remove inference result per rank and merge them into one file.
Args:
world_size: Number of processes for inference.
save_path: The folder for storing inference results.
model_names: Names of models for inference.
dataset_names: Names of dataset for inference.
"""
for model_name in model_names:
for dataset_name, categories in dataset_names.items():
all_answers = {}
for category in categories:
all_answers[category] = {"data": []}
answers = {"data": []}
for r in range(world_size):
directory = os.path.join(
save_path, model_name, f"{dataset_name}_{category}_inference_results_rank{r}.json"
)
if not os.path.exists(directory):
raise Exception(
f"Directory {directory} not found. There may be an error during inference time."
)
else:
rank_answers = utils.jload(directory)
answers["data"].extend(rank_answers["data"])
answers["inference_kwargs"] = rank_answers["inference_kwargs"]
for r in range(world_size):
try:
directory = os.path.join(
save_path, model_name, f"{dataset_name}_{category}_inference_results_rank{r}.json"
)
os.remove(directory)
except Exception as e:
print(e)
all_answers[category] = answers
logger.info(f"Save inference results of model {model_name} on dataset {dataset_name}.")
utils.jdump(all_answers, os.path.join(save_path, model_name, f"{dataset_name}_inference_results.json"))
logger.info(f"Save inference results of model {model_name} for all dataset.")
logger.info(f"Save inference results of all models for all dataset.")
def main(args):
colossalai.launch_from_torch(config={}, seed=42)
world_size = dist.get_world_size()
rank = dist.get_rank()
inference_data = {}
debug_args = {}
few_shot_args = {}
config = utils.jload(args.config)
model_parameters = config["model"]
dataset_parameters = config["dataset"]
for dataset_parameter in dataset_parameters:
path = dataset_parameter["path"]
save_path = dataset_parameter["save_path"]
dataset_name = dataset_parameter["name"]
debug_args[dataset_name] = dataset_parameter["debug"]
few_shot_args[dataset_name] = dataset_parameter["few_shot"]
if not args.load_dataset:
if os.path.exists(save_path):
dataset_ = utils.jload(save_path)
inference_data[dataset_name] = dataset_["test"]
else:
raise Exception(
"Can't find the converted dataset. You may set load_dataset True to store the dataset first."
)
continue
dataset_class = eval(f"dataset.{dataset_parameter['dataset_class']}")
if not issubclass(dataset_class, dataset.BaseDataset):
raise ValueError(f"Dataset class {dataset_parameter['dataset_class']} is not a subclass of BaseDataset.")
dataset_ = dataset_class(path, logger, dataset_parameter["few_shot"])
dataset_.save(save_path)
inference_data[dataset_name] = dataset_.dataset["test"]
for model_parameter in model_parameters:
model_name = model_parameter["name"]
model_class = eval(f"models.{model_parameter['model_class']}")
paramerters = model_parameter["parameters"]
paramerters.update({"logger": logger})
paramerters.update({"prompt_template": utils.prompt_templates[paramerters["prompt_template"]]})
model_ = model_class(**paramerters)
if not issubclass(model_class, models.BaseModel):
raise ValueError(f"Model class {model_parameter['model_class']} is not a subclass of BaseModel.")
for dataset_name, split_data in inference_data.items():
start = 0
for category, category_data in split_data.items():
if few_shot_args[dataset_name] and category_data["inference_kwargs"].get("few_shot_data", None) is None:
raise Exception(f"Dataset {dataset_name} doesn't have few-shot data for category {category}!")
answers_to_dump = copy.deepcopy(category_data)
partition_size = len(category_data["data"]) // world_size
redundant = len(category_data["data"]) % world_size
# Ensure that the amount of data for inference is as consistent as possible across different processes.
lengths = [partition_size for _ in range(world_size)]
for j in range(redundant):
lengths[(j + start) % world_size] += 1
start = (start + redundant) % world_size
questions = category_data["data"][sum(lengths[0:rank]) : sum(lengths[0:rank]) + lengths[rank]]
answers_per_rank = model_.inference(
questions, inference_kwargs=category_data["inference_kwargs"], debug=debug_args[dataset_name]
)
answers_to_dump["data"] = answers_per_rank
utils.jdump(
answers_to_dump,
os.path.join(
args.inference_save_path,
model_name,
f"{dataset_name}_{category}_inference_results_rank{rank}.json",
),
)
logger.info(f"Rank {rank} peak CUDA mem: {torch.cuda.max_memory_allocated()/1024**3:.3f} GB")
del model_
torch.cuda.empty_cache()
dist.barrier()
if rank == 0:
model_names = [model_parameter["name"] for model_parameter in model_parameters]
dataset_names = {key: list(inference_data[key].keys()) for key in inference_data}
rm_and_merge(world_size, args.inference_save_path, model_names, dataset_names)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="ColossalEval inference process.")
parser.add_argument("--config", type=str, default=None, required=True, help="path to config file")
parser.add_argument("--load_dataset", default=False, action="store_true")
parser.add_argument("--inference_save_path", type=str, default=None, help="path to save inference results")
args = parser.parse_args()
main(args)

View File

@@ -0,0 +1,4 @@
torchrun --nproc_per_node=1 inference.py \
--config "path to config file" \
--load_dataset \
--inference_save_path "path to save inference results"