mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-01 17:17:05 +00:00
[feature] ColossalEval: Evaluation Pipeline for LLMs (#4786)
* Add ColossalEval * Delete evaluate in Chat --------- Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com> Co-authored-by: Tong Li <tong.li352711588@gmail.com>
This commit is contained in:
@@ -0,0 +1,58 @@
|
||||
{
|
||||
"model": [
|
||||
{
|
||||
"name": "model1"
|
||||
},
|
||||
{
|
||||
"name": "model2"
|
||||
}
|
||||
],
|
||||
"dataset": [
|
||||
{
|
||||
"name": "mmlu",
|
||||
"metrics": [
|
||||
"first_token_accuracy",
|
||||
"single_choice_accuracy",
|
||||
"perplexity",
|
||||
"ppl_score",
|
||||
"ppl_score_over_choices"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "cmmlu",
|
||||
"metrics": [
|
||||
"first_token_accuracy",
|
||||
"single_choice_accuracy",
|
||||
"perplexity",
|
||||
"ppl_score",
|
||||
"ppl_score_over_choices"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "agieval",
|
||||
"metrics": [
|
||||
"first_token_accuracy",
|
||||
"single_choice_accuracy",
|
||||
"multi_choice_accuracy",
|
||||
"math_equivalence",
|
||||
"perplexity",
|
||||
"ppl_score_over_choices",
|
||||
"ppl_score"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "gaokaobench",
|
||||
"metrics": [
|
||||
"first_token_accuracy",
|
||||
"single_choice_accuracy",
|
||||
"multi_choice_accuracy",
|
||||
"math_equivalence",
|
||||
"rouge_score",
|
||||
"rouge_zh_score",
|
||||
"perplexity",
|
||||
"ppl_score_over_choices",
|
||||
"ppl_score"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
@@ -0,0 +1,84 @@
|
||||
{
|
||||
"model": [
|
||||
{
|
||||
"name": "model name",
|
||||
"model_class": "HuggingFaceCausalLM",
|
||||
"parameters": {
|
||||
"path": "path to model",
|
||||
"model_max_length": 4096,
|
||||
"tokenizer_path": "",
|
||||
"tokenizer_kwargs": {
|
||||
"trust_remote_code": true
|
||||
},
|
||||
"peft_path": null,
|
||||
"model_kwargs": {
|
||||
"torch_dtype": "torch.float32",
|
||||
"trust_remote_code": true
|
||||
},
|
||||
"prompt_template": "plain",
|
||||
"batch_size": 4
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "model2 name",
|
||||
"model_class": "HuggingFaceCausalLM",
|
||||
"parameters": {
|
||||
"path": "path to model2",
|
||||
"model_max_length": 4096,
|
||||
"tokenizer_path": "",
|
||||
"tokenizer_kwargs": {
|
||||
"trust_remote_code": true
|
||||
},
|
||||
"peft_path": null,
|
||||
"model_kwargs": {
|
||||
"torch_dtype": "torch.float32",
|
||||
"trust_remote_code": true
|
||||
},
|
||||
"prompt_template": "plain",
|
||||
"batch_size": 4
|
||||
}
|
||||
}
|
||||
],
|
||||
"dataset": [
|
||||
{
|
||||
"name": "agieval",
|
||||
"dataset_class": "AGIEvalDataset",
|
||||
"debug": false,
|
||||
"few_shot": false,
|
||||
"path": "path to original dataset (folder)",
|
||||
"save_path": "path to save converted dataset (e.g. inference_data/agieval.json)"
|
||||
},
|
||||
{
|
||||
"name": "ceval",
|
||||
"dataset_class": "CEvalDataset",
|
||||
"debug": false,
|
||||
"few_shot": true,
|
||||
"path": "path to original dataset (folder)",
|
||||
"save_path": "path to save converted dataset (e.g. inference_data/ceval.json)"
|
||||
},
|
||||
{
|
||||
"name": "cmmlu",
|
||||
"dataset_class": "CMMLUDataset",
|
||||
"debug": false,
|
||||
"few_shot": true,
|
||||
"path": "path to original dataset (folder)",
|
||||
"save_path": "path to save converted dataset (e.g. inference_data/cmmlu.json)"
|
||||
},
|
||||
{
|
||||
"name": "gaokaobench",
|
||||
"dataset_class": "GaoKaoBenchDataset",
|
||||
"debug": false,
|
||||
"few_shot": false,
|
||||
"path": "path to original dataset (folder)",
|
||||
"save_path": "path to save converted dataset (e.g. inference_data/gaokaobench.json)"
|
||||
},
|
||||
{
|
||||
"name": "mmlu",
|
||||
"dataset_class": "MMLUDataset",
|
||||
"debug": false,
|
||||
"few_shot": true,
|
||||
"path": "path to original dataset (folder)",
|
||||
"save_path": "path to save converted dataset (e.g. inference_data/mmlu.json)"
|
||||
}
|
||||
]
|
||||
}
|
@@ -0,0 +1,73 @@
|
||||
import argparse
|
||||
import os
|
||||
|
||||
import tabulate
|
||||
from colossal_eval.evaluate.dataset_evaluator import DatasetEvaluator
|
||||
from colossal_eval.utils import jdump, jload
|
||||
|
||||
|
||||
def main(args):
|
||||
config = jload(args.config)
|
||||
|
||||
evaluation_results = {dataset["name"]: {} for dataset in config["dataset"]}
|
||||
evaluation_results_table = {dataset["name"]: {} for dataset in config["dataset"]}
|
||||
evaluator = DatasetEvaluator()
|
||||
|
||||
for dataset_parameter in config["dataset"]:
|
||||
dataset_name = dataset_parameter["name"]
|
||||
metrics = dataset_parameter["metrics"]
|
||||
results_metric_model = {metric: {model["name"]: None for model in config["model"]} for metric in metrics}
|
||||
for model in config["model"]:
|
||||
model_name = model["name"]
|
||||
|
||||
data = jload(
|
||||
os.path.join(args.inference_results_path, model_name, f"{dataset_name}_inference_results.json")
|
||||
)
|
||||
results = evaluator.get_evaluation_results(data, dataset_name, model_name, metrics)
|
||||
|
||||
for metric, score in results.items():
|
||||
results_metric_model[metric][model_name] = score["ALL"]
|
||||
|
||||
evaluation_results[dataset_name][model_name] = results
|
||||
|
||||
evaluation_results_table[dataset_name] = results_metric_model
|
||||
|
||||
table = []
|
||||
header = ["dataset", "metric"] + [model["name"] for model in config["model"]]
|
||||
table.append(header)
|
||||
|
||||
for dataset_parameter in config["dataset"]:
|
||||
dataset_name = dataset_parameter["name"]
|
||||
metrics = dataset_parameter["metrics"]
|
||||
|
||||
for metric, model_results in evaluation_results_table[dataset_name].items():
|
||||
row = [dataset_name]
|
||||
for model, score in model_results.items():
|
||||
if len(row) == 1:
|
||||
row.extend([metric, "{:.02f}".format(score)])
|
||||
else:
|
||||
row.append("{:.02f}".format(score))
|
||||
|
||||
table.append(row)
|
||||
|
||||
table = tabulate.tabulate(table, headers="firstrow")
|
||||
print(table)
|
||||
|
||||
os.makedirs(args.evaluation_results_save_path, exist_ok=True)
|
||||
|
||||
with open(os.path.join(args.evaluation_results_save_path, "evaluation_results_table.txt"), "w") as file:
|
||||
file.write(table)
|
||||
|
||||
jdump(evaluation_results, os.path.join(args.evaluation_results_save_path, "evaluation_results.json"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="ColossalEval evaluation process.")
|
||||
parser.add_argument("--config", type=str, default=None, required=True, help="path to config file")
|
||||
parser.add_argument("--inference_results_path", type=str, default=None, help="path to inference results")
|
||||
parser.add_argument(
|
||||
"--evaluation_results_save_path", type=str, default=None, help="path to save evaluation results"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
main(args)
|
@@ -0,0 +1,4 @@
|
||||
python eval_dataset.py \
|
||||
--config "path to config file" \
|
||||
--inference_results_path "path to inference results" \
|
||||
--evaluation_results_save_path "path to save evaluation results"
|
@@ -0,0 +1,171 @@
|
||||
import argparse
|
||||
import copy
|
||||
import os
|
||||
from typing import Dict, List
|
||||
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
from colossal_eval import dataset, models, utils
|
||||
|
||||
import colossalai
|
||||
from colossalai.logging import get_dist_logger
|
||||
|
||||
logger = get_dist_logger()
|
||||
|
||||
|
||||
def rm_and_merge(world_size: int, save_path: str, model_names: List[str], dataset_names: Dict[str, List]) -> None:
|
||||
"""
|
||||
Remove inference result per rank and merge them into one file.
|
||||
|
||||
Args:
|
||||
world_size: Number of processes for inference.
|
||||
save_path: The folder for storing inference results.
|
||||
model_names: Names of models for inference.
|
||||
dataset_names: Names of dataset for inference.
|
||||
|
||||
"""
|
||||
|
||||
for model_name in model_names:
|
||||
for dataset_name, categories in dataset_names.items():
|
||||
all_answers = {}
|
||||
for category in categories:
|
||||
all_answers[category] = {"data": []}
|
||||
answers = {"data": []}
|
||||
|
||||
for r in range(world_size):
|
||||
directory = os.path.join(
|
||||
save_path, model_name, f"{dataset_name}_{category}_inference_results_rank{r}.json"
|
||||
)
|
||||
if not os.path.exists(directory):
|
||||
raise Exception(
|
||||
f"Directory {directory} not found. There may be an error during inference time."
|
||||
)
|
||||
else:
|
||||
rank_answers = utils.jload(directory)
|
||||
answers["data"].extend(rank_answers["data"])
|
||||
answers["inference_kwargs"] = rank_answers["inference_kwargs"]
|
||||
|
||||
for r in range(world_size):
|
||||
try:
|
||||
directory = os.path.join(
|
||||
save_path, model_name, f"{dataset_name}_{category}_inference_results_rank{r}.json"
|
||||
)
|
||||
os.remove(directory)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
|
||||
all_answers[category] = answers
|
||||
|
||||
logger.info(f"Save inference results of model {model_name} on dataset {dataset_name}.")
|
||||
utils.jdump(all_answers, os.path.join(save_path, model_name, f"{dataset_name}_inference_results.json"))
|
||||
|
||||
logger.info(f"Save inference results of model {model_name} for all dataset.")
|
||||
logger.info(f"Save inference results of all models for all dataset.")
|
||||
|
||||
|
||||
def main(args):
|
||||
colossalai.launch_from_torch(config={}, seed=42)
|
||||
world_size = dist.get_world_size()
|
||||
rank = dist.get_rank()
|
||||
|
||||
inference_data = {}
|
||||
debug_args = {}
|
||||
few_shot_args = {}
|
||||
|
||||
config = utils.jload(args.config)
|
||||
|
||||
model_parameters = config["model"]
|
||||
dataset_parameters = config["dataset"]
|
||||
|
||||
for dataset_parameter in dataset_parameters:
|
||||
path = dataset_parameter["path"]
|
||||
save_path = dataset_parameter["save_path"]
|
||||
dataset_name = dataset_parameter["name"]
|
||||
debug_args[dataset_name] = dataset_parameter["debug"]
|
||||
few_shot_args[dataset_name] = dataset_parameter["few_shot"]
|
||||
|
||||
if not args.load_dataset:
|
||||
if os.path.exists(save_path):
|
||||
dataset_ = utils.jload(save_path)
|
||||
inference_data[dataset_name] = dataset_["test"]
|
||||
else:
|
||||
raise Exception(
|
||||
"Can't find the converted dataset. You may set load_dataset True to store the dataset first."
|
||||
)
|
||||
|
||||
continue
|
||||
|
||||
dataset_class = eval(f"dataset.{dataset_parameter['dataset_class']}")
|
||||
if not issubclass(dataset_class, dataset.BaseDataset):
|
||||
raise ValueError(f"Dataset class {dataset_parameter['dataset_class']} is not a subclass of BaseDataset.")
|
||||
|
||||
dataset_ = dataset_class(path, logger, dataset_parameter["few_shot"])
|
||||
|
||||
dataset_.save(save_path)
|
||||
inference_data[dataset_name] = dataset_.dataset["test"]
|
||||
|
||||
for model_parameter in model_parameters:
|
||||
model_name = model_parameter["name"]
|
||||
model_class = eval(f"models.{model_parameter['model_class']}")
|
||||
paramerters = model_parameter["parameters"]
|
||||
paramerters.update({"logger": logger})
|
||||
paramerters.update({"prompt_template": utils.prompt_templates[paramerters["prompt_template"]]})
|
||||
|
||||
model_ = model_class(**paramerters)
|
||||
if not issubclass(model_class, models.BaseModel):
|
||||
raise ValueError(f"Model class {model_parameter['model_class']} is not a subclass of BaseModel.")
|
||||
|
||||
for dataset_name, split_data in inference_data.items():
|
||||
start = 0
|
||||
for category, category_data in split_data.items():
|
||||
if few_shot_args[dataset_name] and category_data["inference_kwargs"].get("few_shot_data", None) is None:
|
||||
raise Exception(f"Dataset {dataset_name} doesn't have few-shot data for category {category}!")
|
||||
|
||||
answers_to_dump = copy.deepcopy(category_data)
|
||||
partition_size = len(category_data["data"]) // world_size
|
||||
redundant = len(category_data["data"]) % world_size
|
||||
|
||||
# Ensure that the amount of data for inference is as consistent as possible across different processes.
|
||||
lengths = [partition_size for _ in range(world_size)]
|
||||
for j in range(redundant):
|
||||
lengths[(j + start) % world_size] += 1
|
||||
|
||||
start = (start + redundant) % world_size
|
||||
|
||||
questions = category_data["data"][sum(lengths[0:rank]) : sum(lengths[0:rank]) + lengths[rank]]
|
||||
|
||||
answers_per_rank = model_.inference(
|
||||
questions, inference_kwargs=category_data["inference_kwargs"], debug=debug_args[dataset_name]
|
||||
)
|
||||
|
||||
answers_to_dump["data"] = answers_per_rank
|
||||
|
||||
utils.jdump(
|
||||
answers_to_dump,
|
||||
os.path.join(
|
||||
args.inference_save_path,
|
||||
model_name,
|
||||
f"{dataset_name}_{category}_inference_results_rank{rank}.json",
|
||||
),
|
||||
)
|
||||
|
||||
logger.info(f"Rank {rank} peak CUDA mem: {torch.cuda.max_memory_allocated()/1024**3:.3f} GB")
|
||||
|
||||
del model_
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
dist.barrier()
|
||||
if rank == 0:
|
||||
model_names = [model_parameter["name"] for model_parameter in model_parameters]
|
||||
dataset_names = {key: list(inference_data[key].keys()) for key in inference_data}
|
||||
rm_and_merge(world_size, args.inference_save_path, model_names, dataset_names)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="ColossalEval inference process.")
|
||||
parser.add_argument("--config", type=str, default=None, required=True, help="path to config file")
|
||||
parser.add_argument("--load_dataset", default=False, action="store_true")
|
||||
parser.add_argument("--inference_save_path", type=str, default=None, help="path to save inference results")
|
||||
args = parser.parse_args()
|
||||
|
||||
main(args)
|
@@ -0,0 +1,4 @@
|
||||
torchrun --nproc_per_node=1 inference.py \
|
||||
--config "path to config file" \
|
||||
--load_dataset \
|
||||
--inference_save_path "path to save inference results"
|
@@ -0,0 +1,44 @@
|
||||
{
|
||||
"language": "en",
|
||||
"category": {
|
||||
"brainstorming": {
|
||||
"GPT": [
|
||||
"language organization",
|
||||
"relevance",
|
||||
"creativity",
|
||||
"practicality",
|
||||
"reasonableness"
|
||||
]
|
||||
},
|
||||
"chat": {
|
||||
"GPT": [
|
||||
"language organization",
|
||||
"naturalness",
|
||||
"engagingness",
|
||||
"fidelity"
|
||||
]
|
||||
},
|
||||
"generation": {
|
||||
"GPT": [
|
||||
"language organization",
|
||||
"relevance",
|
||||
"diversity"
|
||||
]
|
||||
},
|
||||
"open_qa": {
|
||||
"GPT": [
|
||||
"language organization",
|
||||
"relevance",
|
||||
"correctness"
|
||||
]
|
||||
},
|
||||
"roleplay": {
|
||||
"GPT": [
|
||||
"language organization",
|
||||
"relevance",
|
||||
"fidelity",
|
||||
"creativity"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,33 @@
|
||||
{
|
||||
"model": [
|
||||
{
|
||||
"name": "model name",
|
||||
"model_class": "HuggingFaceCausalLM",
|
||||
"parameters": {
|
||||
"path": "path to model",
|
||||
"model_max_length": 4096,
|
||||
"tokenizer_path": "",
|
||||
"tokenizer_kwargs": {
|
||||
"trust_remote_code": true
|
||||
},
|
||||
"peft_path": null,
|
||||
"model_kwargs": {
|
||||
"torch_dtype": "torch.float32",
|
||||
"trust_remote_code": true
|
||||
},
|
||||
"prompt_template": "plain",
|
||||
"batch_size": 4
|
||||
}
|
||||
}
|
||||
],
|
||||
"dataset": [
|
||||
{
|
||||
"name": "colossal",
|
||||
"dataset_class": "ColossalDataset",
|
||||
"debug": false,
|
||||
"few_shot": false,
|
||||
"path": "../../configs/gpt_evaluation/data/eval_en_examples.json",
|
||||
"save_path": "path to save converted dataset (inference_data/colossal.json)"
|
||||
}
|
||||
]
|
||||
}
|
139
applications/ColossalEval/examples/gpt_evaluation/eval.py
Normal file
139
applications/ColossalEval/examples/gpt_evaluation/eval.py
Normal file
@@ -0,0 +1,139 @@
|
||||
import argparse
|
||||
import os
|
||||
|
||||
import openai
|
||||
from colossal_eval.evaluate.evaluator import Evaluator
|
||||
from colossal_eval.utils import jload
|
||||
|
||||
|
||||
def main(args):
|
||||
assert len(args.answer_file_list) == len(
|
||||
args.model_name_list
|
||||
), "The number of answer files and model names should be equal!"
|
||||
|
||||
# load config
|
||||
config = jload(args.config_file)
|
||||
|
||||
if config["language"] in ["cn", "en"]:
|
||||
# get metric settings for all categories
|
||||
metrics_per_category = {}
|
||||
for category in config["category"].keys():
|
||||
metrics_all = {}
|
||||
for metric_type, metrics in config["category"][category].items():
|
||||
metrics_all[metric_type] = metrics
|
||||
metrics_per_category[category] = metrics_all
|
||||
|
||||
battle_prompt = None
|
||||
if args.battle_prompt_file:
|
||||
battle_prompt = jload(args.battle_prompt_file)
|
||||
|
||||
gpt_evaluation_prompt = None
|
||||
if args.gpt_evaluation_prompt_file:
|
||||
gpt_evaluation_prompt = jload(args.gpt_evaluation_prompt_file)
|
||||
|
||||
if len(args.model_name_list) == 2 and not battle_prompt:
|
||||
raise Exception("No prompt file for battle provided. Please specify the prompt file for battle!")
|
||||
|
||||
if len(args.model_name_list) == 1 and not gpt_evaluation_prompt:
|
||||
raise Exception(
|
||||
"No prompt file for gpt evaluation provided. Please specify the prompt file for gpt evaluation!"
|
||||
)
|
||||
|
||||
if args.gpt_model == "text-davinci-003" and args.gpt_with_reference:
|
||||
raise Exception(
|
||||
"GPT evaluation with reference is not supported for text-davinci-003. You should specify chat models such as gpt-3.5-turbo or gpt-4."
|
||||
)
|
||||
|
||||
# initialize evaluator
|
||||
evaluator = Evaluator(
|
||||
metrics_per_category,
|
||||
battle_prompt,
|
||||
gpt_evaluation_prompt,
|
||||
args.gpt_model,
|
||||
config["language"],
|
||||
args.gpt_with_reference,
|
||||
)
|
||||
if len(args.model_name_list) == 2:
|
||||
answers_1 = jload(args.answer_file_list[0])
|
||||
answers_2 = jload(args.answer_file_list[1])
|
||||
|
||||
answers1 = []
|
||||
for category, value in answers_1.items():
|
||||
answers1.extend(value["data"])
|
||||
|
||||
answers2 = []
|
||||
for category, value in answers_2.items():
|
||||
answers2.extend(value["data"])
|
||||
|
||||
assert len(answers1) == len(answers2), "The number of answers for two models should be equal!"
|
||||
|
||||
evaluator.battle(answers1=answers1, answers2=answers2)
|
||||
evaluator.save(args.save_path, args.model_name_list)
|
||||
elif len(args.model_name_list) == 1:
|
||||
targets = jload(args.target_file)
|
||||
answers = jload(args.answer_file_list[0])
|
||||
|
||||
references = []
|
||||
for category, value in targets["test"].items():
|
||||
references.extend(value["data"])
|
||||
|
||||
predictions = []
|
||||
for category, value in answers.items():
|
||||
predictions.extend(value["data"])
|
||||
|
||||
assert len(references) == len(
|
||||
predictions
|
||||
), "The number of target answers and model answers should be equal!"
|
||||
|
||||
evaluator.evaluate(
|
||||
answers=predictions, targets=references, save_path=args.save_path, model_name=args.model_name_list[0]
|
||||
)
|
||||
evaluator.save(args.save_path, args.model_name_list)
|
||||
else:
|
||||
raise ValueError("Unsupported number of answer files and model names!")
|
||||
else:
|
||||
raise ValueError(f'Unsupported language {config["language"]}!')
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="ColossalAI LLM evaluation pipeline.")
|
||||
parser.add_argument(
|
||||
"--config_file", type=str, default=None, required=True, help="path to the file of target results"
|
||||
)
|
||||
parser.add_argument("--battle_prompt_file", type=str, default=None, help="path to the prompt file for battle")
|
||||
parser.add_argument(
|
||||
"--gpt_evaluation_prompt_file", type=str, default=None, help="path to the prompt file for gpt evaluation"
|
||||
)
|
||||
parser.add_argument("--target_file", type=str, default=None, help="path to the target answer (ground truth) file")
|
||||
parser.add_argument(
|
||||
"--answer_file_list",
|
||||
type=str,
|
||||
nargs="+",
|
||||
default=[],
|
||||
required=True,
|
||||
help="path to the answer files of at most 2 models",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model_name_list", type=str, nargs="+", default=[], required=True, help="the names of at most 2 models"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--gpt_model",
|
||||
default="gpt-3.5-turbo-16k",
|
||||
choices=["text-davinci-003", "gpt-3.5-turbo", "gpt-3.5-turbo-16k", "gpt-4"],
|
||||
help="which GPT model to use for evaluation",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--gpt_with_reference",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="whether to include reference answer in gpt evaluation",
|
||||
)
|
||||
parser.add_argument("--save_path", type=str, default="results", help="path to save evaluation results")
|
||||
parser.add_argument("--openai_key", type=str, default=None, required=True, help="Your openai key")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.openai_key is not None:
|
||||
os.environ["OPENAI_API_KEY"] = args.openai_key
|
||||
openai.api_key = os.getenv("OPENAI_API_KEY")
|
||||
|
||||
main(args)
|
@@ -0,0 +1,9 @@
|
||||
python eval.py \
|
||||
--config_file "path to the config file" \
|
||||
--battle_prompt_file "path to the prompt file for battle" \
|
||||
--gpt_evaluation_prompt_file "path to the prompt file for gpt evaluation" \
|
||||
--target_file "path to the target answer file" \
|
||||
--answer_file_list "path to the answer files of at most 2 models" \
|
||||
--model_name_list "the names of at most 2 models" \
|
||||
--save_path "path to save results" \
|
||||
--openai_key "your openai key" \
|
171
applications/ColossalEval/examples/gpt_evaluation/inference.py
Normal file
171
applications/ColossalEval/examples/gpt_evaluation/inference.py
Normal file
@@ -0,0 +1,171 @@
|
||||
import argparse
|
||||
import copy
|
||||
import os
|
||||
from typing import Dict, List
|
||||
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
from colossal_eval import dataset, models, utils
|
||||
|
||||
import colossalai
|
||||
from colossalai.logging import get_dist_logger
|
||||
|
||||
logger = get_dist_logger()
|
||||
|
||||
|
||||
def rm_and_merge(world_size: int, save_path: str, model_names: List[str], dataset_names: Dict[str, List]) -> None:
|
||||
"""
|
||||
Remove inference result per rank and merge them into one file.
|
||||
|
||||
Args:
|
||||
world_size: Number of processes for inference.
|
||||
save_path: The folder for storing inference results.
|
||||
model_names: Names of models for inference.
|
||||
dataset_names: Names of dataset for inference.
|
||||
|
||||
"""
|
||||
|
||||
for model_name in model_names:
|
||||
for dataset_name, categories in dataset_names.items():
|
||||
all_answers = {}
|
||||
for category in categories:
|
||||
all_answers[category] = {"data": []}
|
||||
answers = {"data": []}
|
||||
|
||||
for r in range(world_size):
|
||||
directory = os.path.join(
|
||||
save_path, model_name, f"{dataset_name}_{category}_inference_results_rank{r}.json"
|
||||
)
|
||||
if not os.path.exists(directory):
|
||||
raise Exception(
|
||||
f"Directory {directory} not found. There may be an error during inference time."
|
||||
)
|
||||
else:
|
||||
rank_answers = utils.jload(directory)
|
||||
answers["data"].extend(rank_answers["data"])
|
||||
answers["inference_kwargs"] = rank_answers["inference_kwargs"]
|
||||
|
||||
for r in range(world_size):
|
||||
try:
|
||||
directory = os.path.join(
|
||||
save_path, model_name, f"{dataset_name}_{category}_inference_results_rank{r}.json"
|
||||
)
|
||||
os.remove(directory)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
|
||||
all_answers[category] = answers
|
||||
|
||||
logger.info(f"Save inference results of model {model_name} on dataset {dataset_name}.")
|
||||
utils.jdump(all_answers, os.path.join(save_path, model_name, f"{dataset_name}_inference_results.json"))
|
||||
|
||||
logger.info(f"Save inference results of model {model_name} for all dataset.")
|
||||
logger.info(f"Save inference results of all models for all dataset.")
|
||||
|
||||
|
||||
def main(args):
|
||||
colossalai.launch_from_torch(config={}, seed=42)
|
||||
world_size = dist.get_world_size()
|
||||
rank = dist.get_rank()
|
||||
|
||||
inference_data = {}
|
||||
debug_args = {}
|
||||
few_shot_args = {}
|
||||
|
||||
config = utils.jload(args.config)
|
||||
|
||||
model_parameters = config["model"]
|
||||
dataset_parameters = config["dataset"]
|
||||
|
||||
for dataset_parameter in dataset_parameters:
|
||||
path = dataset_parameter["path"]
|
||||
save_path = dataset_parameter["save_path"]
|
||||
dataset_name = dataset_parameter["name"]
|
||||
debug_args[dataset_name] = dataset_parameter["debug"]
|
||||
few_shot_args[dataset_name] = dataset_parameter["few_shot"]
|
||||
|
||||
if not args.load_dataset:
|
||||
if os.path.exists(save_path):
|
||||
dataset_ = utils.jload(save_path)
|
||||
inference_data[dataset_name] = dataset_["test"]
|
||||
else:
|
||||
raise Exception(
|
||||
"Can't find the converted dataset. You may set load_dataset True to store the dataset first."
|
||||
)
|
||||
|
||||
continue
|
||||
|
||||
dataset_class = eval(f"dataset.{dataset_parameter['dataset_class']}")
|
||||
if not issubclass(dataset_class, dataset.BaseDataset):
|
||||
raise ValueError(f"Dataset class {dataset_parameter['dataset_class']} is not a subclass of BaseDataset.")
|
||||
|
||||
dataset_ = dataset_class(path, logger, dataset_parameter["few_shot"])
|
||||
|
||||
dataset_.save(save_path)
|
||||
inference_data[dataset_name] = dataset_.dataset["test"]
|
||||
|
||||
for model_parameter in model_parameters:
|
||||
model_name = model_parameter["name"]
|
||||
model_class = eval(f"models.{model_parameter['model_class']}")
|
||||
paramerters = model_parameter["parameters"]
|
||||
paramerters.update({"logger": logger})
|
||||
paramerters.update({"prompt_template": utils.prompt_templates[paramerters["prompt_template"]]})
|
||||
|
||||
model_ = model_class(**paramerters)
|
||||
if not issubclass(model_class, models.BaseModel):
|
||||
raise ValueError(f"Model class {model_parameter['model_class']} is not a subclass of BaseModel.")
|
||||
|
||||
for dataset_name, split_data in inference_data.items():
|
||||
start = 0
|
||||
for category, category_data in split_data.items():
|
||||
if few_shot_args[dataset_name] and category_data["inference_kwargs"].get("few_shot_data", None) is None:
|
||||
raise Exception(f"Dataset {dataset_name} doesn't have few-shot data for category {category}!")
|
||||
|
||||
answers_to_dump = copy.deepcopy(category_data)
|
||||
partition_size = len(category_data["data"]) // world_size
|
||||
redundant = len(category_data["data"]) % world_size
|
||||
|
||||
# Ensure that the amount of data for inference is as consistent as possible across different processes.
|
||||
lengths = [partition_size for _ in range(world_size)]
|
||||
for j in range(redundant):
|
||||
lengths[(j + start) % world_size] += 1
|
||||
|
||||
start = (start + redundant) % world_size
|
||||
|
||||
questions = category_data["data"][sum(lengths[0:rank]) : sum(lengths[0:rank]) + lengths[rank]]
|
||||
|
||||
answers_per_rank = model_.inference(
|
||||
questions, inference_kwargs=category_data["inference_kwargs"], debug=debug_args[dataset_name]
|
||||
)
|
||||
|
||||
answers_to_dump["data"] = answers_per_rank
|
||||
|
||||
utils.jdump(
|
||||
answers_to_dump,
|
||||
os.path.join(
|
||||
args.inference_save_path,
|
||||
model_name,
|
||||
f"{dataset_name}_{category}_inference_results_rank{rank}.json",
|
||||
),
|
||||
)
|
||||
|
||||
logger.info(f"Rank {rank} peak CUDA mem: {torch.cuda.max_memory_allocated()/1024**3:.3f} GB")
|
||||
|
||||
del model_
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
dist.barrier()
|
||||
if rank == 0:
|
||||
model_names = [model_parameter["name"] for model_parameter in model_parameters]
|
||||
dataset_names = {key: list(inference_data[key].keys()) for key in inference_data}
|
||||
rm_and_merge(world_size, args.inference_save_path, model_names, dataset_names)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="ColossalEval inference process.")
|
||||
parser.add_argument("--config", type=str, default=None, required=True, help="path to config file")
|
||||
parser.add_argument("--load_dataset", default=False, action="store_true")
|
||||
parser.add_argument("--inference_save_path", type=str, default=None, help="path to save inference results")
|
||||
args = parser.parse_args()
|
||||
|
||||
main(args)
|
@@ -0,0 +1,4 @@
|
||||
torchrun --nproc_per_node=1 inference.py \
|
||||
--config "path to config file" \
|
||||
--load_dataset \
|
||||
--inference_save_path "path to save inference results"
|
Reference in New Issue
Block a user