[feature] ColossalEval: Evaluation Pipeline for LLMs (#4786)

* Add ColossalEval

* Delete evaluate in Chat

---------

Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
This commit is contained in:
Yuanchen
2023-09-24 23:14:11 +08:00
committed by GitHub
parent 74aa7d964a
commit ce777853ae
60 changed files with 5314 additions and 2497 deletions

View File

@@ -0,0 +1,122 @@
import json
import os
import re
from copy import deepcopy
from typing import Dict, List
from colossalai.logging import DistributedLogger
from .base import BaseDataset
multi_choice_datasets = [
"Chinese Lang and Usage MCQs",
"Chinese Modern Lit",
"English Fill in Blanks",
"English Reading Comp",
"Geography MCQs",
"Physics MCQs",
"English Cloze Test",
]
chinese_qa_datasets = [
"Biology MCQs",
"Chemistry MCQs",
"Chinese Lang and Usage MCQs",
"Chinese Modern Lit",
"Geography MCQs",
"History MCQs",
"Math I MCQs",
"Math II MCQs",
"Physics MCQs",
"Political Science MCQs",
]
english_qa_datasets = ["English MCQs", "English Fill in Blanks", "English Reading Comp", "English Cloze Test"]
default_inference_kwargs = {
"calculate_loss": True,
"all_classes": None,
"language": "Chinese",
"pretrain": False,
"max_new_tokens": 32,
}
def get_all_classes(instruction: str):
letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
pattern = r"([A-Z]\. |[A-Z]|[A-Z]\.)"
options = sorted(list(set(re.findall(pattern, instruction))))
options = sorted(list(set([string[0] for string in options])))
for i in range(len(options)):
if options[i] == letters[i]:
continue
else:
return options[0:i]
return options
class GaoKaoBenchDataset(BaseDataset):
"""
Dataset class for GAOKAO-Bench dataset.
Data source: https://github.com/OpenLMLab/GAOKAO-Bench/tree/main/data
This dataset class will convert the original dataset into the inference dataset.
A few typos needed to be manually corrected in the origin dataset, some of the following is fixed.
Issue link: https://github.com/OpenLMLab/GAOKAO-Bench/issues/20
1. Option C missing in index 111 in 2010-2022_Chemistry_MCQs.json
2. Option B missing "." after it in index 16 in 2012-2022_English_Cloze_Test.json
3. Option G missing "." after it in index 23 in 2012-2022_English_Cloze_Test.json
"""
@staticmethod
def load(path: str, logger: DistributedLogger, few_shot: bool) -> List[Dict]:
dataset = {"test": {}}
for category in ["Fill-in-the-blank_Questions", "Multiple-choice_Questions", "Open-ended_Questions"]:
files = os.listdir(os.path.join(path, "data", category))
files.sort()
for file in files:
subject = file[10:-5].split("_")
subject = " ".join(subject)
dataset["test"][subject] = {"data": []}
file_dir = os.path.join(path, "data", category, file)
with open(file_dir, encoding="utf-8") as f:
data = json.load(f)
# It's been tested that each data sample in one subcategory have same inference arguments.
inference_kwargs = deepcopy(default_inference_kwargs)
if category == "Multiple-choice_Questions" and subject not in multi_choice_datasets:
all_classes = get_all_classes(data["example"][0]["question"])
inference_kwargs["all_classes"] = all_classes
if subject in english_qa_datasets:
inference_kwargs["language"] = "English"
if subject in chinese_qa_datasets:
inference_kwargs["language"] = "Chinese"
dataset["test"][subject]["inference_kwargs"] = inference_kwargs
for sample in data["example"]:
# Convert multi-choice answers to a single string.
# We will convert it back when evaluating.
# We do this because if target is a list, it should be only used for multiple target answers.
if subject in multi_choice_datasets:
sample["answer"] = "".join(sample["answer"])
if isinstance(sample["answer"], list) and len(sample["answer"]) == 1:
sample["answer"] = sample["answer"][0]
data_sample = {
"dataset": "gaokaobench",
"split": "test",
"category": f"{category[:-10]}-{subject}",
"instruction": sample["question"].strip() + "\n答案:",
"input": "",
"output": "",
"target": sample["answer"],
}
dataset["test"][subject]["data"].append(data_sample)
return dataset