[feature] ColossalEval: Evaluation Pipeline for LLMs (#4786)

* Add ColossalEval * Delete evaluate in Chat --------- Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com> Co-authored-by: Tong Li <tong.li352711588@gmail.com>
2025-09-17 07:00:37 +00:00 · 2023-09-24 23:14:11 +08:00
parent 74aa7d964a
commit ce777853ae
60 changed files with 5314 additions and 2497 deletions
--- a/applications/ColossalEval/colossal_eval/dataset/init.py
+++ b/applications/ColossalEval/colossal_eval/dataset/init.py
@@ -0,0 +1,19 @@
+from .agieval import AGIEvalDataset
+from .base import BaseDataset
+from .ceval import CEvalDataset
+from .cmmlu import CMMLUDataset
+from .colossalai import ColossalDataset
+from .gaokaobench import GaoKaoBenchDataset
+from .longbench import LongBenchDataset
+from .mmlu import MMLUDataset
+
+__all__ = [
+    "AGIEvalDataset",
+    "BaseDataset",
+    "CEvalDataset",
+    "CMMLUDataset",
+    "GaoKaoBenchDataset",
+    "LongBenchDataset",
+    "MMLUDataset",
+    "ColossalDataset",
+]
--- a/applications/ColossalEval/colossal_eval/dataset/agieval.py
+++ b/applications/ColossalEval/colossal_eval/dataset/agieval.py
@@ -0,0 +1,247 @@
+# Adapted from https://github.com/ruixiangcui/AGIEval/blob/main/src/dataset_loader.py.
+
+import ast
+import glob
+import os
+from copy import deepcopy
+from typing import Dict, List
+
+import pandas as pd
+from colossal_eval.utils import get_json_list
+
+from colossalai.logging import DistributedLogger
+
+from .base import BaseDataset
+
+# define the datasets
+english_qa_datasets = [
+    "lsat-ar",
+    "lsat-lr",
+    "lsat-rc",
+    "logiqa-en",
+    "sat-math",
+    "sat-en",
+    "aqua-rat",
+    "sat-en-without-passage",
+    "gaokao-english",
+]
+chinese_qa_datasets = [
+    "logiqa-zh",
+    "jec-qa-kd",
+    "jec-qa-ca",
+    "gaokao-chinese",
+    "gaokao-geography",
+    "gaokao-history",
+    "gaokao-biology",
+    "gaokao-chemistry",
+    "gaokao-physics",
+    "gaokao-mathqa",
+]
+english_cloze_datasets = ["math"]
+chinese_cloze_datasets = ["gaokao-mathcloze"]
+
+multi_choice_datasets = ["jec-qa-kd", "jec-qa-ca", "gaokao-physics", "gaokao-mathqa"]
+math_output_datasets = {"gaokao-mathcloze", "math"}
+
+default_inference_kwargs = {
+    "calculate_loss": True,
+    "all_classes": None,
+    "language": "Chinese",
+    "pretrain": False,
+    "max_new_tokens": 32,
+}
+
+
+def get_prompt(line: Dict, dataset_name: str, logger: DistributedLogger) -> Dict:
+    """Modified from https://github.com/microsoft/AGIEval/blob/main/src/dataset_loader.py#L190"""
+    try:
+        all_classes = None
+        passage = line["passage"] if line["passage"] is not None else ""
+
+        if dataset_name in english_qa_datasets:
+            option_string = "ABCDEFG"
+            count = len(line["options"])
+
+            input = (
+                "Question: "
+                + line["question"]
+                + " "
+                + "Choose from the following options: "
+                + " ".join(line["options"])
+                + "\n"
+                + "Answer: "
+            )
+
+            all_classes = list(option_string[0:count])
+
+        elif dataset_name in chinese_qa_datasets:
+            option_string = "ABCDEFG"
+            count = len(line["options"])
+
+            input = "问题：" + line["question"] + " " + "从以下选项中选择：" + " ".join(line["options"]) + "\n" + "答案："
+
+            all_classes = list(option_string[0:count])
+
+        elif dataset_name in english_cloze_datasets:
+            input = "Question: " + line["question"] + "\n" + "Answer: "
+
+        elif dataset_name in chinese_cloze_datasets:
+            input = "问题：" + line["question"] + "\n" + "答案："
+
+        return {
+            "instruction": input if not passage else passage + "\n\n" + input,
+            "target": line["label"] if line["label"] else line["answer"],
+        }, all_classes
+
+    except NameError:
+        logger.info("Dataset not defined.")
+
+
+# process few-shot raw_prompts
+def combine_prompt(prompt_path, dataset_name, load_explanation=True, chat_mode=False):
+    skip_passage = False
+    if dataset_name == "sat-en-without-passage":
+        skip_passage = True
+        dataset_name = "sat-en"
+    demostrations = []
+    # read the prompts by context and explanation
+    context_row = [0, 1, 3, 5, 7, 9]
+    explanation_row = [0, 2, 4, 6, 8, 10]
+    raw_prompts_context = pd.read_csv(
+        prompt_path, header=0, skiprows=lambda x: x not in context_row, keep_default_na=False
+    )
+    raw_prompts_explanation = pd.read_csv(
+        prompt_path, header=0, skiprows=lambda x: x not in explanation_row, keep_default_na=False
+    ).replace(r"\n\n", "\n", regex=True)
+    contexts = []
+    for line in list(raw_prompts_context[dataset_name]):
+        if line:
+            # print(line)
+            contexts.append(ast.literal_eval(line))
+    explanations = [exp for exp in raw_prompts_explanation[dataset_name] if exp]
+
+    for idx, (con, exp) in enumerate(zip(contexts, explanations)):
+        passage = con["passage"] if con["passage"] is not None and not skip_passage else ""
+        question = con["question"]
+        options = con["options"] if con["options"] is not None else ""
+        label = con["label"] if con["label"] is not None else ""
+        answer = con["answer"] if "answer" in con and con["answer"] is not None else ""
+
+        if dataset_name in english_qa_datasets:
+            question_input = (
+                "Question: "
+                + passage
+                + " "
+                + question
+                + "\n"
+                + "Choose from the following options: "
+                + " ".join(options)
+                + "\n"
+                + "Answer: {}".format(label)
+            )
+        elif dataset_name in chinese_qa_datasets:
+            question_input = (
+                "问题：" + passage + " " + question + "\n" + "从以下选项中选择：" + " ".join(options) + "\n" + "答案：{}".format(label)
+            )
+        elif dataset_name in english_cloze_datasets:
+            question_input = "Question: ".format(idx + 1) + question + "\n" + "Answer: {}".format(answer)
+        elif dataset_name in chinese_cloze_datasets:
+            question_input = "问题：" + question + "\n" + "答案：{}".format(answer)
+        else:
+            raise ValueError(f"During loading few-sot examples, found unknown dataset: {dataset_name}")
+
+        if chat_mode:
+            demostrations.append((question_input,))
+        else:
+            demostrations.append(question_input + "\n")
+
+    return demostrations
+
+
+class AGIEvalDataset(BaseDataset):
+    """
+    Dataset wrapper for AGIEval dataset.
+    Data source: https://github.com/microsoft/AGIEval
+    This dataset class will convert the original dataset into the inference dataset.
+
+    A few dirty data needed to be manually corrected in the origin dataset:
+    Issue link: https://github.com/microsoft/AGIEval/issues/16
+    1. Invalid options in line 190 in gaokao-chemistry.jsonl.
+    2. Option D (They may increase in value as those same resources become rare on Earth.) missing in line 17 in sat-en-without-passage.jsonl.
+    3. Option D (They may increase in value as those same resources become rare on Earth.) missing in line 17 in sat-en.jsonl.
+    4. Option D (No, because the data do not indicate whether the honeybees had been infected with mites.) missing in line 57 in sat-en-without-passage.jsonl.
+    5. Option D (No, because the data do not indicate whether the honeybees had been infected with mites.) missing in line 57 in sat-en.jsonl.
+    6. Option D (Published theories of scientists who developed earlier models of the Venus flytrap) missing in line 98 in sat-en-without-passage.jsonl.
+    7. Option D (Published theories of scientists who developed earlier models of the Venus flytrap) missing in line 98 in sat-en.jsonl.
+    8. Label is empty in line 212 in jec-qa-kd.jsonl. Content is also dirty.
+    9. Actually, gaokao-mathqa.jsonl is also a multi-choice dataset. See line 149 286 287.
+    """
+
+    @staticmethod
+    def load(path: str, logger: DistributedLogger, few_shot: bool) -> List[Dict]:
+        dataset = {"test": {}}
+
+        files = glob.glob(os.path.join(path, "*.jsonl"))
+        files.sort()
+
+        if few_shot:
+            prompt_path = os.path.join(path, "few_shot_prompts.csv")
+
+        for file in files:
+            dataset_name = os.path.basename(file)[0 : -len(".jsonl")]
+
+            few_shot_data = []
+            if few_shot:
+                # process demo once if it is few-shot-CoT
+                few_shot_data = combine_prompt(prompt_path, dataset_name, load_explanation=False, chat_mode=False)
+
+            dataset["test"][dataset_name] = {"data": []}
+
+            file_dir = os.path.join(path, file)
+
+            loaded_jsonl = get_json_list(file_dir)
+
+            # It's been tested that each data sample in one subcategory have same inference arguments.
+            _, all_classes = get_prompt(loaded_jsonl[0], dataset_name, logger)
+            inference_kwargs = deepcopy(default_inference_kwargs)
+            if all_classes is not None and dataset_name not in multi_choice_datasets:
+                inference_kwargs["all_classes"] = all_classes
+
+            if dataset_name in english_qa_datasets:
+                inference_kwargs["language"] = "English"
+            if dataset_name in chinese_qa_datasets:
+                inference_kwargs["language"] = "Chinese"
+            inference_kwargs["few_shot_data"] = few_shot_data
+
+            dataset["test"][dataset_name]["inference_kwargs"] = inference_kwargs
+
+            for line in loaded_jsonl:
+                info, all_classes = get_prompt(line, dataset_name, logger)
+
+                # Convert multi-choice answers to a single string.
+                # We will convert it back when evaluating.
+                # We do this because if target is a list, it should be only used for multiple target answers.
+                if dataset_name in multi_choice_datasets:
+                    if isinstance(info["target"], str) and len(info["target"]) > 1:
+                        # "gaokao-mathqa" actually contain multi-choice questions.
+                        # This if clause is specially used for it.
+                        info["target"] = "".join(info["target"].split())
+                    else:
+                        info["target"] = "".join(info["target"])
+
+                if isinstance(info["target"], list) and len(info["target"]) == 1:
+                    info["target"] = info["target"][0]
+
+                data_sample = {
+                    "dataset": "agieval",
+                    "split": "test",
+                    "category": dataset_name,
+                    "instruction": info["instruction"],
+                    "input": "",
+                    "output": "",
+                    "target": info["target"],
+                }
+
+                dataset["test"][dataset_name]["data"].append(data_sample)
+
+        return dataset
--- a/applications/ColossalEval/colossal_eval/dataset/base.py
+++ b/applications/ColossalEval/colossal_eval/dataset/base.py
@@ -0,0 +1,24 @@
+from abc import abstractstaticmethod
+
+from colossal_eval.utils import jdump
+
+
+class BaseDataset:
+    """
+    Base class for dataset wrapper.
+
+    Args:
+        path: The path to the original dataset.
+        logger: Logger for the dataset.
+    """
+
+    def __init__(self, path, logger, few_shot):
+        self.dataset = self.load(path, logger, few_shot)
+
+    def save(self, save_path):
+        """Save the converted dataset"""
+        jdump(self.dataset, save_path)
+
+    @abstractstaticmethod
+    def load(path, logger):
+        """Load the original dataset and convert it into the inference dataset"""
--- a/applications/ColossalEval/colossal_eval/dataset/ceval.py
+++ b/applications/ColossalEval/colossal_eval/dataset/ceval.py
@@ -0,0 +1,132 @@
+import copy
+import csv
+import os
+from typing import Dict, List
+
+from colossalai.logging import DistributedLogger
+
+from .base import BaseDataset
+
+ceval_subject_mapping = {
+    "computer_network": ["Computer Network", "计算机网络", "STEM"],
+    "operating_system": ["Operating System", "操作系统", "STEM"],
+    "computer_architecture": ["Computer Architecture", "计算机组成", "STEM"],
+    "college_programming": ["College Programming", "大学编程", "STEM"],
+    "college_physics": ["College Physics", "大学物理", "STEM"],
+    "college_chemistry": ["College Chemistry", "大学化学", "STEM"],
+    "advanced_mathematics": ["Advanced Mathematics", "高等数学", "STEM"],
+    "probability_and_statistics": ["Probability and Statistics", "概率统计", "STEM"],
+    "discrete_mathematics": ["Discrete Mathematics", "离散数学", "STEM"],
+    "electrical_engineer": ["Electrical Engineer", "注册电气工程师", "STEM"],
+    "metrology_engineer": ["Metrology Engineer", "注册计量师", "STEM"],
+    "high_school_mathematics": ["High School Mathematics", "高中数学", "STEM"],
+    "high_school_physics": ["High School Physics", "高中物理", "STEM"],
+    "high_school_chemistry": ["High School Chemistry", "高中化学", "STEM"],
+    "high_school_biology": ["High School Biology", "高中生物", "STEM"],
+    "middle_school_mathematics": ["Middle School Mathematics", "初中数学", "STEM"],
+    "middle_school_biology": ["Middle School Biology", "初中生物", "STEM"],
+    "middle_school_physics": ["Middle School Physics", "初中物理", "STEM"],
+    "middle_school_chemistry": ["Middle School Chemistry", "初中化学", "STEM"],
+    "veterinary_medicine": ["Veterinary Medicine", "兽医学", "STEM"],
+    "college_economics": ["College Economics", "大学经济学", "Social Science"],
+    "business_administration": ["Business Administration", "工商管理", "Social Science"],
+    "marxism": ["Marxism", "马克思主义基本原理", "Social Science"],
+    "mao_zedong_thought": ["Mao Zedong Thought", "毛泽东思想和中国特色社会主义理论体系概论", "Social Science"],
+    "education_science": ["Education Science", "教育学", "Social Science"],
+    "teacher_qualification": ["Teacher Qualification", "教师资格", "Social Science"],
+    "high_school_politics": ["High School Politics", "高中政治", "Social Science"],
+    "high_school_geography": ["High School Geography", "高中地理", "Social Science"],
+    "middle_school_politics": ["Middle School Politics", "初中政治", "Social Science"],
+    "middle_school_geography": ["Middle School Geography", "初中地理", "Social Science"],
+    "modern_chinese_history": ["Modern Chinese History", "近代史纲要", "Humanities"],
+    "ideological_and_moral_cultivation": ["Ideological and Moral Cultivation", "思想道德修养与法律基础", "Humanities"],
+    "logic": ["Logic", "逻辑学", "Humanities"],
+    "law": ["Law", "法学", "Humanities"],
+    "chinese_language_and_literature": ["Chinese Language and Literature", "中国语言文学", "Humanities"],
+    "art_studies": ["Art Studies", "艺术学", "Humanities"],
+    "professional_tour_guide": ["Professional Tour Guide", "导游资格", "Humanities"],
+    "legal_professional": ["Legal Professional", "法律职业资格", "Humanities"],
+    "high_school_chinese": ["High School Chinese", "高中语文", "Humanities"],
+    "high_school_history": ["High School History", "高中历史", "Humanities"],
+    "middle_school_history": ["Middle School History", "初中历史", "Humanities"],
+    "civil_servant": ["Civil Servant", "公务员", "Other"],
+    "sports_science": ["Sports Science", "体育学", "Other"],
+    "plant_protection": ["Plant Protection", "植物保护", "Other"],
+    "basic_medicine": ["Basic Medicine", "基础医学", "Other"],
+    "clinical_medicine": ["Clinical Medicine", "临床医学", "Other"],
+    "urban_and_rural_planner": ["Urban and Rural Planner", "注册城乡规划师", "Other"],
+    "accountant": ["Accountant", "注册会计师", "Other"],
+    "fire_engineer": ["Fire Engineer", "注册消防工程师", "Other"],
+    "environmental_impact_assessment_engineer": ["Environmental Impact Assessment Engineer", "环境影响评价工程师", "Other"],
+    "tax_accountant": ["Tax Accountant", "税务师", "Other"],
+    "physician": ["Physician", "医师资格", "Other"],
+}
+
+default_inference_kwargs = {
+    "calculate_loss": False,
+    "all_classes": ["A", "B", "C", "D"],
+    "language": "Chinese",
+    "pretrain": False,
+    "max_new_tokens": 32,
+}
+
+
+def get_few_shot_data(data: List[Dict]):
+    few_shot_data = []
+    for i in data:
+        few_shot_data.append(i["input"] + i["target"])
+    return few_shot_data
+
+
+class CEvalDataset(BaseDataset):
+    """
+    Dataset class for CEval dataset.
+    Data source: https://huggingface.co/datasets/ceval/ceval-exam
+    This dataset class will convert the original dataset into the inference dataset.
+    """
+
+    @staticmethod
+    def load(path: str, logger: DistributedLogger, few_shot: bool) -> List[Dict]:
+        dataset = {"dev": {}, "test": {}}
+        for split in ["dev", "test"]:
+            files = os.listdir(os.path.join(path, split))
+            files.sort()
+
+            for file in files:
+                subject = file[0 : -len(f"_{split}.csv")]
+                subject = ceval_subject_mapping[subject][1]
+
+                file_dir = os.path.join(path, split, file)
+
+                dataset[split][subject] = {"data": []}
+
+                # It's been tested that each data sample in one subcategory have same inference arguments.
+                dataset[split][subject]["inference_kwargs"] = copy.deepcopy(default_inference_kwargs)
+
+                if split == "test" and few_shot:
+                    dataset[split][subject]["inference_kwargs"]["few_shot_data"] = get_few_shot_data(
+                        dataset["dev"][subject]["data"]
+                    )
+
+                with open(file_dir, encoding="utf-8") as f:
+                    reader = csv.reader(f)
+                    _ = next(reader)
+                    for row in reader:
+                        # Dev split have answer and explanation so len(row) is 8
+                        # But test split doesn't contain answer and explanation, so len(row) is 6
+                        assert len(row) >= 6
+                        choices = f"A. {row[2]}\nB. {row[3]}\nC. {row[4]}\nD. {row[5]}"
+                        data_sample = {
+                            "dataset": "ceval",
+                            "split": split,
+                            "category": subject,
+                            "instruction": f"以下是中国关于{subject}考试的单项选择题，请选出其中的正确答案。",
+                            "input": f"题目：{row[1]}\n{choices}\n答案：",
+                            "output": "",
+                            "target": row[6] if split == "dev" else "",
+                            "id": int(row[0]),
+                        }
+
+                        dataset[split][subject]["data"].append(data_sample)
+
+        return dataset
--- a/applications/ColossalEval/colossal_eval/dataset/cmmlu.py
+++ b/applications/ColossalEval/colossal_eval/dataset/cmmlu.py
@@ -0,0 +1,144 @@
+import copy
+import csv
+import os
+from typing import Dict, List
+
+from colossalai.logging import DistributedLogger
+
+from .base import BaseDataset
+
+cmmlu_subject_mapping = {
+    "agronomy": "农学",
+    "anatomy": "解剖学",
+    "ancient_chinese": "古汉语",
+    "arts": "艺术学",
+    "astronomy": "天文学",
+    "business_ethics": "商业伦理",
+    "chinese_civil_service_exam": "中国公务员考试",
+    "chinese_driving_rule": "中国驾驶规则",
+    "chinese_food_culture": "中国饮食文化",
+    "chinese_foreign_policy": "中国外交政策",
+    "chinese_history": "中国历史",
+    "chinese_literature": "中国文学",
+    "chinese_teacher_qualification": "中国教师资格",
+    "clinical_knowledge": "临床知识",
+    "college_actuarial_science": "大学精算学",
+    "college_education": "大学教育学",
+    "college_engineering_hydrology": "大学工程水文学",
+    "college_law": "大学法律",
+    "college_mathematics": "大学数学",
+    "college_medical_statistics": "大学医学统计",
+    "college_medicine": "大学医学",
+    "computer_science": "计算机科学",
+    "computer_security": "计算机安全",
+    "conceptual_physics": "概念物理学",
+    "construction_project_management": "建设工程管理",
+    "economics": "经济学",
+    "education": "教育学",
+    "electrical_engineering": "电气工程",
+    "elementary_chinese": "小学语文",
+    "elementary_commonsense": "小学常识",
+    "elementary_information_and_technology": "小学信息技术",
+    "elementary_mathematics": "初等数学",
+    "ethnology": "民族学",
+    "food_science": "食品科学",
+    "genetics": "遗传学",
+    "global_facts": "全球事实",
+    "high_school_biology": "高中生物",
+    "high_school_chemistry": "高中化学",
+    "high_school_geography": "高中地理",
+    "high_school_mathematics": "高中数学",
+    "high_school_physics": "高中物理学",
+    "high_school_politics": "高中政治",
+    "human_sexuality": "人类性行为",
+    "international_law": "国际法学",
+    "journalism": "新闻学",
+    "jurisprudence": "法理学",
+    "legal_and_moral_basis": "法律与道德基础",
+    "logical": "逻辑学",
+    "machine_learning": "机器学习",
+    "management": "管理学",
+    "marketing": "市场营销",
+    "marxist_theory": "马克思主义理论",
+    "modern_chinese": "现代汉语",
+    "nutrition": "营养学",
+    "philosophy": "哲学",
+    "professional_accounting": "专业会计",
+    "professional_law": "专业法学",
+    "professional_medicine": "专业医学",
+    "professional_psychology": "专业心理学",
+    "public_relations": "公共关系",
+    "security_study": "安全研究",
+    "sociology": "社会学",
+    "sports_science": "体育学",
+    "traditional_chinese_medicine": "中医中药",
+    "virology": "病毒学",
+    "world_history": "世界历史",
+    "world_religions": "世界宗教",
+}
+
+default_inference_kwargs = {
+    "calculate_loss": True,
+    "all_classes": ["A", "B", "C", "D"],
+    "language": "Chinese",
+    "pretrain": False,
+    "max_new_tokens": 32,
+}
+
+
+def get_few_shot_data(data: List[Dict]):
+    few_shot_data = []
+    for i in data:
+        few_shot_data.append(i["input"] + i["target"])
+    return few_shot_data
+
+
+class CMMLUDataset(BaseDataset):
+    """
+    Dataset class for CMMLU dataset.
+    Data source: https://github.com/haonan-li/CMMLU/tree/master/data
+    This dataset class will convert the original dataset into the inference dataset.
+    """
+
+    @staticmethod
+    def load(path: str, logger: DistributedLogger, few_shot: bool) -> List[Dict]:
+        dataset = {"dev": {}, "test": {}}
+        for split in ["dev", "test"]:
+            files = os.listdir(os.path.join(path, split))
+            files.sort()
+
+            for file in files:
+                subject = file[0 : -len(".csv")]
+                subject = cmmlu_subject_mapping[subject]
+
+                file_dir = os.path.join(path, split, file)
+
+                dataset[split][subject] = {"data": []}
+
+                # It's been tested that each data sample in one subcategory have same inference arguments.
+                dataset[split][subject]["inference_kwargs"] = copy.deepcopy(default_inference_kwargs)
+
+                if split == "test" and few_shot:
+                    dataset[split][subject]["inference_kwargs"]["few_shot_data"] = get_few_shot_data(
+                        dataset["dev"][subject]["data"]
+                    )
+
+                with open(file_dir, encoding="utf-8") as f:
+                    reader = csv.reader(f)
+                    _ = next(reader)
+                    for row in reader:
+                        assert len(row) == 7
+                        choices = f"A. {row[2]}\nB. {row[3]}\nC. {row[4]}\nD. {row[5]}"
+                        data_sample = {
+                            "dataset": "cmmlu",
+                            "split": split,
+                            "category": subject,
+                            "instruction": f"以下是关于{subject}的单项选择题，请直接给出正确答案的选项。",
+                            "input": f"题目：{row[1]}\n{choices}\n答案：",
+                            "output": "",
+                            "target": row[6],
+                        }
+
+                        dataset[split][subject]["data"].append(data_sample)
+
+        return dataset
--- a/applications/ColossalEval/colossal_eval/dataset/colossalai.py
+++ b/applications/ColossalEval/colossal_eval/dataset/colossalai.py
@@ -0,0 +1,70 @@
+from collections import defaultdict
+from copy import deepcopy
+from typing import Dict, List
+
+from colossal_eval.utils import jload
+
+from colossalai.logging import DistributedLogger
+
+from .base import BaseDataset
+
+default_inference_kwargs = {
+    "calculate_loss": False,
+    "all_classes": None,
+    "language": "Chinese",
+    "pretrain": False,
+    "max_new_tokens": 256,
+}
+
+# You can add your own subcategory questions and specify whether it is a single-choice question or has target answers and need to calculate loss.
+single_choice_question = set()
+calculate_loss = set()
+
+
+def get_data_per_category(data):
+    data_per_category = defaultdict(list)
+    for item in data:
+        category = item["category"]
+        data_per_category[category].append(item)
+
+    return data_per_category
+
+
+class ColossalDataset(BaseDataset):
+    """
+    Dataset class for Colossal dataset.
+    This dataset class will convert the original dataset into the inference dataset.
+    """
+
+    @staticmethod
+    def load(path: str, logger: DistributedLogger, few_shot: bool) -> List[Dict]:
+        dataset = {"test": {}}
+        data = jload(path)
+        data_per_category = get_data_per_category(data)
+        categories = list(data_per_category.keys())
+
+        for category in categories:
+            dataset["test"][category] = {"data": []}
+            category_data = data_per_category[category]
+
+            dataset["test"][category]["inference_kwargs"] = deepcopy(default_inference_kwargs)
+
+            if category in calculate_loss:
+                dataset["test"][category]["inference_kwargs"]["calculate_loss"] = True
+            if category in single_choice_question:
+                dataset["test"][category]["inference_kwargs"]["all_classes"] = ["A", "B", "C", "D"]
+
+            for item in category_data:
+                data_sample = {
+                    "dataset": "colossal",
+                    "split": "test",
+                    "category": category,
+                    "instruction": item["instruction"],
+                    "input": item["input"],
+                    "output": "",
+                    "target": item["target"],
+                    "id": item["id"],
+                }
+                dataset["test"][category]["data"].append(data_sample)
+
+        return dataset
--- a/applications/ColossalEval/colossal_eval/dataset/gaokaobench.py
+++ b/applications/ColossalEval/colossal_eval/dataset/gaokaobench.py
@@ -0,0 +1,122 @@
+import json
+import os
+import re
+from copy import deepcopy
+from typing import Dict, List
+
+from colossalai.logging import DistributedLogger
+
+from .base import BaseDataset
+
+multi_choice_datasets = [
+    "Chinese Lang and Usage MCQs",
+    "Chinese Modern Lit",
+    "English Fill in Blanks",
+    "English Reading Comp",
+    "Geography MCQs",
+    "Physics MCQs",
+    "English Cloze Test",
+]
+
+chinese_qa_datasets = [
+    "Biology MCQs",
+    "Chemistry MCQs",
+    "Chinese Lang and Usage MCQs",
+    "Chinese Modern Lit",
+    "Geography MCQs",
+    "History MCQs",
+    "Math I MCQs",
+    "Math II MCQs",
+    "Physics MCQs",
+    "Political Science MCQs",
+]
+english_qa_datasets = ["English MCQs", "English Fill in Blanks", "English Reading Comp", "English Cloze Test"]
+
+default_inference_kwargs = {
+    "calculate_loss": True,
+    "all_classes": None,
+    "language": "Chinese",
+    "pretrain": False,
+    "max_new_tokens": 32,
+}
+
+
+def get_all_classes(instruction: str):
+    letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+    pattern = r"([A-Z]\. |[A-Z]．|[A-Z]\.)"
+    options = sorted(list(set(re.findall(pattern, instruction))))
+    options = sorted(list(set([string[0] for string in options])))
+
+    for i in range(len(options)):
+        if options[i] == letters[i]:
+            continue
+        else:
+            return options[0:i]
+    return options
+
+
+class GaoKaoBenchDataset(BaseDataset):
+    """
+    Dataset class for GAOKAO-Bench dataset.
+    Data source: https://github.com/OpenLMLab/GAOKAO-Bench/tree/main/data
+    This dataset class will convert the original dataset into the inference dataset.
+
+    A few typos needed to be manually corrected in the origin dataset, some of the following is fixed.
+    Issue link: https://github.com/OpenLMLab/GAOKAO-Bench/issues/20
+    1. Option C missing in index 111 in 2010-2022_Chemistry_MCQs.json
+    2. Option B missing "." after it in index 16 in 2012-2022_English_Cloze_Test.json
+    3. Option G missing "." after it in index 23 in 2012-2022_English_Cloze_Test.json
+    """
+
+    @staticmethod
+    def load(path: str, logger: DistributedLogger, few_shot: bool) -> List[Dict]:
+        dataset = {"test": {}}
+        for category in ["Fill-in-the-blank_Questions", "Multiple-choice_Questions", "Open-ended_Questions"]:
+            files = os.listdir(os.path.join(path, "data", category))
+            files.sort()
+
+            for file in files:
+                subject = file[10:-5].split("_")
+                subject = " ".join(subject)
+                dataset["test"][subject] = {"data": []}
+
+                file_dir = os.path.join(path, "data", category, file)
+
+                with open(file_dir, encoding="utf-8") as f:
+                    data = json.load(f)
+
+                    # It's been tested that each data sample in one subcategory have same inference arguments.
+                    inference_kwargs = deepcopy(default_inference_kwargs)
+                    if category == "Multiple-choice_Questions" and subject not in multi_choice_datasets:
+                        all_classes = get_all_classes(data["example"][0]["question"])
+                        inference_kwargs["all_classes"] = all_classes
+                    if subject in english_qa_datasets:
+                        inference_kwargs["language"] = "English"
+                    if subject in chinese_qa_datasets:
+                        inference_kwargs["language"] = "Chinese"
+
+                    dataset["test"][subject]["inference_kwargs"] = inference_kwargs
+
+                    for sample in data["example"]:
+                        # Convert multi-choice answers to a single string.
+                        # We will convert it back when evaluating.
+                        # We do this because if target is a list, it should be only used for multiple target answers.
+                        if subject in multi_choice_datasets:
+                            sample["answer"] = "".join(sample["answer"])
+
+                        if isinstance(sample["answer"], list) and len(sample["answer"]) == 1:
+                            sample["answer"] = sample["answer"][0]
+
+                        data_sample = {
+                            "dataset": "gaokaobench",
+                            "split": "test",
+                            "category": f"{category[:-10]}-{subject}",
+                            "instruction": sample["question"].strip() + "\n答案：",
+                            "input": "",
+                            "output": "",
+                            "target": sample["answer"],
+                        }
+
+                        dataset["test"][subject]["data"].append(data_sample)
+
+        return dataset
--- a/applications/ColossalEval/colossal_eval/dataset/longbench.py
+++ b/applications/ColossalEval/colossal_eval/dataset/longbench.py
@@ -0,0 +1,120 @@
+import os
+from copy import deepcopy
+from typing import Dict, List
+
+from colossal_eval.utils import get_json_list
+
+from colossalai.logging import DistributedLogger
+
+from .base import BaseDataset
+
+dataset2prompt = {
+    "narrativeqa": "You are given a story, which can be either a novel or a movie script, and a question. Answer the question asconcisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story asconcisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:",
+    "qasper": 'You are given a scientific article and a question. Answer the question as concisely as you can, using a single phrase or sentence if possible. If the question cannot be answered based on the information in the article, write "unanswerable". If the question is a yes/no question, answer "yes", "no", or "unanswerable". Do not provide any explanation.\n\nArticle: {context}\n\n Answer the question based on the above article as concisely as you can, using a single phrase or sentence if possible. If the question cannot be answered based on the information in the article, write "unanswerable". If the question is a yes/no question, answer "yes", "no", or "unanswerable". Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:',
+    "multifieldqa_en": "Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:",
+    "multifieldqa_zh": "阅读以下文字并用中文简短回答：\n\n{context}\n\n现在请基于上面的文章回答下面的问题，只告诉我答案，不要输出任何其他字词。\n\n问题：{input}\n回答：",
+    "hotpotqa": "Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:",
+    "2wikimqa": "Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:",
+    "musique": "Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:",
+    "dureader": "请基于给定的文章回答下述问题。\n\n文章：{context}\n\n请基于上述文章回答下面的问题。\n\n问题：{input}\n回答：",
+    "gov_report": "You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:",
+    "qmsum": "You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:",
+    "multi_news": "You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:",
+    "vcsum": "下面有一段会议记录，请你阅读后，写一段总结，总结会议的内容。\n会议记录：\n{context}\n\n会议总结：",
+    "trec": "Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}",
+    "triviaqa": "Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}",
+    "samsum": "Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}",
+    "lsht": "请判断给定新闻的类别，下面是一些例子。\n\n{context}\n{input}",
+    "passage_count": "There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ",
+    "passage_retrieval_en": 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ',
+    "passage_retrieval_zh": '以下是若干段落文字，以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1"，"段落2"等格式\n\n答案是：',
+    "lcc": "Please complete the code given below. \n{context}Next line of code:\n",
+    "repobench-p": "Please complete the code given below. \n{context}{input}Next line of code:\n",
+}
+
+dataset2maxlen = {
+    "narrativeqa": 128,
+    "qasper": 128,
+    "multifieldqa_en": 64,
+    "multifieldqa_zh": 64,
+    "hotpotqa": 32,
+    "2wikimqa": 32,
+    "musique": 32,
+    "dureader": 128,
+    "gov_report": 512,
+    "qmsum": 512,
+    "multi_news": 512,
+    "vcsum": 512,
+    "trec": 64,
+    "triviaqa": 32,
+    "samsum": 128,
+    "lsht": 64,
+    "passage_count": 32,
+    "passage_retrieval_en": 32,
+    "passage_retrieval_zh": 32,
+    "lcc": 64,
+    "repobench-p": 64,
+}
+
+default_inference_kwargs = {
+    "calculate_loss": True,
+    "all_classes": None,
+    "language": "Chinese",
+    "pretrain": False,
+    "max_new_tokens": 32,
+}
+
+
+class LongBenchDataset(BaseDataset):
+    """
+    Dataset class for LongBench dataset.
+    Data source: https://huggingface.co/datasets/THUDM/LongBench
+    This dataset class will convert the original dataset into the inference dataset.
+
+    Issue link: https://github.com/THUDM/LongBench/issues/15 (fixed)
+    There are duplicate target answers in `nq.jsonl`, but this doesn't affect evaluation results.
+    Also doesn't affect perplexity calculation (the program only need to select the minimum loss).
+    """
+
+    @staticmethod
+    def load(path: str, logger: DistributedLogger) -> List[Dict]:
+        dataset = {"test": {}}
+
+        files = os.listdir(path)
+        files.sort()
+
+        for file in files:
+            category = file[0:-6]
+
+            if category.endswith("_e"):
+                continue
+
+            dataset["test"][category] = {"data": []}
+
+            file_dir = os.path.join(path, file)
+
+            loaded_jsonl = get_json_list(file_dir)
+
+            # It's been tested that each data sample in one subcategory have same inference arguments.
+            inference_kwargs = deepcopy(default_inference_kwargs)
+            if loaded_jsonl[0]["all_classes"] is not None:
+                inference_kwargs["all_classes"] = loaded_jsonl[0]["all_classes"]
+            inference_kwargs["max_new_tokens"] = dataset2maxlen[category]
+            dataset["test"][category]["inference_kwargs"] = inference_kwargs
+
+            for sample in loaded_jsonl:
+                prompt = dataset2prompt[category].format(**sample)
+
+                data_sample = {
+                    "dataset": "longbench",
+                    "split": "test",
+                    "category": category,
+                    "instruction": prompt,
+                    "input": "",
+                    "output": "",
+                    "target": sample["answers"],
+                }
+
+                dataset["test"][category]["data"].append(data_sample)
+
+        return dataset
--- a/applications/ColossalEval/colossal_eval/dataset/mmlu.py
+++ b/applications/ColossalEval/colossal_eval/dataset/mmlu.py
@@ -0,0 +1,73 @@
+import copy
+import csv
+import os
+from typing import Dict, List
+
+from colossalai.logging import DistributedLogger
+
+from .base import BaseDataset
+
+default_inference_kwargs = {
+    "calculate_loss": True,
+    "all_classes": ["A", "B", "C", "D"],
+    "language": "English",
+    "pretrain": False,
+    "max_new_tokens": 32,
+}
+
+
+def get_few_shot_data(data: List[Dict]):
+    few_shot_data = []
+    for i in data:
+        few_shot_data.append(i["input"] + i["target"])
+    return few_shot_data
+
+
+class MMLUDataset(BaseDataset):
+    """
+    Dataset class for MMLU dataset.
+    Data source: https://github.com/hendrycks/test
+    This dataset class will convert the original dataset into the inference dataset.
+    """
+
+    @staticmethod
+    def load(path: str, logger: DistributedLogger, few_shot: bool) -> List[Dict]:
+        dataset = {"dev": {}, "test": {}}
+        for split in ["dev", "test"]:
+            files = os.listdir(os.path.join(path, split))
+            files.sort()
+
+            for file in files:
+                subject = file[0 : -len(f"_{split}.csv")].split("_")
+                subject = " ".join([word.title() if word != "us" else "US" for word in subject])
+
+                file_dir = os.path.join(path, split, file)
+
+                dataset[split][subject] = {"data": [], "inference_kwargs": {}}
+
+                # It's been tested that each data sample in one subcategory have same inference arguments.
+                dataset[split][subject]["inference_kwargs"] = copy.deepcopy(default_inference_kwargs)
+
+                if split == "test" and few_shot:
+                    dataset[split][subject]["inference_kwargs"]["few_shot_data"] = get_few_shot_data(
+                        dataset["dev"][subject]["data"]
+                    )
+
+                with open(file_dir, encoding="utf-8") as f:
+                    reader = csv.reader(f)
+                    for row in reader:
+                        assert len(row) == 6
+                        choices = f"A. {row[1]}\nB. {row[2]}\nC. {row[3]}\nD. {row[4]}"
+                        data_sample = {
+                            "dataset": "mmlu",
+                            "split": split,
+                            "category": subject,
+                            "instruction": f"The following is a single-choice question on {subject}. Answer the question by replying A, B, C or D.",
+                            "input": f"Question: {row[0]}\n{choices}\nAnswer: ",
+                            "output": "",
+                            "target": row[5],
+                        }
+
+                        dataset[split][subject]["data"].append(data_sample)
+
+        return dataset