[feature] ColossalEval: Evaluation Pipeline for LLMs (#4786)

* Add ColossalEval * Delete evaluate in Chat --------- Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com> Co-authored-by: Tong Li <tong.li352711588@gmail.com>
2025-09-03 01:55:12 +00:00 · 2023-09-24 23:14:11 +08:00
parent 74aa7d964a
commit ce777853ae
60 changed files with 5314 additions and 2497 deletions
--- a/applications/ColossalEval/colossal_eval/dataset/mmlu.py
+++ b/applications/ColossalEval/colossal_eval/dataset/mmlu.py
@@ -0,0 +1,73 @@
+import copy
+import csv
+import os
+from typing import Dict, List
+
+from colossalai.logging import DistributedLogger
+
+from .base import BaseDataset
+
+default_inference_kwargs = {
+    "calculate_loss": True,
+    "all_classes": ["A", "B", "C", "D"],
+    "language": "English",
+    "pretrain": False,
+    "max_new_tokens": 32,
+}
+
+
+def get_few_shot_data(data: List[Dict]):
+    few_shot_data = []
+    for i in data:
+        few_shot_data.append(i["input"] + i["target"])
+    return few_shot_data
+
+
+class MMLUDataset(BaseDataset):
+    """
+    Dataset class for MMLU dataset.
+    Data source: https://github.com/hendrycks/test
+    This dataset class will convert the original dataset into the inference dataset.
+    """
+
+    @staticmethod
+    def load(path: str, logger: DistributedLogger, few_shot: bool) -> List[Dict]:
+        dataset = {"dev": {}, "test": {}}
+        for split in ["dev", "test"]:
+            files = os.listdir(os.path.join(path, split))
+            files.sort()
+
+            for file in files:
+                subject = file[0 : -len(f"_{split}.csv")].split("_")
+                subject = " ".join([word.title() if word != "us" else "US" for word in subject])
+
+                file_dir = os.path.join(path, split, file)
+
+                dataset[split][subject] = {"data": [], "inference_kwargs": {}}
+
+                # It's been tested that each data sample in one subcategory have same inference arguments.
+                dataset[split][subject]["inference_kwargs"] = copy.deepcopy(default_inference_kwargs)
+
+                if split == "test" and few_shot:
+                    dataset[split][subject]["inference_kwargs"]["few_shot_data"] = get_few_shot_data(
+                        dataset["dev"][subject]["data"]
+                    )
+
+                with open(file_dir, encoding="utf-8") as f:
+                    reader = csv.reader(f)
+                    for row in reader:
+                        assert len(row) == 6
+                        choices = f"A. {row[1]}\nB. {row[2]}\nC. {row[3]}\nD. {row[4]}"
+                        data_sample = {
+                            "dataset": "mmlu",
+                            "split": split,
+                            "category": subject,
+                            "instruction": f"The following is a single-choice question on {subject}. Answer the question by replying A, B, C or D.",
+                            "input": f"Question: {row[0]}\n{choices}\nAnswer: ",
+                            "output": "",
+                            "target": row[5],
+                        }
+
+                        dataset[split][subject]["data"].append(data_sample)
+
+        return dataset