[evaluation] improvement on evaluation (#3862)

* fix a bug when the config file contains one category but the answer file doesn't contains that category * fix Chinese prompt file * support gpt-3.5-turbo and gpt-4 evaluation * polish and update README * resolve pr comments --------- Co-authored-by: Yuanchen Xu <yuanchen.xu00@gmail.com>
2025-12-23 04:23:30 +00:00 · 2023-05-30 11:48:41 +08:00
parent b0474878bf
commit 2506e275b8
7 changed files with 335 additions and 142 deletions
--- a/applications/Chat/evaluate/eval.py
+++ b/applications/Chat/evaluate/eval.py
@@ -39,7 +39,8 @@ def main(args):
                "No prompt file for gpt evaluation provided. Please specify the prompt file for gpt evaluation!")

        # initialize evaluator
-        evaluator = Evaluator(metrics_per_category, battle_prompt, gpt_evaluation_prompt)
+        evaluator = Evaluator(metrics_per_category, battle_prompt, gpt_evaluation_prompt, args.gpt_model,
+                              config["language"])
        if len(args.model_name_list) == 2:
            answers1 = jload(args.answer_file_list[0])
            answers2 = jload(args.answer_file_list[1])
@@ -87,6 +88,10 @@ if __name__ == '__main__':
                        default=[],
                        required=True,
                        help='the names of at most 2 models')
+    parser.add_argument('--gpt_model',
+                        default="gpt-3.5-turbo",
+                        choices=["text-davinci-003", "gpt-3.5-turbo", "gpt-4"],
+                        help='which GPT model to use for evaluation')
    parser.add_argument('--save_path', type=str, default="results", help='path to save evaluation results')
    parser.add_argument('--openai_key', type=str, default=None, required=True, help='Your openai key')
    args = parser.parse_args()