[evaluation] improvement on evaluation (#3862)

* fix a bug when the config file contains one category but the answer file doesn't contains that category

* fix Chinese prompt file

* support gpt-3.5-turbo and gpt-4 evaluation

* polish and update README

* resolve pr comments

---------

Co-authored-by: Yuanchen Xu <yuanchen.xu00@gmail.com>
This commit is contained in:
Yuanchen
2023-05-30 11:48:41 +08:00
committed by GitHub
parent b0474878bf
commit 2506e275b8
7 changed files with 335 additions and 142 deletions

View File

@@ -39,7 +39,8 @@ def main(args):
"No prompt file for gpt evaluation provided. Please specify the prompt file for gpt evaluation!")
# initialize evaluator
evaluator = Evaluator(metrics_per_category, battle_prompt, gpt_evaluation_prompt)
evaluator = Evaluator(metrics_per_category, battle_prompt, gpt_evaluation_prompt, args.gpt_model,
config["language"])
if len(args.model_name_list) == 2:
answers1 = jload(args.answer_file_list[0])
answers2 = jload(args.answer_file_list[1])
@@ -87,6 +88,10 @@ if __name__ == '__main__':
default=[],
required=True,
help='the names of at most 2 models')
parser.add_argument('--gpt_model',
default="gpt-3.5-turbo",
choices=["text-davinci-003", "gpt-3.5-turbo", "gpt-4"],
help='which GPT model to use for evaluation')
parser.add_argument('--save_path', type=str, default="results", help='path to save evaluation results')
parser.add_argument('--openai_key', type=str, default=None, required=True, help='Your openai key')
args = parser.parse_args()