mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-04 02:26:51 +00:00
[feature] ColossalEval: Evaluation Pipeline for LLMs (#4786)
* Add ColossalEval * Delete evaluate in Chat --------- Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com> Co-authored-by: Tong Li <tong.li352711588@gmail.com>
This commit is contained in:
852
applications/ColossalEval/colossal_eval/evaluate/gpt_evaluate.py
Normal file
852
applications/ColossalEval/colossal_eval/evaluate/gpt_evaluate.py
Normal file
@@ -0,0 +1,852 @@
|
||||
import concurrent.futures
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
from copy import deepcopy
|
||||
from typing import Any, Dict, List
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import openai
|
||||
import pandas as pd
|
||||
import seaborn as sns
|
||||
import tqdm
|
||||
from colossal_eval.utils import jdump, jload
|
||||
|
||||
ref_step_template = {
|
||||
"en": "Now please compare the answer with the {adjective} answer, determine whether the answer is able to achieve the same level of {metric}.\n\n",
|
||||
"cn": "请比较答案与上面的{adjective}答案,确定答案是否可以达到与该{adjective}答案同样水平的{metric}。\n\n",
|
||||
}
|
||||
|
||||
ref_answer_template_general = {
|
||||
"en": "\nAn example answer with good quality is as follows:\n\n{answer}\n\n",
|
||||
"cn": "\n一个优质的示例答案如下:\n\n{answer}\n\n",
|
||||
}
|
||||
|
||||
ref_answer_template_correctness = {
|
||||
"en": "\nA correct answer is as follows:\n\n{answer}\n\n",
|
||||
"cn": "\n标准答案如下:\n\n{answer}\n\n",
|
||||
}
|
||||
|
||||
|
||||
def get_battle_result(sys_prompt: str, user_prompt: str, id: int, max_tokens: int = 2048) -> Dict[str, Any]:
|
||||
"""
|
||||
Get battle evaluation from GPT-4.
|
||||
|
||||
Args:
|
||||
sys_prompt: prompt for the system.
|
||||
user_prompt: prompt for the user.
|
||||
id: id of the answers for comparison.
|
||||
max_tokens: the maximum number of tokens to generate in the chat completion.
|
||||
|
||||
Returns:
|
||||
An evaluation of one comparison.
|
||||
"""
|
||||
|
||||
MAX_API_RETRY = 3
|
||||
for _ in range(MAX_API_RETRY):
|
||||
try:
|
||||
response = openai.ChatCompletion.create(
|
||||
model="gpt-4",
|
||||
messages=[
|
||||
{"role": "system", "content": sys_prompt},
|
||||
{
|
||||
"role": "user",
|
||||
"content": user_prompt,
|
||||
},
|
||||
],
|
||||
temperature=0.2,
|
||||
max_tokens=max_tokens,
|
||||
)
|
||||
evaluation = response["choices"][0]["message"]["content"]
|
||||
return {"evaluation": evaluation, "id": id}
|
||||
except Exception as e:
|
||||
print(e)
|
||||
time.sleep(1)
|
||||
print(f"Evaluation {id} failed after {MAX_API_RETRY} retries.")
|
||||
return {"evaluation": "", "id": id}
|
||||
|
||||
|
||||
def parse_battle_score(evaluation: str) -> List[float]:
|
||||
"""
|
||||
Parse evaluation from GPT-4 and get the scores of model 1 and 2.
|
||||
|
||||
Args:
|
||||
evaluation: evaluation from GPT-4.
|
||||
|
||||
Returns:
|
||||
A score pair of two different model answers.
|
||||
"""
|
||||
|
||||
try:
|
||||
pattern = re.compile("([0-9]|10) out of 10")
|
||||
sp = re.findall(pattern, evaluation)
|
||||
if len(re.findall(pattern, evaluation)) == 2:
|
||||
return [float(sp[0]), float(sp[1])]
|
||||
|
||||
pattern = re.compile("a score of ([0-9]|10)")
|
||||
sp = re.findall(pattern, evaluation)
|
||||
if len(re.findall(pattern, evaluation)) == 2:
|
||||
return [float(sp[0]), float(sp[1])]
|
||||
|
||||
pattern = re.compile("([0-9]|10)/10")
|
||||
sp = re.findall(pattern, evaluation)
|
||||
if len(re.findall(pattern, evaluation)) == 2:
|
||||
return [float(sp[0]), float(sp[1])]
|
||||
|
||||
score_pair = evaluation.split("\n")[0]
|
||||
score_pair = score_pair.replace(",", " ")
|
||||
sp = score_pair.split(" ")
|
||||
if len(sp) == 2:
|
||||
return [float(sp[0]), float(sp[1])]
|
||||
else:
|
||||
raise Exception(f"Invalid score pair. Got {evaluation}.")
|
||||
except Exception:
|
||||
return [-1, -1]
|
||||
|
||||
|
||||
def battle(answer1: List[Dict], answer2: List[Dict], prompt_dict: Dict[str, Any]) -> List[Dict]:
|
||||
"""
|
||||
Use GPT-4 to compare answers of two different models.
|
||||
|
||||
Args:
|
||||
answer1: answers of model 1.
|
||||
answer2: answers of model 2.
|
||||
prompt_dict: prompt for battle.
|
||||
|
||||
Returns:
|
||||
Evaluations of all comparison pairs.
|
||||
"""
|
||||
|
||||
assert len(answer1) == len(answer2)
|
||||
|
||||
total_len = len(answer1)
|
||||
question_idx_list = list(range(total_len))
|
||||
|
||||
print(f" Total number of answers: {len(answer1)}.")
|
||||
|
||||
evaluations = []
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
|
||||
futures = []
|
||||
for i in question_idx_list:
|
||||
assert answer1[i]["id"] == answer2[i]["id"]
|
||||
answer_id = answer1[i]["id"]
|
||||
|
||||
ques = (
|
||||
answer1[i]["instruction"]
|
||||
if answer1[i]["input"] == ""
|
||||
else answer1[i]["instruction"] + " " + answer1[i]["input"]
|
||||
)
|
||||
answer1[i]["category"]
|
||||
ans1 = answer1[i]["output"]
|
||||
ans2 = answer2[i]["output"]
|
||||
|
||||
sys_prompt = prompt_dict["system_prompt"]
|
||||
prompt_template = prompt_dict["prompt_template"]
|
||||
prompt = prompt_template.format(
|
||||
question=ques,
|
||||
answer_1=ans1,
|
||||
answer_2=ans2,
|
||||
prompt=prompt_dict["prompt"],
|
||||
)
|
||||
|
||||
future = executor.submit(get_battle_result, sys_prompt, prompt, answer_id, 2048)
|
||||
futures.append(future)
|
||||
|
||||
for future in tqdm.tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
|
||||
evaluations.append(future.result())
|
||||
|
||||
evaluations.sort(key=lambda x: x["id"])
|
||||
|
||||
return evaluations
|
||||
|
||||
|
||||
def save_battle_results(evaluations: List[Dict], name1: str, name2: str, save_path: str) -> None:
|
||||
"""
|
||||
Save evaluation results (model 1 vs model 2) from GPT-4.
|
||||
|
||||
Args:
|
||||
evaluations: evaluation results from GPT-4.
|
||||
name1: model 1 's name.
|
||||
name2: model 2 's name.
|
||||
save_path: path to save battle results.
|
||||
"""
|
||||
|
||||
evaluation_file = deepcopy(evaluations)
|
||||
|
||||
ans1_score = 0
|
||||
ans2_score = 0
|
||||
better_count = 0
|
||||
worse_count = 0
|
||||
tie_count = 0
|
||||
invalid_count = 0
|
||||
|
||||
better_file = []
|
||||
worse_file = []
|
||||
tie_file = []
|
||||
invalid_file = []
|
||||
|
||||
for idx, evaluation in enumerate(evaluations):
|
||||
scores = parse_battle_score(evaluation["evaluation"])
|
||||
evaluation_file[idx]["score"] = scores
|
||||
|
||||
if scores[0] == -1 and scores[1] == -1:
|
||||
invalid_count += 1
|
||||
invalid_file.append(evaluation_file[idx])
|
||||
print(f'Invalid score pair: {evaluation_file[idx]["id"]}.')
|
||||
else:
|
||||
if scores[0] > scores[1]:
|
||||
worse_count += 1
|
||||
worse_file.append(evaluation_file[idx])
|
||||
elif scores[0] < scores[1]:
|
||||
better_count += 1
|
||||
better_file.append(evaluation_file[idx])
|
||||
else:
|
||||
tie_count += 1
|
||||
tie_file.append(evaluation_file[idx])
|
||||
ans1_score += scores[0]
|
||||
ans2_score += scores[1]
|
||||
|
||||
prefix = f"{name1}_vs_{name2}"
|
||||
|
||||
if not os.path.exists(save_path):
|
||||
os.makedirs(save_path)
|
||||
|
||||
jdump(better_file, os.path.join(save_path, prefix, f"{name2}_better.json"))
|
||||
jdump(worse_file, os.path.join(save_path, prefix, f"{name2}_worse.json"))
|
||||
jdump(tie_file, os.path.join(save_path, prefix, f"{prefix}_tie.json"))
|
||||
jdump(invalid_file, os.path.join(save_path, prefix, f"{prefix}_invalid.json"))
|
||||
jdump(evaluation_file, os.path.join(save_path, prefix, f"{prefix}_evaluations.json"))
|
||||
|
||||
if os.path.exists(os.path.join(save_path, "battle_results.json")):
|
||||
results = jload(os.path.join(save_path, "battle_results.json"))
|
||||
else:
|
||||
results = {}
|
||||
|
||||
results[prefix] = {
|
||||
"model": [name1, name2],
|
||||
"better": better_count,
|
||||
"worse": worse_count,
|
||||
"tie": tie_count,
|
||||
"win_rate": better_count / (len(evaluations) - invalid_count),
|
||||
"score": [
|
||||
ans1_score / (len(evaluations) - invalid_count),
|
||||
ans2_score / (len(evaluations) - invalid_count),
|
||||
],
|
||||
}
|
||||
jdump(results, os.path.join(save_path, "battle_results.json"))
|
||||
|
||||
print(f"Total {invalid_count} invalid score pair(s).")
|
||||
print(f"Model {name2} has {better_count} better answer(s).")
|
||||
print(f"Model {name2} has {worse_count} worse answer(s).")
|
||||
print(f"{tie_count} answer(s) play(s) to a tie.")
|
||||
print(f"Win rate of model {name2}: {better_count/(len(evaluations)-invalid_count):.2f}")
|
||||
print(f"Model {name1} average score: {ans1_score/(len(evaluations)-invalid_count):.2f}")
|
||||
print(f"Model {name2} average score: {ans2_score/(len(evaluations)-invalid_count):.2f}")
|
||||
|
||||
|
||||
def reference_template(metric: str, language: str, reference: Dict[str, Any]) -> str:
|
||||
"""
|
||||
Get prompt template for GPT evaluation with reference.
|
||||
|
||||
Different languages have different prompt templates.
|
||||
|
||||
Args:
|
||||
metric: metric used in GPT evaluation with reference.
|
||||
language: language for the template.
|
||||
reference: the instruction that contains target answer.
|
||||
|
||||
Returns:
|
||||
Prompt template for GPT evaluation with reference.
|
||||
"""
|
||||
|
||||
step_to_add = ref_step_template[language]
|
||||
|
||||
for_the_given_answer = (
|
||||
"{metric} (1-5) (directly give the score for the given answer):"
|
||||
if language == "en"
|
||||
else "{metric} (1-5) (直接对给定答案打分)"
|
||||
)
|
||||
|
||||
# adjective is used to describe the word "answer" in the prompt.
|
||||
adjective = "example" if language == "en" else "示例"
|
||||
answer_to_add = ref_answer_template_general[language]
|
||||
|
||||
# Only for correctness, we will provide a correct answer and so the adjective for "answer" will be "correct". The prompt words will be "a correct answer".
|
||||
# In other cases, the prompt words will be "an example answer with good quality" by default.
|
||||
if metric.lower() == "correctness":
|
||||
adjective = "correct" if language == "en" else "标准"
|
||||
answer_to_add = ref_answer_template_correctness[language]
|
||||
|
||||
answer_to_add = answer_to_add.format(answer=reference["target"] if reference["target"] else reference["output"])
|
||||
step_to_add = step_to_add.format(metric=metric.lower(), adjective=adjective) + for_the_given_answer.format(
|
||||
metric=metric
|
||||
)
|
||||
|
||||
return answer_to_add + step_to_add
|
||||
|
||||
|
||||
def fill_in_message(role: str, content: str) -> Dict[str, str]:
|
||||
"""
|
||||
Generate one formatted message to send through chat completion.
|
||||
|
||||
Args:
|
||||
role: the role of the author of this message.
|
||||
content: the contents of the message.
|
||||
|
||||
Returns:
|
||||
One message to send through chat completion.
|
||||
"""
|
||||
|
||||
return {"role": role, "content": content}
|
||||
|
||||
|
||||
def multiturn_chat_completion(user_messages: List[str], model: str, max_tokens: int = 1, turns=2) -> Dict[str, Any]:
|
||||
"""
|
||||
Do multi-turn chat completion.
|
||||
|
||||
When turns == 1, it is a one-turn conversation for normal GPT evaluation.
|
||||
When turns == 2, it is a two-turn conversation which is used for GPT evaluation with reference answers.
|
||||
|
||||
Args:
|
||||
user_messages: messages user wants to send.
|
||||
model: the model used to evaluate answers.
|
||||
max_tokens: the maximum number of tokens to generate in the chat completion.
|
||||
turns: the number of turns for conversation.
|
||||
|
||||
Returns:
|
||||
Last turn's response.
|
||||
"""
|
||||
|
||||
if len(user_messages) != turns:
|
||||
raise Exception("The length of user messages should be equal to the turn number!")
|
||||
|
||||
assistant_responses = []
|
||||
|
||||
for i in range(turns):
|
||||
messages_to_send = []
|
||||
|
||||
for j in range(i):
|
||||
messages_to_send.append(fill_in_message("user", user_messages[j]))
|
||||
messages_to_send.append(
|
||||
fill_in_message("assistant", assistant_responses[j]["choices"][0]["message"]["content"])
|
||||
)
|
||||
|
||||
# Length of user messages == Length of assistant messages + 1
|
||||
# Because we always expect the api to response
|
||||
messages_to_send.append(fill_in_message("user", user_messages[i]))
|
||||
|
||||
response = openai.ChatCompletion.create(
|
||||
model=model,
|
||||
messages=messages_to_send,
|
||||
temperature=0,
|
||||
max_tokens=max_tokens,
|
||||
)
|
||||
|
||||
# Avoid exceeding rate limits.
|
||||
# You can comment this line if your request doesn't contain many tokens.
|
||||
time.sleep(1)
|
||||
|
||||
assistant_responses.append(response)
|
||||
|
||||
return assistant_responses[-1]
|
||||
|
||||
|
||||
def get_gpt_evaluation_without_logprobs(
|
||||
prompt: Dict[str, Any],
|
||||
inst: Dict[str, Any],
|
||||
metrics: List[str],
|
||||
language: str,
|
||||
reference: Dict[str, Any] = None,
|
||||
model: str = "gpt-3.5-turbo",
|
||||
max_tokens: int = 2048,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Use chat models(gpt-3.5-turbo or gpt-4) to evaluate one model answer.
|
||||
|
||||
Temprature is set to 0 to make the model more deterministic.
|
||||
|
||||
Args:
|
||||
prompt: a dictionary including prompt template, CoT and metrics.
|
||||
inst: the instruction that is needed to be evaluated.
|
||||
metrics: the metrics for evaluation.
|
||||
language: language used to change the CoT(add one more step about comparing the given answer and reference) if reference is not None.
|
||||
reference: the reference answer.
|
||||
model: the model used to evaluate answers.
|
||||
max_tokens: the maximum number of tokens to generate in the chat completion.
|
||||
|
||||
Returns:
|
||||
An evaluation of one answer.
|
||||
"""
|
||||
|
||||
MAX_API_RETRY = 3
|
||||
|
||||
question = inst["instruction"] if inst["input"] == "" else inst["instruction"] + "\n" + inst["input"]
|
||||
answer = inst["output"]
|
||||
inst["evaluation"] = {}
|
||||
|
||||
for metric in metrics:
|
||||
if prompt["metrics"].get(metric, None) is None:
|
||||
raise Exception(
|
||||
f"Unsupported metric {metric} for category {inst['category']}! You should add this metric in the prompt file!"
|
||||
)
|
||||
for i in range(MAX_API_RETRY):
|
||||
try:
|
||||
prompt_reference = "" if reference is None else reference_template(metric, language, reference)
|
||||
|
||||
prompt_1st_round = prompt["prompt"].format(
|
||||
question=question,
|
||||
answer=answer,
|
||||
metric=prompt["metrics"][metric],
|
||||
steps=prompt["CoT"][metric],
|
||||
)
|
||||
|
||||
if prompt_reference and (reference["target"] or reference["output"]):
|
||||
# Do a 2-round conversation
|
||||
response = multiturn_chat_completion(
|
||||
[prompt_1st_round, prompt_reference], model, max_tokens=max_tokens, turns=2
|
||||
)
|
||||
else:
|
||||
response = multiturn_chat_completion([prompt_1st_round], model, max_tokens=max_tokens, turns=1)
|
||||
|
||||
inst["evaluation"][metric] = {
|
||||
"response": response["choices"][0]["message"]["content"],
|
||||
"logprobs": None,
|
||||
}
|
||||
|
||||
# Prevent exceeding rate limits because we have multiple workers.
|
||||
# But this will slow down the evaluation process.
|
||||
# You can comment this line if your request doesn't contain many tokens.
|
||||
time.sleep(len(metrics) * 0.5)
|
||||
|
||||
break
|
||||
except Exception as e:
|
||||
print(e)
|
||||
time.sleep(1)
|
||||
if metric not in inst["evaluation"]:
|
||||
print(f"Evaluation {inst['id']} for metric {metric} failed after {MAX_API_RETRY} retries.")
|
||||
inst["evaluation"][metric] = {}
|
||||
return inst
|
||||
|
||||
|
||||
def get_gpt_evaluation_with_logprobs(
|
||||
prompt: Dict[str, Any], inst: Dict[str, Any], metrics: List[str], max_tokens: int = 2048
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Use completion model(text-davinci-003) to evaluate one model answer.
|
||||
Only completion models can return log probabilities.
|
||||
|
||||
Temprature is set to 0 to make the model more deterministic.
|
||||
|
||||
Args:
|
||||
prompt: a dictionary including prompt template, CoT and metrics.
|
||||
inst: the instruction that is needed to be evaluated.
|
||||
metrics: the metrics for evaluation.
|
||||
max_tokens: the maximum number of tokens to generate in the completion.
|
||||
|
||||
Returns:
|
||||
An evaluation of one answer.
|
||||
"""
|
||||
|
||||
MAX_API_RETRY = 3
|
||||
|
||||
question = inst["instruction"] if inst["input"] == "" else inst["instruction"] + "\n" + inst["input"]
|
||||
answer = inst["output"]
|
||||
inst["evaluation"] = {}
|
||||
|
||||
for metric in metrics:
|
||||
if prompt["metrics"].get(metric, None) is None:
|
||||
raise Exception(
|
||||
f"Unsupported metric {metric} for category {inst['category']}! You should add this metric in the prompt file!"
|
||||
)
|
||||
for i in range(MAX_API_RETRY):
|
||||
try:
|
||||
response = openai.Completion.create(
|
||||
model="text-davinci-003",
|
||||
prompt=prompt["prompt"].format(
|
||||
question=question,
|
||||
answer=answer,
|
||||
metric=prompt["metrics"][metric],
|
||||
steps=prompt["CoT"][metric],
|
||||
),
|
||||
logprobs=5,
|
||||
temperature=0,
|
||||
max_tokens=max_tokens,
|
||||
)
|
||||
inst["evaluation"][metric] = {
|
||||
"response": response["choices"][0]["text"],
|
||||
"logprobs": response["choices"][0]["logprobs"]["top_logprobs"],
|
||||
}
|
||||
|
||||
# Prevent exceeding rate limits because we have multiple workers.
|
||||
# But this will slow down the evaluation process.
|
||||
# You can comment this line if your request doesn't contain many tokens.
|
||||
time.sleep(len(metrics) * 0.5)
|
||||
|
||||
break
|
||||
except Exception as e:
|
||||
print(e)
|
||||
time.sleep(1)
|
||||
if metric not in inst["evaluation"]:
|
||||
print(f"Evaluation {inst['id']} for metric {metric} failed after {MAX_API_RETRY} retries.")
|
||||
inst["evaluation"][metric] = {}
|
||||
return inst
|
||||
|
||||
|
||||
def evaluate(
|
||||
answers: List[Dict],
|
||||
prompt: Dict[str, Any],
|
||||
metrics: List[str],
|
||||
category: str,
|
||||
save_path: str,
|
||||
model_name: str,
|
||||
model: str,
|
||||
language: str,
|
||||
references: List[Dict] = None,
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
Use GPT models to evaluate model answers and save evaluation results.
|
||||
|
||||
Args:
|
||||
answers: model answers.
|
||||
prompt: prompt for GPT evaluation.
|
||||
metrics: metrics for GPT evaluation.
|
||||
category: the category of the model answers for evaluation.
|
||||
model: the specific GPT model used to evaluate answers.
|
||||
language: language used in GPT evaluation
|
||||
references: references for GPT evaluation
|
||||
|
||||
Returns:
|
||||
Evaluations of the given answers.
|
||||
"""
|
||||
|
||||
print(f"The number of instances of category {category}'s is {len(answers)}.")
|
||||
|
||||
evaluations = []
|
||||
|
||||
metrics_str = ", ".join(x for x in metrics)
|
||||
print(f"Category {category}'s metrics are {metrics_str}.")
|
||||
|
||||
gpt_base_save_path = os.path.join(save_path, "gpt_evaluate", "gpt_evaluate_results")
|
||||
gpt_evaluation_results_save_path = os.path.join(gpt_base_save_path, "evaluation_results")
|
||||
category_file = os.path.join(gpt_evaluation_results_save_path, model_name, f"{category}_evaluation_results.json")
|
||||
|
||||
if os.path.exists(category_file):
|
||||
print(f"Evaluation results for category {category}, model {model_name} already exists.")
|
||||
print("Skip evaluating.")
|
||||
|
||||
evaluations = jload(category_file)
|
||||
|
||||
retry = []
|
||||
evaluations_copy = deepcopy(evaluations)
|
||||
|
||||
success = []
|
||||
for idx, e in enumerate(evaluations_copy):
|
||||
keys = list(e["evaluation"].keys())
|
||||
for key in keys:
|
||||
if e["evaluation"][key] == {}:
|
||||
retry.append(e["id"])
|
||||
print(f"Re-evaluate id {e['id']} now.")
|
||||
break
|
||||
if e["id"] not in retry:
|
||||
success.append(e)
|
||||
|
||||
if len(retry) == 0:
|
||||
evaluations.sort(key=lambda x: x["id"])
|
||||
print(f"{category} done.")
|
||||
return evaluations
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
|
||||
futures = []
|
||||
for idx, inst in enumerate(answers):
|
||||
if not inst["id"] in retry:
|
||||
continue
|
||||
# Completion models can return log probabilities.
|
||||
if model == "text-davinci-003":
|
||||
future = executor.submit(get_gpt_evaluation_with_logprobs, prompt, inst, metrics, 1)
|
||||
else:
|
||||
future = executor.submit(
|
||||
get_gpt_evaluation_without_logprobs,
|
||||
prompt,
|
||||
inst,
|
||||
metrics,
|
||||
language,
|
||||
reference=None if references is None else references[idx],
|
||||
model=model,
|
||||
max_tokens=1,
|
||||
)
|
||||
|
||||
futures.append(future)
|
||||
|
||||
for future in tqdm.tqdm(
|
||||
concurrent.futures.as_completed(futures),
|
||||
desc=f"{category}: ",
|
||||
total=len(futures),
|
||||
):
|
||||
success.append(future.result())
|
||||
|
||||
success.sort(key=lambda x: x["id"])
|
||||
|
||||
print(f"Saving evaluation results for category {category}, model {model_name}.")
|
||||
|
||||
jdump(success, category_file)
|
||||
|
||||
return success
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
|
||||
futures = []
|
||||
for idx, inst in enumerate(answers):
|
||||
# Completion models can return log probabilities.
|
||||
if model == "text-davinci-003":
|
||||
future = executor.submit(get_gpt_evaluation_with_logprobs, prompt, inst, metrics, 1)
|
||||
else:
|
||||
future = executor.submit(
|
||||
get_gpt_evaluation_without_logprobs,
|
||||
prompt,
|
||||
inst,
|
||||
metrics,
|
||||
language,
|
||||
reference=None if references is None else references[idx],
|
||||
model=model,
|
||||
max_tokens=1,
|
||||
)
|
||||
|
||||
futures.append(future)
|
||||
|
||||
for future in tqdm.tqdm(
|
||||
concurrent.futures.as_completed(futures),
|
||||
desc=f"{category}: ",
|
||||
total=len(futures),
|
||||
):
|
||||
evaluations.append(future.result())
|
||||
|
||||
evaluations.sort(key=lambda x: x["id"])
|
||||
|
||||
print(f"{category} done.")
|
||||
|
||||
print(f"Saving evaluation results for category {category}, model {model_name}.")
|
||||
|
||||
jdump(evaluations, category_file)
|
||||
|
||||
return evaluations
|
||||
|
||||
|
||||
def calculate_scores_form_logprobs(logprobs: Dict[str, Any]) -> float:
|
||||
"""
|
||||
Calculate the score according to log probabilities returned by text-davinci-003.
|
||||
|
||||
Calculation formula:
|
||||
score = sum(score_i * exp(value)) where score_i is the score which corresponds to the key(predicted token) and value is its log probability.
|
||||
|
||||
Ref: https://arxiv.org/abs/2303.16634
|
||||
This paper proposes NLG evaluation methods using text-davinci-003(log probabilities returned by completion models) and GPT-4(probabilities obtained by sampling).
|
||||
|
||||
Args:
|
||||
logprobs: logprobs returned by openai.Completion.
|
||||
|
||||
Returns:
|
||||
The score of one answer.
|
||||
"""
|
||||
|
||||
# GPT-3.5 only returns score of 1 to 5.
|
||||
prob = np.zeros(5)
|
||||
|
||||
for key, value in logprobs.items():
|
||||
# Sometimes the key will be one byte of a unicode character which takes the form of "bytes:\\xe7".
|
||||
# It is meaningless and thus we don't calculate probability.
|
||||
if "bytes" in key:
|
||||
continue
|
||||
# results[0] is the score which corresponds to the key(predicted token).
|
||||
# For example, key "5" corresponds to score 5.
|
||||
results = re.findall(r"\d", key)
|
||||
if len(results) == 1:
|
||||
prob[int(results[0]) - 1] = prob[int(results[0]) - 1] + np.exp(value)
|
||||
|
||||
score = np.dot(np.arange(1, 6), prob)
|
||||
|
||||
return score
|
||||
|
||||
|
||||
def calculate_scores_form_response(response: str, evaluation: Dict[str, Any]) -> int:
|
||||
"""
|
||||
Calculate the score from the response returned by gpt-3.5-turbo or gpt-4.
|
||||
Different from text-davinci-003, this fuction directly calculates the score according to the plain response returned by gpt-3.5-turbo or gpt-4.
|
||||
Although text-davinci-003 can return log probabilities, it costs ten times as much as gpt-3.5-turbo.
|
||||
|
||||
Args:
|
||||
response: logprobs returned by openai.Completion.
|
||||
evaluation: the evaluation corresponds to the question.
|
||||
|
||||
Returns:
|
||||
The score of one answer.
|
||||
"""
|
||||
|
||||
try:
|
||||
results = re.findall(r"\d", response)
|
||||
if len(results) == 1:
|
||||
return int(results[0])
|
||||
else:
|
||||
raise Exception(f"Invalid score pair. Got {evaluation}.")
|
||||
except Exception:
|
||||
return 0
|
||||
|
||||
|
||||
def save_gpt_evaluation_results(
|
||||
model_name: str, gpt_evaluation_results: Dict[str, Any], save_path: str
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Save evaluation results for different categories for one model.
|
||||
|
||||
Args:
|
||||
model_name: name of the model for saving evaluation results.
|
||||
gpt_evaluation_results: evaluations results for all of the model answers.
|
||||
save_path: path to save GPT evaluation statistics.
|
||||
"""
|
||||
|
||||
all_evaluations = []
|
||||
for category, evaluations in gpt_evaluation_results.items():
|
||||
jdump(evaluations, os.path.join(save_path, model_name, f"{category}_evaluation_results.json"))
|
||||
all_evaluations.extend(evaluations)
|
||||
|
||||
jdump(all_evaluations, os.path.join(save_path, f"{model_name}_evaluation_results.json"))
|
||||
|
||||
return all_evaluations
|
||||
|
||||
|
||||
def save_gpt_evaluation_statistics(model_name: str, evaluations: List[Dict], save_path: str) -> None:
|
||||
"""
|
||||
Generate statistics for one model.
|
||||
|
||||
Args:
|
||||
model_name: name of the model for saving statistics.
|
||||
evaluations: evaluations for all of the model answers.
|
||||
save_path: path to save GPT evaluation statistics.
|
||||
"""
|
||||
|
||||
if not os.path.exists(save_path):
|
||||
os.makedirs(save_path)
|
||||
|
||||
data_per_category = {}
|
||||
for evaluation in evaluations:
|
||||
category = evaluation["category"]
|
||||
if evaluation["category"] in data_per_category.keys():
|
||||
data_per_category[category].append(evaluation)
|
||||
else:
|
||||
data_per_category[category] = [evaluation]
|
||||
|
||||
all_statistics = {}
|
||||
for category, data in data_per_category.items():
|
||||
metrics = data[0]["evaluation"].keys()
|
||||
scores = {metric: [] for metric in metrics}
|
||||
for evaluation in data:
|
||||
for metric in metrics:
|
||||
if evaluation["evaluation"][metric] == {}:
|
||||
# This means after 3 retries, the server still returns an error and we set the score to 0.
|
||||
scores[metric].append(0)
|
||||
elif evaluation["evaluation"][metric]["logprobs"] is not None:
|
||||
scores[metric].append(
|
||||
calculate_scores_form_logprobs(evaluation["evaluation"][metric]["logprobs"][0])
|
||||
)
|
||||
else:
|
||||
scores[metric].append(
|
||||
calculate_scores_form_response(evaluation["evaluation"][metric]["response"], evaluation)
|
||||
)
|
||||
|
||||
statistics = {}
|
||||
for metric in metrics:
|
||||
arg_sort = np.argsort(scores[metric])
|
||||
statistics[metric] = {}
|
||||
statistics[metric]["avg_score"] = sum(scores[metric]) / len(data)
|
||||
statistics[metric]["best_3"] = {data[i]["id"]: scores[metric][i] for i in arg_sort[-3:][::-1]}
|
||||
statistics[metric]["worst_3"] = {data[i]["id"]: scores[metric][i] for i in arg_sort[:3]}
|
||||
|
||||
all_statistics[category] = statistics
|
||||
|
||||
jdump(
|
||||
all_statistics,
|
||||
os.path.join(save_path, f"{model_name}_evaluation_statistics.json"),
|
||||
)
|
||||
|
||||
|
||||
def analyze_gpt_evaluation_statistics(statistics_path: str, save_path: str) -> None:
|
||||
"""
|
||||
Analyze and visualize all GPT evaluation statistics in the given directory.
|
||||
|
||||
Args:
|
||||
statistics_path: path to all the models' statistics.
|
||||
save_path: path to save table and visualization results.
|
||||
"""
|
||||
|
||||
if not os.path.exists(statistics_path):
|
||||
raise Exception(f'The given directory "{statistics_path}" doesn\'t exist! No statistics found!')
|
||||
|
||||
all_statistics = {}
|
||||
|
||||
for file_name in os.listdir(statistics_path):
|
||||
if file_name.endswith("_evaluation_statistics.json"):
|
||||
model_name = file_name.split("_evaluation_statistics.json")[0]
|
||||
all_statistics[model_name] = jload(os.path.join(statistics_path, file_name))
|
||||
|
||||
if len(list(all_statistics.keys())) == 0:
|
||||
raise Exception(f'There are no statistics in the given directory "{statistics_path}"!')
|
||||
|
||||
frame_all = {
|
||||
"model": [],
|
||||
"category": [],
|
||||
"metric": [],
|
||||
"avg_score": [],
|
||||
"best_3": [],
|
||||
"worst_3": [],
|
||||
}
|
||||
frame_per_category = {}
|
||||
for model_name, model_statistics in all_statistics.items():
|
||||
for category, category_statistics in model_statistics.items():
|
||||
if frame_per_category.get(category) is None:
|
||||
frame_per_category[category] = {
|
||||
"model": [],
|
||||
"metric": [],
|
||||
"avg_score": [],
|
||||
"best_3": [],
|
||||
"worst_3": [],
|
||||
}
|
||||
|
||||
for metric, metric_statistics in category_statistics.items():
|
||||
frame_all["model"].append(model_name)
|
||||
frame_all["category"].append(category)
|
||||
frame_all["metric"].append(metric)
|
||||
frame_all["avg_score"].append(metric_statistics["avg_score"])
|
||||
frame_all["best_3"].append(metric_statistics["best_3"])
|
||||
frame_all["worst_3"].append(metric_statistics["worst_3"])
|
||||
|
||||
frame_per_category[category]["model"].append(model_name)
|
||||
frame_per_category[category]["metric"].append(metric)
|
||||
frame_per_category[category]["avg_score"].append(metric_statistics["avg_score"])
|
||||
frame_per_category[category]["best_3"].append(metric_statistics["best_3"])
|
||||
frame_per_category[category]["worst_3"].append(metric_statistics["worst_3"])
|
||||
|
||||
if not os.path.exists(save_path):
|
||||
os.makedirs(save_path)
|
||||
|
||||
frame_all = pd.DataFrame(frame_all)
|
||||
frame_all.to_csv(os.path.join(save_path, "gpt_evaluation_statistics.csv"))
|
||||
|
||||
for category in tqdm.tqdm(
|
||||
frame_per_category.keys(),
|
||||
desc=f"GPT evaluation: ",
|
||||
total=len(frame_per_category.keys()),
|
||||
):
|
||||
data = pd.DataFrame(frame_per_category[category])
|
||||
|
||||
sns.set()
|
||||
fig = plt.figure(figsize=(16, 10))
|
||||
plt.ylim((0, 5))
|
||||
|
||||
fig = sns.barplot(x="metric", y="avg_score", hue="model", data=data, dodge=True)
|
||||
fig.set_title(f"Comparison between Different Models for Category {category.title()}")
|
||||
plt.xlabel("Evaluation Metric")
|
||||
plt.ylabel("Average Score")
|
||||
|
||||
figure = fig.get_figure()
|
||||
figure.savefig(os.path.join(save_path, f"{category}.png"), dpi=400)
|
||||
|
||||
plt.close()
|
Reference in New Issue
Block a user