[evaluation] improvement on evaluation (#3862)

* fix a bug when the config file contains one category but the answer file doesn't contains that category

* fix Chinese prompt file

* support gpt-3.5-turbo and gpt-4 evaluation

* polish and update README

* resolve pr comments

---------

Co-authored-by: Yuanchen Xu <yuanchen.xu00@gmail.com>
This commit is contained in:
Yuanchen
2023-05-30 11:48:41 +08:00
committed by GitHub
parent b0474878bf
commit 2506e275b8
7 changed files with 335 additions and 142 deletions

View File

@@ -16,7 +16,7 @@ from utils import jdump, jload
def get_battle_result(sys_prompt: str, user_prompt: str, id: int, max_tokens: int = 2048) -> Dict[str, Any]:
"""
Get evaluation from GPT-4.
Get battle evaluation from GPT-4.
Args:
sys_prompt: prompt for the system.
@@ -51,7 +51,7 @@ def get_battle_result(sys_prompt: str, user_prompt: str, id: int, max_tokens: in
except Exception as e:
print(e)
time.sleep(1)
print(f" Evaluation {id} failed after {MAX_API_RETRY} retries.")
print(f"Evaluation {id} failed after {MAX_API_RETRY} retries.")
return {"evaluation": "", "id": id}
@@ -233,12 +233,77 @@ def save_battle_results(evaluations: List[Dict], name1: str, name2: str, save_pa
print(f"Model {name2} average score: {ans2_score/(len(evaluations)-invalid_count):.2f}")
def get_gpt35_evaluation(prompt: Dict[str, Any],
inst: Dict[str, Any],
metrics: List[str],
max_tokens: int = 2048) -> Dict[str, Any]:
def get_gpt_evaluation_without_logprobs(prompt: Dict[str, Any],
inst: Dict[str, Any],
metrics: List[str],
model: str = "gpt-3.5-turbo",
max_tokens: int = 2048) -> Dict[str, Any]:
"""
Use GPT-3.5 to evaluate one model answer.
Use chat models(gpt-3.5-turbo or gpt-4) to evaluate one model answer.
Args:
prompt: a dictionary including prompt template, CoT and metrics.
inst: the instruction that is needed to be evaluated.
metrics: the metrics for evaluation.
model: the model used to evaluate answers.
max_tokens: the maximum number of tokens to generate in the chat completion.
Returns:
An evaluation of one answer.
"""
MAX_API_RETRY = 3
question = (inst["instruction"] if inst["input"] == "" else inst["instruction"] + " " + inst["input"])
answer = inst["output"]
inst["evaluation"] = {}
for metric in metrics:
if prompt["metrics"].get(metric, None) is None:
raise Exception(
f"Unsupported metric {metric} for category {inst['category']}! You should add this metric in the prompt file!"
)
for i in range(MAX_API_RETRY):
try:
response = openai.ChatCompletion.create(
model=model,
messages=[
{
"role":
"user",
"content":
prompt["prompt"].format(
question=question,
answer=answer,
metric=prompt["metrics"][metric],
steps=prompt["CoT"][metric],
),
},
],
temperature=0,
max_tokens=max_tokens,
)
inst["evaluation"][metric] = {
"response": response["choices"][0]["message"]["content"],
"logprobs": None,
}
break
except Exception as e:
print(e)
time.sleep(1)
if metric not in inst["evaluation"]:
print(f"Evaluation {inst['id']} for metric {metric} failed after {MAX_API_RETRY} retries.")
inst["evaluation"][metric] = {}
return inst
def get_gpt_evaluation_with_logprobs(prompt: Dict[str, Any],
inst: Dict[str, Any],
metrics: List[str],
max_tokens: int = 2048) -> Dict[str, Any]:
"""
Use completion model(text-davinci-003) to evaluate one model answer.
Only completion models can return log probabilities.
Args:
prompt: a dictionary including prompt template, CoT and metrics.
@@ -283,23 +348,22 @@ def get_gpt35_evaluation(prompt: Dict[str, Any],
except Exception as e:
print(e)
time.sleep(1)
if metric not in inst["evaluation"]:
print(f"Evaluation {inst['id']} for metric {metric} failed after {MAX_API_RETRY} retries.")
inst["evaluation"][metric] = {}
return inst
def gpt35_evaluate(
answers: List[Dict],
prompt: Dict[str, Any],
metrics: List[str],
category: str,
) -> List[Dict]:
def evaluate(answers: List[Dict], prompt: Dict[str, Any], metrics: List[str], category: str, model: str) -> List[Dict]:
"""
Use GPT-3.5 to evaluate model answers and save evaluation results.
Use GPT models to evaluate model answers and save evaluation results.
Args:
answers: model answers.
prompt: prompt for GPT-3.5 evaluation.
metrics: metrics for GPT-3.5 evaluation.
prompt: prompt for GPT evaluation.
metrics: metrics for GPT evaluation.
category: the category of the model answers for evaluation.
model: the specific GPT model used to evaluate answers.
Returns:
Evaluations of the given answers.
@@ -315,7 +379,12 @@ def gpt35_evaluate(
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
futures = []
for inst in answers:
future = executor.submit(get_gpt35_evaluation, prompt, inst, metrics, 1)
# Completion models can return log probabilities.
if model == "text-davinci-003":
future = executor.submit(get_gpt_evaluation_with_logprobs, prompt, inst, metrics, 1)
else:
future = executor.submit(get_gpt_evaluation_without_logprobs, prompt, inst, metrics, model, 1)
futures.append(future)
for future in tqdm.tqdm(
@@ -334,20 +403,19 @@ def gpt35_evaluate(
def calculate_scores_form_logprobs(logprobs: Dict[str, Any]) -> float:
"""
Calculate score from log probabilities returned by text-davinci-003.
Only openai.Completion can return logprobs.
Calculate the score according to log probabilities returned by text-davinci-003.
Calculation formula:
score = sum(score_i * exp(value)) where score_i is the score which corresponds to the key(predicted token) and value is its log probability.
Ref: https://arxiv.org/abs/2303.16634
This paper proposes NLG evaluation methods using GPT-3.5(logprobs returned by openai api) and GPT-4(logprobs obtained by sampling).
This paper proposes NLG evaluation methods using text-davinci-003(log probabilities returned by completion models) and GPT-4(probabilities obtained by sampling).
Args:
logprobs: logprobs returned by openai.Completion.
Returns:
Score of one answer.
The score of one answer.
"""
# GPT-3.5 only returns score of 1 to 5.
@@ -369,7 +437,31 @@ def calculate_scores_form_logprobs(logprobs: Dict[str, Any]) -> float:
return score
def save_gpt35_evaluation_statistics(model_name: str, evaluations: List[Dict], save_path: str) -> None:
def calculate_scores_form_response(response: str, evaluation: Dict[str, Any]) -> int:
"""
Calculate the score from the response returned by gpt-3.5-turbo or gpt-4.
Different from text-davinci-003, this fuction directly calculates the score according to the plain response returned by gpt-3.5-turbo or gpt-4.
Although text-davinci-003 can return log probabilities, it costs ten times as much as gpt-3.5-turbo.
Args:
response: logprobs returned by openai.Completion.
evaluation: the evaluation corresponds to the question.
Returns:
The score of one answer.
"""
try:
results = re.findall(r"\d", response)
if len(results) == 1:
return int(results[0])
else:
raise Exception(f"Invalid score pair. Got {evaluation}.")
except Exception as e:
return 0
def save_gpt_evaluation_statistics(model_name: str, evaluations: List[Dict], save_path: str) -> None:
"""
Generate statistics for one model.
@@ -396,7 +488,15 @@ def save_gpt35_evaluation_statistics(model_name: str, evaluations: List[Dict], s
scores = {metric: [] for metric in metrics}
for evaluation in data:
for metric in metrics:
scores[metric].append(calculate_scores_form_logprobs(evaluation["evaluation"][metric]["logprobs"][0]))
if evaluation["evaluation"][metric] == {}:
# This means after 3 retries, the server still returns an error and we set the score to 0.
scores[metric].append(0)
elif evaluation["evaluation"][metric]["logprobs"] is not None:
scores[metric].append(
calculate_scores_form_logprobs(evaluation["evaluation"][metric]["logprobs"][0]))
else:
scores[metric].append(
calculate_scores_form_response(evaluation["evaluation"][metric]["response"], evaluation))
statistics = {}
for metric in metrics:
@@ -414,7 +514,7 @@ def save_gpt35_evaluation_statistics(model_name: str, evaluations: List[Dict], s
)
def analyze_gpt35_evaluation_statistics(statistics_path: str, save_path: str) -> None:
def analyze_gpt_evaluation_statistics(statistics_path: str, save_path: str) -> None:
"""
Analyze and visualize all GPT-3.5 evaluation statistics in the given directory.
@@ -474,7 +574,7 @@ def analyze_gpt35_evaluation_statistics(statistics_path: str, save_path: str) ->
os.makedirs(save_path)
frame_all = pd.DataFrame(frame_all)
frame_all.to_csv(os.path.join(save_path, "gpt35_evaluation_statistics.csv"))
frame_all.to_csv(os.path.join(save_path, "gpt_evaluation_statistics.csv"))
for category in tqdm.tqdm(
frame_per_category.keys(),