[misc] update pre-commit and run all files (#4752)

* [misc] update pre-commit

* [misc] run pre-commit

* [misc] remove useless configuration files

* [misc] ignore cuda for clang-format
This commit is contained in:
Hongxin Liu
2023-09-19 14:20:26 +08:00
committed by GitHub
parent 3c6b831c26
commit 079bf3cb26
1268 changed files with 50037 additions and 38444 deletions

View File

@@ -28,29 +28,29 @@ from .utils import add_question
class SumEvaluator:
def __init__(self, model_name_or_path, max_length=1024, device='cuda:0', cache_dir=None):
""" Set up evaluator for text summarization """
def __init__(self, model_name_or_path, max_length=1024, device="cuda:0", cache_dir=None):
"""Set up evaluator for text summarization"""
self.scorer = UniEvaluator(
model_name_or_path='MingZhong/unieval-sum' if model_name_or_path == "" else model_name_or_path,
model_name_or_path="MingZhong/unieval-sum" if model_name_or_path == "" else model_name_or_path,
max_length=max_length,
device=device,
cache_dir=cache_dir)
self.task = 'summarization'
self.dimensions = ['coherence', 'consistency', 'fluency', 'relevance']
cache_dir=cache_dir,
)
self.task = "summarization"
self.dimensions = ["coherence", "consistency", "fluency", "relevance"]
def evaluate(self, data, category, dims=None, overall=True):
"""
Get the scores of all the given dimensions
Get the scores of all the given dimensions
category: The category to be evaluated.
category: The category to be evaluated.
dims: A list of dimensions to be evaluated. If dims is None, SumEvaluator will evaluate
four dimensions: coherence, consistency, fluency, relevance.
dims: A list of dimensions to be evaluated. If dims is None, SumEvaluator will evaluate
four dimensions: coherence, consistency, fluency, relevance.
overall: indicates whether the overall score is to be calculated.
Overall score can be customized to a combination of scores based on different
dimensions. The default here is the average score of all the given dimensions.
overall: indicates whether the overall score is to be calculated.
Overall score can be customized to a combination of scores based on different
dimensions. The default here is the average score of all the given dimensions.
"""
n_data = len(data)
eval_scores = [{} for _ in range(n_data)]
@@ -63,12 +63,12 @@ class SumEvaluator:
for dim in eval_dims:
# Calculate average sentence-level scores for 'consistency' and 'fluency'
if dim == 'consistency' or dim == 'fluency':
if dim == "consistency" or dim == "fluency":
src_list, output_list = [], []
n_sents = [] # the number of sentences in each generated summary
n_sents = [] # the number of sentences in each generated summary
for i in range(n_data):
source = data[i]['source']
system_outputs = sent_tokenize(data[i]['system_output'])
source = data[i]["source"]
system_outputs = sent_tokenize(data[i]["system_output"])
n_sents.append(len(system_outputs))
for j in range(len(system_outputs)):
src_list.append(source)
@@ -81,24 +81,26 @@ class SumEvaluator:
score = []
for cur_n_sent in n_sents:
# prevent denominator from being 0
score.append(sum(sent_score[start_idx:start_idx + cur_n_sent]) / (cur_n_sent + 1e-6))
score.append(sum(sent_score[start_idx : start_idx + cur_n_sent]) / (cur_n_sent + 1e-6))
start_idx += cur_n_sent
# Calculate summary-level score for 'coherence' and 'relevance'
elif dim == 'coherence' or dim == 'relevance':
elif dim == "coherence" or dim == "relevance":
src_list, output_list, ref_list = [], [], []
for i in range(n_data):
src_list.append(data[i]['source'])
output_list.append(data[i]['system_output'])
if dim == 'relevance':
ref_list.append(data[i]['reference'])
src_list.append(data[i]["source"])
output_list.append(data[i]["system_output"])
if dim == "relevance":
ref_list.append(data[i]["reference"])
input_list = add_question(dimension=dim, output=output_list, src=src_list, ref=ref_list, task=self.task)
score = self.scorer.score(input_list, self.task, category, dim)
# Please customize other dimensions here for summarization
else:
raise NotImplementedError('The input format for this dimension is still undefined. \
Please customize it first.')
raise NotImplementedError(
"The input format for this dimension is still undefined. \
Please customize it first."
)
for i in range(n_data):
eval_scores[i][dim] = score[i]
@@ -106,35 +108,35 @@ class SumEvaluator:
# Customize your overall score here.
if overall == True:
for i in range(n_data):
eval_scores[i]['overall'] = np.mean(list(eval_scores[i].values()))
eval_scores[i]["overall"] = np.mean(list(eval_scores[i].values()))
return eval_scores
class DialogEvaluator:
def __init__(self, model_name_or_path, max_length=1024, device='cuda:0', cache_dir=None):
""" Set up evaluator for dialogues """
def __init__(self, model_name_or_path, max_length=1024, device="cuda:0", cache_dir=None):
"""Set up evaluator for dialogues"""
self.scorer = UniEvaluator(
model_name_or_path='MingZhong/unieval-dialog' if model_name_or_path == "" else model_name_or_path,
model_name_or_path="MingZhong/unieval-dialog" if model_name_or_path == "" else model_name_or_path,
max_length=max_length,
device=device,
cache_dir=cache_dir)
self.task = 'dialogue'
self.dimensions = ['naturalness', 'coherence', 'engagingness', 'groundedness', 'understandability']
cache_dir=cache_dir,
)
self.task = "dialogue"
self.dimensions = ["naturalness", "coherence", "engagingness", "groundedness", "understandability"]
def evaluate(self, data, category, dims=None, overall=True):
"""
Get the scores of all the given dimensions
Get the scores of all the given dimensions
category: The category to be evaluated.
category: The category to be evaluated.
dims: A list of dimensions to be evaluated. If dims is None, DialogEvaluator will evaluate
five dimensions: naturalness, coherence, engagingness, groundedness and understandability.
dims: A list of dimensions to be evaluated. If dims is None, DialogEvaluator will evaluate
five dimensions: naturalness, coherence, engagingness, groundedness and understandability.
overall: indicates whether the overall score is to be calculated.
Overall score can be customized to a combination of scores based on different
dimensions. The default here is the average score of all the given dimensions.
overall: indicates whether the overall score is to be calculated.
Overall score can be customized to a combination of scores based on different
dimensions. The default here is the average score of all the given dimensions.
"""
n_data = len(data)
eval_scores = [{} for _ in range(n_data)]
@@ -147,50 +149,48 @@ class DialogEvaluator:
for dim in eval_dims:
# Calculate summation score for 'engagingness'
if dim == 'engagingness':
if dim == "engagingness":
src_list, output_list, context_list = [], [], []
n_sents = [] # the number of sentences in each generated response
n_sents = [] # the number of sentences in each generated response
for i in range(n_data):
source = data[i]['source']
context = data[i]['context']
system_outputs = sent_tokenize(data[i]['system_output'])
source = data[i]["source"]
context = data[i]["context"]
system_outputs = sent_tokenize(data[i]["system_output"])
n_sents.append(len(system_outputs))
for j in range(len(system_outputs)):
src_list.append(source)
context_list.append(context)
output_list.append(system_outputs[j])
input_list = add_question(dimension=dim,
output=output_list,
src=src_list,
context=context_list,
task=self.task)
input_list = add_question(
dimension=dim, output=output_list, src=src_list, context=context_list, task=self.task
)
sent_score = self.scorer.score(input_list, self.task, category, dim)
# Get the summation score for each sample
start_idx = 0
score = []
for cur_n_sent in n_sents:
score.append(sum(sent_score[start_idx:start_idx + cur_n_sent]))
score.append(sum(sent_score[start_idx : start_idx + cur_n_sent]))
start_idx += cur_n_sent
# Calculate turn-level score for other dimensions
elif dim in ['naturalness', 'coherence', 'groundedness', 'understandability']:
elif dim in ["naturalness", "coherence", "groundedness", "understandability"]:
src_list, output_list, context_list = [], [], []
for i in range(n_data):
src_list.append(data[i]['source'])
output_list.append(data[i]['system_output'])
context_list.append(data[i]['context'])
input_list = add_question(dimension=dim,
output=output_list,
src=src_list,
context=context_list,
task=self.task)
src_list.append(data[i]["source"])
output_list.append(data[i]["system_output"])
context_list.append(data[i]["context"])
input_list = add_question(
dimension=dim, output=output_list, src=src_list, context=context_list, task=self.task
)
score = self.scorer.score(input_list, self.task, category, dim)
# Please customize other dimensions here for summarization
else:
raise NotImplementedError('The input format for this dimension is still undefined. \
Please customize it first.')
raise NotImplementedError(
"The input format for this dimension is still undefined. \
Please customize it first."
)
for i in range(n_data):
eval_scores[i][dim] = score[i]
@@ -198,35 +198,35 @@ class DialogEvaluator:
# Customize your overall score here.
if overall == True:
for i in range(n_data):
eval_scores[i]['overall'] = np.mean(list(eval_scores[i].values()))
eval_scores[i]["overall"] = np.mean(list(eval_scores[i].values()))
return eval_scores
class D2tEvaluator:
def __init__(self, model_name_or_path, max_length=1024, device='cuda:0', cache_dir=None):
""" Set up evaluator for data-to-text """
def __init__(self, model_name_or_path, max_length=1024, device="cuda:0", cache_dir=None):
"""Set up evaluator for data-to-text"""
self.scorer = UniEvaluator(
model_name_or_path='MingZhong/unieval-sum' if model_name_or_path == "" else model_name_or_path,
model_name_or_path="MingZhong/unieval-sum" if model_name_or_path == "" else model_name_or_path,
max_length=max_length,
device=device,
cache_dir=cache_dir)
self.task = 'data2text'
self.dimensions = ['naturalness', 'informativeness']
cache_dir=cache_dir,
)
self.task = "data2text"
self.dimensions = ["naturalness", "informativeness"]
def evaluate(self, data, category, dims=None, overall=True):
"""
Get the scores of all the given dimensions
Get the scores of all the given dimensions
category: The category to be evaluated.
category: The category to be evaluated.
dims: A list of dimensions to be evaluated. If dims is None, D2tEvaluator will evaluate
two dimensions: naturalness and informativeness.
dims: A list of dimensions to be evaluated. If dims is None, D2tEvaluator will evaluate
two dimensions: naturalness and informativeness.
overall: indicates whether the overall score is to be calculated.
Overall score can be customized to a combination of scores based on different
dimensions. The default here is the average score of all the given dimensions.
overall: indicates whether the overall score is to be calculated.
Overall score can be customized to a combination of scores based on different
dimensions. The default here is the average score of all the given dimensions.
"""
n_data = len(data)
eval_scores = [{} for _ in range(n_data)]
@@ -240,8 +240,8 @@ class D2tEvaluator:
for dim in eval_dims:
output_list, ref_list = [], []
for i in range(n_data):
output_list.append(data[i]['system_output'])
ref_list.append(data[i]['reference'])
output_list.append(data[i]["system_output"])
ref_list.append(data[i]["reference"])
input_list = add_question(dimension=dim, output=output_list, ref=ref_list, task=self.task)
score = self.scorer.score(input_list, self.task, category, dim)
@@ -252,38 +252,38 @@ class D2tEvaluator:
# Customize your overall score here.
if overall == True:
for i in range(n_data):
eval_scores[i]['overall'] = np.mean(list(eval_scores[i].values()))
eval_scores[i]["overall"] = np.mean(list(eval_scores[i].values()))
return eval_scores
class FactEvaluator:
def __init__(self, model_name_or_path, max_length=1024, device='cuda:0', cache_dir=None):
""" Set up evaluator for factual consistency detection """
def __init__(self, model_name_or_path, max_length=1024, device="cuda:0", cache_dir=None):
"""Set up evaluator for factual consistency detection"""
self.scorer = UniEvaluator(
model_name_or_path='MingZhong/unieval-fact' if model_name_or_path == "" else model_name_or_path,
model_name_or_path="MingZhong/unieval-fact" if model_name_or_path == "" else model_name_or_path,
max_length=max_length,
device=device,
cache_dir=cache_dir)
self.task = 'fact'
self.dim = 'consistency'
cache_dir=cache_dir,
)
self.task = "fact"
self.dim = "consistency"
def evaluate(self, data, category):
"""
Get the factual consistency score (only 1 dimension for this task)
Get the factual consistency score (only 1 dimension for this task)
category: The category to be evaluated.
category: The category to be evaluated.
"""
n_data = len(data)
eval_scores = [{} for _ in range(n_data)]
# Calculate average sentence-level scores for factual consistency
src_list, output_list = [], []
n_sents = [] # the number of sentences in the claim
n_sents = [] # the number of sentences in the claim
for i in range(n_data):
source = data[i]['source']
system_outputs = sent_tokenize(data[i]['system_output'])
source = data[i]["source"]
system_outputs = sent_tokenize(data[i]["system_output"])
n_sents.append(len(system_outputs))
for j in range(len(system_outputs)):
src_list.append(source)
@@ -295,7 +295,7 @@ class FactEvaluator:
start_idx = 0
score = []
for cur_n_sent in n_sents:
score.append(sum(sent_score[start_idx:start_idx + cur_n_sent]) / cur_n_sent)
score.append(sum(sent_score[start_idx : start_idx + cur_n_sent]) / cur_n_sent)
start_idx += cur_n_sent
for i in range(n_data):
@@ -304,28 +304,26 @@ class FactEvaluator:
return eval_scores
def get_evaluator(task, model_name_or_path="", max_length=1024, device='cuda:0', cache_dir=None):
assert task in ['summarization', 'dialogue', 'data2text', 'fact']
if task == 'summarization':
return SumEvaluator(model_name_or_path=model_name_or_path,
max_length=max_length,
device=device,
cache_dir=cache_dir)
elif task == 'dialogue':
return DialogEvaluator(model_name_or_path=model_name_or_path,
max_length=max_length,
device=device,
cache_dir=cache_dir)
elif task == 'data2text':
return D2tEvaluator(model_name_or_path=model_name_or_path,
max_length=max_length,
device=device,
cache_dir=cache_dir)
elif task == 'fact':
return FactEvaluator(model_name_or_path=model_name_or_path,
max_length=max_length,
device=device,
cache_dir=cache_dir)
def get_evaluator(task, model_name_or_path="", max_length=1024, device="cuda:0", cache_dir=None):
assert task in ["summarization", "dialogue", "data2text", "fact"]
if task == "summarization":
return SumEvaluator(
model_name_or_path=model_name_or_path, max_length=max_length, device=device, cache_dir=cache_dir
)
elif task == "dialogue":
return DialogEvaluator(
model_name_or_path=model_name_or_path, max_length=max_length, device=device, cache_dir=cache_dir
)
elif task == "data2text":
return D2tEvaluator(
model_name_or_path=model_name_or_path, max_length=max_length, device=device, cache_dir=cache_dir
)
elif task == "fact":
return FactEvaluator(
model_name_or_path=model_name_or_path, max_length=max_length, device=device, cache_dir=cache_dir
)
else:
raise NotImplementedError('Other tasks are not implemented, \
please customize specific tasks here.')
raise NotImplementedError(
"Other tasks are not implemented, \
please customize specific tasks here."
)