diff --git a/applications/ColossalChat/coati/distributed/reward/reward_fn.py b/applications/ColossalChat/coati/distributed/reward/reward_fn.py index a0f92d8c4..1260645c9 100644 --- a/applications/ColossalChat/coati/distributed/reward/reward_fn.py +++ b/applications/ColossalChat/coati/distributed/reward/reward_fn.py @@ -3,14 +3,11 @@ import torch from .reward_utils import extract_solution, validate_response_structure -def math_reward_fn(step, input_ids, gt_answer, response_idx, **kwargs): +def math_reward_fn(input_ids, gt_answer, response_idx, **kwargs): tokenizer = kwargs["tokenizer"] soft_over_length_punishment = kwargs["soft_over_length_punishment"] - format_score = 1.0 - acc_score = 9.0 - if step > 30: - format_score = 0.0 - acc_score = 10.0 + format_score = 0.0 + acc_score = 10.0 reward = torch.tensor(0.0) format_reward = torch.tensor(0.0) acc_reward = torch.tensor(0.0) @@ -21,10 +18,8 @@ def math_reward_fn(step, input_ids, gt_answer, response_idx, **kwargs): max_length = kwargs.get("max_length", 1024 * 4) cache_length = kwargs.get("cache_length", 512) res_length = e.item() - s.item() + 1 - if res_length >= max_length: - length_reward = -1.0 * 2 - elif res_length > max_length - cache_length: - length_reward = ((max_length - cache_length) - res_length) / cache_length * 2 + if max_length - cache_length < res_length < max_length: + length_reward = ((max_length - cache_length) - res_length) / cache_length * acc_score if gt_answer is None: return reward