From d03cdea949425f02b0f147e8b604f6d770a5cfeb Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li35271158@gmail.com>
Date: Thu, 6 Mar 2025 10:53:48 +0800
Subject: [PATCH] update reward fn

---
 .../ColossalChat/coati/distributed/reward/reward_fn.py        | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/reward/reward_fn.py b/applications/ColossalChat/coati/distributed/reward/reward_fn.py
index c92f822f7..9e6d1066e 100644
--- a/applications/ColossalChat/coati/distributed/reward/reward_fn.py
+++ b/applications/ColossalChat/coati/distributed/reward/reward_fn.py
@@ -11,7 +11,7 @@ def math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
         return reward
 
     decoded_final_answer = tokenizer.decode(input_ids[s : e + 1], skip_special_tokens=True)
-    gt_answer = tokenizer.decode(gt_answer.squeeze(0))
+    gt_answer = tokenizer.decode(gt_answer.squeeze(0), skip_special_tokens=True)
     final_answer, processed_str = extract_solution(decoded_final_answer)
 
     format_valid = validate_response_structure(processed_str, kwargs["tags"])
@@ -20,7 +20,7 @@ def math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
     else:
         reward += 1.0
         if gt_answer.strip().replace(" ", "").lower() == final_answer.strip().replace(" ", "").lower():
-            reward = reward + 9.0
+            reward = reward + 2.0
         return reward