small fix

2025-08-15 22:53:12 +00:00 · 2025-04-16 15:59:25 +08:00 · 2025-04-16 15:59:25 +08:00 · 9474316132
commit 9474316132
parent cc4faa7300
2 changed files with 14 additions and 17 deletions
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@ -206,12 +206,6 @@ class GRPOConsumer(BaseConsumer):
        total_samples = all_reduce_sum(torch.sum(torch.ones_like(loss_mask, device=loss_mask.device)), self.plugin)
        self.effective_sample_count += effective_samples.item()
        self.total_sample_count += total_samples.item()
        print(
            loss_mask,
            self.effective_sample_count,
            self.total_sample_count,
            self.batch_size * self.dp_size * self.num_generations * 0.75,
        )
        mean_kl, mean_loss = [], []
@ -426,17 +420,19 @@ class GRPOConsumer(BaseConsumer):
                    self.plugin.pp_size > 1 and self.booster.plugin.stage_manager.is_last_stage() and self.tp_rank == 0
                ):
                    to_log_msg = (
-                        f"Loss: {self.accum_loss.item() / self.accum_count:.4f} \
+                        [
-                        Reward: {self.accum_reward.item() / self.accum_count:.4f} \
+                            f"Loss: {self.accum_loss.item() / self.accum_count:.4f}",
-                        Format Reward: {self.accum_format_acc.item() / self.accum_count:.4f} \
+                            f"Reward: {self.accum_reward.item() / self.accum_count:.4f}",
-                        Acc Reward: {self.accum_ans_acc.item() / self.accum_count:.4f} \
+                            f"ormat Reward: {self.accum_format_acc.item() / self.accum_count:.4f}",
-                        Advantages: {self.accum_advantages.item() / self.accum_count:.4f} \
+                            f"Acc Reward: {self.accum_ans_acc.item() / self.accum_count:.4f}",
-                        Response Length: {self.accum_response_length.item() / self.accum_count:.4f}"
+                            f"Advantages: {self.accum_advantages.item() / self.accum_count:.4f}",
-                        + f" KL: {self.accum_kl.item() / self.accum_count:.4f}"
+                            f"Response Length: {self.accum_response_length.item() / self.accum_count:.4f}",
                        ]
                        + [f"KL: {self.accum_kl.item() / self.accum_count:.4f}"]
                        if self.policy_loss_fn.beta > 0
-                        else ""
+                        else []
                    )
-                    print(to_log_msg)
+                    print("\n".join(to_log_msg))
                    metrics = {
                        "metrics/reward": self.accum_reward.item() / self.accum_count,
                        "metrics/format_acc": self.accum_format_acc.item() / self.accum_count,
--- a/applications/ColossalChat/coati/distributed/reward/reward_fn.py
+++ b/applications/ColossalChat/coati/distributed/reward/reward_fn.py
@ -35,9 +35,10 @@ def math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
        format_acc += 1
        reward += format_score
-    # Check answer accuracy
+    # Check answer accuracy, answer is considered correct if the answer is correct and the format is valid
    if (
-        final_answer is not None
+        format_valid
        and final_answer is not None
        and gt_answer.strip().replace(" ", "").lower() == final_answer.strip().replace(" ", "").lower()
    ):
        ans_acc += 1