support boxed reward

2026-07-15 15:29:48 +00:00 · 2025-04-28 17:53:20 +08:00
parent 263a9cbe7a
commit 14c01aec00
4 changed files with 106 additions and 3 deletions
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -7,7 +7,7 @@ import torch
 import wandb
 from coati.distributed.consumer import BaseConsumer
 from coati.distributed.loss import PolicyLoss
-from coati.distributed.reward.reward_fn import math_reward_fn
+from coati.distributed.reward.reward_fn import boxed_math_reward_fn, math_reward_fn
 from coati.distributed.reward.verifiable_reward import VerifiableReward
 from coati.distributed.utils import calc_action_log_probs
 from coati.trainer.utils import all_reduce_mean, all_reduce_sum
@@ -133,7 +133,12 @@ class GRPOConsumer(BaseConsumer):
            k: v for k, v in grpo_config.items() if k in ["soft_over_length_punishment", "max_length", "cache_length"]
        }
        self.reward_model = VerifiableReward(
-            reward_fns=[math_reward_fn], tokenizer=self.tokenizer, tags=response_format_tags, **reward_model_kwargs
+            reward_fns=[
+                math_reward_fn if grpo_config.get("reward_fn_type") == "think_answer_tags" else boxed_math_reward_fn
+            ],
+            tokenizer=self.tokenizer,
+            tags=response_format_tags,
+            **reward_model_kwargs,
        )
        self.global_step = 0
        self.use_wandb = use_wandb
--- a/applications/ColossalChat/coati/distributed/reward/reward_fn.py
+++ b/applications/ColossalChat/coati/distributed/reward/reward_fn.py
@@ -1,6 +1,6 @@
 import torch

-from .reward_utils import extract_solution, validate_response_structure
+from .reward_utils import extract_boxed_solution, extract_solution, validate_response_structure


 def math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
@@ -70,3 +70,43 @@ def gsm8k_reward_fn(input_ids, **kwargs):
        if gt_answer.strip().replace(" ", "").lower() == final_answer.strip().replace(" ", "").lower():
            reward = reward + 9.0
        return reward
+
+
+def boxed_math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
+    tokenizer = kwargs["tokenizer"]
+    soft_over_length_punishment = kwargs.get("soft_over_length_punishment", False)
+    format_score = 0.0
+    acc_score = 10.0
+    reward = torch.tensor(0.0)
+    format_acc = torch.tensor(0.0)
+    ans_acc = torch.tensor(0.0)
+    s, e = response_idx[0], response_idx[1]
+
+    length_reward = 0.0
+    if soft_over_length_punishment:
+        max_length = kwargs.get("max_length", 1024 * 4)
+        cache_length = kwargs.get("cache_length", 512)
+        res_length = e.item() - s.item() + 1
+        if max_length - cache_length < res_length < max_length:
+            length_reward = ((max_length - cache_length) - res_length) / cache_length * acc_score
+
+    if gt_answer is None:
+        return reward
+
+    decoded_final_answer = tokenizer.decode(input_ids[s : e + 1], skip_special_tokens=True)
+    gt_answer = tokenizer.decode(gt_answer.squeeze(0), skip_special_tokens=True)
+    final_answer = extract_boxed_solution(decoded_final_answer)
+    format_valid = final_answer is not None
+    # Check format accuracy
+    if format_valid:
+        format_acc += 1
+        reward += format_score
+
+    # Check answer accuracy, answer is considered correct if the answer is correct and the format is valid
+    if format_valid and final_answer is not None and gt_answer.strip().lower() == final_answer.strip().lower():
+        ans_acc += 1
+        reward += acc_score
+
+    reward = reward + length_reward
+
+    return torch.tensor([reward, format_acc, ans_acc]).to(input_ids.device)
--- a/applications/ColossalChat/coati/distributed/reward/reward_utils.py
+++ b/applications/ColossalChat/coati/distributed/reward/reward_utils.py
@@ -74,3 +74,51 @@ def extract_solution(solution_str: str) -> Tuple[Optional[str], str]:

    final_answer = matches[-1].group(1).strip()
    return final_answer, solution_str
+
+
+def extract_boxed_solution(text: str) -> Optional[str]:
+    """
+    Modified from: https://gist.github.com/lewtun/9c2ce1937b741404090a3dc4c7c022b3
+    Retrieves the content from the last occurrence of `\boxed{}` in a LaTeX-like string.
+
+    Args:
+        text (str): A string potentially containing LaTeX-style boxed expressions.
+
+    Returns:
+        Optional[str]: The text inside the final `\boxed{}` if successfully extracted;
+                       returns `None` if no properly closed box is found.
+
+    Examples:
+        >>> extract_boxed_solution("The answer is \\boxed{42}.")
+        '42'
+        >>> extract_boxed_solution("Here is an unmatched \\boxed{42")
+        None
+    """
+    try:
+        # Find the last occurrence of "\boxed{"
+        start_idx = text.rindex("\\boxed{")
+        # Move past "\boxed{" to find the start of the content
+        content_start = start_idx + len("\\boxed{")
+        open_braces = 1
+        pos = content_start
+
+        # Traverse the string to find the matching closing brace
+        while open_braces > 0 and pos < len(text):
+            if text[pos] == "{":
+                open_braces += 1
+            elif text[pos] == "}":
+                open_braces -= 1
+            pos += 1
+
+        # If all braces are matched, extract and return the content
+        if open_braces == 0:
+            return text[content_start : pos - 1].strip()
+        else:
+            return None
+
+    except ValueError:
+        # "\boxed{" not found
+        return None
+    except Exception:
+        # Any other unexpected error
+        return None
--- a/applications/ColossalChat/rl_example.py
+++ b/applications/ColossalChat/rl_example.py
@@ -86,6 +86,14 @@ if __name__ == "__main__":
    parser.add_argument("-a", "--algo", type=str, default="GRPO", choices=["DAPO", "GRPO"])
    parser.add_argument("-lr", "--learning-rate", type=float, default=1e-6, help="Learning rate for GRPO.")
    parser.add_argument("-kl", "--kl-coeff", type=float, default=0.01, help="KL penalty coefficient for GRPO.")
+    parser.add_argument(
+        "-rt",
+        "--reward-type",
+        type=str,
+        default="think_answer_tags",
+        choices=["think_answer_tags", "boxed"],
+        help="Reward type for GRPO.",
+    )

    # Logging/Checkpointing parameters
    parser.add_argument("-si", "--save-interval", type=int, default=100, help="Interval for saving checkpoints.")
@@ -168,6 +176,7 @@ if __name__ == "__main__":
            "train_microbatch_size": args.train_microbatch_size,
            "beta": args.kl_coeff,  # KL penalty coefficient
            "loss_variation": "sample_level",
+            "reward_fn_type": args.reward_type,
        }
    elif args.algo == "DAPO":
        # DAPO variant settings
@@ -185,6 +194,7 @@ if __name__ == "__main__":
            "max_length": args.max_new_tokens + args.max_prompt_tokens,
            "cache_length": min(1024, int(args.max_new_tokens / 4)),
            "filter_truncated_response": True,
+            "reward_fn_type": args.reward_type,
        }
    else:
        raise ValueError(f"Unsupported algorithm: {args.algo}")