From 55eee129d29185878bdb5a41b1b1b6fda1483af0 Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Thu, 15 May 2025 18:16:50 +0800
Subject: [PATCH 01/34] move prompt-level-filtering to buffer side

---
 .../coati/distributed/consumer.py             | 22 +++--
 .../coati/distributed/grpo_consumer.py        | 83 +++++++++++++++----
 2 files changed, 81 insertions(+), 24 deletions(-)
diff --git a/applications/ColossalChat/coati/distributed/consumer.py b/applications/ColossalChat/coati/distributed/consumer.py
index ecfd65458..6385df2cf 100644
--- a/applications/ColossalChat/coati/distributed/consumer.py
+++ b/applications/ColossalChat/coati/distributed/consumer.py
@@ -113,18 +113,24 @@ class BaseConsumer:
             ) as pbar:
                 for step in pbar:
                     i = 0
-                    allow_sync_model = False
+                    allow_sync_model = True
                     for _ in range(self.num_recv_per_update):
                         # receive data from producers
                         for r in range(self.num_producers):
                             print(f"[T{dist.get_rank()}] Recv data episode {episode} step {step} from {r}")
-                            self.buffer.extend(
-                                unbind_batch(
-                                    ray_broadcast_tensor_dict(
-                                        None, src=0, device=self.device, group_name=f"sync_data_{r}"
-                                    )
-                                )
+                            raw_batch = unbind_batch(
+                                ray_broadcast_tensor_dict(None, src=0, device=self.device, group_name=f"sync_data_{r}")
                             )
+                            filtered_batch = [
+                                t
+                                for t in [
+                                    self.prompt_level_filtering(self.calculate_group_reward(group))
+                                    for group in raw_batch
+                                ]
+                                if t is not None
+                            ]
+
+                            self.buffer.extend(filtered_batch)
                         while len(self.buffer) >= self.dp_size * self.minibatch_size:
                             batches = self.buffer[
                                 self.dp_rank * self.minibatch_size : (self.dp_rank + 1) * self.minibatch_size
@@ -177,7 +183,7 @@ class BaseConsumer:
                                     )
                             del state_dict
                             torch.cuda.empty_cache()
-                            allow_sync_model = False
+                            allow_sync_model = True
 
 
 @ray.remote
diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index 7ea2dbf18..fcf7b0740 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -1,5 +1,5 @@
 from contextlib import nullcontext
-from typing import Any, Optional
+from typing import Any, Dict, Optional
 
 import ray
 import torch
@@ -179,7 +179,6 @@ class GRPOConsumer(BaseConsumer):
         Format:
             [minibatch_size, num_of_generation, prompt_length + response_length] --- <PAD>...<PAD><PROMPT>...<PROMPT><RESPONSE>...<RESPONSE><PAD>...<PAD>.
         """
-
         # Reshape to [minibatch_size x num_of_generation, prompt_length + response_length]
         data = {k: v.view(-1, v.size(-1)) for k, v in kwargs.items()}
         action_mask = data["action_mask"]
@@ -188,15 +187,9 @@ class GRPOConsumer(BaseConsumer):
         response_length = torch.sum(action_mask, dim=1).to(torch.float32)
         train_microbatch_size = self.grpo_config.get("train_microbatch_size", data["input_ids"].size(0))
 
-        reward_group = self.reward_model(
-            data["input_ids"],
-            gt_answer=data["gt_answer"],
-            response_idx=data["response_idx"],
-        )
-
-        reward = torch.tensor([value[0] for value in reward_group]).to(data["input_ids"].device)
-        format_acc = torch.tensor([value[1] for value in reward_group]).to(data["input_ids"].device)
-        ans_acc = torch.tensor([value[2] for value in reward_group]).to(data["input_ids"].device)
+        reward = data["reward"].view((-1))
+        format_acc = data["format_acc"].view((-1))
+        ans_acc = data["ans_acc"].view((-1))
 
         # [minibatch_size, num_generations]
 
@@ -213,11 +206,7 @@ class GRPOConsumer(BaseConsumer):
             ans_acc.view(-1, self.num_generations).mean(dim=1).repeat_interleave(self.num_generations, dim=0)
         )
         # [minibatch_size x num_of_generation]
-        loss_mask = (
-            torch.ones(action_mask.size(0), device=action_mask.device).bool()
-            if self.filter_range is None
-            else torch.logical_and(group_ans_acc > self.filter_range[0], group_ans_acc < self.filter_range[1])
-        )
+        loss_mask = torch.ones(action_mask.size(0), device=action_mask.device).bool()
 
         # filter out overlength samples
         if self.filter_truncated_response and action_mask.size(1) == self.max_length:
@@ -525,6 +514,68 @@ class GRPOConsumer(BaseConsumer):
         else:
             return None, excessive_prompts_idx
 
+    def calculate_group_reward(self, rollout_group: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Calculate the group reward for the given rollout group.
+
+        Args:
+            rollout_group (Dict[str, Any]):
+                a group of samples generated by the model from the same prompt
+                contain the following keys:
+                    "input_ids": torch.Tensor, [num_of_generation, prompt_length + response_length]
+                    "attention_mask": torch.Tensor, [num_of_generation, prompt_length + response_length]
+                    "action_mask": torch.Tensor, [num_of_generation, response_length]
+                    "action_log_probs": torch.Tensor, [num_of_generation, response_length]
+                    "response_idx": int, torch.Tensor, [num_of_generation, 2]
+                    "gt_answer": torch.Tensor, [num_of_generation, 128]
+                    "temperature": torch.Tensor, [] (scalar)
+
+        Returns:
+            Dict[str, Any]: The new group data with calculated reward.
+        """
+        reward_group = self.reward_model(
+            rollout_group["input_ids"],
+            gt_answer=rollout_group["gt_answer"],
+            response_idx=rollout_group["response_idx"],
+        )
+        # [num_of_generation]
+        reward = torch.tensor([value[0] for value in reward_group]).to(rollout_group["input_ids"].device)
+        format_acc = torch.tensor([value[1] for value in reward_group]).to(rollout_group["input_ids"].device)
+        ans_acc = torch.tensor([value[2] for value in reward_group]).to(rollout_group["input_ids"].device)
+
+        rollout_group["reward"] = reward.view((-1, 1))
+        rollout_group["format_acc"] = format_acc.view((-1, 1))
+        rollout_group["ans_acc"] = ans_acc.view((-1, 1))
+        return rollout_group
+
+    def prompt_level_filtering(self, rollout_group: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        rollout_group: Dict[str, Any]
+            a group of samples generated by the model from the same prompt
+            contain the following keys:
+                "input_ids": torch.Tensor, [num_of_generation, prompt_length + response_length]
+                "attention_mask": torch.Tensor, [num_of_generation, prompt_length + response_length]
+                "action_mask": torch.Tensor, [num_of_generation, response_length]
+                "action_log_probs": torch.Tensor, [num_of_generation, response_length]
+                "response_idx": int, torch.Tensor, [num_of_generation, 2]
+                "gt_answer": torch.Tensor, [num_of_generation, 128]
+                "temperature": torch.Tensor, [] (scalar)
+                "reward": torch.Tensor, [num_of_generation]
+                "format_acc": torch.Tensor, [num_of_generation]
+                "ans_acc": torch.Tensor, [num_of_generation]
+        """
+        if self.filter_range is not None:
+            # filter prompt whoes accuracy is too high or too low (out of range)
+            group_ans_acc = torch.mean(rollout_group["ans_acc"])
+            if group_ans_acc < self.filter_range[0] or group_ans_acc > self.filter_range[1]:
+                # filter out the prompt
+                return None
+            else:
+                return rollout_group
+        else:
+            # no filter
+            return rollout_group
+
     def state_dict(self):
         self.policy_model._force_wait_all_gather()
         model = self.policy_model.unwrap()

From a528921944df44046a02366c45f93aec1ed254fa Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Thu, 15 May 2025 18:30:32 +0800
Subject: [PATCH 02/34] move prompt-level-filtering to buffer side

---
 applications/ColossalChat/coati/distributed/grpo_consumer.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index fcf7b0740..e709c8aed 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -254,7 +254,6 @@ class GRPOConsumer(BaseConsumer):
         total_effective_tokens_count = all_reduce_sum(torch.sum(effective_tokens_count), self.plugin)
         total_samples = all_reduce_sum(torch.sum(torch.ones_like(loss_mask, device=loss_mask.device)), self.plugin)
         self.effective_sample_count += effective_samples.item()
-        self.total_sample_count += total_samples.item()
         pbar.set_postfix(
             {
                 "Global Step": self.global_step,
@@ -461,6 +460,9 @@ class GRPOConsumer(BaseConsumer):
             self.optimizer.step()
             self.optimizer.zero_grad()
             self.global_step += 1
+            self.total_sample_count = all_reduce_sum(
+                torch.tensor(self.total_sample_count).to(self.accum_loss.device), self.plugin
+            ).item()
             sample_utilization = self.effective_sample_count / self.total_sample_count
             self.effective_prompt_count = 0
             self.effective_sample_count = 0
@@ -564,6 +566,7 @@ class GRPOConsumer(BaseConsumer):
                 "format_acc": torch.Tensor, [num_of_generation]
                 "ans_acc": torch.Tensor, [num_of_generation]
         """
+        self.total_sample_count += rollout_group["input_ids"].size(0)
         if self.filter_range is not None:
             # filter prompt whoes accuracy is too high or too low (out of range)
             group_ans_acc = torch.mean(rollout_group["ans_acc"])

From 11a5854b50c67afebfbd6250db36f6e0940e471d Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Fri, 16 May 2025 14:08:23 +0800
Subject: [PATCH 03/34] remove redundant code and fix bugs

---
 .../coati/distributed/consumer.py             | 23 ++++-----
 .../coati/distributed/grpo_consumer.py        | 47 +++----------------
 .../ColossalChat/coati/distributed/launch.py  |  2 +-
 .../coati/distributed/producer.py             |  2 +-
 applications/ColossalChat/rl_example.py       | 12 +++--
 5 files changed, 27 insertions(+), 59 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/consumer.py b/applications/ColossalChat/coati/distributed/consumer.py
index 6385df2cf..531cf363c 100644
--- a/applications/ColossalChat/coati/distributed/consumer.py
+++ b/applications/ColossalChat/coati/distributed/consumer.py
@@ -121,14 +121,14 @@ class BaseConsumer:
                             raw_batch = unbind_batch(
                                 ray_broadcast_tensor_dict(None, src=0, device=self.device, group_name=f"sync_data_{r}")
                             )
-                            filtered_batch = [
-                                t
-                                for t in [
-                                    self.prompt_level_filtering(self.calculate_group_reward(group))
-                                    for group in raw_batch
-                                ]
-                                if t is not None
+                            processed_batch = [
+                                self.prompt_level_filtering(self.calculate_group_reward(group)) for group in raw_batch
                             ]
+                            filtered_batch = [t for t in processed_batch if t is not None]
+                            if self.filter_range is not None:
+                                print(
+                                    f"[T{dist.get_rank()}] Filter recv data: {len(processed_batch)} -> {len(filtered_batch)}"
+                                )
 
                             self.buffer.extend(filtered_batch)
                         while len(self.buffer) >= self.dp_size * self.minibatch_size:
@@ -137,13 +137,8 @@ class BaseConsumer:
                             ]
                             batch = bind_batch(batches)
                             batch = post_recv(batch)
-                            loss, excessive_prompts_idx = self.step(i, pbar, **batch)
-
-                            if excessive_prompts_idx is not None:
-                                excessive_prompts = [self.buffer[idx] for idx in excessive_prompts_idx]
-                                self.buffer = excessive_prompts + self.buffer[self.dp_size * self.minibatch_size :]
-                            else:
-                                self.buffer = self.buffer[self.dp_size * self.minibatch_size :]
+                            loss = self.step(i, pbar, **batch)
+                            self.buffer = self.buffer[self.dp_size * self.minibatch_size :]
                             if loss is not None:
                                 allow_sync_model = True
                                 pbar.set_postfix({"loss": loss})
diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index e709c8aed..f73f2d96f 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -9,7 +9,7 @@ from coati.distributed.loss import PolicyLoss
 from coati.distributed.reward.reward_fn import boxed_math_reward_fn, math_reward_fn
 from coati.distributed.reward.verifiable_reward import VerifiableReward
 from coati.distributed.utils import calc_action_log_probs
-from coati.trainer.utils import all_gather_tensors, all_reduce_mean, all_reduce_sum
+from coati.trainer.utils import all_reduce_mean, all_reduce_sum
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
@@ -201,10 +201,7 @@ class GRPOConsumer(BaseConsumer):
         reward_std = group_reward.std(dim=1).repeat_interleave(self.num_generations, dim=0)
         # [minibatch_size x num_generations]
         advantages = ((reward - reward_mean) / (reward_std + 1e-4)).unsqueeze(dim=-1)
-        # filter out the reward that is too high (all sample gets full score) or too low (all sample gets 0 score),
-        group_ans_acc = (
-            ans_acc.view(-1, self.num_generations).mean(dim=1).repeat_interleave(self.num_generations, dim=0)
-        )
+
         # [minibatch_size x num_of_generation]
         loss_mask = torch.ones(action_mask.size(0), device=action_mask.device).bool()
 
@@ -214,37 +211,14 @@ class GRPOConsumer(BaseConsumer):
                 loss_mask,
                 action_mask[:, -1] == False,
             )
-        prompt_level_mask = loss_mask.view(self.minibatch_size, self.num_generations)
-
-        # [minibatch_size] -> calculate the number of effective prompts
-        effective_prompts_mask = prompt_level_mask.any(dim=1)
-        effective_prompts = all_reduce_sum(torch.sum(effective_prompts_mask), self.plugin)
-        self.effective_prompt_count += effective_prompts.item()
-        excessive_prompts_idx = None
+        self.effective_prompt_count += group_reward.size(0) * self.dp_size
 
         mean_kl, mean_loss = [], []
 
         if self.grpo_config.get("dynamic_batching", True):
             need_update = self.effective_prompt_count >= self.batch_size * self.dp_size
             excessive_prompts = self.effective_prompt_count - self.batch_size * self.dp_size
-
-            if excessive_prompts > 0:
-                excessive_prompts_per_rank = excessive_prompts // self.dp_size
-                # Only count excessive prompts if they are greater than 1 per rank.
-                # TODO: customize excessive prompts calculation.
-                if excessive_prompts_per_rank != 0:
-                    # Mask excessive prompts to False
-                    true_indices = torch.nonzero(effective_prompts_mask).squeeze()
-                    if excessive_prompts_per_rank <= len(true_indices):
-                        excessive_prompts_idx = true_indices[-excessive_prompts_per_rank:]
-                    else:
-                        excessive_prompts_idx = true_indices
-                    effective_prompts_mask[excessive_prompts_idx] = False
-
-                    for mask_idx in range(len(effective_prompts_mask)):
-                        if effective_prompts_mask[mask_idx] == False:
-                            # Update loss mask.
-                            loss_mask[mask_idx] = False
+            assert excessive_prompts <= 0, "Debug: Excessive prompts should always be less than 0. Bug!!!!"
         else:
             # If dynamic batching is disabled, we need to use all samples for training.
             need_update = (step_idx + 1) % self.num_microbatches == 0
@@ -460,9 +434,7 @@ class GRPOConsumer(BaseConsumer):
             self.optimizer.step()
             self.optimizer.zero_grad()
             self.global_step += 1
-            self.total_sample_count = all_reduce_sum(
-                torch.tensor(self.total_sample_count).to(self.accum_loss.device), self.plugin
-            ).item()
+            # no need to run all_reduce_sum on total_sample_count, because all dp ranks recieves a complete inference batch from all producers.
             sample_utilization = self.effective_sample_count / self.total_sample_count
             self.effective_prompt_count = 0
             self.effective_sample_count = 0
@@ -507,14 +479,9 @@ class GRPOConsumer(BaseConsumer):
                 self.accum_advantages.zero_()
                 self.accum_response_length.zero_()
                 self.accum_count = 0
-
-            if excessive_prompts_idx is not None:
-                # All gather excessive prompts index across DP ranks.
-                excessive_prompts_idx = [idx + self.dp_rank * self.minibatch_size for idx in excessive_prompts_idx]
-                excessive_prompts_idx = all_gather_tensors(excessive_prompts_idx, self.plugin)
-            return loss_scalar, excessive_prompts_idx
+            return loss_scalar
         else:
-            return None, excessive_prompts_idx
+            return None
 
     def calculate_group_reward(self, rollout_group: Dict[str, Any]) -> Dict[str, Any]:
         """
diff --git a/applications/ColossalChat/coati/distributed/launch.py b/applications/ColossalChat/coati/distributed/launch.py
index 327db1b55..e6367d2d1 100644
--- a/applications/ColossalChat/coati/distributed/launch.py
+++ b/applications/ColossalChat/coati/distributed/launch.py
@@ -66,7 +66,7 @@ def launch_distributed(
 
     dataset_path = train_dataset_config["path"]
     num_samples = get_jsonl_size_fast(dataset_path)
-    global_inference_batch_size = inference_batch_size * num_producers  # TODO: this doesn't support TP on producer
+    global_inference_batch_size = inference_batch_size * num_producers
     num_update_per_episode = num_samples // global_inference_batch_size
     num_recv_per_update = inference_batch_size // inference_microbatch_size
 
diff --git a/applications/ColossalChat/coati/distributed/producer.py b/applications/ColossalChat/coati/distributed/producer.py
index 8b9452160..d796bff48 100644
--- a/applications/ColossalChat/coati/distributed/producer.py
+++ b/applications/ColossalChat/coati/distributed/producer.py
@@ -187,7 +187,7 @@ class BaseProducer:
                         for eval_task_name in self.eval_dataloaders:
                             if self.producer_idx == 0:
                                 print(
-                                    f"[P{self.producer_idx}] Evaluate episode {episode} step {self.consumer_global_step} on task {eval_task_name}"
+                                    f"[P{self.producer_idx}] Evaluate consumer step {self.consumer_global_step} on task {eval_task_name}"
                                 )
                             eval_results = []
                             eval_statistics_tensor = torch.zeros((2,), dtype=torch.float32).to(self.device)
diff --git a/applications/ColossalChat/rl_example.py b/applications/ColossalChat/rl_example.py
index 2ed9ef62c..9f89eb546 100644
--- a/applications/ColossalChat/rl_example.py
+++ b/applications/ColossalChat/rl_example.py
@@ -104,7 +104,13 @@ if __name__ == "__main__":
         choices=["think_answer_tags", "boxed"],
         help="Reward type for GRPO.",
     )
-    parser.add_argument("-ei", "--eval-interval", type=int, default=100, help="Interval for evaluation.")
+    parser.add_argument(
+        "-ei",
+        "--eval-interval",
+        type=int,
+        default=100,
+        help="Interval for evaluation. Evaluate every ei consumer steps.",
+    )
 
     # Logging/Checkpointing parameters
     parser.add_argument("-si", "--save-interval", type=int, default=100, help="Interval for saving checkpoints.")
@@ -125,8 +131,8 @@ if __name__ == "__main__":
         and args.train_microbatch_size > 0
     ), "Train micro batch size must be greater than 0 less than train mini batch size * num generations"
     assert (
-        args.train_minibatch_size <= args.train_batch_size
-    ), "Train mini batch size must be less than or equals to train batch size"
+        args.train_minibatch_size <= args.train_batch_size and args.train_batch_size % args.train_minibatch_size == 0
+    ), "Train mini batch size must be less than or equals to train batch size and train batch size must be divisible by train mini batch size"
 
     if args.master_address is None:
         # Default settings: Using single machine

From 37663386bc5aec28e0aedee27ec65be0c9affff0 Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Tue, 20 May 2025 18:14:05 +0800
Subject: [PATCH 04/34] fix metric calculation

---
 .../coati/distributed/consumer.py             | 85 ++++++++++++++++---
 .../coati/distributed/grpo_consumer.py        | 50 ++++++-----
 .../ColossalChat/coati/distributed/launch.py  |  7 +-
 applications/ColossalChat/rl_example.py       | 53 +++++++++---
 4 files changed, 147 insertions(+), 48 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/consumer.py b/applications/ColossalChat/coati/distributed/consumer.py
index 8a2221b92..026056783 100644
--- a/applications/ColossalChat/coati/distributed/consumer.py
+++ b/applications/ColossalChat/coati/distributed/consumer.py
@@ -120,24 +120,85 @@ class BaseConsumer:
                             raw_batch = unbind_batch(
                                 ray_broadcast_tensor_dict(None, src=0, device=self.device, group_name=f"sync_data_{r}")
                             )
-                            processed_batch = [
-                                self.prompt_level_filtering(self.calculate_group_reward(group)) for group in raw_batch
-                            ]
-                            filtered_batch = [t for t in processed_batch if t is not None]
+                            recv_effective_count = 0
+                            # calculate group reward et al. filtering. As only the filtered group will be used for training (which is incomplete),
+                            # we need to calculate the metrics before filtering here for logging
+                            for group in raw_batch:
+                                group_with_reward = self.calculate_group_reward(group)
+                                group_reward_mean = group_with_reward["reward"].mean().cpu().item()
+                                group_format_acc_mean = group_with_reward["format_acc"].mean().cpu().item()
+                                group_ans_acc_mean = group_with_reward["ans_acc"].mean().cpu().item()
+                                group_response_len = (
+                                    (
+                                        group_with_reward["response_idx"][:, 1]
+                                        - group_with_reward["response_idx"][:, 0]
+                                        + 1
+                                    )
+                                    .type(torch.float32)
+                                    .mean()
+                                    .cpu()
+                                    .item()
+                                )
+                                filtered_group = self.prompt_level_filtering(group_with_reward)
+                                recv_effective_count += 1 if filtered_group is not None else 0
+                                self.buffer.append(
+                                    [
+                                        filtered_group,
+                                        group_reward_mean,
+                                        group_format_acc_mean,
+                                        group_ans_acc_mean,
+                                        group_response_len,
+                                    ]
+                                )
                             if self.filter_range is not None:
                                 print(
-                                    f"[T{dist.get_rank()}] Filter recv data: {len(processed_batch)} -> {len(filtered_batch)}"
+                                    f"[T{dist.get_rank()}] Filter recv data: {len(raw_batch)} -> {recv_effective_count}"
                                 )
+                            # mapping the effective group to the raw group for indexing
+                            effective_group_to_raw_group_mapping = {}
+                            for buffer_idx in range(len(self.buffer)):
+                                if self.buffer[buffer_idx][0] is not None:
+                                    effective_group_to_raw_group_mapping[len(effective_group_to_raw_group_mapping)] = (
+                                        buffer_idx
+                                    )
 
-                            self.buffer.extend(filtered_batch)
-                        while len(self.buffer) >= self.dp_size * self.minibatch_size:
-                            batches = self.buffer[
-                                self.dp_rank * self.minibatch_size : (self.dp_rank + 1) * self.minibatch_size
+                        while len(effective_group_to_raw_group_mapping) >= self.dp_size * self.minibatch_size:
+                            # on each dp_rank, we use minibatch_size effective samples to form a batch
+                            batches = [
+                                self.buffer[effective_group_to_raw_group_mapping[i]]
+                                for i in range(
+                                    self.dp_rank * self.minibatch_size, (self.dp_rank + 1) * self.minibatch_size
+                                )
                             ]
-                            batch = bind_batch(batches)
+                            # every dp_rank will receive a complete mini-batch, no need to sync within step() later
+                            # each mini-batch use the first self.dp_size * minibatch_size effective samples
+                            raw_mini_batches = self.buffer[
+                                : effective_group_to_raw_group_mapping[self.dp_size * self.minibatch_size - 1] + 1
+                            ]  # include the last effective sample
+                            raw_mini_batches_metric_dict = {
+                                "raw_train_mini_batch_reward": [t[1] for t in raw_mini_batches],
+                                "raw_train_mini_batch_format_acc": [t[2] for t in raw_mini_batches],
+                                "raw_train_mini_batch_ans_acc": [t[3] for t in raw_mini_batches],
+                                "raw_train_mini_batch_response_len": [t[4] for t in raw_mini_batches],
+                            }
+                            batch = bind_batch([t[0] for t in batches])
                             batch = post_recv(batch)
-                            loss = self.step(i, pbar, **batch)
-                            self.buffer = self.buffer[self.dp_size * self.minibatch_size :]
+                            loss = self.step(i, pbar, **batch, **raw_mini_batches_metric_dict)
+                            self.buffer = self.buffer[
+                                effective_group_to_raw_group_mapping[self.dp_size * self.minibatch_size - 1] + 1 :
+                            ]
+                            # recalculate the effective group to raw group mapping
+                            effective_group_to_raw_group_mapping_size_before = len(effective_group_to_raw_group_mapping)
+                            effective_group_to_raw_group_mapping = {}
+                            for buffer_idx in range(len(self.buffer)):
+                                if self.buffer[buffer_idx][0] is not None:
+                                    effective_group_to_raw_group_mapping[len(effective_group_to_raw_group_mapping)] = (
+                                        buffer_idx
+                                    )
+                            assert (
+                                len(effective_group_to_raw_group_mapping)
+                                == effective_group_to_raw_group_mapping_size_before - self.dp_size * self.minibatch_size
+                            )
                             if loss is not None:
                                 pbar.set_postfix({"loss": loss})
                             i += 1
diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index bc2bd45d1..369023977 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -72,12 +72,12 @@ class GRPOConsumer(BaseConsumer):
         self.policy_model.gradient_checkpointing_enable()
         self.optimizer = HybridAdam(self.policy_model.parameters(), lr=grpo_config.get("lr", 1e-6))
         self.accum_loss = torch.zeros(1, device=self.device)
-        self.accum_reward = torch.zeros(1, device=self.device)
         self.accum_kl = torch.zeros(1, device=self.device)
-        self.accum_format_acc = torch.zeros(1, device=self.device)
-        self.accum_ans_acc = torch.zeros(1, device=self.device)
         self.accum_advantages = torch.zeros(1, device=self.device)
-        self.accum_response_length = torch.zeros(1, device=self.device)
+        self.raw_train_batch_reward = []
+        self.raw_train_batch_format_acc = []
+        self.raw_train_batch_ans_acc = []
+        self.raw_train_batch_response_len = []
         self.accum_count = 0
         self.generate_config = generate_config
         self.grpo_config = grpo_config
@@ -186,7 +186,11 @@ class GRPOConsumer(BaseConsumer):
             [minibatch_size, num_of_generation, prompt_length + response_length] --- <PAD>...<PAD><PROMPT>...<PROMPT><RESPONSE>...<RESPONSE><PAD>...<PAD>.
         """
         # Reshape to [minibatch_size x num_of_generation, prompt_length + response_length]
-        data = {k: v.view(-1, v.size(-1)) for k, v in kwargs.items()}
+        data = {k: v.view(-1, v.size(-1)) for k, v in kwargs.items() if "raw_train_mini_batch_" not in k}
+        self.raw_train_batch_reward.extend(kwargs["raw_train_mini_batch_reward"])
+        self.raw_train_batch_format_acc.extend(kwargs["raw_train_mini_batch_format_acc"])
+        self.raw_train_batch_ans_acc.extend(kwargs["raw_train_mini_batch_ans_acc"])
+        self.raw_train_batch_response_len.extend(kwargs["raw_train_mini_batch_response_len"])
         action_mask = data["action_mask"]
         num_action = action_mask.shape[1]
         old_action_log_probs = data["action_log_probs"]
@@ -430,11 +434,7 @@ class GRPOConsumer(BaseConsumer):
                 self.accum_loss.add_(sum(mean_loss) / len(mean_loss))
                 if self.policy_loss_fn.beta > 0:
                     self.accum_kl.add_(sum(mean_kl) / len(mean_kl))
-                self.accum_reward.add_(reward.data)
-                self.accum_format_acc.add_(format_acc.data)
-                self.accum_ans_acc.add_(ans_acc.data)
                 self.accum_advantages.add_(advantages.data)
-                self.accum_response_length.add_(response_length.data)
                 self.accum_count += 1
         if need_update:
             self.optimizer.step()
@@ -452,21 +452,33 @@ class GRPOConsumer(BaseConsumer):
                 if (not self.plugin.pp_size > 1 and self.rank == 0) or (
                     self.plugin.pp_size > 1 and self.booster.plugin.stage_manager.is_last_stage() and self.tp_rank == 0
                 ):
+                    raw_batch_reward_mean = sum(self.raw_train_batch_reward) / len(self.raw_train_batch_reward)
+                    raw_batch_format_acc_mean = sum(self.raw_train_batch_format_acc) / len(
+                        self.raw_train_batch_format_acc
+                    )
+                    raw_batch_ans_acc_mean = sum(self.raw_train_batch_ans_acc) / len(self.raw_train_batch_ans_acc)
+                    raw_batch_response_len_mean = sum(self.raw_train_batch_response_len) / len(
+                        self.raw_train_batch_response_len
+                    )
+                    self.raw_train_batch_reward = []
+                    self.raw_train_batch_format_acc = []
+                    self.raw_train_batch_ans_acc = []
+                    self.raw_train_batch_response_len = []
                     to_log_msg = [
                         f"Loss: {self.accum_loss.item() / self.accum_count:.4f}",
-                        f"Reward: {self.accum_reward.item() / self.accum_count:.4f}",
-                        f"format Reward: {self.accum_format_acc.item() / self.accum_count:.4f}",
-                        f"Acc Reward: {self.accum_ans_acc.item() / self.accum_count:.4f}",
+                        f"Reward: {raw_batch_reward_mean:.4f}",
+                        f"format Reward: {raw_batch_format_acc_mean:.4f}",
+                        f"Acc Reward: {raw_batch_ans_acc_mean:.4f}",
                         f"Advantages: {self.accum_advantages.item() / self.accum_count:.4f}",
-                        f"Response Length: {self.accum_response_length.item() / self.accum_count:.4f}",
+                        f"Response Length: {raw_batch_response_len_mean:.4f}",
                         f"Sample_utilization: {sample_utilization:.4f}",
                     ] + ([f"KL: {self.accum_kl.item() / self.accum_count:.4f}"] if self.policy_loss_fn.beta > 0 else [])
                     print("\n".join(to_log_msg))
                     metrics = {
-                        "metrics/reward": self.accum_reward.item() / self.accum_count,
-                        "metrics/format_acc": self.accum_format_acc.item() / self.accum_count,
-                        "metrics/ans_acc": self.accum_ans_acc.item() / self.accum_count,
-                        "metrics/response_length": self.accum_response_length.item() / self.accum_count,
+                        "metrics/reward": raw_batch_reward_mean,
+                        "metrics/format_acc": raw_batch_format_acc_mean,
+                        "metrics/ans_acc": raw_batch_ans_acc_mean,
+                        "metrics/response_length": raw_batch_response_len_mean,
                         "train/loss": self.accum_loss.item() / self.accum_count,
                         "train/advantages": self.accum_advantages.item() / self.accum_count,
                         "train/learning_rate": self.lr_scheduler.get_last_lr()[0],
@@ -478,12 +490,8 @@ class GRPOConsumer(BaseConsumer):
                     if self.wandb_run is not None:
                         self.wandb_run.log(metrics)
                 self.accum_loss.zero_()
-                self.accum_reward.zero_()
-                self.accum_ans_acc.zero_()
-                self.accum_format_acc.zero_()
                 self.accum_kl.zero_()
                 self.accum_advantages.zero_()
-                self.accum_response_length.zero_()
                 self.accum_count = 0
             return loss_scalar
         else:
diff --git a/applications/ColossalChat/coati/distributed/launch.py b/applications/ColossalChat/coati/distributed/launch.py
index 6eeb5d379..220a47eb8 100644
--- a/applications/ColossalChat/coati/distributed/launch.py
+++ b/applications/ColossalChat/coati/distributed/launch.py
@@ -1,4 +1,5 @@
 import copy
+import os
 import uuid
 from typing import Any, Dict, Optional
 
@@ -56,7 +57,7 @@ def launch_distributed(
     eval_save_dir: Optional[str] = None,
     eval_generation_config: Optional[Dict[str, Any]] = None,
     log_rollout_interval: int = 20,
-    rollout_log_file: str = "./rollout_log.jsonl",
+    rollout_save_dir: str = "./rollout",
 ):
     if core_algo not in ALGO_MAP:
         raise NotImplementedError(f"{core_algo} is not supported yet.")
@@ -74,6 +75,10 @@ def launch_distributed(
 
     run_name = f"{inference_backend}_bs_{train_batch_size * train_dp_size}_temp_{generate_config['temperature']:.01f}_top_p_{generate_config['top_p']:.02f}"
     wandb_group_name = str(uuid.uuid4())
+    rollout_log_file = os.path.join(
+        rollout_save_dir,
+        f"{project_name}_run_{wandb_group_name}.jsonl",
+    )
 
     procs = []
     for i in range(num_producers):
diff --git a/applications/ColossalChat/rl_example.py b/applications/ColossalChat/rl_example.py
index a97c2b210..53f166d66 100644
--- a/applications/ColossalChat/rl_example.py
+++ b/applications/ColossalChat/rl_example.py
@@ -121,6 +121,34 @@ if __name__ == "__main__":
     parser.add_argument(
         "-rsd", "--rollout-save-dir", type=str, default="./rollouts", help="Directory for saving rollout loggings."
     )
+    parser.add_argument(
+        "-tp",
+        "--tensor-parallel-size",
+        type=int,
+        default=1,
+        help="Tensor parallel size for the inference backend. Please check the generation arguments documentation for your backend.",
+    )
+    parser.add_argument(
+        "-pp",
+        "--pipeline-parallel-size",
+        type=int,
+        default=1,
+        help="Pipeline parallel size for the inference backend. Please check the generation arguments documentation for your backend.",
+    )
+    parser.add_argument(
+        "-zero",
+        "--zero-stage",
+        type=int,
+        default=0,
+        help="Zero stage for the inference backend. Please check the generation arguments documentation for your backend.",
+    )
+    parser.add_argument(
+        "-ptp",
+        "--produce-tensor-parallel-size",
+        type=int,
+        default=1,
+        help="Tensor parallel size for the producer. Please check the generation arguments documentation for your backend.",
+    )
     args = parser.parse_args()
 
     if args.train_minibatch_size is None:
@@ -178,7 +206,7 @@ if __name__ == "__main__":
                 enforce_eager=True,
                 enable_chunked_prefill=True,
                 max_model_len=args.max_new_tokens + args.max_prompt_tokens,
-                tensor_parallel_size=1,
+                tensor_parallel_size=args.produce_tensor_parallel_size,
             )
         )
         generate_config.update(
@@ -228,7 +256,7 @@ if __name__ == "__main__":
 
     launch_distributed(
         num_producers=args.num_inferencer,
-        num_proc_per_producer=inference_model_config.get("tensor_parallel_size", 1),
+        num_proc_per_producer=inference_model_config.get("tensor_parallel_size", args.produce_tensor_parallel_size),
         num_consumer_procs=args.num_trainers,
         num_episodes=args.num_episodes,
         inference_batch_size=args.inference_batch_size,
@@ -247,17 +275,14 @@ if __name__ == "__main__":
         train_model_config=train_model_config,
         grpo_config=grpo_config,
         plugin_config={
-            "zero_stage": 2,
-        },  # for zero
-        # plugin_config={
-        #     "tp_size": 2,
-        #     "pp_size": 2,
-        #     "microbatch_size": max(
-        #         1, args.train_microbatch_size // 2
-        #     ),  # microbatch size should be set to train_microbatch_size // pp_size
-        #     "zero_stage": 0,
-        #     "max_norm": 1.0,
-        # },  # for pp, tp
+            "tp_size": args.tensor_parallel_size,
+            "pp_size": args.pipeline_parallel_size,
+            "microbatch_size": max(
+                1, args.train_microbatch_size // args.pipeline_parallel_size
+            ),  # microbatch size should be set to train_microbatch_size // pp_size
+            "zero_stage": args.zero_stage,
+            "max_norm": 1.0,
+        },  # for pp, tp
         inference_backend=args.backend,
         master_addr="localhost",
         master_port=args.master_port,
@@ -273,5 +298,5 @@ if __name__ == "__main__":
         eval_save_dir=os.path.join(args.eval_save_dir, args.project.replace(" ", "_")),
         eval_generation_config=eval_generation_config,
         log_rollout_interval=20,
-        rollout_log_file=os.path.join(args.rollout_save_dir, args.project.replace(" ", "_") + ".jsonl"),
+        rollout_save_dir=args.rollout_save_dir,
     )

From 78a06f5ce37832bcc635ca7aa1c623b199a0b807 Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Wed, 21 May 2025 10:51:32 +0800
Subject: [PATCH 05/34] fix missing tags parameter

---
 .../coati/distributed/grpo_consumer.py        | 11 +---------
 .../ColossalChat/coati/distributed/launch.py  |  1 +
 .../coati/distributed/producer.py             |  5 +++++
 applications/ColossalChat/rl_example.py       | 20 +++++++++++++++++++
 4 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index 369023977..152689e06 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -120,16 +120,7 @@ class GRPOConsumer(BaseConsumer):
                     "either max_tokens (vllm) or max_new_tokens (transformers) must be set in generate_config."
                 )
         # Initialize verifiable reward.
-        response_format_tags = (
-            {
-                "think_start": {"text": "<think>", "num_occur": 1},
-                "think_end": {"text": "</think>", "num_occur": 1},
-                "answer_start": {"text": "<answer>", "num_occur": 1},
-                "answer_end": {"text": "</answer>", "num_occur": 1},
-            }
-            if grpo_config.get("reward_fn_type") == "think_answer_tags"
-            else None
-        )
+        response_format_tags = grpo_config.get("response_format_tags", None)
         reward_model_kwargs = {
             k: v
             for k, v in grpo_config.items()
diff --git a/applications/ColossalChat/coati/distributed/launch.py b/applications/ColossalChat/coati/distributed/launch.py
index c9e8d2ab2..764325cdd 100644
--- a/applications/ColossalChat/coati/distributed/launch.py
+++ b/applications/ColossalChat/coati/distributed/launch.py
@@ -100,6 +100,7 @@ def launch_distributed(
             eval_dataset_config=eval_dataset_config,
             eval_interval=eval_interval,
             evaluation_function_type=grpo_config["reward_fn_type"],
+            response_format_tags=grpo_config["response_format_tags"],
             eval_save_dir=eval_save_dir,
             eval_generation_config=eval_generation_config,
             project_name=project_name,
diff --git a/applications/ColossalChat/coati/distributed/producer.py b/applications/ColossalChat/coati/distributed/producer.py
index 75879278c..916ae3326 100644
--- a/applications/ColossalChat/coati/distributed/producer.py
+++ b/applications/ColossalChat/coati/distributed/producer.py
@@ -46,6 +46,7 @@ class BaseProducer:
         eval_dataset_config=None,
         eval_interval=-1,  # disable evaluation
         evaluation_function_type="think_answer_tags",
+        response_format_tags=None,
         eval_save_dir: str = "./eval",
         project_name: str = None,
         run_name: str = None,
@@ -148,6 +149,7 @@ class BaseProducer:
                 self.evaluation_function = boxed_math_reward_fn
             else:
                 raise ValueError(f"Unknown evaluation function type {evaluation_function_type}")
+            self.response_format_tags = response_format_tags
         else:
             raise ValueError("eval_dataset_config is not defined")
         self.device = get_current_device()
@@ -217,6 +219,7 @@ class BaseProducer:
                                         eval_outputs["response_idx"][m][n],
                                         tokenizer=self.tokenizer,
                                         eval_mode=True,
+                                        tags=self.response_format_tags,
                                     )
                                     for m in range(eval_outputs["input_ids"].size(0))
                                     for n in range(eval_outputs["input_ids"].size(1))
@@ -324,6 +327,7 @@ class SimpleProducer(BaseProducer):
         eval_dataset_config=None,
         eval_interval=-1,  # disable evaluation
         evaluation_function_type="think_answer_tags",
+        response_format_tags=None,
         eval_save_dir: str = "./eval",
         eval_generation_config={},
         project_name: str = None,
@@ -349,6 +353,7 @@ class SimpleProducer(BaseProducer):
             eval_dataset_config=eval_dataset_config,
             eval_interval=eval_interval,
             evaluation_function_type=evaluation_function_type,
+            response_format_tags=response_format_tags,
             eval_save_dir=eval_save_dir,
             project_name=project_name,
             run_name=run_name,
diff --git a/applications/ColossalChat/rl_example.py b/applications/ColossalChat/rl_example.py
index 53f166d66..e1025442c 100644
--- a/applications/ColossalChat/rl_example.py
+++ b/applications/ColossalChat/rl_example.py
@@ -231,6 +231,16 @@ if __name__ == "__main__":
             "reward_fn_type": args.reward_type,
             "max_length": args.max_new_tokens + args.max_prompt_tokens,
             "max_new_tokens": args.max_new_tokens,
+            "response_format_tags": (
+                {
+                    "think_start": {"text": "<think>", "num_occur": 1},
+                    "think_end": {"text": "</think>", "num_occur": 1},
+                    "answer_start": {"text": "<answer>", "num_occur": 1},
+                    "answer_end": {"text": "</answer>", "num_occur": 1},
+                }
+                if args.reward_type == "think_answer_tags"
+                else None
+            ),
         }
     elif args.algo == "DAPO":
         # DAPO variant settings
@@ -250,6 +260,16 @@ if __name__ == "__main__":
             "cache_length": min(1024, int(args.max_new_tokens / 4)),
             "filter_truncated_response": True,
             "reward_fn_type": args.reward_type,
+            "response_format_tags": (
+                {
+                    "think_start": {"text": "<think>", "num_occur": 1},
+                    "think_end": {"text": "</think>", "num_occur": 1},
+                    "answer_start": {"text": "<answer>", "num_occur": 1},
+                    "answer_end": {"text": "</answer>", "num_occur": 1},
+                }
+                if args.reward_type == "think_answer_tags"
+                else None
+            ),
         }
     else:
         raise ValueError(f"Unsupported algorithm: {args.algo}")

From 4c3656870ae25c959e310c559958c4a21d3f75cd Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Wed, 28 May 2025 17:34:11 +0800
Subject: [PATCH 06/34] address conversation

---
 .../coati/distributed/consumer.py             | 11 ++++----
 .../coati/distributed/grpo_consumer.py        | 26 +++++++++----------
 2 files changed, 17 insertions(+), 20 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/consumer.py b/applications/ColossalChat/coati/distributed/consumer.py
index 026056783..6f8f1b497 100644
--- a/applications/ColossalChat/coati/distributed/consumer.py
+++ b/applications/ColossalChat/coati/distributed/consumer.py
@@ -117,14 +117,12 @@ class BaseConsumer:
                         # receive data from producers
                         for r in range(self.num_producers):
                             print(f"[T{dist.get_rank()}] Recv data episode {episode} step {step} from {r}")
-                            raw_batch = unbind_batch(
-                                ray_broadcast_tensor_dict(None, src=0, device=self.device, group_name=f"sync_data_{r}")
-                            )
+                            raw_batch = ray_broadcast_tensor_dict(None, src=0, device=self.device, group_name=f"sync_data_{r}")
                             recv_effective_count = 0
                             # calculate group reward et al. filtering. As only the filtered group will be used for training (which is incomplete),
                             # we need to calculate the metrics before filtering here for logging
-                            for group in raw_batch:
-                                group_with_reward = self.calculate_group_reward(group)
+                            raw_batch_with_reward = unbind_batch(self.calculate_reward(raw_batch))
+                            for group_with_reward in raw_batch_with_reward:
                                 group_reward_mean = group_with_reward["reward"].mean().cpu().item()
                                 group_format_acc_mean = group_with_reward["format_acc"].mean().cpu().item()
                                 group_ans_acc_mean = group_with_reward["ans_acc"].mean().cpu().item()
@@ -139,7 +137,8 @@ class BaseConsumer:
                                     .cpu()
                                     .item()
                                 )
-                                filtered_group = self.prompt_level_filtering(group_with_reward)
+                                if self.grpo_config.get("dynamic_batching", True):
+                                    filtered_group = self.prompt_level_filtering(group_with_reward)
                                 recv_effective_count += 1 if filtered_group is not None else 0
                                 self.buffer.append(
                                     [
diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index 152689e06..891a6f842 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -218,8 +218,6 @@ class GRPOConsumer(BaseConsumer):
 
         if self.grpo_config.get("dynamic_batching", True):
             need_update = self.effective_prompt_count >= self.batch_size * self.dp_size
-            excessive_prompts = self.effective_prompt_count - self.batch_size * self.dp_size
-            assert excessive_prompts <= 0, "Debug: Excessive prompts should always be less than 0. Bug!!!!"
         else:
             # If dynamic batching is disabled, we need to use all samples for training.
             need_update = (step_idx + 1) % self.num_microbatches == 0
@@ -488,7 +486,7 @@ class GRPOConsumer(BaseConsumer):
         else:
             return None
 
-    def calculate_group_reward(self, rollout_group: Dict[str, Any]) -> Dict[str, Any]:
+    def calculate_reward(self, rollout: Dict[str, Any]) -> Dict[str, Any]:
         """
         Calculate the group reward for the given rollout group.
 
@@ -507,20 +505,20 @@ class GRPOConsumer(BaseConsumer):
         Returns:
             Dict[str, Any]: The new group data with calculated reward.
         """
-        reward_group = self.reward_model(
-            rollout_group["input_ids"],
-            gt_answer=rollout_group["gt_answer"],
-            response_idx=rollout_group["response_idx"],
+        reward_model_output = self.reward_model(
+            rollout["input_ids"],
+            gt_answer=rollout["gt_answer"],
+            response_idx=rollout["response_idx"],
         )
         # [num_of_generation]
-        reward = torch.tensor([value[0] for value in reward_group]).to(rollout_group["input_ids"].device)
-        format_acc = torch.tensor([value[1] for value in reward_group]).to(rollout_group["input_ids"].device)
-        ans_acc = torch.tensor([value[2] for value in reward_group]).to(rollout_group["input_ids"].device)
+        reward = torch.tensor([value[0] for value in reward_model_output]).to(rollout["input_ids"].device)
+        format_acc = torch.tensor([value[1] for value in reward_model_output]).to(rollout["input_ids"].device)
+        ans_acc = torch.tensor([value[2] for value in reward_model_output]).to(rollout["input_ids"].device)
 
-        rollout_group["reward"] = reward.view((-1, 1))
-        rollout_group["format_acc"] = format_acc.view((-1, 1))
-        rollout_group["ans_acc"] = ans_acc.view((-1, 1))
-        return rollout_group
+        rollout["reward"] = reward.view((-1, 1))
+        rollout["format_acc"] = format_acc.view((-1, 1))
+        rollout["ans_acc"] = ans_acc.view((-1, 1))
+        return rollout
 
     def prompt_level_filtering(self, rollout_group: Dict[str, Any]) -> Dict[str, Any]:
         """

From c8b368c294a76fd06b45b5f683cde466e03f987c Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li352711588@gmail.com>
Date: Wed, 28 May 2025 19:18:09 +0800
Subject: [PATCH 07/34] add overlength sample count (#6332)

Co-authored-by: Tong Li <tong.li35271158@gmail.com>
---
 .../coati/distributed/grpo_consumer.py             | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index eaf3521b6..1666ac582 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -85,6 +85,8 @@ class GRPOConsumer(BaseConsumer):
         self.effective_sample_count = 0
         self.effective_prompt_count = 0
         self.total_sample_count = 0
+        self.overlength_samples = 0
+        self.total_overlength_samples = 0
         self.project_name = project_name
         self.run_name = run_name
         self.wandb_group_name = wandb_group_name
@@ -227,10 +229,18 @@ class GRPOConsumer(BaseConsumer):
 
         # filter out overlength samples
         if self.filter_truncated_response and action_mask.size(1) == self.max_length:
+            old_loss_mask = loss_mask.clone()
             loss_mask = torch.logical_and(
                 loss_mask,
                 action_mask[:, -1] == False,
             )
+
+            self.overlength_samples = (old_loss_mask & ~loss_mask).sum().item()
+            self.overlength_samples = all_reduce_sum(
+                torch.tensor(self.overlength_samples, device=loss_mask.device), self.plugin
+            )
+            self.total_overlength_samples += self.overlength_samples.item()
+
         prompt_level_mask = loss_mask.view(self.minibatch_size, self.num_generations)
 
         # [minibatch_size] -> calculate the number of effective prompts
@@ -484,9 +494,11 @@ class GRPOConsumer(BaseConsumer):
             self.optimizer.zero_grad()
             self.global_step += 1
             sample_utilization = self.effective_sample_count / self.total_sample_count
+            overlength_samples_percentage = self.total_overlength_samples / self.total_sample_count
             self.effective_prompt_count = 0
             self.effective_sample_count = 0
             self.total_sample_count = 0
+            self.total_overlength_samples = 0
             loss_scalar = self.accum_loss.item()
             if not self.plugin.pp_size > 1 or (
                 self.plugin.pp_size > 1 and self.booster.plugin.stage_manager.is_last_stage() and self.tp_rank == 0
@@ -502,6 +514,7 @@ class GRPOConsumer(BaseConsumer):
                         f"Advantages: {self.accum_advantages.item() / self.accum_count:.4f}",
                         f"Response Length: {self.accum_response_length.item() / self.accum_count:.4f}",
                         f"Sample_utilization: {sample_utilization:.4f}",
+                        f"Percentage of overlength samples: {overlength_samples_percentage:.4f}",
                     ] + ([f"KL: {self.accum_kl.item() / self.accum_count:.4f}"] if self.policy_loss_fn.beta > 0 else [])
                     print("\n".join(to_log_msg))
                     metrics = {
@@ -513,6 +526,7 @@ class GRPOConsumer(BaseConsumer):
                         "train/advantages": self.accum_advantages.item() / self.accum_count,
                         "train/learning_rate": self.lr_scheduler.get_last_lr()[0],
                         "train/sample_utilization": sample_utilization,
+                        "train/percentage_overlength_samples": overlength_samples_percentage,
                         "rollout/temperature": data["temperature"].cpu().numpy()[0][0],
                     }
                     if self.policy_loss_fn.beta > 0:

From ee939d9aa506b4c5266fd2136e23bc6503b1d5b7 Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Thu, 29 May 2025 10:25:59 +0800
Subject: [PATCH 08/34] address conversation

---
 .../coati/distributed/consumer.py             | 67 ++++++++++---------
 .../coati/distributed/grpo_consumer.py        | 34 +---------
 .../coati/distributed/producer.py             |  3 +-
 3 files changed, 36 insertions(+), 68 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/consumer.py b/applications/ColossalChat/coati/distributed/consumer.py
index 6f8f1b497..593e0f4ec 100644
--- a/applications/ColossalChat/coati/distributed/consumer.py
+++ b/applications/ColossalChat/coati/distributed/consumer.py
@@ -118,48 +118,49 @@ class BaseConsumer:
                         for r in range(self.num_producers):
                             print(f"[T{dist.get_rank()}] Recv data episode {episode} step {step} from {r}")
                             raw_batch = ray_broadcast_tensor_dict(None, src=0, device=self.device, group_name=f"sync_data_{r}")
-                            recv_effective_count = 0
                             # calculate group reward et al. filtering. As only the filtered group will be used for training (which is incomplete),
                             # we need to calculate the metrics before filtering here for logging
-                            raw_batch_with_reward = unbind_batch(self.calculate_reward(raw_batch))
-                            for group_with_reward in raw_batch_with_reward:
-                                group_reward_mean = group_with_reward["reward"].mean().cpu().item()
-                                group_format_acc_mean = group_with_reward["format_acc"].mean().cpu().item()
-                                group_ans_acc_mean = group_with_reward["ans_acc"].mean().cpu().item()
-                                group_response_len = (
-                                    (
-                                        group_with_reward["response_idx"][:, 1]
-                                        - group_with_reward["response_idx"][:, 0]
-                                        + 1
-                                    )
-                                    .type(torch.float32)
-                                    .mean()
-                                    .cpu()
-                                    .item()
+                            # [batch_size, num_generations, ...] -> [batch_size * num_generations, ...]
+                            raw_batch_with_reward = self.calculate_reward({k:v.view(-1, v.size(-1)) if k!='temperature' else v for k, v in raw_batch.items()})
+                            raw_batch_with_reward = {k: v.view(-1, self.num_generations, v.size(-1)) if k!='temperature' else v for k, v in raw_batch_with_reward.items()}
+                            # [batch_size, num_generations] -> [batch_size]
+                            group_reward_mean = raw_batch_with_reward["reward"][:,:,0].mean(dim=-1)  
+                            group_format_acc_mean = raw_batch_with_reward["format_acc"][:,:,0].mean(dim=-1) 
+                            group_ans_acc_mean = raw_batch_with_reward["ans_acc"][:,:,0].mean(dim=-1) 
+                            group_response_len = (
+                                (raw_batch_with_reward["response_idx"][:, :, 1] - raw_batch_with_reward["response_idx"][:, :, 0] + 1)
+                                .type(torch.float32)
+                                .mean(dim=-1)
+                            )
+                            effective_group_mask = None
+                            if self.filter_range is not None and self.grpo_config.get("dynamic_batching", True):
+                                # filter the group based on the reward and accuracy
+                                effective_group_mask = torch.logical_and(
+                                    group_ans_acc_mean > self.filter_range[0], group_ans_acc_mean < self.filter_range[1]
                                 )
-                                if self.grpo_config.get("dynamic_batching", True):
-                                    filtered_group = self.prompt_level_filtering(group_with_reward)
-                                recv_effective_count += 1 if filtered_group is not None else 0
+                            raw_batch_with_reward = unbind_batch(raw_batch_with_reward)       # List[Dict[str, torch.Tensor]]                       
+                            for group_idx, group_with_reward in enumerate(raw_batch_with_reward):
                                 self.buffer.append(
                                     [
-                                        filtered_group,
-                                        group_reward_mean,
-                                        group_format_acc_mean,
-                                        group_ans_acc_mean,
-                                        group_response_len,
+                                        group_with_reward if effective_group_mask is None or effective_group_mask[group_idx] else None,
+                                        group_reward_mean[group_idx],
+                                        group_format_acc_mean[group_idx],
+                                        group_ans_acc_mean[group_idx],
+                                        group_response_len[group_idx],
                                     ]
                                 )
-                            if self.filter_range is not None:
+                            if effective_group_mask is not None:
                                 print(
-                                    f"[T{dist.get_rank()}] Filter recv data: {len(raw_batch)} -> {recv_effective_count}"
+                                    f"[T{dist.get_rank()}] Filter recv data: {len(raw_batch)} -> {torch.sum(effective_group_mask).cpu().item()} effective groups"
                                 )
-                            # mapping the effective group to the raw group for indexing
-                            effective_group_to_raw_group_mapping = {}
-                            for buffer_idx in range(len(self.buffer)):
-                                if self.buffer[buffer_idx][0] is not None:
-                                    effective_group_to_raw_group_mapping[len(effective_group_to_raw_group_mapping)] = (
-                                        buffer_idx
-                                    )
+                        # mapping the effective group to the raw group for indexing
+                        effective_group_to_raw_group_mapping = {}
+                        for buffer_idx in range(len(self.buffer)):
+                            if self.buffer[buffer_idx][0] is not None:
+                                effective_group_to_raw_group_mapping[len(effective_group_to_raw_group_mapping)] = (
+                                    buffer_idx
+                                )
+                        pbar.set_postfix({"Collect Effective Prompt": f"{len(effective_group_to_raw_group_mapping)}/{self.dp_size * self.minibatch_size}"})
 
                         while len(effective_group_to_raw_group_mapping) >= self.dp_size * self.minibatch_size:
                             # on each dp_rank, we use minibatch_size effective samples to form a batch
diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index 891a6f842..50702683f 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -84,7 +84,6 @@ class GRPOConsumer(BaseConsumer):
         self.project_name = project_name
         self.effective_sample_count = 0
         self.effective_prompt_count = 0
-        self.total_sample_count = 0
         self.project_name = project_name
         self.run_name = run_name
         self.wandb_group_name = wandb_group_name
@@ -429,11 +428,9 @@ class GRPOConsumer(BaseConsumer):
             self.optimizer.step()
             self.optimizer.zero_grad()
             self.global_step += 1
-            # no need to run all_reduce_sum on total_sample_count, because all dp ranks recieves a complete inference batch from all producers.
-            sample_utilization = self.effective_sample_count / self.total_sample_count
+            sample_utilization = self.effective_sample_count / len(self.raw_train_batch_reward) / self.num_generations
             self.effective_prompt_count = 0
             self.effective_sample_count = 0
-            self.total_sample_count = 0
             loss_scalar = self.accum_loss.item()
             if not self.plugin.pp_size > 1 or (
                 self.plugin.pp_size > 1 and self.booster.plugin.stage_manager.is_last_stage() and self.tp_rank == 0
@@ -520,35 +517,6 @@ class GRPOConsumer(BaseConsumer):
         rollout["ans_acc"] = ans_acc.view((-1, 1))
         return rollout
 
-    def prompt_level_filtering(self, rollout_group: Dict[str, Any]) -> Dict[str, Any]:
-        """
-        rollout_group: Dict[str, Any]
-            a group of samples generated by the model from the same prompt
-            contain the following keys:
-                "input_ids": torch.Tensor, [num_of_generation, prompt_length + response_length]
-                "attention_mask": torch.Tensor, [num_of_generation, prompt_length + response_length]
-                "action_mask": torch.Tensor, [num_of_generation, response_length]
-                "action_log_probs": torch.Tensor, [num_of_generation, response_length]
-                "response_idx": int, torch.Tensor, [num_of_generation, 2]
-                "gt_answer": torch.Tensor, [num_of_generation, 128]
-                "temperature": torch.Tensor, [] (scalar)
-                "reward": torch.Tensor, [num_of_generation]
-                "format_acc": torch.Tensor, [num_of_generation]
-                "ans_acc": torch.Tensor, [num_of_generation]
-        """
-        self.total_sample_count += rollout_group["input_ids"].size(0)
-        if self.filter_range is not None:
-            # filter prompt whoes accuracy is too high or too low (out of range)
-            group_ans_acc = torch.mean(rollout_group["ans_acc"])
-            if group_ans_acc < self.filter_range[0] or group_ans_acc > self.filter_range[1]:
-                # filter out the prompt
-                return None
-            else:
-                return rollout_group
-        else:
-            # no filter
-            return rollout_group
-
     def state_dict(self):
         self.policy_model._force_wait_all_gather()
         model = self.policy_model.unwrap()
diff --git a/applications/ColossalChat/coati/distributed/producer.py b/applications/ColossalChat/coati/distributed/producer.py
index f5bdc6835..623ed7ab9 100644
--- a/applications/ColossalChat/coati/distributed/producer.py
+++ b/applications/ColossalChat/coati/distributed/producer.py
@@ -248,11 +248,10 @@ class BaseProducer:
                         self.eval_mode = False
                         self.latest_eval_step = self.consumer_global_step
                 outputs = self.rollout(**batch)
-
-                print(f"[P{self.producer_idx}] Send data {[(k, v.shape) for k, v in outputs.items()]}")
                 outputs["temperature"] = torch.tensor(
                     [self.model.generate_config["temperature"]] * outputs["input_ids"].size(0)
                 ).to(outputs["input_ids"].device)
+                print(f"[P{self.producer_idx}] Send data {[(k, v.shape) for k, v in outputs.items()]}")
                 outputs = pre_send(outputs)
                 ray_broadcast_tensor_dict(
                     outputs, src=0, device=self.device, group_name=f"sync_data_{self.producer_idx}"

From 0d008110e7a4fe2eb5d90cfd8fed6b9e2f294e00 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 29 May 2025 10:16:55 +0000
Subject: [PATCH 09/34] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../coati/distributed/consumer.py             | 40 +++++++++++++------
 .../coati/distributed/grpo_consumer.py        | 10 +++--
 2 files changed, 34 insertions(+), 16 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/consumer.py b/applications/ColossalChat/coati/distributed/consumer.py
index 690a10608..22de6911e 100644
--- a/applications/ColossalChat/coati/distributed/consumer.py
+++ b/applications/ColossalChat/coati/distributed/consumer.py
@@ -117,20 +117,28 @@ class BaseConsumer:
                         # receive data from producers
                         for r in range(self.num_producers):
                             print(f"[T{dist.get_rank()}] Recv data episode {episode} step {step} from {r}")
-                            raw_batch = ray_broadcast_tensor_dict(None, src=0, device=self.device, group_name=f"sync_data_{r}")
+                            raw_batch = ray_broadcast_tensor_dict(
+                                None, src=0, device=self.device, group_name=f"sync_data_{r}"
+                            )
                             # calculate group reward et al. filtering. As only the filtered group will be used for training (which is incomplete),
                             # we need to calculate the metrics before filtering here for logging
                             # [batch_size, num_generations, ...] -> [batch_size * num_generations, ...]
-                            raw_batch_with_reward = self.calculate_reward({k:v.view(-1, v.size(-1)) if k!='temperature' else v for k, v in raw_batch.items()})
-                            raw_batch_with_reward = {k: v.view(-1, self.num_generations, v.size(-1)) if k!='temperature' else v for k, v in raw_batch_with_reward.items()}
-                            # [batch_size, num_generations] -> [batch_size]
-                            reward = raw_batch_with_reward["reward"][:,:,0]
-                            format_acc = raw_batch_with_reward["format_acc"][:,:,0]
-                            ans_acc = raw_batch_with_reward["ans_acc"][:,:,0]
-                            response_len = (
-                                (raw_batch_with_reward["response_idx"][:, :, 1] - raw_batch_with_reward["response_idx"][:, :, 0] + 1)
-                                .type(torch.float32)
+                            raw_batch_with_reward = self.calculate_reward(
+                                {k: v.view(-1, v.size(-1)) if k != "temperature" else v for k, v in raw_batch.items()}
                             )
+                            raw_batch_with_reward = {
+                                k: v.view(-1, self.num_generations, v.size(-1)) if k != "temperature" else v
+                                for k, v in raw_batch_with_reward.items()
+                            }
+                            # [batch_size, num_generations] -> [batch_size]
+                            reward = raw_batch_with_reward["reward"][:, :, 0]
+                            format_acc = raw_batch_with_reward["format_acc"][:, :, 0]
+                            ans_acc = raw_batch_with_reward["ans_acc"][:, :, 0]
+                            response_len = (
+                                raw_batch_with_reward["response_idx"][:, :, 1]
+                                - raw_batch_with_reward["response_idx"][:, :, 0]
+                                + 1
+                            ).type(torch.float32)
                             effective_group_mask = None
                             if self.filter_range is not None and self.grpo_config.get("dynamic_batching", True):
                                 # filter the group based on the reward and accuracy
@@ -138,11 +146,15 @@ class BaseConsumer:
                                 effective_group_mask = torch.logical_and(
                                     group_ans_acc_mean > self.filter_range[0], group_ans_acc_mean < self.filter_range[1]
                                 )
-                            raw_batch_with_reward = unbind_batch(raw_batch_with_reward)       # List[Dict[str, torch.Tensor]]                       
+                            raw_batch_with_reward = unbind_batch(raw_batch_with_reward)  # List[Dict[str, torch.Tensor]]
                             for group_idx, group_with_reward in enumerate(raw_batch_with_reward):
                                 self.buffer.append(
                                     [
-                                        group_with_reward if effective_group_mask is None or effective_group_mask[group_idx] else None,
+                                        (
+                                            group_with_reward
+                                            if effective_group_mask is None or effective_group_mask[group_idx]
+                                            else None
+                                        ),
                                         reward[group_idx],
                                         format_acc[group_idx],
                                         ans_acc[group_idx],
@@ -160,7 +172,9 @@ class BaseConsumer:
                                 effective_group_to_raw_group_mapping[len(effective_group_to_raw_group_mapping)] = (
                                     buffer_idx
                                 )
-                        print(f"[T{dist.get_rank()}] Collect Effective Prompt: {len(effective_group_to_raw_group_mapping)}/{self.dp_size * self.minibatch_size}")
+                        print(
+                            f"[T{dist.get_rank()}] Collect Effective Prompt: {len(effective_group_to_raw_group_mapping)}/{self.dp_size * self.minibatch_size}"
+                        )
 
                         while len(effective_group_to_raw_group_mapping) >= self.dp_size * self.minibatch_size:
                             # on each dp_rank, we use minibatch_size effective samples to form a batch
diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index 451947b44..89fae6fc7 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -211,10 +211,12 @@ class GRPOConsumer(BaseConsumer):
                 loss_mask,
                 action_mask[:, -1] == False,
             )
-        if self.filter_range is not None and self.grpo_config.get("dynamic_batching", False)==False:
+        if self.filter_range is not None and self.grpo_config.get("dynamic_batching", False) == False:
             # filter out samples with reward outside the range
             # if dynamic batching is enabled, we filter out out of range groups before training
-            group_ans_acc_mean = ans_acc.view(-1, self.num_generations).mean(dim=1).repeat_interleave(self.num_generations, dim=-1)
+            group_ans_acc_mean = (
+                ans_acc.view(-1, self.num_generations).mean(dim=1).repeat_interleave(self.num_generations, dim=-1)
+            )
             loss_mask = torch.logical_and(
                 loss_mask,
                 torch.logical_and(
@@ -454,7 +456,9 @@ class GRPOConsumer(BaseConsumer):
                     raw_batch_ans_acc_mean = torch.cat(self.raw_train_batch_ans_acc, dim=0).mean().cpu().item()
                     raw_batch_response_len = torch.cat(self.raw_train_batch_response_len, dim=0)
                     raw_batch_response_len_mean = raw_batch_response_len.mean().cpu().item()
-                    overlength_samples_ratio = (raw_batch_response_len >= action_mask.size(-1)).to(float).mean().cpu().item()  # not an exact figure, but a close estimate
+                    overlength_samples_ratio = (
+                        (raw_batch_response_len >= action_mask.size(-1)).to(float).mean().cpu().item()
+                    )  # not an exact figure, but a close estimate
                     self.raw_train_batch_reward = []
                     self.raw_train_batch_format_acc = []
                     self.raw_train_batch_ans_acc = []

From 96faf5454245d001d06ef538a4fd033d87ca62de Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Thu, 5 Jun 2025 15:41:14 +0800
Subject: [PATCH 10/34] fix typ and parameter description

---
 applications/ColossalChat/rl_example.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/applications/ColossalChat/rl_example.py b/applications/ColossalChat/rl_example.py
index eed0af362..b4f565617 100644
--- a/applications/ColossalChat/rl_example.py
+++ b/applications/ColossalChat/rl_example.py
@@ -126,25 +126,25 @@ if __name__ == "__main__":
         "--tensor-parallel-size",
         type=int,
         default=1,
-        help="Tensor parallel size for the inference backend. Please check the generation arguments documentation for your backend.",
+        help="Tensor parallel size for the trainer (consumer). Please check the generation arguments documentation for your backend.",
     )
     parser.add_argument(
         "-pp",
         "--pipeline-parallel-size",
         type=int,
         default=1,
-        help="Pipeline parallel size for the inference backend. Please check the generation arguments documentation for your backend.",
+        help="Pipeline parallel size for the trainer (consumer). Please check the generation arguments documentation for your backend.",
     )
     parser.add_argument(
         "-zero",
         "--zero-stage",
         type=int,
         default=0,
-        help="Zero stage for the inference backend. Please check the generation arguments documentation for your backend.",
+        help="Zero stage for the trainer (consumer). Please check the generation arguments documentation for your backend.",
     )
     parser.add_argument(
         "-ptp",
-        "--produce-tensor-parallel-size",
+        "--producer-tensor-parallel-size",
         type=int,
         default=1,
         help="Tensor parallel size for the producer. Please check the generation arguments documentation for your backend.",
@@ -206,7 +206,7 @@ if __name__ == "__main__":
                 enforce_eager=True,
                 enable_chunked_prefill=True,
                 max_model_len=args.max_new_tokens + args.max_prompt_tokens,
-                tensor_parallel_size=args.produce_tensor_parallel_size,
+                tensor_parallel_size=args.producer_tensor_parallel_size,
             )
         )
         generate_config.update(
@@ -276,7 +276,7 @@ if __name__ == "__main__":
 
     launch_distributed(
         num_producers=args.num_inferencer,
-        num_proc_per_producer=inference_model_config.get("tensor_parallel_size", args.produce_tensor_parallel_size),
+        num_proc_per_producer=inference_model_config.get("tensor_parallel_size", args.producer_tensor_parallel_size),
         num_consumer_procs=args.num_trainers,
         num_episodes=args.num_episodes,
         inference_batch_size=args.inference_batch_size,

From dc3033e68a927dcace02e951deda3b9882c89362 Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Thu, 5 Jun 2025 17:56:42 +0800
Subject: [PATCH 11/34] support code generation tasks

---
 .../ColossalChat/coati/dataset/loader.py      |  35 +-
 .../coati/distributed/consumer.py             |  21 +-
 .../coati/distributed/grpo_consumer.py        |  51 +-
 .../coati/distributed/inference_backend.py    |  18 +-
 .../ColossalChat/coati/distributed/launch.py  |   5 +-
 .../coati/distributed/producer.py             |  91 ++-
 .../reward/code_reward/testing_util.py        | 669 ++++++++++++++++++
 .../distributed/reward/code_reward/utils.py   |  61 ++
 .../coati/distributed/reward/reward_fn.py     | 118 ++-
 .../distributed/reward/verifiable_reward.py   |  54 +-
 applications/ColossalChat/requirements.txt    |   3 +
 applications/ColossalChat/rl_example.py       |  28 +-
 12 files changed, 1027 insertions(+), 127 deletions(-)
 create mode 100644 applications/ColossalChat/coati/distributed/reward/code_reward/testing_util.py
 create mode 100644 applications/ColossalChat/coati/distributed/reward/code_reward/utils.py

diff --git a/applications/ColossalChat/coati/dataset/loader.py b/applications/ColossalChat/coati/dataset/loader.py
index 43cf78383..16fd385ba 100755
--- a/applications/ColossalChat/coati/dataset/loader.py
+++ b/applications/ColossalChat/coati/dataset/loader.py
@@ -367,9 +367,9 @@ def apply_chat_template_and_mask(
     }
 
     # Format for RL.
-    gt_answer = None
-    if "messages" in chat and "gt_answer" in chat:
-        gt_answer = chat["gt_answer"]
+    if "messages" in chat:
+        gt_answer = chat.get("gt_answer", None)
+        test_cases = chat.get("test_cases", None)
         chat = [chat["messages"]]
 
     tokens = []
@@ -402,12 +402,14 @@ def apply_chat_template_and_mask(
     labels[~torch.tensor(assistant_mask, dtype=torch.bool)] = ignore_idx
 
     if gt_answer is not None:
-        gt_answer = tokenizer.encode(
-            gt_answer, padding="max_length", truncation=True, max_length=128, return_tensors="pt"
-        )
-        gt_answer = gt_answer.squeeze(1)
         return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels, "gt_answer": gt_answer}
-
+    elif test_cases is not None:
+        return {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "labels": labels,
+            "test_cases": test_cases,
+        }
     return {
         "input_ids": input_ids,
         "attention_mask": attention_mask,
@@ -440,3 +442,20 @@ class RawConversationDataset(Dataset):
             tokens = apply_chat_template_and_mask(self.tokenizer, message, self.max_length, self.system_prompt)
             self.tokenized_texts[index] = dict(tokens)
         return self.tokenized_texts[index]
+
+
+def collate_fn_grpo(batch):
+    input_ids = [item["input_ids"] for item in batch]
+    attention_mask = [item["attention_mask"] for item in batch]
+    labels = [item["labels"] for item in batch]
+    # Assume input_ids, attention_mask, labels are already of the same length,
+    # otherwise use pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
+    input_ids = torch.stack(input_ids)
+    attention_mask = torch.stack(attention_mask)
+    labels = torch.stack(labels)
+    ret = {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}
+    if "test_cases" in batch[0]:
+        ret["test_cases"] = [item["test_cases"] for item in batch]
+    if "gt_answer" in batch[0]:
+        ret["gt_answer"] = [item["gt_answer"] for item in batch]
+    return ret
diff --git a/applications/ColossalChat/coati/distributed/consumer.py b/applications/ColossalChat/coati/distributed/consumer.py
index 22de6911e..3ae4a1796 100644
--- a/applications/ColossalChat/coati/distributed/consumer.py
+++ b/applications/ColossalChat/coati/distributed/consumer.py
@@ -123,20 +123,17 @@ class BaseConsumer:
                             # calculate group reward et al. filtering. As only the filtered group will be used for training (which is incomplete),
                             # we need to calculate the metrics before filtering here for logging
                             # [batch_size, num_generations, ...] -> [batch_size * num_generations, ...]
-                            raw_batch_with_reward = self.calculate_reward(
-                                {k: v.view(-1, v.size(-1)) if k != "temperature" else v for k, v in raw_batch.items()}
-                            )
-                            raw_batch_with_reward = {
+                            raw_batch = {
                                 k: v.view(-1, self.num_generations, v.size(-1)) if k != "temperature" else v
-                                for k, v in raw_batch_with_reward.items()
+                                for k, v in raw_batch.items()
                             }
                             # [batch_size, num_generations] -> [batch_size]
-                            reward = raw_batch_with_reward["reward"][:, :, 0]
-                            format_acc = raw_batch_with_reward["format_acc"][:, :, 0]
-                            ans_acc = raw_batch_with_reward["ans_acc"][:, :, 0]
+                            reward = raw_batch["reward"][:, :, 0]
+                            format_acc = raw_batch["format_acc"][:, :, 0]
+                            ans_acc = raw_batch["ans_acc"][:, :, 0]
                             response_len = (
-                                raw_batch_with_reward["response_idx"][:, :, 1]
-                                - raw_batch_with_reward["response_idx"][:, :, 0]
+                                raw_batch["response_idx"][:, :, 1]
+                                - raw_batch["response_idx"][:, :, 0]
                                 + 1
                             ).type(torch.float32)
                             effective_group_mask = None
@@ -146,8 +143,8 @@ class BaseConsumer:
                                 effective_group_mask = torch.logical_and(
                                     group_ans_acc_mean > self.filter_range[0], group_ans_acc_mean < self.filter_range[1]
                                 )
-                            raw_batch_with_reward = unbind_batch(raw_batch_with_reward)  # List[Dict[str, torch.Tensor]]
-                            for group_idx, group_with_reward in enumerate(raw_batch_with_reward):
+                            raw_batch = unbind_batch(raw_batch)  # List[Dict[str, torch.Tensor]]
+                            for group_idx, group_with_reward in enumerate(raw_batch):
                                 self.buffer.append(
                                     [
                                         (
diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index 89fae6fc7..2e4d8f08c 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -6,8 +6,6 @@ import torch
 import wandb
 from coati.distributed.consumer import BaseConsumer
 from coati.distributed.loss import PolicyLoss
-from coati.distributed.reward.reward_fn import boxed_math_reward_fn, math_reward_fn
-from coati.distributed.reward.verifiable_reward import VerifiableReward
 from coati.distributed.utils import calc_action_log_probs
 from coati.trainer.utils import all_reduce_mean, all_reduce_sum
 from transformers import AutoModelForCausalLM, AutoTokenizer
@@ -119,20 +117,7 @@ class GRPOConsumer(BaseConsumer):
                     "either max_tokens (vllm) or max_new_tokens (transformers) must be set in generate_config."
                 )
         # Initialize verifiable reward.
-        response_format_tags = grpo_config.get("response_format_tags", None)
-        reward_model_kwargs = {
-            k: v
-            for k, v in grpo_config.items()
-            if k in ["soft_over_length_punishment", "max_new_tokens", "cache_length"]
-        }
-        self.reward_model = VerifiableReward(
-            reward_fns=[
-                math_reward_fn if grpo_config.get("reward_fn_type") == "think_answer_tags" else boxed_math_reward_fn
-            ],
-            tokenizer=self.tokenizer,
-            tags=response_format_tags,
-            **reward_model_kwargs,
-        )
+        grpo_config.get("response_format_tags", None)
         self.global_step = 0
 
         self.lr_scheduler = CosineAnnealingWarmupLR(
@@ -498,40 +483,6 @@ class GRPOConsumer(BaseConsumer):
         else:
             return None
 
-    def calculate_reward(self, rollout: Dict[str, Any]) -> Dict[str, Any]:
-        """
-        Calculate the group reward for the given rollout group.
-
-        Args:
-            rollout_group (Dict[str, Any]):
-                a group of samples generated by the model from the same prompt
-                contain the following keys:
-                    "input_ids": torch.Tensor, [num_of_generation, prompt_length + response_length]
-                    "attention_mask": torch.Tensor, [num_of_generation, prompt_length + response_length]
-                    "action_mask": torch.Tensor, [num_of_generation, response_length]
-                    "action_log_probs": torch.Tensor, [num_of_generation, response_length]
-                    "response_idx": int, torch.Tensor, [num_of_generation, 2]
-                    "gt_answer": torch.Tensor, [num_of_generation, 128]
-                    "temperature": torch.Tensor, [] (scalar)
-
-        Returns:
-            Dict[str, Any]: The new group data with calculated reward.
-        """
-        reward_model_output = self.reward_model(
-            rollout["input_ids"],
-            gt_answer=rollout["gt_answer"],
-            response_idx=rollout["response_idx"],
-        )
-        # [num_of_generation]
-        reward = torch.tensor([value[0] for value in reward_model_output]).to(rollout["input_ids"].device)
-        format_acc = torch.tensor([value[1] for value in reward_model_output]).to(rollout["input_ids"].device)
-        ans_acc = torch.tensor([value[2] for value in reward_model_output]).to(rollout["input_ids"].device)
-
-        rollout["reward"] = reward.view((-1, 1))
-        rollout["format_acc"] = format_acc.view((-1, 1))
-        rollout["ans_acc"] = ans_acc.view((-1, 1))
-        return rollout
-
     def state_dict(self):
         self.policy_model._force_wait_all_gather()
         model = self.policy_model.unwrap()
diff --git a/applications/ColossalChat/coati/distributed/inference_backend.py b/applications/ColossalChat/coati/distributed/inference_backend.py
index ce2a2f28c..34827e4e2 100644
--- a/applications/ColossalChat/coati/distributed/inference_backend.py
+++ b/applications/ColossalChat/coati/distributed/inference_backend.py
@@ -74,9 +74,8 @@ class TransformersInferenceBackend(BaseInferenceBackend):
         micro_batch_size = input_ids.size(0)
         input_ids = input_ids.to(get_current_device())
         attention_mask = attention_mask.to(get_current_device())
-        gt_answer = None
-        if "gt_answer" in kwargs:
-            gt_answer = kwargs.pop("gt_answer")
+        gt_answer = kwargs.pop("gt_answer", None)
+        test_cases = kwargs.pop("test_cases", None)
         if self.num_generations > 1:
             input_ids = input_ids.repeat_interleave(self.num_generations, dim=0)
             attention_mask = attention_mask.repeat_interleave(self.num_generations, dim=0)
@@ -116,8 +115,9 @@ class TransformersInferenceBackend(BaseInferenceBackend):
         data = {k: v.view(micro_batch_size, self.num_generations, v.size(-1)) for k, v in data.items()}
 
         if gt_answer is not None:
-            # repeat gt_answer for each prompt.
-            data["gt_answer"] = gt_answer.repeat_interleave(self.num_generations, dim=1)
+            data["gt_answer"] = gt_answer
+        if test_cases is not None:
+            data["test_cases"] = test_cases
         data = {k: v.to(get_current_device()) for k, v in data.items()}
         return data
 
@@ -269,11 +269,11 @@ class VLLMInferenceBackend(BaseInferenceBackend):
         }
 
         data = {k: v.view(micro_batch_size, -1, v.size(-1)) for k, v in data.items()}
-
-        if "gt_answer" in kwargs:
-            # repeat gt_answer for each prompt.
-            data["gt_answer"] = kwargs["gt_answer"].repeat_interleave(data["input_ids"].size(1), dim=1)
         data = {k: v.to(get_current_device()) for k, v in data.items()}
+        if "gt_answer" in kwargs:
+            data["gt_answer"] = kwargs["gt_answer"]
+        if "test_cases" in kwargs:
+            data["test_cases"] = kwargs["test_cases"]
         return data
 
     def load_state_dict(self, state_dict: Dict[str, torch.Tensor]) -> None:
diff --git a/applications/ColossalChat/coati/distributed/launch.py b/applications/ColossalChat/coati/distributed/launch.py
index 764325cdd..41ba8ea55 100644
--- a/applications/ColossalChat/coati/distributed/launch.py
+++ b/applications/ColossalChat/coati/distributed/launch.py
@@ -37,7 +37,6 @@ def launch_distributed(
     train_batch_size: int,
     train_minibatch_size: int,
     train_dataset_config: Dict[str, Any],
-    dataloaders_config: Dict[str, Any],
     inference_model_config: Dict[str, Any],
     generate_config: Dict[str, Any],
     train_model_config: Dict[str, Any],
@@ -89,7 +88,6 @@ def launch_distributed(
             num_episodes=num_episodes,
             batch_size=inference_batch_size,
             train_dataset_config=train_dataset_config,
-            dataloaders_config=dataloaders_config,
             model_config=inference_model_config,
             generate_config=generate_config,
             tokenizer_config=tokenizer_config,
@@ -99,8 +97,7 @@ def launch_distributed(
             consumer_plugin_config=plugin_config,
             eval_dataset_config=eval_dataset_config,
             eval_interval=eval_interval,
-            evaluation_function_type=grpo_config["reward_fn_type"],
-            response_format_tags=grpo_config["response_format_tags"],
+            grpo_config=grpo_config,
             eval_save_dir=eval_save_dir,
             eval_generation_config=eval_generation_config,
             project_name=project_name,
diff --git a/applications/ColossalChat/coati/distributed/producer.py b/applications/ColossalChat/coati/distributed/producer.py
index 623ed7ab9..135c8db0e 100644
--- a/applications/ColossalChat/coati/distributed/producer.py
+++ b/applications/ColossalChat/coati/distributed/producer.py
@@ -8,8 +8,9 @@ import ray.util.collective as cc
 import torch
 import tqdm
 import wandb
-from coati.dataset.loader import RawConversationDataset
-from coati.distributed.reward.reward_fn import boxed_math_reward_fn, math_reward_fn
+from coati.dataset.loader import RawConversationDataset, collate_fn_grpo
+from coati.distributed.reward.reward_fn import boxed_math_reward_fn, code_reward_fn, math_reward_fn
+from coati.distributed.reward.verifiable_reward import VerifiableReward
 from ray.util.collective import allreduce
 from ray.util.collective.types import Backend, ReduceOp
 from torch.utils.data import DataLoader, DistributedSampler
@@ -36,7 +37,6 @@ class BaseProducer:
         num_episodes: int,
         batch_size: int,
         train_dataset_config: Dict[str, Any],
-        dataloaders_config: Dict[str, Any],
         model_config: Dict[str, Any],
         generate_config: Dict[str, Any],
         tokenizer_config: Optional[Dict[str, Any]] = None,
@@ -45,8 +45,7 @@ class BaseProducer:
         consumer_plugin_config: Dict[str, Any] = None,
         eval_dataset_config=None,
         eval_interval=-1,  # disable evaluation
-        evaluation_function_type="think_answer_tags",
-        response_format_tags=None,
+        grpo_config: Dict[str, Any] = None,
         eval_save_dir: str = "./eval",
         project_name: str = None,
         run_name: str = None,
@@ -75,6 +74,13 @@ class BaseProducer:
         self.eval_mode = False
         self.log_rollout_interval = log_rollout_interval
         self.latest_rollout_log_step = -1
+        self.grpo_config = grpo_config
+        reward_model_kwargs = {
+            k: v
+            for k, v in grpo_config.items()
+            if k in ["soft_over_length_punishment", "max_new_tokens", "cache_length"]
+        }
+        self.response_format_tags = grpo_config.get("response_format_tags", None)
         if producer_idx == 0:
             if os.path.exists(rollout_log_file):
                 raise ValueError(
@@ -120,6 +126,7 @@ class BaseProducer:
             ),
             num_workers=4,
             drop_last=True,
+            collate_fn=collate_fn_grpo,
         )
 
         self.eval_dataset_config = eval_dataset_config
@@ -142,17 +149,25 @@ class BaseProducer:
                         drop_last=False,
                         seed=42,
                     ),
+                    collate_fn=collate_fn_grpo,
                 )
-            if evaluation_function_type == "think_answer_tags":
+            if grpo_config["reward_fn_type"] == "think_answer_tags":
                 self.evaluation_function = math_reward_fn
-            elif evaluation_function_type == "boxed":
+            elif grpo_config["reward_fn_type"] == "boxed":
                 self.evaluation_function = boxed_math_reward_fn
+            elif grpo_config["reward_fn_type"] == "code":
+                self.evaluation_function = code_reward_fn
             else:
-                raise ValueError(f"Unknown evaluation function type {evaluation_function_type}")
-            self.response_format_tags = response_format_tags
+                raise ValueError(f"Unknown evaluation function type {grpo_config['reward_fn_type']}")
         else:
             print("No eval dataset provided, skip eval")
         self.device = get_current_device()
+        self.reward_model = VerifiableReward(
+            reward_fns=[self.evaluation_function],  # multiple reward functions can be added here
+            tokenizer=self.tokenizer,
+            tags=self.response_format_tags,
+            **reward_model_kwargs,
+        )
 
         # init backend
         if backend in BACKEND_MAP:
@@ -215,7 +230,13 @@ class BaseProducer:
                                 eval_results = eval_results + [
                                     self.evaluation_function(
                                         eval_outputs["input_ids"][m][n],
-                                        eval_outputs["gt_answer"][m][n],
+                                        eval_outputs[
+                                            (
+                                                "test_cases"
+                                                if self.grpo_config["reward_fn_type"] == "code"
+                                                else "gt_answer"
+                                            )
+                                        ][m],
                                         eval_outputs["response_idx"][m][n],
                                         tokenizer=self.tokenizer,
                                         eval_mode=True,
@@ -251,8 +272,50 @@ class BaseProducer:
                 outputs["temperature"] = torch.tensor(
                     [self.model.generate_config["temperature"]] * outputs["input_ids"].size(0)
                 ).to(outputs["input_ids"].device)
+                bs, num_gen = outputs["input_ids"].size(0), outputs["input_ids"].size(1)
+                # breakpoint()
+                if self.grpo_config["reward_fn_type"] == "code":
+                    test_cases = []
+                    for prompt_id in range(bs):
+                        test_cases.extend([outputs["test_cases"][prompt_id]] * num_gen)
+                    # assert all([isinstance(test_cases[i], str) for i in range(len(test_cases))]), f"{[type(test_cases[i]) for i in range(len(test_cases))]}, {test_cases}"
+                    reward_model_output = self.reward_model(
+                        outputs["input_ids"].view((-1, outputs["input_ids"].size(-1))),
+                        test_cases=test_cases,
+                        response_idx=outputs["response_idx"].view((-1, 2)),
+                    )
+                else:
+                    gt_answer = []
+                    for prompt_id in range(bs):
+                        gt_answer.extend([outputs["gt_answer"][prompt_id]] * num_gen)
+                    reward_model_output = self.reward_model(
+                        outputs["input_ids"].view((-1, outputs["input_ids"].size(-1))),
+                        gt_answer=gt_answer,
+                        response_idx=outputs["response_idx"],
+                    )
+                outputs["reward"] = (
+                    torch.tensor([value[0] for value in reward_model_output])
+                    .to(outputs["input_ids"].device)
+                    .view((bs, num_gen, 1))
+                )
+                outputs["format_acc"] = (
+                    torch.tensor([value[1] for value in reward_model_output])
+                    .to(outputs["input_ids"].device)
+                    .view((bs, num_gen, 1))
+                )
+                outputs["ans_acc"] = (
+                    torch.tensor([value[2] for value in reward_model_output])
+                    .to(outputs["input_ids"].device)
+                    .view((bs, num_gen, 1))
+                )
+                if "gt_answer" in outputs:
+                    outputs.pop("gt_answer")
+                if "test_cases" in outputs:
+                    outputs.pop("test_cases")
+
                 print(f"[P{self.producer_idx}] Send data {[(k, v.shape) for k, v in outputs.items()]}")
                 outputs = pre_send(outputs)
+                # breakpoint()
                 ray_broadcast_tensor_dict(
                     outputs, src=0, device=self.device, group_name=f"sync_data_{self.producer_idx}"
                 )
@@ -315,7 +378,6 @@ class SimpleProducer(BaseProducer):
         num_episodes,
         batch_size,
         train_dataset_config,
-        dataloaders_config,
         model_config,
         generate_config,
         tokenizer_config=None,
@@ -325,8 +387,7 @@ class SimpleProducer(BaseProducer):
         consumer_plugin_config=None,
         eval_dataset_config=None,
         eval_interval=-1,  # disable evaluation
-        evaluation_function_type="think_answer_tags",
-        response_format_tags=None,
+        grpo_config: Dict[str, Any] = None,
         eval_save_dir: str = "./eval",
         eval_generation_config={},
         project_name: str = None,
@@ -342,7 +403,6 @@ class SimpleProducer(BaseProducer):
             num_episodes,
             batch_size,
             train_dataset_config,
-            dataloaders_config,
             model_config,
             generate_config,
             tokenizer_config,
@@ -351,8 +411,7 @@ class SimpleProducer(BaseProducer):
             consumer_plugin_config,
             eval_dataset_config=eval_dataset_config,
             eval_interval=eval_interval,
-            evaluation_function_type=evaluation_function_type,
-            response_format_tags=response_format_tags,
+            grpo_config=grpo_config,
             eval_save_dir=eval_save_dir,
             project_name=project_name,
             run_name=run_name,
diff --git a/applications/ColossalChat/coati/distributed/reward/code_reward/testing_util.py b/applications/ColossalChat/coati/distributed/reward/code_reward/testing_util.py
new file mode 100644
index 000000000..c6d6d8fad
--- /dev/null
+++ b/applications/ColossalChat/coati/distributed/reward/code_reward/testing_util.py
@@ -0,0 +1,669 @@
+# Code from the verl Project (https://github.com/agentica-project/rllm),
+# which itself is adapted from Prime (https://github.com/PRIME-RL/PRIME)
+#
+# Copyright 2024 ByteDance Group
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import ast
+import faulthandler
+import json
+import platform
+
+# to run the solution files we're using a timing based approach
+import signal
+import sys
+import traceback
+
+# used for debugging to time steps
+from datetime import datetime
+from enum import Enum
+
+# for capturing the stdout
+from io import StringIO
+
+# used for testing the code that reads from input
+from unittest.mock import mock_open, patch
+
+import numpy as np
+from pyext import RuntimeModule
+
+
+def truncatefn(s, length=300):
+    assert isinstance(s, str)
+    if len(s) <= length:
+        return s
+
+    return s[: length // 2] + "...(truncated) ..." + s[-length // 2 :]
+
+
+class CODE_TYPE(Enum):
+    call_based = 0
+    standard_input = 1
+
+
+# used to capture stdout as a list
+# from https://stackoverflow.com/a/16571630/6416660
+# alternative use redirect_stdout() from contextlib
+class Capturing(list):
+    def __enter__(self):
+        self._stdout = sys.stdout
+        sys.stdout = self._stringio = StringIO()
+        # Make closing the StringIO a no-op
+        self._stringio.close = lambda x: 1
+        return self
+
+    def __exit__(self, *args):
+        self.append(self._stringio.getvalue())
+        del self._stringio  # free up some memory
+        sys.stdout = self._stdout
+
+
+def only_int_check(val):
+    return isinstance(val, int)
+
+
+def string_int_check(val):
+    return isinstance(val, str) and val.isdigit()
+
+
+def combined_int_check(val):
+    return only_int_check(val) or string_int_check(val)
+
+
+def clean_traceback(error_traceback):
+    file_start = error_traceback.find('File "<string>"')
+    # print(file_start)
+    error_traceback = "Traceback (most recent call last):\n  " + error_traceback[file_start:]
+    return error_traceback
+
+
+def run_test(in_outs, test=None, debug=False, timeout=15):
+    """
+    if test(generated_code) is not None it'll try to run the code.
+    otherwise it'll just return an input and output pair.
+    """
+    # Disable functionalities that can make destructive changes to the test.
+    reliability_guard()
+
+    if debug:
+        print(f"start = {datetime.now().time()}")
+
+    if in_outs:
+        if in_outs.get("fn_name") is None:
+            which_type = CODE_TYPE.standard_input  # Standard input
+            method_name = None
+        else:
+            which_type = CODE_TYPE.call_based  # Call-based
+            method_name = in_outs["fn_name"]
+
+    if debug:
+        print(f"loaded input_output = {datetime.now().time()}")
+
+    if test is None:
+        raise AssertionError("should not happen: test code is none")
+    elif test is not None:
+        results = []
+        sol = "from string import *\nfrom re import *\nfrom datetime import *\nfrom collections import *\nfrom heapq import *\nfrom bisect import *\nfrom copy import *\nfrom math import *\nfrom random import *\nfrom statistics import *\nfrom itertools import *\nfrom functools import *\nfrom operator import *\nfrom io import *\nfrom sys import *\nfrom json import *\nfrom builtins import *\nfrom typing import *\nimport string\nimport re\nimport datetime\nimport collections\nimport heapq\nimport bisect\nimport copy\nimport math\nimport random\nimport statistics\nimport itertools\nimport functools\nimport operator\nimport io\nimport sys\nimport json\nsys.setrecursionlimit(6*10**5)\n"  # noqa: E501
+        if debug:
+            print(f"loading test code = {datetime.now().time()}")
+
+        if which_type == CODE_TYPE.call_based:
+            sol += test
+            if debug:
+                print(f"sol = {sol}")
+            signal.alarm(timeout)
+            try:
+                tmp_sol = RuntimeModule.from_string("tmp_sol", "", sol)
+                tmp = tmp_sol if "class Solution" not in test else tmp_sol.Solution()
+                signal.alarm(0)
+            except Exception as e:
+                signal.alarm(0)
+                error_traceback = traceback.format_exc()
+                if debug:
+                    print(f"type 0 compilation error = {e}")
+                results.append(-2)
+                return results, {
+                    "error": repr(e),
+                    # "error_code": -1,
+                    # "error_message": "Compilation Error",
+                    "traceback": clean_traceback(error_traceback),
+                }
+            signal.alarm(0)
+
+        elif which_type == CODE_TYPE.standard_input:
+            # sol
+            # if code has if __name__ == "__main__": then remove it
+            try:
+                astree = ast.parse(test)
+                last_block = astree.body[-1]
+                if isinstance(last_block, ast.If):
+                    condition = last_block.test
+                    if ast.unparse(condition).strip() == "__name__ == '__main__'":
+                        test = ast.unparse(astree.body[:-1]) + "\n" + ast.unparse(last_block.body)
+            except Exception:
+                pass
+
+            tmp_test = test.split("\n")
+
+            new_test = []
+            for x in tmp_test:
+                if (not x.startswith("from ")) and (not x.startswith("import ")):
+                    new_test.append("\t" + x + "\n")
+                else:
+                    new_test.append(x + "\n")
+            tmp_test = new_test
+
+            new_test = ""
+            started = False
+            for i in tmp_test:
+                if i.startswith("\t") and not started:
+                    new_test += "stdin = sys.stdin\nstdout = sys.stdout\n"
+                    new_test += "def code():\n"
+                    new_test += i
+                    started = True
+                elif started and ((i.startswith("from ")) or (i.startswith("import "))):
+                    new_test += "\t" + i
+                else:
+                    new_test += i
+            tmp_test = new_test
+
+            sol += tmp_test
+            if debug:
+                print(f"sol = {sol}")
+            method_name = "code"
+            signal.alarm(timeout)
+            try:
+                tmp_sol = RuntimeModule.from_string("tmp_sol", "", sol)
+                tmp = tmp_sol
+                signal.alarm(0)
+            except Exception as e:
+                signal.alarm(0)
+                error_traceback = traceback.format_exc()
+                if debug:
+                    print(f"type 1 compilation error = {e}")
+                results.append(-2)
+                return results, {
+                    "error": repr(e),
+                    # "error_code": -1,
+                    # "error_message": "Compilation Error",
+                    "traceback": clean_traceback(error_traceback),
+                }
+            signal.alarm(0)
+        if debug:
+            print(f"get method = {datetime.now().time()}")
+
+        try:
+            method = getattr(tmp, method_name)  # get_attr second arg must be str
+        except Exception:
+            signal.alarm(0)
+            error_traceback = traceback.format_exc()
+            error_info = sys.exc_info()
+            print(f"unable to get function error = {error_info}")
+            results.append(-2)
+            return results, {
+                "error": repr(error_info),
+                # "error_code": -1,
+                # "error_message": "Unable to extract code",
+                "traceback": clean_traceback(error_traceback),
+            }
+
+        for index, inputs in enumerate(in_outs["inputs"]):
+            raw_inputs = inputs
+            raw_outputs = in_outs["outputs"][index]
+            if which_type == CODE_TYPE.call_based:
+                inputs = [json.loads(line) for line in inputs.split("\n")]
+                in_outs["outputs"][index] = json.loads(in_outs["outputs"][index])
+
+                truncate_line_size = 300 // (raw_inputs.count("\n") + 1)
+                raw_inputs = "\n".join(
+                    [truncatefn(line, truncate_line_size) for line in raw_inputs.strip().split("\n")]
+                )
+                raw_outputs = truncatefn(raw_outputs, 200)
+            else:
+                raw_inputs = truncatefn(raw_inputs)
+                raw_outputs = truncatefn(raw_outputs, 200)
+            # JSON forces dictionaries to have string keys; this undoes this (assuming a singleton list)
+            try:
+                if isinstance(inputs[0], dict):
+                    inputs = [{int(k): v for k, v in inputs[0].items()}]
+            except Exception:
+                pass
+            try:
+                if isinstance(in_outs["outputs"][index], dict):
+                    in_outs["outputs"][index] = [{int(k): v for k, v in in_outs["outputs"][index].items()}]
+            except Exception:
+                pass
+            try:
+                if isinstance(in_outs["outputs"][index][0], dict):
+                    in_outs["outputs"][index] = [{int(k): v for k, v in in_outs["outputs"][index][0].items()}]
+            except Exception:
+                pass
+
+            if debug:
+                print(
+                    f"time: {datetime.now().time()} testing index = {index}  inputs = {inputs}, {type(inputs)}. type = {which_type}"
+                )
+            if which_type == CODE_TYPE.call_based:  # Call-based
+                signal.alarm(timeout)
+                faulthandler.enable()
+                try:
+                    output = method(*inputs)
+                    raw_true_output = output
+
+                    raw_true_output_copy = json.dumps(output)
+                    raw_true_output_copy = truncatefn(raw_true_output_copy, 200)
+
+                    # ground truth sequences are not tuples
+                    if isinstance(output, tuple):
+                        output = list(output)
+
+                    tmp_result = output == in_outs["outputs"][index]
+                    if isinstance(in_outs["outputs"][index], list) and in_outs["outputs"][index]:
+                        tmp_result = tmp_result or (output == in_outs["outputs"][index][0])
+
+                    # ground truth sequences are not tuples
+                    try:
+                        if isinstance(output[0], tuple):
+                            tmp_result = tmp_result or ([list(x) for x in output] == in_outs["outputs"][index][0])
+                    except Exception:
+                        pass
+                    results.append(tmp_result)
+                    if tmp_result is not True:
+                        return results, {
+                            "output": raw_true_output_copy,
+                            "expected": raw_outputs,
+                            "inputs": raw_inputs,
+                            # "error_code": -2,
+                            "error_message": "Wrong Answer",
+                        }
+                    # reset the alarm
+                    signal.alarm(0)
+                except Exception as e:
+                    signal.alarm(0)
+                    error_traceback = traceback.format_exc()
+                    faulthandler.disable()
+                    if debug:
+                        print(f"Standard input runtime error or time limit exceeded error = {e}")
+                    results.append(-1)
+                    return results, {
+                        "error": repr(e),
+                        "traceback": clean_traceback(error_traceback),
+                    }
+                faulthandler.disable()
+                signal.alarm(0)
+                if debug:
+                    print(
+                        f"outputs = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]}"
+                    )
+            elif which_type == CODE_TYPE.standard_input:  # Standard input
+                faulthandler.enable()
+                passed = False
+
+                if isinstance(inputs, list):
+                    inputs = "\n".join(inputs)
+                if isinstance(in_outs["outputs"][index], list):
+                    in_outs["outputs"][index] = "\n".join(in_outs["outputs"][index])
+
+                signal.alarm(timeout)
+                with Capturing() as output:
+                    try:
+                        call_method(method, inputs)
+                        # reset the alarm
+                        signal.alarm(0)
+                        passed = True
+                    except Exception as e:
+                        # runtime error or took too long
+                        signal.alarm(0)
+                        error_traceback = traceback.format_exc()
+                        print(f"Call-based runtime error or time limit exceeded error = {repr(e)}{e}")
+                        results.append(-1)
+                        return results, {
+                            "error": repr(e),
+                            "traceback": clean_traceback(error_traceback),
+                        }
+                    signal.alarm(0)
+                raw_true_output = output[0]
+                raw_true_output_copy = truncatefn(raw_true_output, 200)
+                output = raw_true_output.splitlines()
+                if not passed:
+                    if debug:
+                        nl = "\n"
+                        if not isinstance(inputs, list):
+                            print(
+                                f"not passed output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs.replace(nl, ' new-line ')}, {type(inputs)}, {output == [in_outs['outputs'][index]]}"
+                            )
+                        else:
+                            print(
+                                f"not passed output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]}"
+                            )
+                    continue
+
+                if passed and debug:
+                    print(f"==> output = {output}, test outputs = {in_outs['outputs'][index]}")
+
+                if custom_compare_(output, in_outs["outputs"][index]):
+                    tmp_result = True
+                    results.append(tmp_result)
+                    continue
+
+                # ground truth sequences are expressed as lists not tuples
+                if isinstance(output, tuple):
+                    output = list(output)
+
+                tmp_result = False
+                try:
+                    tmp_result = output == [in_outs["outputs"][index]]
+                    if isinstance(in_outs["outputs"][index], list):
+                        tmp_result = tmp_result or (output == in_outs["outputs"][index])
+                        if isinstance(output[0], str):
+                            tmp_result = tmp_result or ([e.strip() for e in output] == in_outs["outputs"][index])
+                except Exception as e:
+                    if debug:
+                        print(f"Failed check1 exception = {e}")
+
+                if tmp_result is True:
+                    results.append(tmp_result)
+                    continue
+
+                # try one more time without \n
+                if isinstance(in_outs["outputs"][index], list):
+                    for tmp_index, i in enumerate(in_outs["outputs"][index]):
+                        in_outs["outputs"][index][tmp_index] = i.split("\n")
+                        in_outs["outputs"][index][tmp_index] = [
+                            x.strip() for x in in_outs["outputs"][index][tmp_index] if x
+                        ]
+                else:
+                    in_outs["outputs"][index] = in_outs["outputs"][index].split("\n")
+                    in_outs["outputs"][index] = list(filter(len, in_outs["outputs"][index]))
+                    in_outs["outputs"][index] = list(map(lambda x: x.strip(), in_outs["outputs"][index]))
+
+                try:
+                    tmp_result = output == [in_outs["outputs"][index]]
+                    if isinstance(in_outs["outputs"][index], list):
+                        tmp_result = tmp_result or (output == in_outs["outputs"][index])
+                except Exception as e:
+                    if debug:
+                        print(f"Failed check2 exception = {e}")
+
+                if tmp_result is True:
+                    results.append(tmp_result)
+                    continue
+
+                # try by converting the output into a split up list too
+                if isinstance(output, list):
+                    output = list(filter(len, output))
+
+                if debug:
+                    nl = "\n"
+                    if not isinstance(inputs, list):
+                        print(
+                            f"@1 output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs.replace(nl, ' new-line ')}, {type(inputs)}, {output == [in_outs['outputs'][index]]} {tmp_result=}"
+                        )
+                    else:
+                        print(
+                            f"@1 output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]} {tmp_result=}"
+                        )
+
+                if debug:
+                    print(f"{tmp_result=} @a")
+
+                try:
+                    tmp_result = output == [in_outs["outputs"][index]]
+                    if isinstance(in_outs["outputs"][index], list):
+                        tmp_result = tmp_result or (output == in_outs["outputs"][index])
+                except Exception as e:
+                    if debug:
+                        print(f"Failed check3 exception = {e}")
+
+                if debug:
+                    print(f"{tmp_result=} @b")
+
+                try:
+                    all_ints = all(
+                        combined_int_check(e1) and combined_int_check(e2)
+                        for e1, e2 in zip(output, in_outs["outputs"][index])
+                    )
+                    if not all_ints:
+                        if debug:
+                            print(
+                                [
+                                    combined_int_check(e1) and combined_int_check(e2)
+                                    for e1, e2 in zip(output, in_outs["outputs"][index])
+                                ]
+                            )
+                        output_float = [float(e) for e in output]
+                        gt_float = [float(e) for e in in_outs["outputs"][index]]
+                        tmp_result = tmp_result or (
+                            (len(output_float) == len(gt_float)) and np.allclose(output_float, gt_float)
+                        )
+                except Exception:
+                    pass
+
+                if debug:
+                    print(f"{tmp_result=} @c")
+
+                try:
+                    if isinstance(output[0], list):
+                        all_ints = all(
+                            combined_int_check(e1) and combined_int_check(e2)
+                            for e1, e2 in zip(output[0], in_outs["outputs"][index])
+                        )
+                        if not all_ints:
+                            output_float = [float(e) for e in output[0]]
+                            gt_float = [float(e) for e in in_outs["outputs"][index][0]]
+                            tmp_result = tmp_result or (
+                                (len(output_float) == len(gt_float)) and np.allclose(output_float, gt_float)
+                            )
+                except Exception:
+                    pass
+
+                if tmp_result is True:
+                    results.append(tmp_result)
+                    continue
+
+                if debug:
+                    print(f"{tmp_result=} @d")
+                # try by converting the stuff into split up list
+                if isinstance(in_outs["outputs"][index], list):
+                    for tmp_index, i in enumerate(in_outs["outputs"][index]):
+                        in_outs["outputs"][index][tmp_index] = set(i.split())
+                else:
+                    in_outs["outputs"][index] = set(in_outs["outputs"][index].split())
+
+                if debug:
+                    print(f"{tmp_result=} @e")
+
+                try:
+                    tmp_result = output == in_outs["outputs"][index]
+                except Exception as e:
+                    if debug:
+                        print(f"Failed check4 exception = {e}")
+                    continue
+
+                if tmp_result is True:
+                    results.append(tmp_result)
+                    continue
+
+                if debug:
+                    print(f"{tmp_result=} @f")
+
+                # try by converting the output into a split up list too
+                if isinstance(output, list):
+                    for tmp_index, i in enumerate(output):
+                        output[tmp_index] = i.split()
+                    output = list(filter(len, output))
+                    for tmp_index, i in enumerate(output):
+                        output[tmp_index] = set(i)
+                else:
+                    output = output.split()
+                    output = list(filter(len, output))
+                    output = set(output)
+
+                if debug:
+                    print(f"{tmp_result=} @g")
+
+                if tmp_result is True and debug:
+                    print("PASSED")
+
+                results.append(tmp_result)
+                if tmp_result is not True:
+                    return results, {
+                        "output": raw_true_output_copy,
+                        "expected": raw_outputs,
+                        "inputs": raw_inputs,
+                        # "error_code": -2,
+                        "error_message": "Wrong Answer",
+                    }
+
+                if debug:
+                    nl = "\n"
+                    if not isinstance(inputs, list):
+                        print(
+                            f"@2 output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs.replace(nl, ' new-line ')}, {type(inputs)}, {output == [in_outs['outputs'][index]]}"
+                        )
+                    else:
+                        print(
+                            f"@2 output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]}"
+                        )
+
+                    print(f"results = {results}")
+
+    return results, {}
+
+
+def custom_compare_(output, ground_truth):
+    if isinstance(output, list):
+        output_1 = "\n".join(output)
+        if stripped_string_compare(output_1, ground_truth):
+            return True
+
+    if isinstance(output, list):
+        output_2 = [o.lstrip().rstrip() for o in output]
+        output_2 = "\n".join(output_2)
+        if stripped_string_compare(output_2, ground_truth):
+            return True
+
+    return False
+
+
+def stripped_string_compare(s1, s2):
+    s1 = s1.lstrip().rstrip()
+    s2 = s2.lstrip().rstrip()
+    return s1 == s2
+
+
+def call_method(method, inputs):
+    if isinstance(inputs, list):
+        inputs = "\n".join(inputs)
+
+    inputs_line_iterator = iter(inputs.split("\n"))
+
+    # sys.setrecursionlimit(10000)
+
+    # @patch('builtins.input', side_effect=inputs.split("\n"))
+    @patch("builtins.open", mock_open(read_data=inputs))
+    @patch("sys.stdin", StringIO(inputs))
+    @patch("sys.stdin.readline", lambda *args: next(inputs_line_iterator))
+    @patch("sys.stdin.readlines", lambda *args: inputs.split("\n"))
+    @patch("sys.stdin.read", lambda *args: inputs)
+    # @patch('sys.stdout.write', print)
+    def _inner_call_method(_method):
+        try:
+            return _method()
+        except SystemExit:
+            pass
+        finally:
+            pass
+
+    return _inner_call_method(method)
+
+
+def reliability_guard(maximum_memory_bytes=None):
+    """
+    This disables various destructive functions and prevents the generated code
+    from interfering with the test (e.g. fork bomb, killing other processes,
+    removing filesystem files, etc.)
+    WARNING
+    This function is NOT a security sandbox. Untrusted code, including, model-
+    generated code, should not be blindly executed outside of one. See the
+    Codex paper for more information about OpenAI's code sandbox, and proceed
+    with caution.
+    """
+
+    if maximum_memory_bytes is not None:
+        import resource
+
+        resource.setrlimit(resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes))
+        resource.setrlimit(resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes))
+        if platform.uname().system != "Darwin":
+            resource.setrlimit(resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes))
+
+    faulthandler.disable()
+
+    import builtins
+
+    builtins.exit = None
+    builtins.quit = None
+
+    import os
+
+    os.environ["OMP_NUM_THREADS"] = "1"
+
+    os.kill = None
+    os.system = None  # 防止干扰repl评测
+    os.putenv = None
+    os.remove = None
+    os.removedirs = None
+    os.rmdir = None
+    os.fchdir = None
+    os.setuid = None
+    os.fork = None
+    os.forkpty = None
+    os.killpg = None
+    os.rename = None
+    os.renames = None
+    os.truncate = None
+    os.replace = None
+    os.unlink = None
+    os.fchmod = None
+    os.fchown = None
+    os.chmod = None
+    os.chown = None
+    os.chroot = None
+    os.lchflags = None
+    os.lchmod = None
+    os.lchown = None
+    os.getcwd = None
+    os.chdir = None
+
+    import shutil
+
+    shutil.rmtree = None
+    shutil.move = None
+    shutil.chown = None
+
+    import subprocess
+
+    subprocess.Popen = None  # type: ignore
+
+    __builtins__["help"] = None
+
+    import sys
+
+    sys.modules["ipdb"] = None
+    sys.modules["joblib"] = None
+    sys.modules["resource"] = None
+    sys.modules["psutil"] = None
+    sys.modules["tkinter"] = None
diff --git a/applications/ColossalChat/coati/distributed/reward/code_reward/utils.py b/applications/ColossalChat/coati/distributed/reward/code_reward/utils.py
new file mode 100644
index 000000000..e19e9e387
--- /dev/null
+++ b/applications/ColossalChat/coati/distributed/reward/code_reward/utils.py
@@ -0,0 +1,61 @@
+# Code from the verl Project (https://github.com/agentica-project/rllm),
+# which itself is adapted from Prime (https://github.com/PRIME-RL/PRIME)
+#
+# Copyright 2024 ByteDance Group
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import multiprocessing
+import os
+import sys
+import traceback
+from typing import Optional
+
+from .testing_util import run_test
+
+
+def _temp_run(sample, generation, debug, result, metadata_list, timeout):
+    with open(os.devnull, "w") as devnull:
+        sys.stdout = devnull
+        sys.stderr = devnull
+        try:
+            res, metadata = run_test(in_outs=sample, test=generation, debug=debug, timeout=timeout)
+            result.append(res)
+            metadata_list.append(metadata)
+        except Exception:
+            # print(e) # some tracebacks are extremely long.
+            traceback.print_exc(10)
+            result.append([-1 for i in range(len(sample["inputs"]))])
+            metadata_list.append({})
+
+
+def check_correctness(in_outs: Optional[dict], generation, timeout=10, debug=True):
+    """Check correctness of code generation with a global timeout.
+    The global timeout is to catch some extreme/rare cases not handled by the timeouts
+    inside `run_test`"""
+
+    manager = multiprocessing.Manager()
+    result = manager.list()
+    metadata_list = manager.list()
+    p = multiprocessing.Process(target=_temp_run, args=(in_outs, generation, debug, result, metadata_list, timeout))
+    p.start()
+    p.join(timeout=timeout + 1)
+    if p.is_alive():
+        p.kill()
+        # p.terminate()
+    if not result:
+        # consider that all tests failed
+        result = [[-1 for i in range(len(in_outs["inputs"]))]]
+        if debug:
+            print("global timeout")
+    return result[0], metadata_list
diff --git a/applications/ColossalChat/coati/distributed/reward/reward_fn.py b/applications/ColossalChat/coati/distributed/reward/reward_fn.py
index a4042ae97..5c8810fdf 100644
--- a/applications/ColossalChat/coati/distributed/reward/reward_fn.py
+++ b/applications/ColossalChat/coati/distributed/reward/reward_fn.py
@@ -1,7 +1,30 @@
+# Copyright 2024 ByteDance Group
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Some functions in this file are adapted from the verl project
+under the Apache License 2.0:
+https://github.com/volcengine/verl
+"""
+
+
+import json
+
 import torch
 from latex2sympy2_extended import NormalizationConfig
 from math_verify import ExprExtractionConfig, LatexExtractionConfig, parse, verify
 
+from .code_reward.utils import check_correctness as check_correctness_code
 from .reward_utils import extract_boxed_solution, extract_solution, validate_response_structure
 
 CANNOT_PARSE_GT_ANSWER = -1
@@ -98,15 +121,11 @@ def math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
         raise ValueError("no gt_answer is provided, please check your training dataset.")
 
     decoded_final_answer = tokenizer.decode(input_ids[s : e + 1], skip_special_tokens=True)
-    gt_answer = tokenizer.decode(gt_answer.squeeze(0), skip_special_tokens=True)
+
     final_answer, processed_str = extract_solution(decoded_final_answer)
 
     format_valid = validate_response_structure(processed_str, kwargs["tags"])
 
-    # Check format accuracy
-    if format_valid:
-        format_acc += 1
-
     # Check answer accuracy, answer is considered correct if the answer is correct and the format is valid
     if final_answer is not None:
         if eval_mode or format_valid:
@@ -114,6 +133,10 @@ def math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
         if not eval_mode:
             reward = reward + length_reward
 
+    # Check format accuracy
+    if format_valid:
+        format_acc += 1
+
     # Check if the sequence is over length
     if not eval_mode and res_length >= max_new_tokens:
         reward *= 0.0
@@ -138,7 +161,6 @@ def boxed_math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
     tokenizer = kwargs["tokenizer"]
     eval_mode = kwargs.get("eval_mode", False)
     soft_over_length_punishment = kwargs.get("soft_over_length_punishment", False)
-    format_score = 0.0
     acc_score = 10.0
     reward = torch.tensor(0.0)
     format_acc = torch.tensor(0.0)
@@ -161,7 +183,6 @@ def boxed_math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
 
     decoded_final_answer = tokenizer.decode(input_ids[s : e + 1], skip_special_tokens=True)
 
-    gt_answer = tokenizer.decode(gt_answer.squeeze(0), skip_special_tokens=True)
     final_answer = extract_boxed_solution(decoded_final_answer)
     format_valid = final_answer is not None
     if "tags" in kwargs and kwargs["tags"]:
@@ -169,10 +190,6 @@ def boxed_math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
         format_valid = format_valid and all(
             [decoded_final_answer.count(tags[tag]["text"]) == tags[tag]["num_occur"] for tag in tags]
         )
-    # Check format accuracy
-    if format_valid:
-        format_acc += 1
-        reward += format_score
 
     # Check answer accuracy, answer is considered correct if the answer is correct and the format is valid
     if final_answer is not None:
@@ -181,6 +198,10 @@ def boxed_math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
         if not eval_mode:
             reward = reward + length_reward
 
+    # Check format accuracy
+    if format_valid:
+        format_acc += 1
+
     # Check if the sequence is over length
     if not eval_mode and res_length >= max_new_tokens:
         reward *= 0.0
@@ -199,3 +220,78 @@ def boxed_math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
             "response_length": res_length,
             "reward": reward.item(),
         }
+
+
+def code_reward_fn(input_ids, test_cases, response_idx, **kwargs):
+    tokenizer = kwargs["tokenizer"]
+    eval_mode = kwargs.get("eval_mode", False)
+    soft_over_length_punishment = kwargs.get("soft_over_length_punishment", False)
+    acc_score = 10.0
+    reward = torch.tensor(0.0)
+    format_acc = torch.tensor(0.0)
+    ans_acc = torch.tensor(0.0)
+    s, e = response_idx[0], response_idx[1]
+
+    length_reward = 0.0
+    res_length = e.item() - s.item() + 1
+    if not eval_mode:
+        max_new_tokens = kwargs["max_new_tokens"]
+    else:
+        max_new_tokens = -1  # for eval mode, we don't need to check the length
+    if not eval_mode and soft_over_length_punishment:
+        cache_length = kwargs["cache_length"]
+        if max_new_tokens - cache_length < res_length < max_new_tokens:
+            length_reward = ((max_new_tokens - cache_length) - res_length) / cache_length * acc_score
+
+    # try to get code solution from completion. if the completion is pure code, this will not take effect.
+    decoded_final_answer = tokenizer.decode(input_ids[s : e + 1], skip_special_tokens=True)
+
+    solution = decoded_final_answer.split("```python")[-1].split("```")[0]
+    format_valid = False
+    if "```python" in decoded_final_answer:
+        format_valid = solution is not None
+
+    # Check format accuracy
+    if format_valid:
+        format_acc += 1
+
+    try:
+        try:
+            if not isinstance(test_cases, dict):
+                test_cases = json.loads(test_cases)
+        except Exception as e:
+            print(f"Error {e}: Cannot parse test cases.")
+            raise e
+        # Complete check on all in-out pairs first. If there is no failure, per-sample test can be skipped.
+        try:
+            res, metadata = check_correctness_code(in_outs=test_cases, generation=solution, timeout=10, debug=True)
+            metadata = dict(enumerate(metadata))[0]
+            success = all(map(lambda x: x is True, res))
+            if success:
+                ans_acc += 1
+                if eval_mode or format_valid:
+                    reward += acc_score
+                if not eval_mode:
+                    reward = reward + length_reward
+        except Exception:
+            pass
+
+        # Check if the sequence is over length
+        if not eval_mode and res_length >= max_new_tokens:
+            reward *= 0.0
+    except Exception:
+        pass
+    if not eval_mode:
+        return torch.tensor([reward, format_acc, ans_acc]).to(input_ids.device)
+    else:
+        prompt = tokenizer.decode(input_ids[:s], skip_special_tokens=True)
+        return {
+            "prompt": prompt,
+            "prediction": decoded_final_answer,
+            "gold": test_cases["outputs"],
+            "parsed": solution,
+            "format_valid": format_acc.item(),
+            "ans_valid": ans_acc.item(),
+            "response_length": res_length,
+            "reward": reward.item(),
+        }
diff --git a/applications/ColossalChat/coati/distributed/reward/verifiable_reward.py b/applications/ColossalChat/coati/distributed/reward/verifiable_reward.py
index ba83f7787..2f0b3778c 100644
--- a/applications/ColossalChat/coati/distributed/reward/verifiable_reward.py
+++ b/applications/ColossalChat/coati/distributed/reward/verifiable_reward.py
@@ -2,6 +2,7 @@
 Function-based reward verification module.
 """
 
+import inspect
 from typing import Any, Dict, List
 
 import torch
@@ -15,7 +16,8 @@ class VerifiableReward:
     def __call__(
         self,
         input_ids: torch.LongTensor,
-        gt_answer: List[torch.Tensor] = None,
+        gt_answer: List[str] = None,
+        test_cases: List[str] = None,
         response_idx: List[torch.Tensor] = None,
     ) -> torch.Tensor:
         # Get batch size
@@ -26,18 +28,44 @@ class VerifiableReward:
         # Loop through reward functions
         for reward_fn in self.reward_fns:
             # Apply the reward function to the entire batch at once
-            reward_batch = torch.stack(
-                [
-                    reward_fn(
-                        input_ids[i],
-                        gt_answer=gt_answer[i],
-                        response_idx=response_idx[i],
-                        **self.kwargs,
-                    )
-                    for i in range(bs)
-                ],
-                dim=0,
-            )
+            if "gt_answer" in inspect.getfullargspec(reward_fn).args:
+                reward_batch = torch.stack(
+                    [
+                        reward_fn(
+                            input_ids[i],
+                            gt_answer=gt_answer[i],
+                            response_idx=response_idx[i],
+                            **self.kwargs,
+                        )
+                        for i in range(bs)
+                    ],
+                    dim=0,
+                )
+            elif "test_cases" in inspect.getfullargspec(reward_fn).args:
+                reward_batch = torch.stack(
+                    [
+                        reward_fn(
+                            input_ids[i],
+                            test_cases=test_cases[i],
+                            response_idx=response_idx[i],
+                            **self.kwargs,
+                        )
+                        for i in range(bs)
+                    ],
+                    dim=0,
+                )
+            else:
+                reward_batch = torch.stack(
+                    [
+                        reward_fn(
+                            input_ids[i],
+                            response_idx=response_idx[i],
+                            **self.kwargs,
+                        )
+                        for i in range(bs)
+                    ],
+                    dim=0,
+                )
 
             rewards += reward_batch
         return rewards
diff --git a/applications/ColossalChat/requirements.txt b/applications/ColossalChat/requirements.txt
index 472080101..6b9511ad5 100755
--- a/applications/ColossalChat/requirements.txt
+++ b/applications/ColossalChat/requirements.txt
@@ -22,3 +22,6 @@ sentencepiece==0.1.99
 flash-attn
 tiktoken
 jsonlines
+math_verify
+latex2sympy2_extended
+pyext
diff --git a/applications/ColossalChat/rl_example.py b/applications/ColossalChat/rl_example.py
index b4f565617..0536a746d 100644
--- a/applications/ColossalChat/rl_example.py
+++ b/applications/ColossalChat/rl_example.py
@@ -101,7 +101,7 @@ if __name__ == "__main__":
         "--reward-type",
         type=str,
         default="think_answer_tags",
-        choices=["think_answer_tags", "boxed"],
+        choices=["think_answer_tags", "boxed", "code"],
         help="Reward type for GRPO.",
     )
     parser.add_argument(
@@ -167,10 +167,29 @@ if __name__ == "__main__":
 
     if args.master_address is None:
         # Default settings: Using single machine
-        ray.init(address="local", namespace="ray-example")
+        ray.init(
+            address="local",
+            namespace="ray-example",
+            runtime_env={
+                "env_vars": {
+                    # "RAY_DEBUG_POST_MORTEM": "1"  # enable post-mortem debugging with ray
+                    "TOKENIZERS_PARALLELISM": "false"
+                },
+            },
+        )
     else:
         # For ray distributed multi-machine training, Please change _node_ip_address to your IP address of your master node
-        ray.init(_node_ip_address=args.master_address, namespace="ray-example", _temp_dir=args.ray_dir)
+        ray.init(
+            _node_ip_address=args.master_address,
+            namespace="ray-example",
+            _temp_dir=args.ray_dir,
+            runtime_env={
+                "env_vars": {
+                    # "RAY_DEBUG_POST_MORTEM": "1"  # enable post-mortem debugging with ray
+                    "TOKENIZERS_PARALLELISM": "false"
+                },
+            },
+        )
 
     if args.top_k is None:
         if args.backend == "transformers":
@@ -178,6 +197,8 @@ if __name__ == "__main__":
         elif args.backend == "vllm":
             args.top_k = -1
 
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"  # Disable tokenizers parallelism to avoid deadlock
+
     inference_model_config = dict(path=args.model)
     train_model_config = dict(path=args.model, use_flash_attention_2=True, use_cache=False)
     generate_config = dict(top_k=args.top_k, top_p=args.top_p, temperature=args.temperature)
@@ -288,7 +309,6 @@ if __name__ == "__main__":
             "max_length": args.max_prompt_tokens,
             "system_prompt": args.system_prompt,
         },
-        dataloaders_config={},
         inference_model_config=inference_model_config,
         generate_config=generate_config,
         num_generations=args.num_generations,

From 3bed6ae9eea18424a1305b2d7b977a3c66fd2621 Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Mon, 9 Jun 2025 09:37:28 +0800
Subject: [PATCH 12/34] fix bug, tested

---
 .gitignore                                              | 4 ++++
 applications/ColossalChat/coati/distributed/consumer.py | 6 ++----
 applications/ColossalChat/coati/distributed/producer.py | 2 +-
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/.gitignore b/.gitignore
index 0503f3c95..e603f5015 100644
--- a/.gitignore
+++ b/.gitignore
@@ -167,3 +167,7 @@ applications/ColossalChat/wandb
 applications/ColossalChat/model
 applications/ColossalChat/eval
 applications/ColossalChat/rollouts
+applications/ColossalChat/*.txt
+applications/ColossalChat/*.db
+applications/ColossalChat/stdin
+applications/ColossalChat/*.zip
diff --git a/applications/ColossalChat/coati/distributed/consumer.py b/applications/ColossalChat/coati/distributed/consumer.py
index 3ae4a1796..a7abb1588 100644
--- a/applications/ColossalChat/coati/distributed/consumer.py
+++ b/applications/ColossalChat/coati/distributed/consumer.py
@@ -132,9 +132,7 @@ class BaseConsumer:
                             format_acc = raw_batch["format_acc"][:, :, 0]
                             ans_acc = raw_batch["ans_acc"][:, :, 0]
                             response_len = (
-                                raw_batch["response_idx"][:, :, 1]
-                                - raw_batch["response_idx"][:, :, 0]
-                                + 1
+                                raw_batch["response_idx"][:, :, 1] - raw_batch["response_idx"][:, :, 0] + 1
                             ).type(torch.float32)
                             effective_group_mask = None
                             if self.filter_range is not None and self.grpo_config.get("dynamic_batching", True):
@@ -160,7 +158,7 @@ class BaseConsumer:
                                 )
                             if effective_group_mask is not None:
                                 print(
-                                    f"[T{dist.get_rank()}] Filter recv data: {len(raw_batch_with_reward)} -> {torch.sum(effective_group_mask).cpu().item()} effective groups"
+                                    f"[T{dist.get_rank()}] Filter recv data: {len(raw_batch)} -> {torch.sum(effective_group_mask).cpu().item()} effective groups"
                                 )
                         # mapping the effective group to the raw group for indexing
                         effective_group_to_raw_group_mapping = {}
diff --git a/applications/ColossalChat/coati/distributed/producer.py b/applications/ColossalChat/coati/distributed/producer.py
index 135c8db0e..23542f1c6 100644
--- a/applications/ColossalChat/coati/distributed/producer.py
+++ b/applications/ColossalChat/coati/distributed/producer.py
@@ -291,7 +291,7 @@ class BaseProducer:
                     reward_model_output = self.reward_model(
                         outputs["input_ids"].view((-1, outputs["input_ids"].size(-1))),
                         gt_answer=gt_answer,
-                        response_idx=outputs["response_idx"],
+                        response_idx=outputs["response_idx"].view((-1, 2)),
                     )
                 outputs["reward"] = (
                     torch.tensor([value[0] for value in reward_model_output])

From d0e12c508a4e72b73ed7f890f4850d16140fb756 Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Mon, 9 Jun 2025 09:42:58 +0800
Subject: [PATCH 13/34] remove debug code

---
 applications/ColossalChat/coati/distributed/producer.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/producer.py b/applications/ColossalChat/coati/distributed/producer.py
index 23542f1c6..82e4b6ac1 100644
--- a/applications/ColossalChat/coati/distributed/producer.py
+++ b/applications/ColossalChat/coati/distributed/producer.py
@@ -273,12 +273,10 @@ class BaseProducer:
                     [self.model.generate_config["temperature"]] * outputs["input_ids"].size(0)
                 ).to(outputs["input_ids"].device)
                 bs, num_gen = outputs["input_ids"].size(0), outputs["input_ids"].size(1)
-                # breakpoint()
                 if self.grpo_config["reward_fn_type"] == "code":
                     test_cases = []
                     for prompt_id in range(bs):
                         test_cases.extend([outputs["test_cases"][prompt_id]] * num_gen)
-                    # assert all([isinstance(test_cases[i], str) for i in range(len(test_cases))]), f"{[type(test_cases[i]) for i in range(len(test_cases))]}, {test_cases}"
                     reward_model_output = self.reward_model(
                         outputs["input_ids"].view((-1, outputs["input_ids"].size(-1))),
                         test_cases=test_cases,
@@ -315,7 +313,6 @@ class BaseProducer:
 
                 print(f"[P{self.producer_idx}] Send data {[(k, v.shape) for k, v in outputs.items()]}")
                 outputs = pre_send(outputs)
-                # breakpoint()
                 ray_broadcast_tensor_dict(
                     outputs, src=0, device=self.device, group_name=f"sync_data_{self.producer_idx}"
                 )

From 9ca920c1af5076ca2816710852cf0ab4410ba3d3 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 9 Jun 2025 01:48:19 +0000
Subject: [PATCH 14/34] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 applications/ColossalChat/coati/distributed/grpo_consumer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index 2e4d8f08c..0e956ca41 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -1,5 +1,5 @@
 from contextlib import nullcontext
-from typing import Any, Dict, Optional
+from typing import Any, Optional
 
 import ray
 import torch

From bb6f5d98fc94d857284d3816591e3e248fcb5f46 Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li352711588@gmail.com>
Date: Tue, 10 Jun 2025 13:53:19 +0800
Subject: [PATCH 15/34] move out evaluation func (#6343)

Co-authored-by: Tong Li <tong.li35271158@gmail.com>
---
 .../ColossalChat/coati/distributed/producer.py   | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/producer.py b/applications/ColossalChat/coati/distributed/producer.py
index 82e4b6ac1..854c2fcc2 100644
--- a/applications/ColossalChat/coati/distributed/producer.py
+++ b/applications/ColossalChat/coati/distributed/producer.py
@@ -128,6 +128,14 @@ class BaseProducer:
             drop_last=True,
             collate_fn=collate_fn_grpo,
         )
+        if grpo_config["reward_fn_type"] == "think_answer_tags":
+            self.evaluation_function = math_reward_fn
+        elif grpo_config["reward_fn_type"] == "boxed":
+            self.evaluation_function = boxed_math_reward_fn
+        elif grpo_config["reward_fn_type"] == "code":
+            self.evaluation_function = code_reward_fn
+        else:
+            raise ValueError(f"Unknown evaluation function type {grpo_config['reward_fn_type']}")
 
         self.eval_dataset_config = eval_dataset_config
         if self.eval_dataset_config is not None:
@@ -151,14 +159,6 @@ class BaseProducer:
                     ),
                     collate_fn=collate_fn_grpo,
                 )
-            if grpo_config["reward_fn_type"] == "think_answer_tags":
-                self.evaluation_function = math_reward_fn
-            elif grpo_config["reward_fn_type"] == "boxed":
-                self.evaluation_function = boxed_math_reward_fn
-            elif grpo_config["reward_fn_type"] == "code":
-                self.evaluation_function = code_reward_fn
-            else:
-                raise ValueError(f"Unknown evaluation function type {grpo_config['reward_fn_type']}")
         else:
             print("No eval dataset provided, skip eval")
         self.device = get_current_device()

From 21d517d0fa393c526d025c7b3f3a1f1e54eda330 Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Tue, 10 Jun 2025 15:00:48 +0800
Subject: [PATCH 16/34] Manually schedule resources and support auto master
 address assigning

---
 .../ColossalChat/coati/distributed/launch.py  | 48 ++++++++++++++++---
 1 file changed, 42 insertions(+), 6 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/launch.py b/applications/ColossalChat/coati/distributed/launch.py
index 41ba8ea55..30e7382ef 100644
--- a/applications/ColossalChat/coati/distributed/launch.py
+++ b/applications/ColossalChat/coati/distributed/launch.py
@@ -79,8 +79,35 @@ def launch_distributed(
         f"{project_name.replace(' ','_')}_run_{wandb_group_name}.jsonl",
     )
 
-    procs = []
+    # Attention: Ray use complex schedualing method that consider various factors including load-balancing.
+    # when requesting resources, it is not guaranteed that the resource comes from a node with lower node it
+    # this go against the design principle of our implementation, and we need to manually force the schedualing,
+    # allocating the producer to nodes with lower node id and the consumer to the resouces from nodes with higher
+    # node id. See the reference here: https://docs.ray.io/en/latest/ray-core/scheduling/index.html#nodeaffinityschedulingstrategy
+    nodes = ray.nodes()
+    node_info = {
+        node["NodeID"]: {
+            "num_gpus": node["Resources"].get("GPU", 0),
+            "address": node["NodeManagerAddress"],
+        }  # Default to 0 if no GPUs are available
+        for node in nodes
+    }
+    gpu_to_node_id = []
+    gpu_to_ip_address = []
+    for node_id in node_info:
+        for idx in range(int(node_info[node_id]["num_gpus"])):
+            gpu_to_node_id.append(node_id)
+            gpu_to_ip_address.append(node_info[node_id]["address"])
+    print(node_info)
+
+    producer_procs = []
     for i in range(num_producers):
+        node_id = gpu_to_node_id[0]
+        producer_ip_address = gpu_to_ip_address[0]
+        for _ in range(num_proc_per_producer):
+            gpu_to_node_id.pop(0)
+            gpu_to_ip_address.pop(0)
+        print(f"Schedual Producer P[{i}] which requires {num_proc_per_producer} GPUs on node {producer_ip_address}")
         producer = SimpleProducer.options(num_gpus=num_proc_per_producer).remote(
             producer_idx=i,
             num_producers=num_producers,
@@ -106,20 +133,29 @@ def launch_distributed(
             log_rollout_interval=log_rollout_interval,
             rollout_log_file=rollout_log_file,
         )
-        procs.append(producer)
+        producer_procs.append(producer)
+    ray.get([p.setup.remote() for p in producer_procs])
     generate_config_consumer = copy.deepcopy(generate_config)
     generate_config_consumer.update(
         dict(
             backend=inference_backend,
         )
     )
+    consumer_master_ip_address = gpu_to_ip_address[0]
+    print(f"Use {consumer_master_ip_address} as master address for torch DDP.")
+    consumer_procs = []
     for i in range(num_consumer_procs):
+        node_id = gpu_to_node_id[0]
+        consumer_ip_address = gpu_to_ip_address[0]
+        gpu_to_node_id.pop(0)
+        gpu_to_ip_address.pop(0)
+        print(f"Schedual Consumer T[{i}] which requires 1 GPUs on node {consumer_ip_address}")
         consumer = core_consumer.options(num_gpus=1).remote(
             num_producers=num_producers,
             num_episodes=num_episodes,
             rank=i,
             world_size=num_consumer_procs,
-            master_addr=master_addr,
+            master_addr=consumer_master_ip_address,
             master_port=master_port,
             num_update_per_episode=num_update_per_episode,
             num_recv_per_update=num_recv_per_update,
@@ -136,6 +172,6 @@ def launch_distributed(
             run_name=run_name,
             wandb_group_name=wandb_group_name,
         )
-        procs.append(consumer)
-    ray.get([p.setup.remote() for p in procs])
-    ray.get([p.loop.remote() for p in procs])
+        consumer_procs.append(consumer)
+    ray.get([p.setup.remote() for p in consumer_procs])
+    ray.get([p.loop.remote() for p in (producer_procs + consumer_procs)])

From 25599246c5080cdfcc03e794495aa2c075ab048d Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Tue, 10 Jun 2025 17:00:35 +0800
Subject: [PATCH 17/34] modify readme

---
 .../ColossalChat/coati/distributed/README.md  | 312 +++++++++++++++++-
 applications/ColossalChat/rl_example.py       |  10 +
 2 files changed, 321 insertions(+), 1 deletion(-)

diff --git a/applications/ColossalChat/coati/distributed/README.md b/applications/ColossalChat/coati/distributed/README.md
index b7bac2b2d..262ab6aab 100644
--- a/applications/ColossalChat/coati/distributed/README.md
+++ b/applications/ColossalChat/coati/distributed/README.md
@@ -1,6 +1,316 @@
-# Requirements
+# Distributed RL Framework for Language Model Fine-Tuning
 
+This repository implements a distributed Reinforcement Learning (RL) training framework designed to fine-tune large language models using algorithms such as **GRPO** and **DAPO**. It supports multi-node and multi-GPU setups, scalable rollout generation, and policy optimization using libraries like VLLM. Currently, we supports two Reinforcement Learning with Verifiable Reward (RLVR) tasks: solving math problems and code generation.
+
+**Please note that we are still under intensive development, stay tuned.**
+
+---
+
+## 🚀 Features
+
+* **Distributed Training with Ray**: Scalable to multiple machines and GPUs.
+* **Support for GRPO and DAPO**: Choose your preferred policy optimization algorithm.
+* **Model Backends**: Support `vllm` as inference backends.
+* **Rollout and Policy Decoupling**: Efficient generation and consumption of data through parallel inferencer-trainer architecture.
+* **Evaluation Integration**: Easily plug in task-specific eval datasets.
+* **Checkpoints and Logging**: Configurable intervals and directories.
+
+---
+
+## 🛠 Installation
+
+### Prepare Develop Environment
+
+Install Colossalai & ColossalChat
+```bash
+git clone https://github.com/hpcaitech/ColossalAI.git
+git checkout grpo-latest
+BUILD_EXT=1 pip install -e .
+
+cd ./applications/ColossalChat
+pip install -e .
+```
+
+Install vllm
+```bash
+pip install vllm==0.7.3
+```
+
+Install Ray.
+```bash
+pip install ray
+```
+
+Install Other Dependencies
 ```bash
 pip install cupy-cuda12x
 python -m cupyx.tools.install_library --cuda 12.x --library nccl
 ```
+
+Prepare Model & dataset
+```bash
+huggingface-cli download --local-dir-use-symlinks False Qwen/Qwen2.5-7B --local-dir /models/Qwen/Qwen2.5-7B
+```
+
+## Architecture Design
+
+<div align="center">
+  <p align="center">
+    <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chat/producer-consumer-pattern.png" width=700/>
+  </p>
+</div>
+Producer-Consumer Pattern: a classic software design pattern used for managing resources, data, or tasks between two different processes or threads.
+
+* Producer: inference engine which rollouts out examples and saves them into a shared buffer.
+* Consumer: training framework which takes training examples from the shared buffer and train the policy model.
+
+Key features for Producer-Consumer Pattern:
+* Buffer: Acts as a shared queue where the producer adds data and the consumer removes data.
+* Concurrency: Rollout and training can work concurrently.
+
+## 🧠 Data Format
+
+Samples in the training or evaluation `.jsonl` file should the same format depends on the type of task, we currently support two RLVR tasks: solving math problems and code generation.
+
+### Math Data Format
+```json
+{
+  "messages": {
+    "role": "user",
+    "content": "Simplify $\\sqrt[3]{1+8} \\cdot \\sqrt[3]{1+\\sqrt[3]{8}}$."
+  },
+  "gt_answer": "3"
+}
+```
+
+### Code Data Format
+We support [Prime code dataset format](https://github.com/PRIME-RL/PRIME). Inputs and outputs in test cases should be two lists containing only strings and matching in the number of elements. You prompt must properly instruct the LLM to generate code to read test cases from stdin and output results to stdout.
+```json
+{
+    "messages": {
+        "role": "user",
+        "content": "Solve the following coding problem using the programming language python:\n\nMikhail walks on a Cartesian plane. He starts at the point $(0, 0)$, and in one move he can go to any of eight adjacent points. For example, ..."
+    },
+    "test_cases": {
+        "inputs": [
+            "3\n2 2 3\n4 3 7\n10 1 9\n"
+        ],
+        "outputs": [
+            "1\n6\n-1\n"
+        ]
+    }
+}
+```
+
+---
+
+## ⚙️ Hyperparameters & Arguments
+
+| Argument         | Description                             | Example           |
+| ---------------- | --------------------------------------- | ----------------- |
+| `--model`        | Model path or identifier                | `/path/to/model` |
+| `--dataset`      | Path to training `.jsonl`               | `/path/to/train_data.jsonl`      |
+| `--eval-dataset` | JSON of task\:eval\_dataset\_path pairs | `{"eval_1":"/path/to/eval_1.jsonl"}`            |
+| `--project`      | Project name                            | `Project1`            |
+| `--num-episodes` | Number of training episodes             | `1`               |
+
+### Distributed Training
+
+| Argument                      | Description                           | Example |
+| ----------------------------- | ------------------------------------- | ------- |
+| `--num-trainers`              | Number of trainer processes           | `4`     |
+| `--num-inferencer`            | Number of inferencer processes        | `4`     |
+| `--inference-batch-size`      | Prompts per inference step            | `8`    |
+| `--inference-microbatch-size` | Per-GPU batch size for inference      | `8`     |
+| `--train-batch-size`          | Prompts per trainer step per dp group | `8`    |
+| `--train-minibatch-size`      | Mini-batch size before forward pass   | `8`     |
+| `--train-microbatch-size`     | Per-GPU batch size for training       | `2`     |
+
+### Sampling
+
+| Argument              | Description           | Example        |
+| --------------------- | --------------------- | -------------- |
+| `--backend`           | Generation backend, choose from `vllm`     | `vllm` |
+| `--temperature`       | Sampling temperature for generation  | `1.0`          |
+| `--top-k`             | Top-K sampling parameter for generation        | `None`         |
+| `--top-p`             | Top-P sampling parameter for generation        | `1.0`          |
+| `--system-prompt`     | System prompt, Optional, default to the default system prompt for each reward types. For more information, refer to the [**reward type**](#-constraints-and-notes) section        | `Please reason step by step, and put your final answer within \\boxed{}.`         |
+| `--max-new-tokens`    | Max generation tokens | `3584`         |
+| `--max-prompt-tokens` | Max prompt tokens     | `512`          |
+
+### GRPO Specific
+
+| Argument          | Description                  | Example             |
+| ----------------- | ---------------------------- | ------------------- |
+| `--algo`          | Algorithm (`GRPO` or `DAPO`), for more customization refer to [GRPO Settings](#️-grpo-settings) | `GRPO`              |
+| `--learning-rate` | Learning rate                | `1e-6`              |
+| `--kl-coeff`      | KL penalty coefficient, if nonzero, a reference model will be used       | `0.01`              |
+| `--reward-type`   | Reward signal type (choose from 'think_answer_tags', 'boxed', 'code') For more information, refer to the [**reward type**](#-constraints-and-notes) section        | `think_answer_tags` |
+| `--eval-interval` | Evaluation interval in number of training steps (positive value to enable evaluation)         | `10`               |
+
+### Logging and Checkpointing
+
+| Argument             | Description               | Example      |
+| -------------------- | ------------------------- | ------------ |
+| `--save-interval`    | Training steps between checkpoints | `20`         |
+| `--save-dir`         | Checkpoint directory      | `./model`    |
+| `--eval-save-dir`    | Evaluation save path      | `./eval`     |
+| `--rollout-save-dir` | Rollout logs directory    | `./rollouts` |
+
+### Miscellaneous
+
+| Argument           | Description                             | Example |
+| ------------------ | --------------------------------------- | ------- |
+| `--ray_dir`        | Custom Ray temp dir of a running Ray cluster (optional)                   | `None`  |
+| `--master_address` | Master address of a running Ray cluster | `None`  |
+| `--master_port`    | Master port for torch DDP                            | `29506` |
+
+---
+
+## ⚙️ GRPO Settings
+
+In addition to the two default training settings we provided--- original `GRPO` and `DAPO`, users can customize their training by changing the following hyperparameters in `grpo_config` in `rl_example.py`.
+
+| Argument Name                 | Description                      | Default                                                                                                                                                   |
+| ----------------------------- | ---------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
+| `filter_range`                | Filters out rollout group if the success rate within that group is out of this range.| `[0.01, 0.99]`                                  |
+| `dynamic_batching`            | Enables dynamic batching as described in the [DAPO paper](https://arxiv.org/abs/2503.14476).                                                                      | `True`                                         |
+| `clip_eps_low`                | epsilon_low in DAPO in equation in [DAPO paper](https://arxiv.org/abs/2503.14476)                                                   | `0.2`                                           |
+| `clip_eps_high`               | epsilon_high in DAPO equation in [DAPO paper](https://arxiv.org/abs/2503.14476)                                                 | `0.28`                                           |
+| `skip_threshold`              | If ratio is above this threshold, the sample is skipped to avoid instability.                                                   | `20.0`                                             |
+| `loss_variation`              | Type of loss variation. Supports `"token_level"` for token-wise policy gradient loss and `sample_level` for original GRPO loss.                                         |  `"token_level"`                                        |
+| `soft_over_length_punishment` | Whether to use soft overlength penalty in [DAPO paper](https://arxiv.org/abs/2503.14476) or not.                                                               | `True`                                             |
+| `cache_length`                | `L_cache` parameter for soft overlength penalty in e.q. 13 in [DAPO paper](https://arxiv.org/abs/2503.14476)                                                                          | `min(1024, int(args.max_new_tokens / 4))`                 |
+| `filter_truncated_response`    | Mask out truncated responses in loss calculation.                                       | `True`                                         |
+
+
+
+## 🔄 Constraints and Notes
+
+* `num_inferencer + num_trainer == NUM_GPUs`
+* `num_inferencer % num_trainer == 0`
+* `(num_inferencer * inference_batch_size) % (num_trainer * train_batch_size) == 0`
+* `train_batch_size >= train_minibatch_size >= train_microbatch_size`
+* `inference_batch_size >= inference_microbatch_size`
+* Set microbatch sizes based on **VRAM capacity**
+* To use tensor parallelism on inferencer
+  * set backend to `vllm`
+  * change `tensor_parallel_size` in `inference_model_config` in rl_example.py
+  * set `num_inferencer = NUM_INFERENCE_GPUs / tensor_parallel_size`
+* To set tensor parallelism / pipeline parallelism / zero stage
+  * change corresponding settings in `plugin_config` in rl_example.py
+* Ensure rollout generation rate matches trainer consumption:
+
+  ```
+  num_inferencer * inference_batch_size % (
+    num_trainer * train_batch_size /
+    train_pipeline_parallelism_size /
+    train_tensor_parallelism_size
+  ) == 0
+  ```
+* Model weights sync every:
+
+  ```
+  (num_inferencer * inference_batch_size) /
+  (num_trainer * train_batch_size /
+    train_pipeline_parallelism_size /
+    train_tensor_parallelism_size)
+  ```
+* Reward Type
+
+    We currently support three reward types--- `think_answer_tags`, `boxed`, `code`, each varies in details such as how answer is extracted and the reward calculation process. Please select one from `think_answer_tags`, `boxed` for math problem solving and use `code` for code generation. The default system prompt for each reward type is as follows. Please make sure your system prompt provides information for the answer to be correctly extracted from model responses.
+
+    * think_answer_tags
+
+        Answer extraction: extract the content between the last `<answer>`, `</answer>` tags.
+
+        ```
+        You are a helpful assistant. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and<answer> </answer> tags, respectively, i.e., <think> reasoning process here </think><answer> answer here </answer>. Now the user asks you to solve a math problem that involves reasoning. After thinking, when you finally reach a conclusion, clearly output the final answer without explanation within the <answer> </answer> tags, i.e., <answer> 123 </answer>.\n\n
+        ```
+    * boxed
+
+        Answer extraction: extract the last content marked by `\\boxed{}`
+        ```
+        Please reason step by step, and put your final answer within \\boxed{}.
+        ```
+    * code
+
+        Answer extraction: extract code inside ` ```python\n...``` `
+        ```
+        You are a helpful assistant.
+        ```
+        generated code is
+---
+
+## 🧪 Example: single machine 8-GPU Zero2 Strategy
+
+```bash
+python rl_example.py \
+  --dataset /path/to/train_data.jsonl \
+  --model /path/to/Qwen2.5-3B/ \
+  -t 4 -i 4 \
+  -b vllm \
+  -ibs 2 -tbs 4 -tMbs 1 -tmbs 4 -imbs 1 \
+  -rt boxed \
+  -g 4 \
+  -ibs 1 \
+  -tbs 2 \
+  -tMbs 1 \
+  -tmbs 2 \
+  -imbs 1 \
+  -s "Please reason step by step, and put your final answer within \\boxed{}." \
+  -tMbs 8 \
+  -p GRPO-Train-Align-Debug \
+```
+
+## 🧪 Example: multi-machine TP+PP Strategy
+
+### Create ray cluster on multi-machine
+For example, now we have 4 nodes and their IPs are 10.0.0.3, 10.0.0.4, 10.0.0.5, 10.0.0.6.
+We use 10.0.0.3 as master node. First we start a ray cluster on 10.0.0.3:
+```bash
+ray start --head --node-ip-address=10.0.0.3
+```
+
+Then, for each slave node (10.0.0.4/10.0.0.5/10.0.0.6), we add to the ray cluser by following code:
+```bash
+ray start --address='10.0.0.3:6379'
+```
+
+Modify plugin_config in ./applications/ColossalChat/rl_example.py
+```python
+plugin_config={
+  "tp_size": 4,
+  "pp_size": 2,
+  "microbatch_size": max(
+    1, args.train_microbatch_size // 2
+  ),  # microbatch size should be set to train_microbatch_size // pp_size
+  "zero_stage": 1,
+  "max_norm": 1.0,
+  },  # for pp, tp
+```
+
+```bash
+# Hint1: replace /models/Qwen/Qwen2.5-7B to your model path
+#        replace /datasets/train-alignment.jsonl to your dataset path
+python rl_example.py
+  -m /path/to/Qwen2.5-Math-7B/ \
+  -d /path/to/train_data.jsonl \
+  --master_address '10.0.0.3'
+  -t 16 \
+  -i 16 \
+  -p GRPO-Train-Align-Debug \
+  -g 2 \
+  -ibs 1 \
+  -tbs 2 \
+  -tMbs 1 \
+  -tmbs 2 \
+  -imbs 1 \
+  -b vllm \
+  -e 2 \
+  -rt boxed \
+  -s "Please reason step by step, and put your final answer within \\boxed{}."
+```
+
+## Acknowledgement
+Colossal-RL is a distributed version of ColossalChat and inspired by a few awesome open-source projects. We would like to express our gratitude to the Fuyao-ray team and the vllm-ascend team for their support throughout the development of the this project. We also thank the following awesome open-source projects and algorithms: GRPO, DAPO, TRL, Verl, OpenRLHF, StreamRL, Qwen, Logic-RL.
diff --git a/applications/ColossalChat/rl_example.py b/applications/ColossalChat/rl_example.py
index 0536a746d..c60923e00 100644
--- a/applications/ColossalChat/rl_example.py
+++ b/applications/ColossalChat/rl_example.py
@@ -6,6 +6,12 @@ import ray
 import torch
 from coati.distributed.launch import launch_distributed
 
+DEFAUT_SYSTEM_PROMPT = {
+    "think_answer_tags": "You are a helpful assistant. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and<answer> </answer> tags, respectively, i.e., <think> reasoning process here </think><answer> answer here </answer>. Now the user asks you to solve a math problem that involves reasoning. After thinking, when you finally reach a conclusion, clearly output the final answer without explanation within the <answer> </answer> tags, i.e., <answer> 123 </answer>.\n\n",
+    "boxed": "Please reason step by step, and put your final answer within \\boxed{}.",
+    "code": "You are a helpful assistant.",
+}
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("-m", "--model", type=str, default="Qwen/Qwen2.5-7B")
@@ -295,6 +301,10 @@ if __name__ == "__main__":
     else:
         raise ValueError(f"Unsupported algorithm: {args.algo}")
 
+    if args.system_prompt is None:
+        # Default system prompt
+        args.system_prompt = DEFAUT_SYSTEM_PROMPT[args.reward_type]
+
     launch_distributed(
         num_producers=args.num_inferencer,
         num_proc_per_producer=inference_model_config.get("tensor_parallel_size", args.producer_tensor_parallel_size),

From dc29c746327d7c18fbabc5e26e2739f5178271a5 Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Tue, 10 Jun 2025 17:17:41 +0800
Subject: [PATCH 18/34] update readme

---
 .../ColossalChat/coati/distributed/README.md       | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/README.md b/applications/ColossalChat/coati/distributed/README.md
index 262ab6aab..29bd8d413 100644
--- a/applications/ColossalChat/coati/distributed/README.md
+++ b/applications/ColossalChat/coati/distributed/README.md
@@ -1,6 +1,6 @@
 # Distributed RL Framework for Language Model Fine-Tuning
 
-This repository implements a distributed Reinforcement Learning (RL) training framework designed to fine-tune large language models using algorithms such as **GRPO** and **DAPO**. It supports multi-node and multi-GPU setups, scalable rollout generation, and policy optimization using libraries like VLLM. Currently, we supports two Reinforcement Learning with Verifiable Reward (RLVR) tasks: solving math problems and code generation.
+This repository implements a distributed Reinforcement Learning (RL) training framework designed to fine-tune large language models using algorithms such as **GRPO** and **DAPO**. It supports multi-node and multi-GPU setups, scalable rollout generation, and policy optimization using libraries like VLLM. Currently, we support two Reinforcement Learning with Verifiable Reward (RLVR) tasks: solving math problems and code generation.
 
 **Please note that we are still under intensive development, stay tuned.**
 
@@ -70,7 +70,7 @@ Key features for Producer-Consumer Pattern:
 
 ## 🧠 Data Format
 
-Samples in the training or evaluation `.jsonl` file should the same format depends on the type of task, we currently support two RLVR tasks: solving math problems and code generation.
+Samples in the training or evaluation `.jsonl` file should follow the format specific to the type of task. We currently support two RLVR tasks: solving math problems and code generation.
 
 ### Math Data Format
 ```json
@@ -84,7 +84,7 @@ Samples in the training or evaluation `.jsonl` file should the same format depen
 ```
 
 ### Code Data Format
-We support [Prime code dataset format](https://github.com/PRIME-RL/PRIME). Inputs and outputs in test cases should be two lists containing only strings and matching in the number of elements. You prompt must properly instruct the LLM to generate code to read test cases from stdin and output results to stdout.
+We support [Prime code dataset format](https://github.com/PRIME-RL/PRIME). Inputs and outputs in test cases should be two lists containing only strings and matching in the number of elements. Your prompt must properly instruct the LLM to generate code to read test cases from stdin and output results to stdout.
 ```json
 {
     "messages": {
@@ -134,7 +134,7 @@ We support [Prime code dataset format](https://github.com/PRIME-RL/PRIME). Input
 | `--temperature`       | Sampling temperature for generation  | `1.0`          |
 | `--top-k`             | Top-K sampling parameter for generation        | `None`         |
 | `--top-p`             | Top-P sampling parameter for generation        | `1.0`          |
-| `--system-prompt`     | System prompt, Optional, default to the default system prompt for each reward types. For more information, refer to the [**reward type**](#-constraints-and-notes) section        | `Please reason step by step, and put your final answer within \\boxed{}.`         |
+| `--system-prompt`     | System prompt, optional, default to the default system prompt for each reward types. For more information, refer to the [**reward type**](#-constraints-and-notes) section        | `Please reason step by step, and put your final answer within \\boxed{}.`         |
 | `--max-new-tokens`    | Max generation tokens | `3584`         |
 | `--max-prompt-tokens` | Max prompt tokens     | `512`          |
 
@@ -169,7 +169,7 @@ We support [Prime code dataset format](https://github.com/PRIME-RL/PRIME). Input
 
 ## ⚙️ GRPO Settings
 
-In addition to the two default training settings we provided--- original `GRPO` and `DAPO`, users can customize their training by changing the following hyperparameters in `grpo_config` in `rl_example.py`.
+In addition to the two default training settings provided—`GRPO` and `DAPO`—users can customize their training by changing the following hyperparameters in `grpo_config` in `rl_example.py`.
 
 | Argument Name                 | Description                      | Default                                                                                                                                                   |
 | ----------------------------- | ---------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
@@ -272,7 +272,7 @@ We use 10.0.0.3 as master node. First we start a ray cluster on 10.0.0.3:
 ray start --head --node-ip-address=10.0.0.3
 ```
 
-Then, for each slave node (10.0.0.4/10.0.0.5/10.0.0.6), we add to the ray cluser by following code:
+Then, for each slave node (10.0.0.4/10.0.0.5/10.0.0.6), we add to the ray cluster by following code:
 ```bash
 ray start --address='10.0.0.3:6379'
 ```
@@ -313,4 +313,4 @@ python rl_example.py
 ```
 
 ## Acknowledgement
-Colossal-RL is a distributed version of ColossalChat and inspired by a few awesome open-source projects. We would like to express our gratitude to the Fuyao-ray team and the vllm-ascend team for their support throughout the development of the this project. We also thank the following awesome open-source projects and algorithms: GRPO, DAPO, TRL, Verl, OpenRLHF, StreamRL, Qwen, Logic-RL.
+Colossal-RL is a distributed version of ColossalChat and inspired by a few awesome open-source projects. We would like to express our gratitude to the following awesome open-source projects and algorithms: GRPO, DAPO, TRL, Verl, OpenRLHF, StreamRL, Qwen, Logic-RL.

From 1330b5753ba15e2613a6f85149cace4f366ed15e Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Tue, 10 Jun 2025 18:21:42 +0800
Subject: [PATCH 19/34] add ray timeout handling instruction

---
 applications/ColossalChat/coati/distributed/README.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/applications/ColossalChat/coati/distributed/README.md b/applications/ColossalChat/coati/distributed/README.md
index 29bd8d413..85055a5f3 100644
--- a/applications/ColossalChat/coati/distributed/README.md
+++ b/applications/ColossalChat/coati/distributed/README.md
@@ -47,6 +47,14 @@ pip install cupy-cuda12x
 python -m cupyx.tools.install_library --cuda 12.x --library nccl
 ```
 
+To support long input/output sequence length (e.g., 32K), you may need to manually change the default setting (180 seconds) for the `timeout_s` variable in your ray installation to a larger value as shown in the screenshot below.
+
+<div align="center">
+  <p align="center">
+    <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chat/change_ray_timeout.png" width=700/>
+  </p>
+</div>
+
 Prepare Model & dataset
 ```bash
 huggingface-cli download --local-dir-use-symlinks False Qwen/Qwen2.5-7B --local-dir /models/Qwen/Qwen2.5-7B

From 8992def757f7f27e0b6dadfaa6f66ff6caa95e89 Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li352711588@gmail.com>
Date: Wed, 11 Jun 2025 17:54:18 +0800
Subject: [PATCH 20/34] fix pp memory issue (#6344)

Co-authored-by: Tong Li <tong.li35271158@gmail.com>
---
 applications/ColossalChat/coati/distributed/grpo_consumer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index 0e956ca41..51cdcf322 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -345,7 +345,7 @@ class GRPOConsumer(BaseConsumer):
                         criterion=_criterion,
                         optimizer=self.optimizer,
                         return_loss=True,
-                        return_outputs=True,
+                        return_outputs=False,
                     )
                     loss = policy_model_outputs["loss"]
 

From 2f02a287772519fc6eef66f08fe3ab2e85e24053 Mon Sep 17 00:00:00 2001
From: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Date: Thu, 12 Jun 2025 11:21:31 +0800
Subject: [PATCH 21/34] Update README.md

---
 applications/ColossalChat/coati/distributed/README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/applications/ColossalChat/coati/distributed/README.md b/applications/ColossalChat/coati/distributed/README.md
index 85055a5f3..21647a8cc 100644
--- a/applications/ColossalChat/coati/distributed/README.md
+++ b/applications/ColossalChat/coati/distributed/README.md
@@ -247,7 +247,6 @@ In addition to the two default training settings provided—`GRPO` and `DAPO`—
         ```
         You are a helpful assistant.
         ```
-        generated code is
 ---
 
 ## 🧪 Example: single machine 8-GPU Zero2 Strategy

From 51b7abe9ddaee6619e18f111fc80afeb4d217304 Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Thu, 12 Jun 2025 15:06:01 +0800
Subject: [PATCH 22/34] fix num_update_per_episode

---
 applications/ColossalChat/coati/distributed/launch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/applications/ColossalChat/coati/distributed/launch.py b/applications/ColossalChat/coati/distributed/launch.py
index 30e7382ef..dc8bf0057 100644
--- a/applications/ColossalChat/coati/distributed/launch.py
+++ b/applications/ColossalChat/coati/distributed/launch.py
@@ -16,7 +16,7 @@ def get_jsonl_size_fast(path: str) -> int:
     with open(path) as f:
         lines = f.readlines()
         lines = [line for line in lines if line.strip()]
-        return len(lines) - 1
+        return len(lines)
 
 
 def get_dp_size_fast(n_procs: int, plugin_config: Dict[str, Any]) -> int:

From 30a6859f7792a9026d388224f8fa50e4d6e711ac Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Fri, 13 Jun 2025 18:21:54 +0800
Subject: [PATCH 23/34] optimize pp log_softmax OOM

---
 .gitignore                                    |  2 +
 .../coati/distributed/grpo_consumer.py        | 37 +++++++++++++------
 2 files changed, 28 insertions(+), 11 deletions(-)

diff --git a/.gitignore b/.gitignore
index e603f5015..94d952295 100644
--- a/.gitignore
+++ b/.gitignore
@@ -171,3 +171,5 @@ applications/ColossalChat/*.txt
 applications/ColossalChat/*.db
 applications/ColossalChat/stdin
 applications/ColossalChat/*.zip
+applications/ColossalChat/*.prof
+applications/ColossalChat/*.png
diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index 51cdcf322..040ca61a4 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -280,13 +280,21 @@ class GRPOConsumer(BaseConsumer):
                             )
 
                         if self.booster.plugin.stage_manager.is_last_stage():
-                            reference_model_logits = reference_model_outputs["outputs"]["logits"]
-                            reference_action_log_probs = calc_action_log_probs(
-                                reference_model_logits / self.generate_config["temperature"],
-                                input_ids_forward_micro_batch,
-                                num_action,
-                                self.plugin.shard_config,
+                            reference_action_log_probs = torch.zeros(
+                                (input_ids_forward_micro_batch.size(0), num_action),
+                                device=input_ids_forward_micro_batch.device,
                             )
+                            for i in range(reference_action_log_probs.size(0)):
+                                # activation for log_softmax is too large if vocab size and sequence length are large
+                                # e.g., when using 152064 vocab size with 32K seqence length and a micro batch size of 4 (for pp=4 for example),
+                                # this activation sorely takes 152064*32000*4*4/1024/1024/1024=72.5GB
+                                reference_action_log_probs[i, :] += calc_action_log_probs(
+                                    reference_model_outputs["outputs"]["logits"][i : i + 1]
+                                    / self.generate_config["temperature"],
+                                    input_ids_forward_micro_batch[i : i + 1],
+                                    num_action,
+                                    self.plugin.shard_config,
+                                )[0]
                         else:
                             # Dummy reference logprobs for data iterator.
                             reference_action_log_probs = None
@@ -308,12 +316,19 @@ class GRPOConsumer(BaseConsumer):
 
                     def _criterion(outputs, inputs):
                         action_logits = outputs.logits
-                        action_log_probs = calc_action_log_probs(
-                            action_logits / self.generate_config["temperature"],
-                            inputs["input_ids"],
-                            num_action,
-                            self.plugin.shard_config,
+                        action_log_probs = torch.zeros(
+                            (inputs["input_ids"].size(0), num_action), device=action_logits.device
                         )
+                        for i in range(action_log_probs.size(0)):
+                            # activation for log_softmax is too large if vocab size and sequence length are large
+                            # e.g., when using 152064 vocab size with 32K seqence length and a micro batch size of 4 (for pp=4 for example),
+                            # this activation sorely takes 152064*32000*4*4/1024/1024/1024=72.5GB
+                            action_log_probs[i, :] += calc_action_log_probs(
+                                action_logits[i : i + 1] / self.generate_config["temperature"],
+                                inputs["input_ids"][i : i + 1],
+                                num_action,
+                                self.plugin.shard_config,
+                            )[0]
                         if "reference_action_log_probs" in inputs:
                             per_token_kl = (
                                 torch.exp(inputs["reference_action_log_probs"] - action_log_probs)

From e3d56cbd860da04d1b81a7917fac10d8aaedaa9a Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Wed, 18 Jun 2025 10:24:48 +0000
Subject: [PATCH 24/34] implement memory efficient logprob

---
 .../coati/distributed/grpo_consumer.py        | 45 ++++++------------
 .../ColossalChat/coati/distributed/utils.py   | 46 ++++++++++++-------
 2 files changed, 43 insertions(+), 48 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index 040ca61a4..5e8f329eb 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -6,7 +6,7 @@ import torch
 import wandb
 from coati.distributed.consumer import BaseConsumer
 from coati.distributed.loss import PolicyLoss
-from coati.distributed.utils import calc_action_log_probs
+from coati.distributed.utils import memory_efficient_logprob
 from coati.trainer.utils import all_reduce_mean, all_reduce_sum
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
@@ -280,21 +280,12 @@ class GRPOConsumer(BaseConsumer):
                             )
 
                         if self.booster.plugin.stage_manager.is_last_stage():
-                            reference_action_log_probs = torch.zeros(
-                                (input_ids_forward_micro_batch.size(0), num_action),
-                                device=input_ids_forward_micro_batch.device,
+                            reference_action_log_probs = memory_efficient_logprob(
+                                reference_model_outputs["outputs"]["logits"],
+                                input_ids_forward_micro_batch,
+                                num_action,
+                                shard_config=self.plugin.shard_config,
                             )
-                            for i in range(reference_action_log_probs.size(0)):
-                                # activation for log_softmax is too large if vocab size and sequence length are large
-                                # e.g., when using 152064 vocab size with 32K seqence length and a micro batch size of 4 (for pp=4 for example),
-                                # this activation sorely takes 152064*32000*4*4/1024/1024/1024=72.5GB
-                                reference_action_log_probs[i, :] += calc_action_log_probs(
-                                    reference_model_outputs["outputs"]["logits"][i : i + 1]
-                                    / self.generate_config["temperature"],
-                                    input_ids_forward_micro_batch[i : i + 1],
-                                    num_action,
-                                    self.plugin.shard_config,
-                                )[0]
                         else:
                             # Dummy reference logprobs for data iterator.
                             reference_action_log_probs = None
@@ -316,19 +307,12 @@ class GRPOConsumer(BaseConsumer):
 
                     def _criterion(outputs, inputs):
                         action_logits = outputs.logits
-                        action_log_probs = torch.zeros(
-                            (inputs["input_ids"].size(0), num_action), device=action_logits.device
+                        action_log_probs = memory_efficient_logprob(
+                            action_logits,
+                            inputs["input_ids"],
+                            num_action,
+                            shard_config=self.plugin.shard_config,
                         )
-                        for i in range(action_log_probs.size(0)):
-                            # activation for log_softmax is too large if vocab size and sequence length are large
-                            # e.g., when using 152064 vocab size with 32K seqence length and a micro batch size of 4 (for pp=4 for example),
-                            # this activation sorely takes 152064*32000*4*4/1024/1024/1024=72.5GB
-                            action_log_probs[i, :] += calc_action_log_probs(
-                                action_logits[i : i + 1] / self.generate_config["temperature"],
-                                inputs["input_ids"][i : i + 1],
-                                num_action,
-                                self.plugin.shard_config,
-                            )[0]
                         if "reference_action_log_probs" in inputs:
                             per_token_kl = (
                                 torch.exp(inputs["reference_action_log_probs"] - action_log_probs)
@@ -370,16 +354,15 @@ class GRPOConsumer(BaseConsumer):
                             mean_kl.append(kl)
                         mean_loss.append(all_reduce_mean(loss, self.plugin).data)
                 else:
-
                     policy_model_logits = self.policy_model(
                         input_ids=input_ids_forward_micro_batch,
                         attention_mask=attention_mask_forward_micro_batch,
                     ).logits
-                    action_log_probs = calc_action_log_probs(
+                    action_log_probs = memory_efficient_logprob(
                         policy_model_logits / self.generate_config["temperature"],
                         input_ids_forward_micro_batch,
                         num_action,
-                        self.plugin.shard_config,
+                        shard_config=self.plugin.shard_config,
                     )
 
                     if self.policy_loss_fn.beta > 0:
@@ -388,7 +371,7 @@ class GRPOConsumer(BaseConsumer):
                                 input_ids=input_ids_forward_micro_batch,
                                 attention_mask=attention_mask_forward_micro_batch,
                             ).logits
-                        reference_action_log_probs = calc_action_log_probs(
+                        reference_action_log_probs = memory_efficient_logprob(
                             reference_model_logits / self.generate_config["temperature"],
                             input_ids_forward_micro_batch,
                             num_action,
diff --git a/applications/ColossalChat/coati/distributed/utils.py b/applications/ColossalChat/coati/distributed/utils.py
index a40ebbcfb..d46243114 100644
--- a/applications/ColossalChat/coati/distributed/utils.py
+++ b/applications/ColossalChat/coati/distributed/utils.py
@@ -71,31 +71,43 @@ def log_probs_from_logits(logits: torch.Tensor, labels: torch.Tensor) -> torch.T
     return per_label_logps.squeeze(-1)
 
 
-def calc_action_log_probs(
+def memory_efficient_logprob(
     logits: torch.Tensor,
-    sequences: torch.LongTensor,
-    num_actions: int,
-    shard_config,
+    inputs: torch.Tensor,
+    num_action: int,
+    chunk_size: int = 2048,
+    shard_config: Any = None,
     vocab_size: int = None,
 ) -> torch.Tensor:
-    """Calculate action log probs.
-
+    """
+    Calculate action log probs in a memory-efficient way by processing in chunks.
     Args:
         logits (torch.Tensor): Output tensor of Actor.forward.logits.
-        sequences (torch.LongTensor): Input sequences.
-        num_actions (int): Number of actions.
-        shard_config
-        vocab_size
-
-
+        inputs (torch.LongTensor): Input sequences.
+        num_action (int): Number of actions.
+        chunk_size (int, optional): Size of each chunk to process. Default is 2048.
+        shard_config: Shard configuration for distributed computation.
+        vocab_size (int, optional): Vocabulary size. Default is None.
     Returns:
         torch.Tensor: Action log probs.
     """
-    # labels: torch.Tensor,  # [B, S] or [B, S, Vocab_size]
-    # logits: torch.Tensor,  # [B, S, Vocab_size]
-    log_probs = dist_log_prob(sequences, logits, shard_config, vocab_size, logits.dtype)
-    log_probs = log_probs.squeeze(-1)
-    return log_probs[:, -num_actions:]
+    action_log_probs = torch.zeros((logits.size(0), num_action), device=logits.device, dtype=logits.dtype)
+    context_length = logits.size(1) - num_action
+    for i in range(action_log_probs.size(0)):
+        # loop over each sample in the micro-batch
+        for start in range(context_length, logits.size(1), chunk_size):
+            end = min(start + chunk_size, logits.size(1))
+            # calculate log probs in chunks to save memory
+            log_probs = dist_log_prob(
+                inputs[i : i + 1, start - 1 : end],
+                logits[i : i + 1, start - 1 : end],
+                shard_config,
+                vocab_size,
+                logits.dtype,
+            )  # [1, chunk_size, 1]
+            log_probs = log_probs.squeeze(-1)
+            action_log_probs[i, start - context_length : end - context_length] += log_probs[0]
+    return action_log_probs
 
 
 def masked_mean(tensor: torch.Tensor, mask: torch.Tensor, dim: int = 1) -> torch.Tensor:

From 6b06430ca418715240c10a28baf14ff3e4b35ca0 Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Thu, 19 Jun 2025 01:37:52 +0000
Subject: [PATCH 25/34] fix small bug

---
 .../ColossalChat/coati/distributed/grpo_consumer.py         | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index 5e8f329eb..d7d6221a1 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -281,7 +281,7 @@ class GRPOConsumer(BaseConsumer):
 
                         if self.booster.plugin.stage_manager.is_last_stage():
                             reference_action_log_probs = memory_efficient_logprob(
-                                reference_model_outputs["outputs"]["logits"],
+                                reference_model_outputs["outputs"]["logits"] / self.generate_config["temperature"],
                                 input_ids_forward_micro_batch,
                                 num_action,
                                 shard_config=self.plugin.shard_config,
@@ -308,7 +308,7 @@ class GRPOConsumer(BaseConsumer):
                     def _criterion(outputs, inputs):
                         action_logits = outputs.logits
                         action_log_probs = memory_efficient_logprob(
-                            action_logits,
+                            action_logits / self.generate_config["temperature"],
                             inputs["input_ids"],
                             num_action,
                             shard_config=self.plugin.shard_config,
@@ -375,7 +375,7 @@ class GRPOConsumer(BaseConsumer):
                             reference_model_logits / self.generate_config["temperature"],
                             input_ids_forward_micro_batch,
                             num_action,
-                            self.plugin.shard_config,
+                            shard_config=self.plugin.shard_config,
                         )
                         per_token_kl = (
                             torch.exp(reference_action_log_probs - action_log_probs)

From 8880b83791fa9d34d733f365236d8b7beabdb0ee Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li352711588@gmail.com>
Date: Thu, 19 Jun 2025 14:02:08 +0800
Subject: [PATCH 26/34] add dp rank for multi-dp (#6351)

Co-authored-by: Tong Li <tong.li35271158@gmail.com>
---
 .../ColossalChat/coati/distributed/grpo_consumer.py   | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index d7d6221a1..8d50734a9 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -130,7 +130,10 @@ class GRPOConsumer(BaseConsumer):
     def setup(self):
         super().setup()
         if (not self.plugin.pp_size > 1 and self.rank == 0) or (
-            self.plugin.pp_size > 1 and self.booster.plugin.stage_manager.is_last_stage() and self.tp_rank == 0
+            self.plugin.pp_size > 1
+            and self.booster.plugin.stage_manager.is_last_stage()
+            and self.tp_rank == 0
+            and self.dp_rank == 0
         ):
             self.wandb_run = wandb.init(
                 project=self.project_name,
@@ -222,7 +225,6 @@ class GRPOConsumer(BaseConsumer):
         effective_samples = all_reduce_sum(torch.sum(loss_mask), self.plugin)
         effective_tokens_count = torch.sum(action_mask, dim=-1) * loss_mask
         total_effective_tokens_count = all_reduce_sum(torch.sum(effective_tokens_count), self.plugin)
-        total_samples = all_reduce_sum(torch.sum(torch.ones_like(loss_mask, device=loss_mask.device)), self.plugin)
         self.effective_sample_count += effective_samples.item()
         pbar.set_postfix(
             {
@@ -407,7 +409,10 @@ class GRPOConsumer(BaseConsumer):
                         mean_kl.append(kl.data)
                     mean_loss.append(loss.data)
             if not self.plugin.pp_size > 1 or (
-                self.plugin.pp_size > 1 and self.booster.plugin.stage_manager.is_last_stage() and self.tp_rank == 0
+                self.plugin.pp_size > 1
+                and self.booster.plugin.stage_manager.is_last_stage()
+                and self.tp_rank == 0
+                and self.dp_rank == 0
             ):
                 reward = all_reduce_mean(reward.mean(), self.plugin)
                 format_acc = all_reduce_mean(format_acc.mean(), self.plugin)

From d4ef7f57be5d2cfb8b29fd74823d38cc3b877d0a Mon Sep 17 00:00:00 2001
From: duanjunwen <935724073@qq.com>
Date: Wed, 28 May 2025 10:43:13 +0800
Subject: [PATCH 27/34] [ColossalRL] Support ColossalRL on Ascend (#6324)

* [fix] support npu

* [feat] multinode 14B

* [feat] enlarge seqlen

* [fix]

* [fix] ready to updated

* [fix] ready to merge grpo-latest

* [fix] rm comments

* [feat] support msprof-analyze, add analsys result

* [feat] support ColossalaiRL on Ascend

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* [feat] rm comments in qwen modeling

* [Doc] Drafted README.md

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* [feat] fix ascend readme format

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* [fix] fix readme

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* [fix] fix readme

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* [fix] fix Readme, rm irrelevant testcase

* [fix] fix some adapt modification

* [fix] rm comments in modeling qwen

* [fix] rm comm, test and debug print

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
---
 .../ColossalChat/coati/distributed/README.md  | 178 +++++++++---------
 .../coati/distributed/consumer.py             |  15 +-
 .../coati/distributed/inference_backend.py    |   1 +
 .../ColossalChat/coati/distributed/launch.py  |  32 +++-
 .../coati/distributed/producer.py             |  68 +------
 applications/ColossalChat/requirements.txt    |  13 +-
 applications/ColossalChat/rl_example.py       |  12 +-
 colossalai/pipeline/schedule/one_f_one_b.py   |   2 +-
 colossalai/shardformer/modeling/qwen2.py      | 150 ++++++++++++++-
 colossalai/shardformer/policies/qwen2.py      |   4 +-
 10 files changed, 295 insertions(+), 180 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/README.md b/applications/ColossalChat/coati/distributed/README.md
index 21647a8cc..d7a947d06 100644
--- a/applications/ColossalChat/coati/distributed/README.md
+++ b/applications/ColossalChat/coati/distributed/README.md
@@ -1,5 +1,81 @@
 # Distributed RL Framework for Language Model Fine-Tuning
 
+This repository implements a distributed Reinforcement Learning (RL) training framework designed to fine-tune large language models using algorithms such as **GRPO** and **DAPO**. It supports multi-node and multi-GPU setups, scalable rollout generation, and policy optimization using libraries like  VLLM.
+
+---
+
+## 🚀 Features
+
+* **Distributed Training with Ray**: Scalable to multiple machines and GPUs.
+* **Support for GRPO and DAPO**: Choose your preferred policy optimization algorithm.
+* **Model Backends**: Support `vllm` as inference backends.
+* **Rollout and Policy Decoupling**: Efficient generation and consumption of data through parallel inferencer-trainer architecture.
+* **Evaluation Integration**: Easily plug in task-specific eval datasets.
+* **Checkpoints and Logging**: Configurable intervals and directories.
+
+---
+
+## 🛠 Installation
+
+### Prepare Develop Environment
+
+Install Colossalai & ColossalChat
+```bash
+git clone https://github.com/hpcaitech/ColossalAI.git
+git checkout grpo-latest-ascend
+pip install -e .
+
+cd ./applications/ColossalChat
+pip install -e .
+```
+Install Fuyao Ray.
+Please update CANN before install fuyao ray
+```bash
+# Install CANN
+source /usr/local/Ascend/ascend-toolkit/set_env.sh
+./Ascend-cann-kernels-910b_8.1.RC1.alpha001_linux-aarch64.run  --devel
+
+# Clone Fuyao Ray. Fuyao Ray is not an open source project, it will be inherited in the ColossalRL images.
+git clone https://gitee.com/openfuyao/ray.git
+cd ray
+git pull origin pull/5/head
+
+# Install ray
+pip install ray==2.43.0 --no-cache-dir
+
+# Create soft-link from fuyao-ray to ray site-package
+cd ..
+ln -s ./ray/python/ray/ /usr/local/python3.10/lib/python3.10/site-packages/ray
+
+# Install Fuyao Ray
+cd ray
+python python/ray/setup-dev.py
+```
+
+Prepare Model & dataset
+```bash
+huggingface-cli download --local-dir-use-symlinks False Qwen/Qwen2.5-7B --local-dir /models/Qwen/Qwen2.5-7B
+```
+
+### Set Distributed Config
+Now, we need to set distributed config for multi-node.
+
+First, we set host ip config.
+For example. I need to configure a cluster of 4 nodes, then I do
+```bash
+vim /etc/hosts
+```
+Then write IP node map to /etc/hosts
+```bash
+10.0.0.3 npu-3
+10.0.0.4 npu-4
+10.0.0.5 npu-5
+10.0.0.6 npu-6
+```
+
+Set Ascend Multi-Node Config
+
+
 This repository implements a distributed Reinforcement Learning (RL) training framework designed to fine-tune large language models using algorithms such as **GRPO** and **DAPO**. It supports multi-node and multi-GPU setups, scalable rollout generation, and policy optimization using libraries like VLLM. Currently, we support two Reinforcement Learning with Verifiable Reward (RLVR) tasks: solving math problems and code generation.
 
 **Please note that we are still under intensive development, stay tuned.**
@@ -43,73 +119,29 @@ pip install ray
 
 Install Other Dependencies
 ```bash
-pip install cupy-cuda12x
-python -m cupyx.tools.install_library --cuda 12.x --library nccl
+export ATB_LLM_HCCL_ENABLE=1
+export ATB_LLM_COMM_BACKEND="hccl"
+export HCCL_CONNECT_TIMEOUT=7200
+export WORLD_SIZE=32
+export HCCL_EXEC_TIMEOUT=7200
+export HCCL_SOCKET_IFNAME=eno0
+export RAY_COLLECTIVE_MEET_TIMEOUT_SECONDS=7200
 ```
 
-To support long input/output sequence length (e.g., 32K), you may need to manually change the default setting (180 seconds) for the `timeout_s` variable in your ray installation to a larger value as shown in the screenshot below.
-
-<div align="center">
-  <p align="center">
-    <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chat/change_ray_timeout.png" width=700/>
-  </p>
-</div>
-
-Prepare Model & dataset
-```bash
-huggingface-cli download --local-dir-use-symlinks False Qwen/Qwen2.5-7B --local-dir /models/Qwen/Qwen2.5-7B
-```
-
-## Architecture Design
-
-<div align="center">
-  <p align="center">
-    <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chat/producer-consumer-pattern.png" width=700/>
-  </p>
-</div>
-Producer-Consumer Pattern: a classic software design pattern used for managing resources, data, or tasks between two different processes or threads.
-
-* Producer: inference engine which rollouts out examples and saves them into a shared buffer.
-* Consumer: training framework which takes training examples from the shared buffer and train the policy model.
-
-Key features for Producer-Consumer Pattern:
-* Buffer: Acts as a shared queue where the producer adds data and the consumer removes data.
-* Concurrency: Rollout and training can work concurrently.
-
 ## 🧠 Data Format
 
-Samples in the training or evaluation `.jsonl` file should follow the format specific to the type of task. We currently support two RLVR tasks: solving math problems and code generation.
+Each data sample in the training or evaluation `.jsonl` file should follow this format:
 
-### Math Data Format
 ```json
 {
   "messages": {
     "role": "user",
-    "content": "Simplify $\\sqrt[3]{1+8} \\cdot \\sqrt[3]{1+\\sqrt[3]{8}}$."
+    "content": "Simplify $\\sqrt[3]{1+8} \\cdot \\sqrt[3]{1+\\sqrt[3]{8}}$. Let's think step by step and output the final answer within \\boxed{}."
   },
   "gt_answer": "3"
 }
 ```
 
-### Code Data Format
-We support [Prime code dataset format](https://github.com/PRIME-RL/PRIME). Inputs and outputs in test cases should be two lists containing only strings and matching in the number of elements. Your prompt must properly instruct the LLM to generate code to read test cases from stdin and output results to stdout.
-```json
-{
-    "messages": {
-        "role": "user",
-        "content": "Solve the following coding problem using the programming language python:\n\nMikhail walks on a Cartesian plane. He starts at the point $(0, 0)$, and in one move he can go to any of eight adjacent points. For example, ..."
-    },
-    "test_cases": {
-        "inputs": [
-            "3\n2 2 3\n4 3 7\n10 1 9\n"
-        ],
-        "outputs": [
-            "1\n6\n-1\n"
-        ]
-    }
-}
-```
-
 ---
 
 ## ⚙️ Hyperparameters & Arguments
@@ -118,7 +150,7 @@ We support [Prime code dataset format](https://github.com/PRIME-RL/PRIME). Input
 | ---------------- | --------------------------------------- | ----------------- |
 | `--model`        | Model path or identifier                | `/path/to/model` |
 | `--dataset`      | Path to training `.jsonl`               | `/path/to/train_data.jsonl`      |
-| `--eval-dataset` | JSON of task\:eval\_dataset\_path pairs | `{"eval_1":"/path/to/eval_1.jsonl"}`            |
+| `--eval-dataset` | JSON of task\:eval\_dataset\_path pairs | `{'eval_1':'/path/to/eval_1.jsonl'}`            |
 | `--project`      | Project name                            | `Project1`            |
 | `--num-episodes` | Number of training episodes             | `1`               |
 
@@ -142,7 +174,7 @@ We support [Prime code dataset format](https://github.com/PRIME-RL/PRIME). Input
 | `--temperature`       | Sampling temperature for generation  | `1.0`          |
 | `--top-k`             | Top-K sampling parameter for generation        | `None`         |
 | `--top-p`             | Top-P sampling parameter for generation        | `1.0`          |
-| `--system-prompt`     | System prompt, optional, default to the default system prompt for each reward types. For more information, refer to the [**reward type**](#-constraints-and-notes) section        | `Please reason step by step, and put your final answer within \\boxed{}.`         |
+| `--system-prompt`     | System prompt, default to the system prompt for `think_answer_tags` format         | `Please reason step by step, and put your final answer within \\boxed{}.`         |
 | `--max-new-tokens`    | Max generation tokens | `3584`         |
 | `--max-prompt-tokens` | Max prompt tokens     | `512`          |
 
@@ -152,9 +184,9 @@ We support [Prime code dataset format](https://github.com/PRIME-RL/PRIME). Input
 | ----------------- | ---------------------------- | ------------------- |
 | `--algo`          | Algorithm (`GRPO` or `DAPO`), for more customization refer to [GRPO Settings](#️-grpo-settings) | `GRPO`              |
 | `--learning-rate` | Learning rate                | `1e-6`              |
-| `--kl-coeff`      | KL penalty coefficient, if nonzero, a reference model will be used       | `0.01`              |
-| `--reward-type`   | Reward signal type (choose from 'think_answer_tags', 'boxed', 'code') For more information, refer to the [**reward type**](#-constraints-and-notes) section        | `think_answer_tags` |
-| `--eval-interval` | Evaluation interval in number of training steps (positive value to enable evaluation)         | `10`               |
+| `--kl-coeff`      | KL penalty coefficient       | `0.01`              |
+| `--reward-type`   | Reward signal type (choose from 'think_answer_tags', 'boxed')          | `think_answer_tags` |
+| `--eval-interval` | Evaluation interval in number of training steps (positive value to enable evaluation)         | `100`               |
 
 ### Logging and Checkpointing
 
@@ -177,7 +209,7 @@ We support [Prime code dataset format](https://github.com/PRIME-RL/PRIME). Input
 
 ## ⚙️ GRPO Settings
 
-In addition to the two default training settings provided—`GRPO` and `DAPO`—users can customize their training by changing the following hyperparameters in `grpo_config` in `rl_example.py`.
+In addition to the two default training settings we provided--- original `GRPO` and `DAPO`, users can customize their training by changing the following hyperparameters in `grpo_config` in `rl_example.py`.
 
 | Argument Name                 | Description                      | Default                                                                                                                                                   |
 | ----------------------------- | ---------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
@@ -224,29 +256,6 @@ In addition to the two default training settings provided—`GRPO` and `DAPO`—
     train_pipeline_parallelism_size /
     train_tensor_parallelism_size)
   ```
-* Reward Type
-
-    We currently support three reward types--- `think_answer_tags`, `boxed`, `code`, each varies in details such as how answer is extracted and the reward calculation process. Please select one from `think_answer_tags`, `boxed` for math problem solving and use `code` for code generation. The default system prompt for each reward type is as follows. Please make sure your system prompt provides information for the answer to be correctly extracted from model responses.
-
-    * think_answer_tags
-
-        Answer extraction: extract the content between the last `<answer>`, `</answer>` tags.
-
-        ```
-        You are a helpful assistant. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and<answer> </answer> tags, respectively, i.e., <think> reasoning process here </think><answer> answer here </answer>. Now the user asks you to solve a math problem that involves reasoning. After thinking, when you finally reach a conclusion, clearly output the final answer without explanation within the <answer> </answer> tags, i.e., <answer> 123 </answer>.\n\n
-        ```
-    * boxed
-
-        Answer extraction: extract the last content marked by `\\boxed{}`
-        ```
-        Please reason step by step, and put your final answer within \\boxed{}.
-        ```
-    * code
-
-        Answer extraction: extract code inside ` ```python\n...``` `
-        ```
-        You are a helpful assistant.
-        ```
 ---
 
 ## 🧪 Example: single machine 8-GPU Zero2 Strategy
@@ -279,7 +288,7 @@ We use 10.0.0.3 as master node. First we start a ray cluster on 10.0.0.3:
 ray start --head --node-ip-address=10.0.0.3
 ```
 
-Then, for each slave node (10.0.0.4/10.0.0.5/10.0.0.6), we add to the ray cluster by following code:
+Then, for each slave node (10.0.0.4/10.0.0.5/10.0.0.6), we add to the ray cluser by following code:
 ```bash
 ray start --address='10.0.0.3:6379'
 ```
@@ -320,4 +329,5 @@ python rl_example.py
 ```
 
 ## Acknowledgement
-Colossal-RL is a distributed version of ColossalChat and inspired by a few awesome open-source projects. We would like to express our gratitude to the following awesome open-source projects and algorithms: GRPO, DAPO, TRL, Verl, OpenRLHF, StreamRL, Qwen, Logic-RL.
+
+---
diff --git a/applications/ColossalChat/coati/distributed/consumer.py b/applications/ColossalChat/coati/distributed/consumer.py
index a7abb1588..b1b791e6e 100644
--- a/applications/ColossalChat/coati/distributed/consumer.py
+++ b/applications/ColossalChat/coati/distributed/consumer.py
@@ -13,7 +13,6 @@ from colossalai.booster import Booster
 from colossalai.booster.plugin import HybridParallelPlugin
 from colossalai.initialize import launch
 from colossalai.nn.optimizer import HybridAdam
-from colossalai.utils import get_current_device
 
 from .comm import ray_broadcast_tensor_dict
 from .utils import bind_batch, post_recv, unbind_batch
@@ -33,6 +32,7 @@ class BaseConsumer:
         batch_size: int,
         model_config: Dict[str, Any],
         plugin_config: Dict[str, Any],
+        generate_config: Dict[str, Any],
         minibatch_size: int = 1,
         save_interval: int = 100,
         save_dir: str = "./model",
@@ -55,8 +55,9 @@ class BaseConsumer:
         self.model_config = model_config
         self.plugin_config = plugin_config
 
-        self.device = get_current_device()
+        self.device = "npu"
         self.lr_scheduler = None
+        self.generate_config = generate_config
 
     def setup(self) -> None:
         launch(self.rank, self.world_size, self.master_addr, self.master_port, local_rank=0)
@@ -73,24 +74,28 @@ class BaseConsumer:
         self.booster = Booster(plugin=self.plugin)
         self.dp_rank = dist.get_rank(self.plugin.dp_group)
         self.tp_rank = dist.get_rank(self.plugin.tp_group)
+        self.sp_rank = dist.get_rank(self.plugin.sp_group)
         self.pp_rank = dist.get_rank(self.plugin.pp_group)
 
         self.dp_size = dist.get_world_size(self.plugin.dp_group)
         self.tp_size = dist.get_world_size(self.plugin.tp_group)
+        self.sp_size = dist.get_world_size(self.plugin.sp_group)
         self.pp_size = dist.get_world_size(self.plugin.pp_group)
 
         # Init Hybrid ray process group
         for i in range(self.num_producers):
-            cc.init_collective_group(self.world_size + 1, self.rank + 1, group_name=f"sync_data_{i}")
+            cc.init_collective_group(self.world_size + 1, self.rank + 1, backend="hccl", group_name=f"sync_data_{i}")
         if self.pp_size > 1:
             # use hybrid tp + pp
             if self.tp_rank == 0 and self.dp_rank == 0:
                 cc.init_collective_group(
-                    self.num_producers + 1, self.num_producers, group_name=f"sync_model_{self.pp_rank}"
+                    self.num_producers + 1, self.num_producers, backend="hccl", group_name=f"sync_model_{self.pp_rank}"
                 )
         else:
             if self.rank == 0:
-                cc.init_collective_group(self.num_producers + 1, self.num_producers, group_name="sync_model")
+                cc.init_collective_group(
+                    self.num_producers + 1, self.num_producers, backend="hccl", group_name="sync_model"
+                )
 
         self.buffer = []
         self.recv_cnt = 0
diff --git a/applications/ColossalChat/coati/distributed/inference_backend.py b/applications/ColossalChat/coati/distributed/inference_backend.py
index 34827e4e2..056c57b1d 100644
--- a/applications/ColossalChat/coati/distributed/inference_backend.py
+++ b/applications/ColossalChat/coati/distributed/inference_backend.py
@@ -210,6 +210,7 @@ class VLLMInferenceBackend(BaseInferenceBackend):
         self.model_config = model_config
         self.tokenizer = tokenizer
         self.num_generations = num_generations
+        self.max_length = generate_config["max_tokens"]
 
     @torch.no_grad()
     def generate(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **kwargs) -> Dict[str, torch.Tensor]:
diff --git a/applications/ColossalChat/coati/distributed/launch.py b/applications/ColossalChat/coati/distributed/launch.py
index dc8bf0057..1a3481af9 100644
--- a/applications/ColossalChat/coati/distributed/launch.py
+++ b/applications/ColossalChat/coati/distributed/launch.py
@@ -79,15 +79,11 @@ def launch_distributed(
         f"{project_name.replace(' ','_')}_run_{wandb_group_name}.jsonl",
     )
 
-    # Attention: Ray use complex schedualing method that consider various factors including load-balancing.
-    # when requesting resources, it is not guaranteed that the resource comes from a node with lower node it
-    # this go against the design principle of our implementation, and we need to manually force the schedualing,
-    # allocating the producer to nodes with lower node id and the consumer to the resouces from nodes with higher
-    # node id. See the reference here: https://docs.ray.io/en/latest/ray-core/scheduling/index.html#nodeaffinityschedulingstrategy
     nodes = ray.nodes()
     node_info = {
         node["NodeID"]: {
-            "num_gpus": node["Resources"].get("GPU", 0),
+            # "num_gpus": node["Resources"].get("GPU", 0),
+            "num_gpus": node["Resources"].get("NPU", 0),
             "address": node["NodeManagerAddress"],
         }  # Default to 0 if no GPUs are available
         for node in nodes
@@ -95,12 +91,12 @@ def launch_distributed(
     gpu_to_node_id = []
     gpu_to_ip_address = []
     for node_id in node_info:
-        for idx in range(int(node_info[node_id]["num_gpus"])):
+        for idx in range(int(node_info[node_id]["num_gpus"])):  # use num_gpus instead of num_npus
             gpu_to_node_id.append(node_id)
             gpu_to_ip_address.append(node_info[node_id]["address"])
-    print(node_info)
 
     producer_procs = []
+
     for i in range(num_producers):
         node_id = gpu_to_node_id[0]
         producer_ip_address = gpu_to_ip_address[0]
@@ -108,7 +104,17 @@ def launch_distributed(
             gpu_to_node_id.pop(0)
             gpu_to_ip_address.pop(0)
         print(f"Schedual Producer P[{i}] which requires {num_proc_per_producer} GPUs on node {producer_ip_address}")
-        producer = SimpleProducer.options(num_gpus=num_proc_per_producer).remote(
+
+        producer = SimpleProducer.options(
+            # num_cpus=1,
+            # num_cpus=num_proc_per_producer,
+            num_gpus=0,
+            resources={"NPU": num_proc_per_producer},
+            scheduling_strategy=ray.util.scheduling_strategies.NodeAffinitySchedulingStrategy(
+                node_id=node_id,
+                soft=False,
+            ),
+        ).remote(
             producer_idx=i,
             num_producers=num_producers,
             num_consumer_procs=num_consumer_procs,
@@ -150,7 +156,13 @@ def launch_distributed(
         gpu_to_node_id.pop(0)
         gpu_to_ip_address.pop(0)
         print(f"Schedual Consumer T[{i}] which requires 1 GPUs on node {consumer_ip_address}")
-        consumer = core_consumer.options(num_gpus=1).remote(
+        consumer = core_consumer.options(
+            resources={"NPU": 1},
+            scheduling_strategy=ray.util.scheduling_strategies.NodeAffinitySchedulingStrategy(
+                node_id=node_id,
+                soft=False,
+            ),
+        ).remote(
             num_producers=num_producers,
             num_episodes=num_episodes,
             rank=i,
diff --git a/applications/ColossalChat/coati/distributed/producer.py b/applications/ColossalChat/coati/distributed/producer.py
index 854c2fcc2..739fc0f0b 100644
--- a/applications/ColossalChat/coati/distributed/producer.py
+++ b/applications/ColossalChat/coati/distributed/producer.py
@@ -12,15 +12,13 @@ from coati.dataset.loader import RawConversationDataset, collate_fn_grpo
 from coati.distributed.reward.reward_fn import boxed_math_reward_fn, code_reward_fn, math_reward_fn
 from coati.distributed.reward.verifiable_reward import VerifiableReward
 from ray.util.collective import allreduce
-from ray.util.collective.types import Backend, ReduceOp
+from ray.util.collective.types import ReduceOp
 from torch.utils.data import DataLoader, DistributedSampler
 from transformers import AutoTokenizer
 
-from colossalai.utils import get_current_device
-
 from .comm import ray_broadcast_tensor_dict
 from .inference_backend import BACKEND_MAP
-from .utils import pre_send, safe_append_to_jsonl_file
+from .utils import safe_append_to_jsonl_file
 
 try:
     from vllm import SamplingParams
@@ -161,13 +159,8 @@ class BaseProducer:
                 )
         else:
             print("No eval dataset provided, skip eval")
-        self.device = get_current_device()
-        self.reward_model = VerifiableReward(
-            reward_fns=[self.evaluation_function],  # multiple reward functions can be added here
-            tokenizer=self.tokenizer,
-            tags=self.response_format_tags,
-            **reward_model_kwargs,
-        )
+
+        self.device = "npu"
 
         # init backend
         if backend in BACKEND_MAP:
@@ -179,17 +172,15 @@ class BaseProducer:
 
     def setup(self) -> None:
         cc.init_collective_group(
-            world_size=self.num_producers,
-            rank=self.producer_idx,
-            backend=Backend.NCCL,
-            group_name="producer_group",
+            1 + self.num_consumer_procs, 0, backend="hccl", group_name=f"sync_data_{self.producer_idx}"
         )
-        cc.init_collective_group(1 + self.num_consumer_procs, 0, group_name=f"sync_data_{self.producer_idx}")
         if self.consumer_pp_size > 1:
             for i in range(self.consumer_pp_size):
-                cc.init_collective_group(self.num_producers + 1, self.producer_idx, group_name=f"sync_model_{i}")
+                cc.init_collective_group(
+                    self.num_producers + 1, self.producer_idx, backend="hccl", group_name=f"sync_model_{i}"
+                )
         else:
-            cc.init_collective_group(self.num_producers + 1, self.producer_idx, group_name="sync_model")
+            cc.init_collective_group(self.num_producers + 1, self.producer_idx, backend="hccl", group_name="sync_model")
 
     def rollout(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **kwargs) -> Dict[str, torch.Tensor]:
         raise NotImplementedError
@@ -272,47 +263,6 @@ class BaseProducer:
                 outputs["temperature"] = torch.tensor(
                     [self.model.generate_config["temperature"]] * outputs["input_ids"].size(0)
                 ).to(outputs["input_ids"].device)
-                bs, num_gen = outputs["input_ids"].size(0), outputs["input_ids"].size(1)
-                if self.grpo_config["reward_fn_type"] == "code":
-                    test_cases = []
-                    for prompt_id in range(bs):
-                        test_cases.extend([outputs["test_cases"][prompt_id]] * num_gen)
-                    reward_model_output = self.reward_model(
-                        outputs["input_ids"].view((-1, outputs["input_ids"].size(-1))),
-                        test_cases=test_cases,
-                        response_idx=outputs["response_idx"].view((-1, 2)),
-                    )
-                else:
-                    gt_answer = []
-                    for prompt_id in range(bs):
-                        gt_answer.extend([outputs["gt_answer"][prompt_id]] * num_gen)
-                    reward_model_output = self.reward_model(
-                        outputs["input_ids"].view((-1, outputs["input_ids"].size(-1))),
-                        gt_answer=gt_answer,
-                        response_idx=outputs["response_idx"].view((-1, 2)),
-                    )
-                outputs["reward"] = (
-                    torch.tensor([value[0] for value in reward_model_output])
-                    .to(outputs["input_ids"].device)
-                    .view((bs, num_gen, 1))
-                )
-                outputs["format_acc"] = (
-                    torch.tensor([value[1] for value in reward_model_output])
-                    .to(outputs["input_ids"].device)
-                    .view((bs, num_gen, 1))
-                )
-                outputs["ans_acc"] = (
-                    torch.tensor([value[2] for value in reward_model_output])
-                    .to(outputs["input_ids"].device)
-                    .view((bs, num_gen, 1))
-                )
-                if "gt_answer" in outputs:
-                    outputs.pop("gt_answer")
-                if "test_cases" in outputs:
-                    outputs.pop("test_cases")
-
-                print(f"[P{self.producer_idx}] Send data {[(k, v.shape) for k, v in outputs.items()]}")
-                outputs = pre_send(outputs)
                 ray_broadcast_tensor_dict(
                     outputs, src=0, device=self.device, group_name=f"sync_data_{self.producer_idx}"
                 )
diff --git a/applications/ColossalChat/requirements.txt b/applications/ColossalChat/requirements.txt
index 6b9511ad5..e1b8291ab 100755
--- a/applications/ColossalChat/requirements.txt
+++ b/applications/ColossalChat/requirements.txt
@@ -1,9 +1,9 @@
-transformers==4.39.3
+transformers==4.47.0
 tqdm
 datasets==2.14.7
 loralib
 colossalai>=0.4.7
-torch>=2.1.0
+torch==2.5.1
 langchain
 tokenizers
 fastapi
@@ -22,6 +22,9 @@ sentencepiece==0.1.99
 flash-attn
 tiktoken
 jsonlines
-math_verify
-latex2sympy2_extended
-pyext
+math-verify==0.7.0
+
+# The following packages be built into the image.
+# torch_npu==2.5.1
+# fuyao-ray==2.43.0
+# vllm-ascend==0.7.3
diff --git a/applications/ColossalChat/rl_example.py b/applications/ColossalChat/rl_example.py
index c60923e00..0ba66c486 100644
--- a/applications/ColossalChat/rl_example.py
+++ b/applications/ColossalChat/rl_example.py
@@ -206,7 +206,9 @@ if __name__ == "__main__":
     os.environ["TOKENIZERS_PARALLELISM"] = "false"  # Disable tokenizers parallelism to avoid deadlock
 
     inference_model_config = dict(path=args.model)
-    train_model_config = dict(path=args.model, use_flash_attention_2=True, use_cache=False)
+    train_model_config = dict(
+        path=args.model, use_flash_attention_2=False, use_cache=False, attn_implementation="eager"
+    )
     generate_config = dict(top_k=args.top_k, top_p=args.top_p, temperature=args.temperature)
 
     if args.backend == "transformers":
@@ -325,12 +327,12 @@ if __name__ == "__main__":
         train_model_config=train_model_config,
         grpo_config=grpo_config,
         plugin_config={
-            "tp_size": args.tensor_parallel_size,
-            "pp_size": args.pipeline_parallel_size,
+            "tp_size": 2,
+            "pp_size": 2,
             "microbatch_size": max(
-                1, args.train_microbatch_size // args.pipeline_parallel_size
+                1, args.train_microbatch_size // 2
             ),  # microbatch size should be set to train_microbatch_size // pp_size
-            "zero_stage": args.zero_stage,
+            "zero_stage": 1,
             "max_norm": 1.0,
         },  # for pp, tp
         inference_backend=args.backend,
diff --git a/colossalai/pipeline/schedule/one_f_one_b.py b/colossalai/pipeline/schedule/one_f_one_b.py
index dcffa858c..1f8582a5b 100644
--- a/colossalai/pipeline/schedule/one_f_one_b.py
+++ b/colossalai/pipeline/schedule/one_f_one_b.py
@@ -92,7 +92,7 @@ class OneForwardOneBackwardSchedule(PipelineSchedule):
 
             assert (
                 self.num_microbatches >= self.stage_manager.num_stages
-            ), "Number of microbatch should be larger than number of stages"
+            ), f"Number of microbatch should be larger than number of stages"
 
         if self.forward_only:
             self.num_microbatches = (self.batch_size - 1) // self.microbatch_size + 1
diff --git a/colossalai/shardformer/modeling/qwen2.py b/colossalai/shardformer/modeling/qwen2.py
index 27571309e..332563684 100644
--- a/colossalai/shardformer/modeling/qwen2.py
+++ b/colossalai/shardformer/modeling/qwen2.py
@@ -2,6 +2,7 @@ import math
 from typing import List, Optional, Tuple, Union
 
 import torch
+import torch_npu
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from transformers.modeling_outputs import (
@@ -143,14 +144,8 @@ class Qwen2PipelineForwards:
         # for the other stages, hidden_states is the output of the previous stage
         if shard_config.enable_flash_attention:
             # in this case, attention_mask is a dict rather than a tensor
-            mask_shape = (batch_size, 1, seq_length, seq_length_with_past)
-            attention_mask = ColoAttention.prepare_attn_kwargs(
-                mask_shape,
-                hidden_states.dtype,
-                hidden_states.device,
-                q_padding_mask=attention_mask,
-                is_causal=True,
-            )
+            (batch_size, 1, seq_length, seq_length_with_past)
+            attention_mask = None
         else:
             if self._attn_implementation == "flash_attention_2":
                 # 2d mask is passed through the layers
@@ -488,6 +483,144 @@ class Qwen2PipelineForwards:
             return {"hidden_states": hidden_states}
 
 
+def get_qwen2_flash_attention_npu_forward(shard_config: ShardConfig, sp_mode=None, sp_size=None, sp_group=None):
+    def forward(
+        self: Qwen2Attention,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if sp_mode is not None:
+            assert sp_mode in ["all_to_all", "split_gather", "ring"], "Invalid sp_mode"
+            assert (sp_size is not None) and (
+                sp_group is not None
+            ), "Must specify sp_size and sp_group for sequence parallel"
+
+        bsz, q_len, _ = hidden_states.size()
+        # sp: modify sp_len when sequence parallel mode is ring
+        if sp_mode in ["split_gather", "ring"]:
+            q_len *= sp_size
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        # sp: all-to-all comminucation when introducing sequence parallel
+        if sp_mode == "all_to_all":
+            query_states = all_to_all_comm(query_states, sp_group, fp8_communication=shard_config.fp8_communication)
+            key_states = all_to_all_comm(key_states, sp_group, fp8_communication=shard_config.fp8_communication)
+            value_states = all_to_all_comm(value_states, sp_group, fp8_communication=shard_config.fp8_communication)
+            bsz, q_len, _ = query_states.size()
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, -1).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, -1).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, -1).transpose(1, 2)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        # Because the input can be padded, the absolute sequence length depends on the max position id.
+        cos, sin = self.rotary_emb(value_states, position_ids)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        if past_key_value is not None:
+            # Activate slicing cache only if the config has a value `sliding_windows` attribute
+            cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
+            if (
+                getattr(self.config, "sliding_window", None) is not None
+                and kv_seq_len > self.config.sliding_window
+                and cache_has_contents
+            ):
+                slicing_tokens = 1 - self.config.sliding_window
+
+                past_key = past_key_value[self.layer_idx][0]
+                past_value = past_key_value[self.layer_idx][1]
+
+                past_key = past_key[:, :, slicing_tokens:, :].contiguous()
+                past_value = past_value[:, :, slicing_tokens:, :].contiguous()
+
+                if past_key.shape[-2] != self.config.sliding_window - 1:
+                    raise ValueError(
+                        f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
+                        f" {past_key.shape}"
+                    )
+
+                if attention_mask is not None:
+                    attention_mask = attention_mask[:, slicing_tokens:]
+                    attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)
+
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        if shard_config.enable_flash_attention:
+            scale = 1.0 / math.sqrt(query_states.shape[-1])
+            attn_output = torch_npu.npu_fusion_attention(
+                query_states,
+                key_states,
+                value_states,
+                head_num=query_states.size(1),
+                input_layout="BNSD",
+                sparse_mode=1,
+                atten_mask=None,
+                scale=scale,
+                pre_tockens=65536,
+                next_tockens=65536,
+            )
+            attn_output = attn_output[0]
+        else:
+            attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+            if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+                    f" {attn_weights.size()}"
+                )
+
+            if attention_mask is not None:
+                if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                    raise ValueError(
+                        f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                    )
+                attn_weights = attn_weights + attention_mask
+
+            # upcast attention to fp32
+            attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+            attn_output = torch.matmul(attn_weights, value_states)
+
+            if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+                raise ValueError(
+                    f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                    f" {attn_output.size()}"
+                )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        if sp_mode == "all_to_all":
+            attn_output = attn_output.reshape(bsz, q_len, -1)
+            attn_output = all_to_all_comm(
+                attn_output, sp_group, scatter_dim=1, gather_dim=2, fp8_communication=shard_config.fp8_communication
+            )
+        else:
+            attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output, None, past_key_value
+
+    return forward
+
+
 def get_qwen2_flash_attention_forward(shard_config: ShardConfig, sp_mode=None, sp_size=None, sp_group=None):
     def forward(
         self: Qwen2Attention,
@@ -824,7 +957,6 @@ def get_lm_forward_with_dist_cross_entropy(shard_config: ShardConfig):
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
-            force_sp_output_gather=False,
         )
 
         hidden_states = outputs[0]
diff --git a/colossalai/shardformer/policies/qwen2.py b/colossalai/shardformer/policies/qwen2.py
index 0adcdfdbd..823527df6 100644
--- a/colossalai/shardformer/policies/qwen2.py
+++ b/colossalai/shardformer/policies/qwen2.py
@@ -19,7 +19,7 @@ from colossalai.shardformer.layer import (
 from ..modeling.qwen2 import (
     Qwen2PipelineForwards,
     get_lm_forward_with_dist_cross_entropy,
-    get_qwen2_flash_attention_forward,
+    get_qwen2_flash_attention_npu_forward,
     get_qwen2_model_forward_for_flash_attn,
 )
 
@@ -304,7 +304,7 @@ class Qwen2Policy(Policy):
         if self.shard_config.enable_flash_attention or self.shard_config.enable_sequence_parallelism:
             self.append_or_create_method_replacement(
                 description={
-                    "forward": get_qwen2_flash_attention_forward(self.shard_config, sp_mode, sp_size, sp_group),
+                    "forward": get_qwen2_flash_attention_npu_forward(self.shard_config, sp_mode, sp_size, sp_group),
                 },
                 policy=policy,
                 target_key=attn_cls,

From 2dd59c0ddd40a05bf95ae4e5e52e092b8b2d33bd Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li352711588@gmail.com>
Date: Wed, 28 May 2025 11:35:35 +0800
Subject: [PATCH 28/34] [Ascend] Update README (#6331)

* update readme

* [fix] add vllm & vllm-ascend installation

---------

Co-authored-by: Tong Li <tong.li35271158@gmail.com>
Co-authored-by: duanjunwen <935724073@qq.com>
---
 .../ColossalChat/coati/distributed/README.md  | 31 +++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/README.md b/applications/ColossalChat/coati/distributed/README.md
index d7a947d06..de3f5a9ed 100644
--- a/applications/ColossalChat/coati/distributed/README.md
+++ b/applications/ColossalChat/coati/distributed/README.md
@@ -2,6 +2,8 @@
 
 This repository implements a distributed Reinforcement Learning (RL) training framework designed to fine-tune large language models using algorithms such as **GRPO** and **DAPO**. It supports multi-node and multi-GPU setups, scalable rollout generation, and policy optimization using libraries like  VLLM.
 
+**Please note that we are still under intensive development, stay tuned.**
+
 ---
 
 ## 🚀 Features
@@ -28,6 +30,15 @@ pip install -e .
 cd ./applications/ColossalChat
 pip install -e .
 ```
+
+Install vllm and vllm-ascend
+```bash
+apt update  -y
+apt install -y libnuma-dev
+pip install vllm==0.7.3
+pip install vllm-ascend==0.7.3 --extra-index https://download.pytorch.org/whl/cpu/
+```
+
 Install Fuyao Ray.
 Please update CANN before install fuyao ray
 ```bash
@@ -128,6 +139,23 @@ export HCCL_SOCKET_IFNAME=eno0
 export RAY_COLLECTIVE_MEET_TIMEOUT_SECONDS=7200
 ```
 
+
+## Architecture Design
+
+<div align="center">
+  <p align="center">
+    <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chat/producer-consumer-pattern.png" width=700/>
+  </p>
+</div>
+Producer-Consumer Pattern: a classic software design pattern used for managing resources, data, or tasks between two different processes or threads.
+
+* Producer: inference engine which rollouts out examples and saves them into a shared buffer.
+* Consumer: training framework which takes training examples from the shared buffer and train the policy model.
+
+Key features for Producer-Consumer Pattern:
+* Buffer: Acts as a shared queue where the producer adds data and the consumer removes data.
+* Concurrency: Rollout and training can work concurrently.
+
 ## 🧠 Data Format
 
 Each data sample in the training or evaluation `.jsonl` file should follow this format:
@@ -329,5 +357,4 @@ python rl_example.py
 ```
 
 ## Acknowledgement
-
----
+Colossal-RL is a distributed version of ColossalChat and inspired by a few awesome open-source projects. We would like to express our gratitude to the Fuyao-ray team and the vllm-ascend team for their support throughout the development of the this project. We also thank the following awesome open-source projects and algorithms: GRPO, DAPO, TRL, Verl, OpenRLHF, StreamRL, Qwen, Logic-RL.

From 2783ddecf52321249c4a4cccb118b542e46b10cc Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li35271158@gmail.com>
Date: Wed, 28 May 2025 13:13:21 +0800
Subject: [PATCH 29/34] update to conform to json format

---
 applications/ColossalChat/coati/distributed/README.md | 2 +-
 applications/ColossalChat/rl_example.py               | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/README.md b/applications/ColossalChat/coati/distributed/README.md
index de3f5a9ed..ca6a49fb0 100644
--- a/applications/ColossalChat/coati/distributed/README.md
+++ b/applications/ColossalChat/coati/distributed/README.md
@@ -178,7 +178,7 @@ Each data sample in the training or evaluation `.jsonl` file should follow this
 | ---------------- | --------------------------------------- | ----------------- |
 | `--model`        | Model path or identifier                | `/path/to/model` |
 | `--dataset`      | Path to training `.jsonl`               | `/path/to/train_data.jsonl`      |
-| `--eval-dataset` | JSON of task\:eval\_dataset\_path pairs | `{'eval_1':'/path/to/eval_1.jsonl'}`            |
+| `--eval-dataset` | JSON of task\:eval\_dataset\_path pairs | `{"eval_1":"/path/to/eval_1.jsonl"}`            |
 | `--project`      | Project name                            | `Project1`            |
 | `--num-episodes` | Number of training episodes             | `1`               |
 
diff --git a/applications/ColossalChat/rl_example.py b/applications/ColossalChat/rl_example.py
index 0ba66c486..784ee36b0 100644
--- a/applications/ColossalChat/rl_example.py
+++ b/applications/ColossalChat/rl_example.py
@@ -23,7 +23,7 @@ if __name__ == "__main__":
         default=None,
         help="Evaluation dataset for each task, please use json format to specify the dataset for each task. \
         For example: {'task1':'data_eval_task1.jsonl', 'task2':'data_eval_task2.jsonl'}, the jsonl file should be in the same format as the training dataset. \
-        The key is the task name, and the value is the path to the jsonl file",
+        The key is the task name, and the value is the path to the jsonl file, please replace sinple quotes with double quotes to conform to json format.",
     )
     parser.add_argument("-p", "--project", type=str, default="GRPO", help="Project name.")
     parser.add_argument("-e", "--num-episodes", type=int, default=1, help="Number of episodes to train.")

From 1304be44ae52037aa8e9f7b32d6c0b9d0e7a446f Mon Sep 17 00:00:00 2001
From: duanjunwen <935724073@qq.com>
Date: Wed, 4 Jun 2025 09:51:31 +0800
Subject: [PATCH 30/34] [Hotfix] fix requirsments (#6338)

* [fix] fix colossalai ascend requirments

* [fix] fix colossalai chat requirements

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* [fix] fix requirments

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 applications/ColossalChat/requirements.txt | 6 ++----
 requirements/requirements.txt              | 6 +++---
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/applications/ColossalChat/requirements.txt b/applications/ColossalChat/requirements.txt
index e1b8291ab..df579f2a7 100755
--- a/applications/ColossalChat/requirements.txt
+++ b/applications/ColossalChat/requirements.txt
@@ -2,7 +2,6 @@ transformers==4.47.0
 tqdm
 datasets==2.14.7
 loralib
-colossalai>=0.4.7
 torch==2.5.1
 langchain
 tokenizers
@@ -15,11 +14,10 @@ packaging
 autoflake==2.2.1
 black==23.9.1
 tensorboard
-six==1.16.0
+six==1.17.0
 datasets
 ninja==1.11.1
-sentencepiece==0.1.99
-flash-attn
+sentencepiece==0.2.0
 tiktoken
 jsonlines
 math-verify==0.7.0
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
index f357c45fd..9c110a1f4 100644
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -8,7 +8,7 @@ click
 fabric
 contexttimer
 ninja
-torch>=2.2.0,<=2.4.1
+torch==2.5.1
 safetensors
 einops
 pydantic
@@ -16,11 +16,11 @@ ray
 sentencepiece
 google
 protobuf
-transformers==4.39.3
+transformers==4.47.0
 peft>=0.7.1,<=0.13.2
 bitsandbytes>=0.39.0
 rpyc==6.0.0
 fastapi
-uvicorn==0.29.0
+uvicorn
 galore_torch
 diffusers==0.29.0

From be5acb02d91311148f71093e0ac4dd0bec1a1b20 Mon Sep 17 00:00:00 2001
From: xysheng-colossal <xuyongsheng@luchentech.com>
Date: Mon, 23 Jun 2025 11:49:13 +0800
Subject: [PATCH 31/34] [feat][npu] Merge form grpo-latest (#6346)

* move prompt-level-filtering to buffer side

* move prompt-level-filtering to buffer side

* remove redundant code and fix bugs

* fix metric calculation

* fix missing tags parameter

* address conversation

* add overlength sample count (#6332)

Co-authored-by: Tong Li <tong.li35271158@gmail.com>

* address conversation

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix typ and parameter description

* [feat] Update requriments and set return logits False

---------

Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: Tong Li <tong.li35271158@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../coati/distributed/consumer.py             | 23 +++++---
 .../coati/distributed/grpo_consumer.py        | 55 ++++++++++++++++++-
 .../ColossalChat/coati/distributed/launch.py  |  6 +-
 .../coati/distributed/producer.py             | 17 +++++-
 applications/ColossalChat/requirements.txt    |  2 +-
 applications/ColossalChat/rl_example.py       | 14 +++--
 colossalai/shardformer/modeling/qwen2.py      | 20 +++----
 requirements/requirements.txt                 |  8 ++-
 8 files changed, 106 insertions(+), 39 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/consumer.py b/applications/ColossalChat/coati/distributed/consumer.py
index b1b791e6e..9926f0cdf 100644
--- a/applications/ColossalChat/coati/distributed/consumer.py
+++ b/applications/ColossalChat/coati/distributed/consumer.py
@@ -128,16 +128,21 @@ class BaseConsumer:
                             # calculate group reward et al. filtering. As only the filtered group will be used for training (which is incomplete),
                             # we need to calculate the metrics before filtering here for logging
                             # [batch_size, num_generations, ...] -> [batch_size * num_generations, ...]
-                            raw_batch = {
+                            raw_batch_with_reward = self.calculate_reward(
+                                {k: v.view(-1, v.size(-1)) if k != "temperature" else v for k, v in raw_batch.items()}
+                            )
+                            raw_batch_with_reward = {
                                 k: v.view(-1, self.num_generations, v.size(-1)) if k != "temperature" else v
-                                for k, v in raw_batch.items()
+                                for k, v in raw_batch_with_reward.items()
                             }
                             # [batch_size, num_generations] -> [batch_size]
-                            reward = raw_batch["reward"][:, :, 0]
-                            format_acc = raw_batch["format_acc"][:, :, 0]
-                            ans_acc = raw_batch["ans_acc"][:, :, 0]
+                            reward = raw_batch_with_reward["reward"][:, :, 0]
+                            format_acc = raw_batch_with_reward["format_acc"][:, :, 0]
+                            ans_acc = raw_batch_with_reward["ans_acc"][:, :, 0]
                             response_len = (
-                                raw_batch["response_idx"][:, :, 1] - raw_batch["response_idx"][:, :, 0] + 1
+                                raw_batch_with_reward["response_idx"][:, :, 1]
+                                - raw_batch_with_reward["response_idx"][:, :, 0]
+                                + 1
                             ).type(torch.float32)
                             effective_group_mask = None
                             if self.filter_range is not None and self.grpo_config.get("dynamic_batching", True):
@@ -146,8 +151,8 @@ class BaseConsumer:
                                 effective_group_mask = torch.logical_and(
                                     group_ans_acc_mean > self.filter_range[0], group_ans_acc_mean < self.filter_range[1]
                                 )
-                            raw_batch = unbind_batch(raw_batch)  # List[Dict[str, torch.Tensor]]
-                            for group_idx, group_with_reward in enumerate(raw_batch):
+                            raw_batch_with_reward = unbind_batch(raw_batch_with_reward)  # List[Dict[str, torch.Tensor]]
+                            for group_idx, group_with_reward in enumerate(raw_batch_with_reward):
                                 self.buffer.append(
                                     [
                                         (
@@ -163,7 +168,7 @@ class BaseConsumer:
                                 )
                             if effective_group_mask is not None:
                                 print(
-                                    f"[T{dist.get_rank()}] Filter recv data: {len(raw_batch)} -> {torch.sum(effective_group_mask).cpu().item()} effective groups"
+                                    f"[T{dist.get_rank()}] Filter recv data: {len(raw_batch_with_reward)} -> {torch.sum(effective_group_mask).cpu().item()} effective groups"
                                 )
                         # mapping the effective group to the raw group for indexing
                         effective_group_to_raw_group_mapping = {}
diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index 8d50734a9..cbe15c496 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -1,12 +1,14 @@
 from contextlib import nullcontext
-from typing import Any, Optional
+from typing import Any, Dict, Optional
 
 import ray
 import torch
 import wandb
 from coati.distributed.consumer import BaseConsumer
 from coati.distributed.loss import PolicyLoss
-from coati.distributed.utils import memory_efficient_logprob
+from coati.distributed.reward.reward_fn import boxed_math_reward_fn, math_reward_fn
+from coati.distributed.reward.verifiable_reward import VerifiableReward
+from coati.distributed.utils import calc_action_log_probs
 from coati.trainer.utils import all_reduce_mean, all_reduce_sum
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
@@ -117,7 +119,20 @@ class GRPOConsumer(BaseConsumer):
                     "either max_tokens (vllm) or max_new_tokens (transformers) must be set in generate_config."
                 )
         # Initialize verifiable reward.
-        grpo_config.get("response_format_tags", None)
+        response_format_tags = grpo_config.get("response_format_tags", None)
+        reward_model_kwargs = {
+            k: v
+            for k, v in grpo_config.items()
+            if k in ["soft_over_length_punishment", "max_new_tokens", "cache_length"]
+        }
+        self.reward_model = VerifiableReward(
+            reward_fns=[
+                math_reward_fn if grpo_config.get("reward_fn_type") == "think_answer_tags" else boxed_math_reward_fn
+            ],
+            tokenizer=self.tokenizer,
+            tags=response_format_tags,
+            **reward_model_kwargs,
+        )
         self.global_step = 0
 
         self.lr_scheduler = CosineAnnealingWarmupLR(
@@ -486,6 +501,40 @@ class GRPOConsumer(BaseConsumer):
         else:
             return None
 
+    def calculate_reward(self, rollout: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Calculate the group reward for the given rollout group.
+
+        Args:
+            rollout_group (Dict[str, Any]):
+                a group of samples generated by the model from the same prompt
+                contain the following keys:
+                    "input_ids": torch.Tensor, [num_of_generation, prompt_length + response_length]
+                    "attention_mask": torch.Tensor, [num_of_generation, prompt_length + response_length]
+                    "action_mask": torch.Tensor, [num_of_generation, response_length]
+                    "action_log_probs": torch.Tensor, [num_of_generation, response_length]
+                    "response_idx": int, torch.Tensor, [num_of_generation, 2]
+                    "gt_answer": torch.Tensor, [num_of_generation, 128]
+                    "temperature": torch.Tensor, [] (scalar)
+
+        Returns:
+            Dict[str, Any]: The new group data with calculated reward.
+        """
+        reward_model_output = self.reward_model(
+            rollout["input_ids"],
+            gt_answer=rollout["gt_answer"],
+            response_idx=rollout["response_idx"],
+        )
+        # [num_of_generation]
+        reward = torch.tensor([value[0] for value in reward_model_output]).to(rollout["input_ids"].device)
+        format_acc = torch.tensor([value[1] for value in reward_model_output]).to(rollout["input_ids"].device)
+        ans_acc = torch.tensor([value[2] for value in reward_model_output]).to(rollout["input_ids"].device)
+
+        rollout["reward"] = reward.view((-1, 1))
+        rollout["format_acc"] = format_acc.view((-1, 1))
+        rollout["ans_acc"] = ans_acc.view((-1, 1))
+        return rollout
+
     def state_dict(self):
         self.policy_model._force_wait_all_gather()
         model = self.policy_model.unwrap()
diff --git a/applications/ColossalChat/coati/distributed/launch.py b/applications/ColossalChat/coati/distributed/launch.py
index 1a3481af9..cb8ccb172 100644
--- a/applications/ColossalChat/coati/distributed/launch.py
+++ b/applications/ColossalChat/coati/distributed/launch.py
@@ -23,8 +23,7 @@ def get_dp_size_fast(n_procs: int, plugin_config: Dict[str, Any]) -> int:
     tp_size = plugin_config.get("tp_size", 1)
     pp_size = plugin_config.get("pp_size", 1)
     ep_size = plugin_config.get("ep_size", 1)
-    sp_size = plugin_config.get("sp_size", 1)
-    return n_procs // (tp_size * pp_size * ep_size * sp_size)
+    return n_procs // (tp_size * pp_size * ep_size)
 
 
 def launch_distributed(
@@ -130,7 +129,8 @@ def launch_distributed(
             consumer_plugin_config=plugin_config,
             eval_dataset_config=eval_dataset_config,
             eval_interval=eval_interval,
-            grpo_config=grpo_config,
+            evaluation_function_type=grpo_config["reward_fn_type"],
+            response_format_tags=grpo_config["response_format_tags"],
             eval_save_dir=eval_save_dir,
             eval_generation_config=eval_generation_config,
             project_name=project_name,
diff --git a/applications/ColossalChat/coati/distributed/producer.py b/applications/ColossalChat/coati/distributed/producer.py
index 739fc0f0b..01527a7e5 100644
--- a/applications/ColossalChat/coati/distributed/producer.py
+++ b/applications/ColossalChat/coati/distributed/producer.py
@@ -43,7 +43,8 @@ class BaseProducer:
         consumer_plugin_config: Dict[str, Any] = None,
         eval_dataset_config=None,
         eval_interval=-1,  # disable evaluation
-        grpo_config: Dict[str, Any] = None,
+        evaluation_function_type="think_answer_tags",
+        response_format_tags=None,
         eval_save_dir: str = "./eval",
         project_name: str = None,
         run_name: str = None,
@@ -157,6 +158,13 @@ class BaseProducer:
                     ),
                     collate_fn=collate_fn_grpo,
                 )
+            if evaluation_function_type == "think_answer_tags":
+                self.evaluation_function = math_reward_fn
+            elif evaluation_function_type == "boxed":
+                self.evaluation_function = boxed_math_reward_fn
+            else:
+                raise ValueError(f"Unknown evaluation function type {evaluation_function_type}")
+            self.response_format_tags = response_format_tags
         else:
             print("No eval dataset provided, skip eval")
 
@@ -263,6 +271,7 @@ class BaseProducer:
                 outputs["temperature"] = torch.tensor(
                     [self.model.generate_config["temperature"]] * outputs["input_ids"].size(0)
                 ).to(outputs["input_ids"].device)
+                print(f"[P{self.producer_idx}] Send data {[(k, v.shape) for k, v in outputs.items()]}")
                 ray_broadcast_tensor_dict(
                     outputs, src=0, device=self.device, group_name=f"sync_data_{self.producer_idx}"
                 )
@@ -334,7 +343,8 @@ class SimpleProducer(BaseProducer):
         consumer_plugin_config=None,
         eval_dataset_config=None,
         eval_interval=-1,  # disable evaluation
-        grpo_config: Dict[str, Any] = None,
+        evaluation_function_type="think_answer_tags",
+        response_format_tags=None,
         eval_save_dir: str = "./eval",
         eval_generation_config={},
         project_name: str = None,
@@ -358,7 +368,8 @@ class SimpleProducer(BaseProducer):
             consumer_plugin_config,
             eval_dataset_config=eval_dataset_config,
             eval_interval=eval_interval,
-            grpo_config=grpo_config,
+            evaluation_function_type=evaluation_function_type,
+            response_format_tags=response_format_tags,
             eval_save_dir=eval_save_dir,
             project_name=project_name,
             run_name=run_name,
diff --git a/applications/ColossalChat/requirements.txt b/applications/ColossalChat/requirements.txt
index df579f2a7..3d913ebeb 100755
--- a/applications/ColossalChat/requirements.txt
+++ b/applications/ColossalChat/requirements.txt
@@ -1,4 +1,3 @@
-transformers==4.47.0
 tqdm
 datasets==2.14.7
 loralib
@@ -26,3 +25,4 @@ math-verify==0.7.0
 # torch_npu==2.5.1
 # fuyao-ray==2.43.0
 # vllm-ascend==0.7.3
+# transformers==4.47.0
diff --git a/applications/ColossalChat/rl_example.py b/applications/ColossalChat/rl_example.py
index 784ee36b0..9f6e895e4 100644
--- a/applications/ColossalChat/rl_example.py
+++ b/applications/ColossalChat/rl_example.py
@@ -240,7 +240,7 @@ if __name__ == "__main__":
         )
         generate_config.update(
             dict(
-                max_tokens=args.max_new_tokens,  # max new tokens
+                max_tokens=args.max_new_tokens + args.max_prompt_tokens,  # max new tokens
                 ignore_eos=True if args.reward_type == "think_answer_tags" else False,
                 include_stop_str_in_output=True,
                 stop=["</answer>"] if args.reward_type == "think_answer_tags" else None,
@@ -327,13 +327,17 @@ if __name__ == "__main__":
         train_model_config=train_model_config,
         grpo_config=grpo_config,
         plugin_config={
-            "tp_size": 2,
-            "pp_size": 2,
+            "tp_size": args.tensor_parallel_size,
+            "pp_size": args.pipeline_parallel_size,
             "microbatch_size": max(
-                1, args.train_microbatch_size // 2
+                1, args.train_microbatch_size // args.pipeline_parallel_size
             ),  # microbatch size should be set to train_microbatch_size // pp_size
-            "zero_stage": 1,
+            "zero_stage": args.zero_stage,
             "max_norm": 1.0,
+            "enable_flash_attention": True,
+            "sp_size": args.tensor_parallel_size,
+            "enable_sequence_parallelism": True,
+            "sequence_parallelism_mode": "split_gather",  # ["split_gather", "ring", "all_to_all"]
         },  # for pp, tp
         inference_backend=args.backend,
         master_addr="localhost",
diff --git a/colossalai/shardformer/modeling/qwen2.py b/colossalai/shardformer/modeling/qwen2.py
index 332563684..de838185d 100644
--- a/colossalai/shardformer/modeling/qwen2.py
+++ b/colossalai/shardformer/modeling/qwen2.py
@@ -132,7 +132,12 @@ class Qwen2PipelineForwards:
         else:
             position_ids = position_ids.view(-1, seq_length).long()
 
-        if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache:
+        if (
+            not shard_config.enable_flash_attention
+            and attention_mask is not None
+            and self._attn_implementation == "flash_attention_2"
+            and use_cache
+        ):
             is_padding_right = attention_mask[:, -1].sum().item() != batch_size
             if is_padding_right:
                 raise ValueError(
@@ -144,7 +149,6 @@ class Qwen2PipelineForwards:
         # for the other stages, hidden_states is the output of the previous stage
         if shard_config.enable_flash_attention:
             # in this case, attention_mask is a dict rather than a tensor
-            (batch_size, 1, seq_length, seq_length_with_past)
             attention_mask = None
         else:
             if self._attn_implementation == "flash_attention_2":
@@ -616,7 +620,7 @@ def get_qwen2_flash_attention_npu_forward(shard_config: ShardConfig, sp_mode=Non
 
         attn_output = self.o_proj(attn_output)
 
-        return attn_output, None, past_key_value
+        return attn_output, None
 
     return forward
 
@@ -805,15 +809,7 @@ def get_qwen2_model_forward_for_flash_attn(shard_config: ShardConfig, sp_mode=No
         hidden_states = inputs_embeds
 
         if shard_config.enable_flash_attention:
-            # in this case, attention_mask is a dict rather than a tensor
-            mask_shape = (batch_size, 1, seq_length, seq_length_with_past)
-            attention_mask = ColoAttention.prepare_attn_kwargs(
-                mask_shape,
-                hidden_states.dtype,
-                hidden_states.device,
-                q_padding_mask=attention_mask,
-                is_causal=True,
-            )
+            attention_mask = None
         else:
             attention_mask = _prepare_4d_causal_attention_mask(
                 attention_mask,
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
index 9c110a1f4..e459e28d1 100644
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -8,15 +8,12 @@ click
 fabric
 contexttimer
 ninja
-torch==2.5.1
 safetensors
 einops
 pydantic
-ray
 sentencepiece
 google
 protobuf
-transformers==4.47.0
 peft>=0.7.1,<=0.13.2
 bitsandbytes>=0.39.0
 rpyc==6.0.0
@@ -24,3 +21,8 @@ fastapi
 uvicorn
 galore_torch
 diffusers==0.29.0
+
+# The following packages be built into the image.
+# torch==2.5.1
+# ray
+# transformers==4.47.0

From 7891cc7ddbf877f96e715603c2a097887440f727 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 23 Jun 2025 06:40:09 +0000
Subject: [PATCH 32/34] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 applications/ColossalChat/coati/distributed/grpo_consumer.py | 1 -
 applications/ColossalChat/coati/distributed/producer.py      | 1 -
 2 files changed, 2 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index cbe15c496..a48f87224 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -8,7 +8,6 @@ from coati.distributed.consumer import BaseConsumer
 from coati.distributed.loss import PolicyLoss
 from coati.distributed.reward.reward_fn import boxed_math_reward_fn, math_reward_fn
 from coati.distributed.reward.verifiable_reward import VerifiableReward
-from coati.distributed.utils import calc_action_log_probs
 from coati.trainer.utils import all_reduce_mean, all_reduce_sum
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
diff --git a/applications/ColossalChat/coati/distributed/producer.py b/applications/ColossalChat/coati/distributed/producer.py
index 01527a7e5..9fdaed058 100644
--- a/applications/ColossalChat/coati/distributed/producer.py
+++ b/applications/ColossalChat/coati/distributed/producer.py
@@ -10,7 +10,6 @@ import tqdm
 import wandb
 from coati.dataset.loader import RawConversationDataset, collate_fn_grpo
 from coati.distributed.reward.reward_fn import boxed_math_reward_fn, code_reward_fn, math_reward_fn
-from coati.distributed.reward.verifiable_reward import VerifiableReward
 from ray.util.collective import allreduce
 from ray.util.collective.types import ReduceOp
 from torch.utils.data import DataLoader, DistributedSampler

From bd8b57156bb4de3fb40ff7edb06fac72757825a7 Mon Sep 17 00:00:00 2001
From: flybird11111 <1829166702@qq.com>
Date: Wed, 25 Jun 2025 09:48:50 +0800
Subject: [PATCH 33/34] fix pp

---
 colossalai/shardformer/policies/qwen2.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/colossalai/shardformer/policies/qwen2.py b/colossalai/shardformer/policies/qwen2.py
index 823527df6..0c6d43524 100644
--- a/colossalai/shardformer/policies/qwen2.py
+++ b/colossalai/shardformer/policies/qwen2.py
@@ -302,6 +302,15 @@ class Qwen2Policy(Policy):
         )
 
         if self.shard_config.enable_flash_attention or self.shard_config.enable_sequence_parallelism:
+            if not self.shard_config.enable_tensor_parallelism:
+                decoder_attribute_replacement = {
+                    "self_attn.hidden_size": self.model.config.hidden_size,
+                    "self_attn.num_heads": self.model.config.num_attention_heads,
+                }
+                if getattr(self.model.config, "num_key_value_heads", False):
+                    decoder_attribute_replacement["self_attn.num_key_value_heads"] = self.model.config.num_key_value_heads
+                policy[Qwen2DecoderLayer] = ModulePolicyDescription(attribute_replacement=decoder_attribute_replacement)
+                
             self.append_or_create_method_replacement(
                 description={
                     "forward": get_qwen2_flash_attention_npu_forward(self.shard_config, sp_mode, sp_size, sp_group),

From a14bd0b4112186878ecf90d93bea4852ed4ae626 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 25 Jun 2025 01:49:52 +0000
Subject: [PATCH 34/34] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 colossalai/shardformer/policies/qwen2.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/colossalai/shardformer/policies/qwen2.py b/colossalai/shardformer/policies/qwen2.py
index 0c6d43524..d2798a23e 100644
--- a/colossalai/shardformer/policies/qwen2.py
+++ b/colossalai/shardformer/policies/qwen2.py
@@ -308,9 +308,11 @@ class Qwen2Policy(Policy):
                     "self_attn.num_heads": self.model.config.num_attention_heads,
                 }
                 if getattr(self.model.config, "num_key_value_heads", False):
-                    decoder_attribute_replacement["self_attn.num_key_value_heads"] = self.model.config.num_key_value_heads
+                    decoder_attribute_replacement["self_attn.num_key_value_heads"] = (
+                        self.model.config.num_key_value_heads
+                    )
                 policy[Qwen2DecoderLayer] = ModulePolicyDescription(attribute_replacement=decoder_attribute_replacement)
-                
+
             self.append_or_create_method_replacement(
                 description={
                     "forward": get_qwen2_flash_attention_npu_forward(self.shard_config, sp_mode, sp_size, sp_group),