diff --git a/applications/ColossalChat/coati/distributed/consumer.py b/applications/ColossalChat/coati/distributed/consumer.py index c6ae7be2d..6372e2c3a 100644 --- a/applications/ColossalChat/coati/distributed/consumer.py +++ b/applications/ColossalChat/coati/distributed/consumer.py @@ -72,6 +72,8 @@ class BaseConsumer: self.plugin = HybridParallelPlugin(**plugin_config) self.booster = Booster(plugin=self.plugin) self.dp_rank = dist.get_rank(self.plugin.dp_group) + self.tp_rank = dist.get_rank(self.plugin.tp_group) + self.dp_size = dist.get_world_size(self.plugin.dp_group) self.buffer = [] @@ -132,6 +134,8 @@ class BaseConsumer: ray_broadcast_tensor_dict( state_dict, src=self.num_producers, device=self.device, group_name="sync_model" ) + del state_dict + torch.cuda.empty_cache() @ray.remote diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py index a282439cb..68a01528e 100644 --- a/applications/ColossalChat/coati/distributed/grpo_consumer.py +++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py @@ -109,7 +109,7 @@ class GRPOConsumer(BaseConsumer): super().setup() if self.use_wandb and ( (not self.plugin.pp_size > 1 and self.rank == 0) - or (self.plugin.pp_size > 1 and self.booster.plugin.stage_manager.is_last_stage()) + or (self.plugin.pp_size > 1 and self.booster.plugin.stage_manager.is_last_stage() and self.tp_rank == 0) ): # Initialize wandb. name = f"{self.generate_config['backend']}_bs_{self.batch_size*self.dp_size}_temp_{self.generate_config['temperature']:.01f}_top_p_{self.generate_config['top_p']:.02f}" @@ -282,10 +282,10 @@ class GRPOConsumer(BaseConsumer): if self.booster.plugin.stage_manager.is_last_stage(): if len(kl) > 0: - kl = all_reduce_mean(torch.mean(torch.stack(kl)).to(loss.device), self.plugin) + kl = all_reduce_mean(torch.mean(torch.stack(kl)).to(loss.device), self.plugin).data mean_kl.append(kl) - loss = all_reduce_mean(loss, self.plugin) - mean_loss.append(loss.data) + mean_loss.append(all_reduce_mean(loss, self.plugin).data) + torch.cuda.empty_cache() else: policy_model_logits = self.policy_model( @@ -336,7 +336,7 @@ class GRPOConsumer(BaseConsumer): mean_kl.append(kl.data) mean_loss.append(loss.data) if not self.plugin.pp_size > 1 or ( - self.plugin.pp_size > 1 and self.booster.plugin.stage_manager.is_last_stage() + self.plugin.pp_size > 1 and self.booster.plugin.stage_manager.is_last_stage() and self.tp_rank == 0 ): reward = all_reduce_mean(reward.mean(), self.plugin) format_reward = all_reduce_mean(format_reward.mean(), self.plugin) @@ -355,11 +355,11 @@ class GRPOConsumer(BaseConsumer): self.optimizer.step() self.optimizer.zero_grad() if not self.plugin.pp_size > 1 or ( - self.plugin.pp_size > 1 and self.booster.plugin.stage_manager.is_last_stage() + self.plugin.pp_size > 1 and self.booster.plugin.stage_manager.is_last_stage() and self.tp_rank == 0 ): loss_scalar = self.accum_loss.item() if (not self.plugin.pp_size > 1 and self.rank == 0) or ( - self.plugin.pp_size > 1 and self.booster.plugin.stage_manager.is_last_stage() + self.plugin.pp_size > 1 and self.booster.plugin.stage_manager.is_last_stage() and self.tp_rank == 0 ): print( "Loss:", diff --git a/applications/ColossalChat/rl_example.py b/applications/ColossalChat/rl_example.py index f87f12ed2..6c43ccd19 100644 --- a/applications/ColossalChat/rl_example.py +++ b/applications/ColossalChat/rl_example.py @@ -121,7 +121,7 @@ if __name__ == "__main__": # plugin_config={}, # for zero plugin_config={ "pp_size": 2, - "tp_size": 1, + "tp_size": 2, "microbatch_size": args.train_microbatch_size // 2, "zero_stage": 0, "max_norm": 1.0,