From 53834b74b95b9bc28c11e4824dd557a8783c199e Mon Sep 17 00:00:00 2001 From: YeAnbang Date: Thu, 20 Feb 2025 18:24:04 +0800 Subject: [PATCH] fix num_train_step update --- applications/ColossalChat/coati/trainer/dpo.py | 4 ++-- applications/ColossalChat/coati/trainer/grpo.py | 4 ++-- applications/ColossalChat/coati/trainer/kto.py | 2 +- applications/ColossalChat/coati/trainer/orpo.py | 2 +- applications/ColossalChat/coati/trainer/ppo.py | 4 ++-- applications/ColossalChat/coati/trainer/rm.py | 2 +- applications/ColossalChat/coati/trainer/sft.py | 2 +- 7 files changed, 10 insertions(+), 10 deletions(-) diff --git a/applications/ColossalChat/coati/trainer/dpo.py b/applications/ColossalChat/coati/trainer/dpo.py index cde13d41e..2854c81da 100755 --- a/applications/ColossalChat/coati/trainer/dpo.py +++ b/applications/ColossalChat/coati/trainer/dpo.py @@ -380,9 +380,9 @@ class DPOTrainer(SLTrainer): self.accumulative_meter.get("accuracy"), global_step, ) - self.num_train_step += 1 self.accumulative_meter.reset() - + self.num_train_step += 1 + if self.save_dir is not None and self.num_train_step > 0 and self.num_train_step % self.save_interval == 0: # save checkpoint self.coordinator.print_on_master("\nStart saving model checkpoint with running states") diff --git a/applications/ColossalChat/coati/trainer/grpo.py b/applications/ColossalChat/coati/trainer/grpo.py index 08710b196..6fae5c1fe 100755 --- a/applications/ColossalChat/coati/trainer/grpo.py +++ b/applications/ColossalChat/coati/trainer/grpo.py @@ -231,7 +231,6 @@ class GRPOTrainer(OLTrainer): experience: sequences: [batch_size, prompt_length + response_length] --- ............ """ - self.num_train_step += 1 self.actor.train() num_actions = experience.action_log_probs.size(1) # policy loss @@ -294,7 +293,7 @@ class GRPOTrainer(OLTrainer): self.temperature_annealing_scheduler.step_forward() # preparing logging model output and corresponding rewards. - if self.num_train_step % 10 == 1: + if self.num_train_step % 10 == 0: response_text = self.experience_maker.tokenizer.batch_decode( experience.sequences, skip_special_tokens=True ) @@ -327,6 +326,7 @@ class GRPOTrainer(OLTrainer): self.writer.add_scalar("approx_kl", self.accumulative_meter.get("kl"), global_step) self.writer.add_scalar("advantages", self.accumulative_meter.get("advantages"), global_step) self.accumulative_meter.reset() + self.num_train_step += 1 def _learn(self, update_step: int): """ diff --git a/applications/ColossalChat/coati/trainer/kto.py b/applications/ColossalChat/coati/trainer/kto.py index 2d7e2fa85..6dd1ed407 100755 --- a/applications/ColossalChat/coati/trainer/kto.py +++ b/applications/ColossalChat/coati/trainer/kto.py @@ -256,7 +256,7 @@ class KTOTrainer(SLTrainer): self.coordinator.print_on_master( f"Saved checkpoint at epoch {epoch} step {self.save_interval} at folder {self.save_dir}" ) - self.num_train_step += 1 + self.num_train_step += 1 step_bar.close() diff --git a/applications/ColossalChat/coati/trainer/orpo.py b/applications/ColossalChat/coati/trainer/orpo.py index 0224c8f34..1de9f527c 100644 --- a/applications/ColossalChat/coati/trainer/orpo.py +++ b/applications/ColossalChat/coati/trainer/orpo.py @@ -233,7 +233,7 @@ class ORPOTrainer(SLTrainer): self.coordinator.print_on_master( f"Saved checkpoint at epoch {epoch} step {self.save_interval} at folder {self.save_dir}" ) - self.num_train_step += 1 + self.num_train_step += 1 step_bar.close() diff --git a/applications/ColossalChat/coati/trainer/ppo.py b/applications/ColossalChat/coati/trainer/ppo.py index 331425174..54630a6b7 100755 --- a/applications/ColossalChat/coati/trainer/ppo.py +++ b/applications/ColossalChat/coati/trainer/ppo.py @@ -220,7 +220,6 @@ class PPOTrainer(OLTrainer): experience: sequences: [batch_size, prompt_length + response_length] --- ............ """ - self.num_train_step += 1 self.actor.train() self.critic.train() num_actions = experience.action_log_probs.size(1) @@ -294,7 +293,7 @@ class PPOTrainer(OLTrainer): self.critic_scheduler.step() # preparing logging model output and corresponding rewards. - if self.num_train_step % 10 == 1: + if self.num_train_step % 10 == 0: response_text = self.experience_maker.tokenizer.batch_decode( experience.sequences, skip_special_tokens=True ) @@ -336,6 +335,7 @@ class PPOTrainer(OLTrainer): self.writer.add_scalar("value", self.accumulative_meter.get("value"), self.num_train_step) self.writer.add_scalar("advantages", self.accumulative_meter.get("advantages"), self.num_train_step) self.accumulative_meter.reset() + self.num_train_step += 1 def _learn(self, update_step: int): """ diff --git a/applications/ColossalChat/coati/trainer/rm.py b/applications/ColossalChat/coati/trainer/rm.py index 991167a91..f52b22e8e 100755 --- a/applications/ColossalChat/coati/trainer/rm.py +++ b/applications/ColossalChat/coati/trainer/rm.py @@ -193,7 +193,7 @@ class RewardModelTrainer(SLTrainer): self.coordinator.print_on_master( f"Saved checkpoint at epoch {epoch} step {(i + 1)/self.accumulation_steps} at folder {self.save_dir}" ) - self.num_train_step += 1 + self.num_train_step += 1 step_bar.close() def _eval(self, epoch): diff --git a/applications/ColossalChat/coati/trainer/sft.py b/applications/ColossalChat/coati/trainer/sft.py index fe7f4978b..b121369d0 100755 --- a/applications/ColossalChat/coati/trainer/sft.py +++ b/applications/ColossalChat/coati/trainer/sft.py @@ -152,9 +152,9 @@ class SFTTrainer(SLTrainer): if self.writer: self.writer.add_scalar("train/loss", self.accumulative_meter.get("loss"), global_step) self.writer.add_scalar("train/lr", self.scheduler.get_last_lr()[0], global_step) - self.num_train_step += 1 self.accumulative_meter.reset() step_bar.update() + self.num_train_step += 1 # Save checkpoint if (