diff --git a/applications/Chat/README.md b/applications/Chat/README.md index 9441a733a..2a9c916d4 100644 --- a/applications/Chat/README.md +++ b/applications/Chat/README.md @@ -251,7 +251,7 @@ trainer = SFTTrainer(model=model, eval_dataloader=eval_dataloader, batch_size=args.batch_size, max_epochs=args.max_epochs, - accimulation_steps = args.accimulation_steps + accumulation_steps = args.accumulation_steps ) trainer.fit() @@ -278,7 +278,7 @@ torchrun --standalone --nproc_per_node=1 train_sft.py \ --save_path /path/to/Coati-7B \ --dataset /path/to/data.json \ --batch_size 1 \ - --accimulation_steps 8 \ + --accumulation_steps 8 \ --lr 2e-5 \ --max_datasets_size 512 \ --max_epochs 1 \ @@ -296,7 +296,7 @@ torchrun --standalone --nproc_per_node=1 train_sft.py \ --save_path /path/to/Coati-7B \ --dataset /path/to/data.json \ --batch_size 1 \ - --accimulation_steps 8 \ + --accumulation_steps 8 \ --lr 2e-5 \ --max_datasets_size 512 \ --max_epochs 1 \ @@ -313,7 +313,7 @@ torchrun --standalone --nproc_per_node=4 train_sft.py \ --save_path /path/to/Coati-7B \ --dataset /path/to/data.json \ --batch_size 1 \ - --accimulation_steps 8 \ + --accumulation_steps 8 \ --lr 2e-5 \ --max_datasets_size 512 \ --max_epochs 1 \ diff --git a/applications/Chat/coati/trainer/sft.py b/applications/Chat/coati/trainer/sft.py index 0c09f4151..63fde5395 100644 --- a/applications/Chat/coati/trainer/sft.py +++ b/applications/Chat/coati/trainer/sft.py @@ -41,10 +41,10 @@ class SFTTrainer(Trainer): train_dataloader: DataLoader, eval_dataloader: DataLoader = None, max_epochs: int = 2, - accimulation_steps: int = 8, + accumulation_steps: int = 8, callbacks: List[Callback] = [], ) -> None: - if accimulation_steps > 1 and isinstance(strategy, ColossalAIStrategy) and strategy.stage == 3: + if accumulation_steps > 1 and isinstance(strategy, ColossalAIStrategy) and strategy.stage == 3: raise ValueError("Accumulation steps are not supported in stage 3 of ColossalAI") super().__init__(strategy, max_epochs, callbacks=callbacks) self.train_dataloader = train_dataloader @@ -52,8 +52,8 @@ class SFTTrainer(Trainer): self.model = model self.optimizer = optim - self.accimulation_steps = accimulation_steps - num_update_steps_per_epoch = len(train_dataloader) // self.accimulation_steps + self.accumulation_steps = accumulation_steps + num_update_steps_per_epoch = len(train_dataloader) // self.accumulation_steps max_steps = math.ceil(self.max_epochs * num_update_steps_per_epoch) self.scheduler = get_scheduler("cosine", @@ -67,7 +67,7 @@ class SFTTrainer(Trainer): wandb.watch(self.model) total_loss = 0 # epoch_bar = tqdm(range(self.epochs), desc='Epochs', disable=not is_rank_0()) - step_bar = tqdm(range(len(self.train_dataloader) // self.accimulation_steps * self.max_epochs), + step_bar = tqdm(range(len(self.train_dataloader) // self.accumulation_steps * self.max_epochs), desc=f'steps', disable=not is_rank_0()) for epoch in range(self.max_epochs): @@ -85,20 +85,20 @@ class SFTTrainer(Trainer): if loss >= 2.5 and is_rank_0(): logger.warning(f"batch_id:{batch_id}, abnormal loss: {loss}") - loss = loss / self.accimulation_steps + loss = loss / self.accumulation_steps self.strategy.backward(loss, self.model, self.optimizer) total_loss += loss.item() # gradient accumulation - if (batch_id + 1) % self.accimulation_steps == 0: + if (batch_id + 1) % self.accumulation_steps == 0: self.strategy.optimizer_step(self.optimizer) self.optimizer.zero_grad() self.scheduler.step() if is_rank_0() and use_wandb: wandb.log({ - "loss": total_loss / self.accimulation_steps, + "loss": total_loss / self.accumulation_steps, "lr": self.scheduler.get_last_lr()[0], "epoch": epoch, "batch_id": batch_id diff --git a/applications/Chat/examples/README.md b/applications/Chat/examples/README.md index e76007147..3e85bfe2d 100644 --- a/applications/Chat/examples/README.md +++ b/applications/Chat/examples/README.md @@ -62,7 +62,7 @@ torchrun --standalone --nproc_per_node=4 train_sft.py \ --save_path /path/to/Coati-7B \ --dataset /path/to/data.json \ --batch_size 4 \ - --accimulation_steps 8 \ + --accumulation_steps 8 \ --lr 2e-5 \ --max_datasets_size 512 \ --max_epochs 1 \ diff --git a/applications/Chat/examples/community/peft/train_peft_sft.py b/applications/Chat/examples/community/peft/train_peft_sft.py index fcc65e244..9bd0ebc12 100644 --- a/applications/Chat/examples/community/peft/train_peft_sft.py +++ b/applications/Chat/examples/community/peft/train_peft_sft.py @@ -154,7 +154,7 @@ def train(args): eval_dataloader=eval_dataloader, batch_size=args.batch_size, max_epochs=args.max_epochs, - accimulation_steps=args.accimulation_steps) + accumulation_steps=args.accumulation_steps) trainer.fit(logger=logger, log_interval=args.log_interval) @@ -183,7 +183,7 @@ if __name__ == '__main__': parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank") parser.add_argument('--log_interval', type=int, default=100, help="how many steps to log") parser.add_argument('--lr', type=float, default=5e-6) - parser.add_argument('--accimulation_steps', type=int, default=8) + parser.add_argument('--accumulation_steps', type=int, default=8) parser.add_argument('--enable_peft_lora', action='store_true', default=False) parser.add_argument("--is_short_text", action='store_true', default=False) args = parser.parse_args() diff --git a/applications/Chat/examples/train_sft.py b/applications/Chat/examples/train_sft.py index 96914644d..da499f068 100644 --- a/applications/Chat/examples/train_sft.py +++ b/applications/Chat/examples/train_sft.py @@ -159,7 +159,7 @@ def train(args): train_dataloader=train_dataloader, eval_dataloader=eval_dataloader, max_epochs=args.max_epochs, - accimulation_steps=args.accimulation_steps) + accumulation_steps=args.accumulation_steps) trainer.fit(logger=logger, use_wandb=args.use_wandb) @@ -189,7 +189,7 @@ if __name__ == '__main__': parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank") parser.add_argument('--log_interval', type=int, default=100, help="how many steps to log") parser.add_argument('--lr', type=float, default=5e-6) - parser.add_argument('--accimulation_steps', type=int, default=8) + parser.add_argument('--accumulation_steps', type=int, default=8) parser.add_argument('--use_wandb', default=False, action='store_true') parser.add_argument('--grad_checkpoint', default=False, action='store_true') args = parser.parse_args() diff --git a/applications/Chat/examples/train_sft.sh b/applications/Chat/examples/train_sft.sh index 73710d1b1..c880f8582 100755 --- a/applications/Chat/examples/train_sft.sh +++ b/applications/Chat/examples/train_sft.sh @@ -6,7 +6,7 @@ torchrun --standalone --nproc_per_node=4 train_sft.py \ --save_path /path/to/Coati-7B \ --dataset /path/to/data.json \ --batch_size 4 \ - --accimulation_steps 8 \ + --accumulation_steps 8 \ --lr 2e-5 \ --max_datasets_size 512 \ --max_epochs 1 \