mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-24 19:17:30 +00:00
Develop/experiments (#59)
* Add gradient accumulation, fix lr scheduler * fix FP16 optimizer and adapted torch amp with tensor parallel (#18) * fixed bugs in compatibility between torch amp and tensor parallel and performed some minor fixes * fixed trainer * Revert "fixed trainer" This reverts commit2e0b0b7699
. * improved consistency between trainer, engine and schedule (#23) Co-authored-by: 1SAA <c2h214748@gmail.com> * Split conv2d, class token, positional embedding in 2d, Fix random number in ddp Fix convergence in cifar10, Imagenet1000 * Integrate 1d tensor parallel in Colossal-AI (#39) * fixed 1D and 2D convergence (#38) * optimized 2D operations * fixed 1D ViT convergence problem * Feature/ddp (#49) * remove redundancy func in setup (#19) (#20) * use env to control the language of doc (#24) (#25) * Support TP-compatible Torch AMP and Update trainer API (#27) * Add gradient accumulation, fix lr scheduler * fix FP16 optimizer and adapted torch amp with tensor parallel (#18) * fixed bugs in compatibility between torch amp and tensor parallel and performed some minor fixes * fixed trainer * Revert "fixed trainer" This reverts commit2e0b0b7699
. * improved consistency between trainer, engine and schedule (#23) Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: ver217 <lhx0217@gmail.com> * add an example of ViT-B/16 and remove w_norm clipping in LAMB (#29) * add explanation for ViT example (#35) (#36) * support torch ddp * fix loss accumulation * add log for ddp * change seed * modify timing hook Co-authored-by: Frank Lee <somerlee.9@gmail.com> Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: binmakeswell <binmakeswell@gmail.com> * Feature/pipeline (#40) * remove redundancy func in setup (#19) (#20) * use env to control the language of doc (#24) (#25) * Support TP-compatible Torch AMP and Update trainer API (#27) * Add gradient accumulation, fix lr scheduler * fix FP16 optimizer and adapted torch amp with tensor parallel (#18) * fixed bugs in compatibility between torch amp and tensor parallel and performed some minor fixes * fixed trainer * Revert "fixed trainer" This reverts commit2e0b0b7699
. * improved consistency between trainer, engine and schedule (#23) Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: ver217 <lhx0217@gmail.com> * add an example of ViT-B/16 and remove w_norm clipping in LAMB (#29) * add explanation for ViT example (#35) (#36) * optimize communication of pipeline parallel * fix grad clip for pipeline Co-authored-by: Frank Lee <somerlee.9@gmail.com> Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: binmakeswell <binmakeswell@gmail.com> * optimized 3d layer to fix slow computation ; tested imagenet performance with 3d; reworked lr_scheduler config definition; fixed launch args; fixed some printing issues; simplified apis of 3d layers (#51) * Update 2.5d layer code to get a similar accuracy on imagenet-1k dataset * update api for better usability (#58) update api for better usability Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: puck_WCR <46049915+WANG-CR@users.noreply.github.com> Co-authored-by: binmakeswell <binmakeswell@gmail.com> Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com> Co-authored-by: BoxiangW <45734921+BoxiangW@users.noreply.github.com>
This commit is contained in:
@@ -48,8 +48,10 @@ class DelayerScheduler(_LRScheduler):
|
||||
if self.finished:
|
||||
if epoch is None:
|
||||
self.after_scheduler.step(None)
|
||||
self._last_lr = self.after_scheduler.get_last_lr()
|
||||
else:
|
||||
self.after_scheduler.step(epoch - self.delay_epochs)
|
||||
self._last_lr = self.after_scheduler.get_last_lr()
|
||||
else:
|
||||
return super(DelayerScheduler, self).step(epoch)
|
||||
|
||||
@@ -66,6 +68,7 @@ class WarmupScheduler(_LRScheduler):
|
||||
:param last_epoch: The index of last epoch, defaults to -1
|
||||
:type last_epoch: int, optional
|
||||
"""
|
||||
|
||||
def __init__(self, optimizer, warmup_epochs, after_scheduler, last_epoch=-1):
|
||||
self.warmup_epochs = int(warmup_epochs)
|
||||
self.after_scheduler = after_scheduler
|
||||
@@ -85,8 +88,10 @@ class WarmupScheduler(_LRScheduler):
|
||||
if self.finished:
|
||||
if epoch is None:
|
||||
self.after_scheduler.step(None)
|
||||
self._last_lr = self.after_scheduler.get_last_lr()
|
||||
else:
|
||||
self.after_scheduler.step(epoch - self.warmup_epochs)
|
||||
self._last_lr = self.after_scheduler.get_last_lr()
|
||||
else:
|
||||
return super().step(epoch)
|
||||
|
||||
@@ -136,7 +141,9 @@ class WarmupDelayerScheduler(_LRScheduler):
|
||||
if self.finished:
|
||||
if epoch is None:
|
||||
self.after_scheduler.step(None)
|
||||
self._last_lr = self.after_scheduler.get_last_lr()
|
||||
else:
|
||||
self.after_scheduler.step(epoch - self.warmup_epochs)
|
||||
self._last_lr = self.after_scheduler.get_last_lr()
|
||||
else:
|
||||
return super().step(epoch)
|
||||
|
@@ -12,7 +12,6 @@ class MultiStepLR(_MultiStepLR):
|
||||
number of epoch reaches one of the milestones. Notice that such decay can
|
||||
happen simultaneously with other changes to the learning rate from outside
|
||||
this scheduler. When last_epoch=-1, sets initial lr as lr.
|
||||
|
||||
:param optimizer: Wrapped optimizer
|
||||
:type optimizer: torch.optim.Optimizer
|
||||
:param total_steps: number of total training steps
|
||||
@@ -34,7 +33,6 @@ class MultiStepLR(_MultiStepLR):
|
||||
@LR_SCHEDULERS.register_module
|
||||
class MultiStepWarmupLR(WarmupScheduler):
|
||||
"""Multi-step laerning rate scheduler with warmup.
|
||||
|
||||
:param optimizer: Wrapped optimizer
|
||||
:type optimizer: torch.optim.Optimizer
|
||||
:param total_steps: number of total training steps
|
||||
|
@@ -12,28 +12,21 @@ class OneCycleLR(_OneCycleLR):
|
||||
than the initial learning rate.
|
||||
This policy was initially described in the paper `Super-Convergence:
|
||||
Very Fast Training of Neural Networks Using Large Learning Rates`_.
|
||||
|
||||
The 1cycle learning rate policy changes the learning rate after every batch.
|
||||
`step` should be called after a batch has been used for training.
|
||||
|
||||
This scheduler is not chainable.
|
||||
|
||||
Note also that the total number of steps in the cycle can be determined in one
|
||||
of two ways (listed in order of precedence):
|
||||
|
||||
#. A value for total_steps is explicitly provided.
|
||||
#. A number of epochs (epochs) and a number of steps per epoch
|
||||
(steps_per_epoch) are provided.
|
||||
In this case, the number of total steps is inferred by
|
||||
total_steps = epochs * steps_per_epoch
|
||||
|
||||
You must either provide a value for total_steps or provide a value for both
|
||||
epochs and steps_per_epoch.
|
||||
|
||||
The default behaviour of this scheduler follows the fastai implementation of 1cycle, which
|
||||
claims that "unpublished work has shown even better results by using only two phases". To
|
||||
mimic the behaviour of the original paper instead, set ``three_phase=True``.
|
||||
|
||||
:param optimizer: Wrapped optimizer
|
||||
:type optimizer: torch.optim.Optimizer
|
||||
:param total_steps: number of total training steps
|
||||
@@ -71,7 +64,6 @@ class OneCycleLR(_OneCycleLR):
|
||||
number of *batches* computed, not the total number of epochs computed.
|
||||
When last_epoch=-1, the schedule is started from the beginning, defaults to -1
|
||||
:type last_epoch: int, optional
|
||||
|
||||
.. _Super-Convergence\: Very Fast Training of Neural Networks Using Large Learning Rates:
|
||||
https://arxiv.org/abs/1708.07120
|
||||
"""
|
||||
|
@@ -7,7 +7,6 @@ from .delayed import WarmupScheduler
|
||||
@LR_SCHEDULERS.register_module
|
||||
class PolynomialLR(_LRScheduler):
|
||||
"""Polynomial learning rate scheduler.
|
||||
|
||||
:param optimizer: Wrapped optimizer
|
||||
:type optimizer: torch.optim.Optimizer
|
||||
:param total_steps: number of total training steps
|
||||
@@ -43,7 +42,6 @@ class PolynomialLR(_LRScheduler):
|
||||
@LR_SCHEDULERS.register_module
|
||||
class PolynomialWarmupLR(WarmupScheduler):
|
||||
"""Polynomial learning rate scheduler with warmup.
|
||||
|
||||
:param optimizer: Wrapped optimizer
|
||||
:type optimizer: torch.optim.Optimizer
|
||||
:param total_steps: number of total training steps
|
||||
|
@@ -10,7 +10,6 @@ from colossalai.registry import LR_SCHEDULERS
|
||||
class LambdaLR(_LambdaLR):
|
||||
"""Sets the learning rate of each parameter group to the initial lr
|
||||
times a given function. When last_epoch=-1, sets initial lr as lr.
|
||||
|
||||
:param optimizer: Wrapped optimizer
|
||||
:type optimizer: torch.optim.Optimizer
|
||||
:param total_steps: number of total training steps
|
||||
@@ -33,7 +32,6 @@ class LambdaLR(_LambdaLR):
|
||||
class MultiplicativeLR(_MultiplicativeLR):
|
||||
"""Multiply the learning rate of each parameter group by the factor given
|
||||
in the specified function. When last_epoch=-1, sets initial lr as lr
|
||||
|
||||
:param optimizer: Wrapped optimizer
|
||||
:type optimizer: torch.optim.Optimizer
|
||||
:param total_steps: number of total training steps
|
||||
@@ -58,7 +56,6 @@ class StepLR(_StepLR):
|
||||
step_size epochs. Notice that such decay can happen simultaneously with
|
||||
other changes to the learning rate from outside this scheduler. When
|
||||
last_epoch=-1, sets initial lr as lr
|
||||
|
||||
:param optimizer: Wrapped optimizer
|
||||
:type optimizer: torch.optim.Optimizer
|
||||
:param total_steps: number of total training steps
|
||||
@@ -82,7 +79,6 @@ class StepLR(_StepLR):
|
||||
class ExponentialLR(_ExponentialLR):
|
||||
"""Decays the learning rate of each parameter group by gamma every epoch.
|
||||
When last_epoch=-1, sets initial lr as lr
|
||||
|
||||
:param optimizer: Wrapped optimizer
|
||||
:type optimizer: torch.optim.Optimizer
|
||||
:param total_steps: number of total training steps
|
||||
|
Reference in New Issue
Block a user