mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-25 19:55:03 +00:00
Support TP-compatible Torch AMP and Update trainer API (#27)
* Add gradient accumulation, fix lr scheduler
* fix FP16 optimizer and adapted torch amp with tensor parallel (#18)
* fixed bugs in compatibility between torch amp and tensor parallel and performed some minor fixes
* fixed trainer
* Revert "fixed trainer"
This reverts commit 2e0b0b7699
.
* improved consistency between trainer, engine and schedule (#23)
Co-authored-by: 1SAA <c2h214748@gmail.com>
Co-authored-by: 1SAA <c2h214748@gmail.com>
Co-authored-by: ver217 <lhx0217@gmail.com>
This commit is contained in:
@@ -1,2 +1,10 @@
|
||||
from .builder import *
|
||||
from .builder import (build_schedule, build_lr_scheduler, build_model, build_optimizer, build_optimizer_wrapper,
|
||||
build_layer, build_loss, build_hooks, build_dataset, build_transform, build_data_sampler,
|
||||
build_gradient_handler)
|
||||
from .pipeline import ModelInitializer
|
||||
|
||||
__all__ = [
|
||||
'build_schedule', 'build_lr_scheduler', 'build_model', 'build_optimizer', 'build_optimizer_wrapper',
|
||||
'build_layer', 'build_loss', 'build_hooks', 'build_dataset', 'build_transform', 'build_data_sampler',
|
||||
'build_gradient_handler', 'ModelInitializer'
|
||||
]
|
||||
|
@@ -181,18 +181,6 @@ def build_transform(config):
|
||||
return build_from_registry(config, TRANSFORMS)
|
||||
|
||||
|
||||
def build_pipe_alloc_policy(config):
|
||||
"""Returns a pipeline allocation policy object constructed from `config`.
|
||||
|
||||
:param config: A python dict or a :class:`colossalai.context.Config` object
|
||||
containing information used in the construction of the return object
|
||||
:type config: dict or :class:`colossalai.context.Config`
|
||||
:return: A pipeline allocation policy object
|
||||
:rtype:
|
||||
"""
|
||||
return build_from_registry(config, PIPE_ALLOC_POLICY)
|
||||
|
||||
|
||||
def build_data_sampler(config, dataset):
|
||||
"""Returns a data sampler object of :class:`colossalai.nn.data.sampler.BaseSampler`
|
||||
constructed from `config`.
|
||||
@@ -235,7 +223,7 @@ def build_optimizer_wrapper(config, optimizer, model=None):
|
||||
return OPTIMIZER_WRAPPERS.get_module(mod_type)(optimizer, **config_)
|
||||
|
||||
|
||||
def build_lr_scheduler(config, optimizer, total_steps, num_steps_per_epoch):
|
||||
def build_lr_scheduler(config, optimizer):
|
||||
"""Returns a learning rate scheduler object of :class:`torch.optim.lr_scheduler`
|
||||
constructed from `config`, `optimizer`, `total_steps` and `num_steps_per_epoch`.
|
||||
|
||||
@@ -254,9 +242,16 @@ def build_lr_scheduler(config, optimizer, total_steps, num_steps_per_epoch):
|
||||
"""
|
||||
config_ = config.copy()
|
||||
mod_type = config_.pop('type')
|
||||
# warmup epochs will overwrite warmup steps
|
||||
if 'warmup_epochs' in config_:
|
||||
warmup_epochs = config_.pop('warmup_epochs')
|
||||
config_['warmup_steps'] = int(num_steps_per_epoch * warmup_epochs)
|
||||
return LR_SCHEDULERS.get_module(mod_type)(optimizer, total_steps, num_steps_per_epoch=num_steps_per_epoch,
|
||||
**config_)
|
||||
return LR_SCHEDULERS.get_module(mod_type)(optimizer, **config_)
|
||||
|
||||
|
||||
def build_schedule(config):
|
||||
"""Returns a schedule of :class:`colossalai.engine.schedule.BaseSchedule`.
|
||||
|
||||
:param config: A python dict or a :class:`colossalai.context.Config` object
|
||||
containing information used in the construction of the return object
|
||||
:type config: dict or :class:`colossalai.context.Config`
|
||||
:return: An object of :class:`colossalai.engine.schedule.BaseSchedule`
|
||||
:rtype: :class:`colossalai.engine.schedule.BaseSchedule`
|
||||
"""
|
||||
return build_from_registry(config, SCHEDULE)
|
||||
|
Reference in New Issue
Block a user