Support TP-compatible Torch AMP and Update trainer API (#27)

* Add gradient accumulation, fix lr scheduler * fix FP16 optimizer and adapted torch amp with tensor parallel (#18) * fixed bugs in compatibility between torch amp and tensor parallel and performed some minor fixes * fixed trainer * Revert "fixed trainer" This reverts commit 2e0b0b7699. * improved consistency between trainer, engine and schedule (#23) Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: ver217 <lhx0217@gmail.com>
2025-09-25 19:55:03 +00:00 · 2021-11-18 19:45:06 +08:00
parent 2b05de4c64
commit 3defa32aee
80 changed files with 2194 additions and 1584 deletions
--- a/colossalai/builder/init.py
+++ b/colossalai/builder/init.py
@@ -1,2 +1,10 @@
-from .builder import *
+from .builder import (build_schedule, build_lr_scheduler, build_model, build_optimizer, build_optimizer_wrapper,
+                      build_layer, build_loss, build_hooks, build_dataset, build_transform, build_data_sampler,
+                      build_gradient_handler)
 from .pipeline import ModelInitializer
+
+__all__ = [
+    'build_schedule', 'build_lr_scheduler', 'build_model', 'build_optimizer', 'build_optimizer_wrapper',
+    'build_layer', 'build_loss', 'build_hooks', 'build_dataset', 'build_transform', 'build_data_sampler',
+    'build_gradient_handler', 'ModelInitializer'
+]
--- a/colossalai/builder/builder.py
+++ b/colossalai/builder/builder.py
@@ -181,18 +181,6 @@ def build_transform(config):
    return build_from_registry(config, TRANSFORMS)


-def build_pipe_alloc_policy(config):
-    """Returns a pipeline allocation policy object constructed from `config`.
-
-    :param config: A python dict or a :class:`colossalai.context.Config` object
-        containing information used in the construction of the return object
-    :type config: dict or :class:`colossalai.context.Config`
-    :return: A pipeline allocation policy object
-    :rtype: 
-    """
-    return build_from_registry(config, PIPE_ALLOC_POLICY)
-
-
 def build_data_sampler(config, dataset):
    """Returns a data sampler object of :class:`colossalai.nn.data.sampler.BaseSampler`
    constructed from `config`.
@@ -235,7 +223,7 @@ def build_optimizer_wrapper(config, optimizer, model=None):
        return OPTIMIZER_WRAPPERS.get_module(mod_type)(optimizer, **config_)


-def build_lr_scheduler(config, optimizer, total_steps, num_steps_per_epoch):
+def build_lr_scheduler(config, optimizer):
    """Returns a learning rate scheduler object of :class:`torch.optim.lr_scheduler` 
    constructed from `config`, `optimizer`, `total_steps` and `num_steps_per_epoch`.

@@ -254,9 +242,16 @@ def build_lr_scheduler(config, optimizer, total_steps, num_steps_per_epoch):
    """
    config_ = config.copy()
    mod_type = config_.pop('type')
-    # warmup epochs will overwrite warmup steps
-    if 'warmup_epochs' in config_:
-        warmup_epochs = config_.pop('warmup_epochs')
-        config_['warmup_steps'] = int(num_steps_per_epoch * warmup_epochs)
-    return LR_SCHEDULERS.get_module(mod_type)(optimizer, total_steps, num_steps_per_epoch=num_steps_per_epoch,
-                                              **config_)
+    return LR_SCHEDULERS.get_module(mod_type)(optimizer, **config_)
+
+
+def build_schedule(config):
+    """Returns a schedule of :class:`colossalai.engine.schedule.BaseSchedule`.
+
+    :param config: A python dict or a :class:`colossalai.context.Config` object
+        containing information used in the construction of the return object
+    :type config: dict or :class:`colossalai.context.Config`
+    :return: An object of :class:`colossalai.engine.schedule.BaseSchedule`
+    :rtype: :class:`colossalai.engine.schedule.BaseSchedule`
+    """
+    return build_from_registry(config, SCHEDULE)