Support TP-compatible Torch AMP and Update trainer API (#27)

* Add gradient accumulation, fix lr scheduler * fix FP16 optimizer and adapted torch amp with tensor parallel (#18) * fixed bugs in compatibility between torch amp and tensor parallel and performed some minor fixes * fixed trainer * Revert "fixed trainer" This reverts commit 2e0b0b7699. * improved consistency between trainer, engine and schedule (#23) Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: ver217 <lhx0217@gmail.com>
2025-09-10 05:20:33 +00:00 · 2021-11-18 19:45:06 +08:00
parent 2b05de4c64
commit 3defa32aee
80 changed files with 2194 additions and 1584 deletions
--- a/colossalai/utils/common.py
+++ b/colossalai/utils/common.py
@@ -27,7 +27,7 @@ def sync_model_param_in_dp(model):
    :param model: A pyTorch nn.model on whose parameters you check the consistency
    '''
    
-    if gpc.is_initialized(ParallelMode.DATA) and gpc.get_world_size(ParallelMode.DATA) > 2:
+    if gpc.is_initialized(ParallelMode.DATA) and gpc.get_world_size(ParallelMode.DATA) > 1:
        for param in model.parameters():
            ranks = gpc.get_ranks_in_group(ParallelMode.DATA)
            dist.broadcast(param, src=ranks[0], group=gpc.get_group(ParallelMode.DATA))