Optimize pipeline schedule (#94)

* add pipeline shared module wrapper and update load batch * added model parallel process group for amp and clip grad (#86) * added model parallel process group for amp and clip grad * update amp and clip with model parallel process group * remove pipeline_prev/next group (#88) * micro batch offload * optimize pipeline gpu memory usage * pipeline can receive tensor shape (#93) * optimize pipeline gpu memory usage * fix grad accumulation step counter * rename classes and functions Co-authored-by: Frank Lee <somerlee.9@gmail.com>
2025-10-23 16:08:55 +00:00 · 2021-12-30 15:56:46 +08:00
parent e5b9f9a08d
commit 96780e6ee4
29 changed files with 423 additions and 290 deletions
--- a/colossalai/engine/gradient_handler/_pipeline_parallel_gradient_handler.py
+++ b/colossalai/engine/gradient_handler/_pipeline_parallel_gradient_handler.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env python
+
+import torch.distributed as dist
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+
+from colossalai.core import global_context as gpc
+from colossalai.registry import GRADIENT_HANDLER
+from ._base_gradient_handler import BaseGradientHandler
+from collections import defaultdict
+
+
+@GRADIENT_HANDLER.register_module
+class PipelineSharedModuleGradientHandler(BaseGradientHandler):
+    """A helper class to handle all-reduce operations in sub parallel groups.
+    A all-reduce collective communication will be operated in 
+    :func:`handle_gradient` among all sub pipeline parallel groups.
+    For better performance, it bucketizes the gradients of all parameters that are 
+    the same type to improve the efficiency of communication.
+    """
+
+    def handle_gradient(self):
+        """A method running a all-reduce operation in sub pipeline parallel groups.
+        """
+        if gpc.pipeline_parallel_size > 1:
+            # bucketize and all-reduce
+            buckets = defaultdict(lambda: defaultdict(list))
+            # Pack the buckets.
+            for param in self._model.parameters():
+                group = getattr(param, 'pipeline_shared_module_pg', None)
+                if param.requires_grad and param.grad is not None and group is not None:
+                    tp = param.data.type()
+                    buckets[group][tp].append(param)
+
+            # For each bucket, all-reduce and copy all-reduced grads.
+            for group, group_buckets in buckets.items():
+                for tp, bucket in group_buckets.items():
+                    grads = [param.grad.data for param in bucket]
+                    coalesced = _flatten_dense_tensors(grads)
+                    dist.all_reduce(coalesced, op=dist.ReduceOp.SUM, group=group)
+                    for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
+                        buf.copy_(synced)