Feature/zero (#279)

* add zero1 (#209) * add zero1 * add test zero1 * update zero stage 1 develop (#212) * Implement naive zero3 (#240) * naive zero3 works well * add zero3 param manager * add TODOs in comments * add gather full param ctx * fix sub module streams * add offload * fix bugs of hook and add unit tests * fix bugs of hook and add unit tests (#252) * add gather full param ctx * fix sub module streams * add offload * fix bugs of hook and add unit tests * polish code and add state dict hook * fix bug * update unit test * refactor reconstructed zero code * clip_grad support zero3 and add unit test * add unit test for Zero3ParameterManager * [WIP] initialize the shard param class * [WIP] Yet another sharded model implementation (#274) * [WIP] initialize the shard param class * [WIP] Yes another implementation of shardModel. Using a better hook method. * torch.concat -> torch.cat * fix test_zero_level_1.py::test_zero_level_1 unitest * remove deepspeed implementation and refactor for the reconstructed zero module * polish zero dp unittests Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com>
2025-09-24 19:17:30 +00:00 · 2022-03-01 18:17:01 +08:00
parent 08eccfe681
commit 5a560a060a
40 changed files with 3912 additions and 6493 deletions
--- a/colossalai/engine/gradient_handler/_zero_gradient_handler.py
+++ b/colossalai/engine/gradient_handler/_zero_gradient_handler.py
@@ -13,4 +13,4 @@ class ZeROGradientHandler(BaseGradientHandler):
    def handle_gradient(self):
        """A method running a all-reduce operation in a data parallel group.
        """
-        self._optimizer.allreduce_gradients()
+        self._optimizer.sync_grad()
--- a/colossalai/engine/ophooks/init.py
+++ b/colossalai/engine/ophooks/init.py
@@ -1,9 +1,10 @@
 from ._base_ophook import BaseOpHook
 from ._memtracer_ophook import MemTracerOpHook
+from ._shard_param_ophook import ShardParamHook
 import torch
 from typing import List

-all = ["BaseOpHook", "MemTracerOpHook", "register_ophooks_recursively"]
+all = ["BaseOpHook", "MemTracerOpHook", "register_ophooks_recursively", "ShardParamHook"]


 # apply torch.autograd.Function that calls a backward_function to tensors in output
--- a/colossalai/engine/ophooks/_memtracer_ophook.py
+++ b/colossalai/engine/ophooks/_memtracer_ophook.py
@@ -4,7 +4,6 @@ from concurrent.futures import ThreadPoolExecutor
 from colossalai.registry import OPHOOKS
 from colossalai.logging import get_dist_logger
 from time import sleep, time
-import psutil
 import pickle


--- a/colossalai/engine/ophooks/_shard_param_ophook.py
+++ b/colossalai/engine/ophooks/_shard_param_ophook.py
@@ -0,0 +1,41 @@
+import torch
+from . import BaseOpHook
+from colossalai.registry import OPHOOKS
+
+@OPHOOKS.register_module
+class ShardParamHook(BaseOpHook):
+    """
+    A hook to process sharded param before and afther FWD and BWD operator executing.
+    """
+    def __init__(self):
+        super().__init__()
+
+    def niter(self):
+        return self._niter
+
+    def pre_fwd_exec(self, module: torch.nn.Module, *args):
+        for param in module.parameters():
+            assert hasattr(param, 'ca_attr')
+            param.ca_attr.gather()
+
+    def post_fwd_exec(self, module: torch.nn.Module, *args):
+        for param in module.parameters():
+            assert hasattr(param, 'ca_attr')
+            param.ca_attr.shard()
+
+    def pre_bwd_exec(self, module: torch.nn.Module, input, output):
+        for param in module.parameters():
+            assert hasattr(param, 'ca_attr')
+            param.ca_attr.gather()
+
+    def post_bwd_exec(self, module: torch.nn.Module, input):
+        for param in module.parameters():
+            assert hasattr(param, 'ca_attr')
+            param.ca_attr.shard()
+
+    def pre_iter(self):
+        pass
+
+    def post_iter(self):
+        pass
+
--- a/colossalai/engine/schedule/_pipeline_schedule.py
+++ b/colossalai/engine/schedule/_pipeline_schedule.py
@@ -12,8 +12,7 @@ from colossalai.core import global_context as gpc
 from colossalai.logging import get_dist_logger
 from colossalai.utils import switch_virtual_pipeline_parallel_rank
 from colossalai.utils.cuda import get_current_device
-from colossalai.zero import (ZeroRedundancyOptimizer_Level_2,
-                             ZeroRedundancyOptimizer_Level_3)
+from colossalai.zero import ShardedOptimizer, ShardedModel

 from ._base_schedule import BaseSchedule

@@ -91,9 +90,10 @@ class PipelineSchedule(BaseSchedule):
        return self._move_to_device(data), self._move_to_device(label)

    def pre_processing(self, engine):
-        if isinstance(engine.optimizer, (ZeroRedundancyOptimizer_Level_2, ZeroRedundancyOptimizer_Level_3)):
+        # TODO: remove this after testing new zero with pipeline parallelism
+        if isinstance(engine.optimizer, ShardedOptimizer) or isinstance(engine.model, ShardedModel):
            raise TypeError(
-                "Pipeline schedule is currently not compatible with ZeRO Level 2 and Level 3"
+                "Pipeline schedule is currently not compatible with ZeRO"
            )
        model = engine.model
        if isinstance(model, NaiveAMPModel):