diff --git a/colossalai/engine/gradient_handler/_zero_gradient_handler.py b/colossalai/engine/gradient_handler/_zero_gradient_handler.py
index b303bcb39..7be3f8fb3 100644
--- a/colossalai/engine/gradient_handler/_zero_gradient_handler.py
+++ b/colossalai/engine/gradient_handler/_zero_gradient_handler.py
@@ -13,4 +13,4 @@ class ZeROGradientHandler(BaseGradientHandler):
     def handle_gradient(self):
         """A method running a all-reduce operation in a data parallel group.
         """
-        self._optimizer.allreduce_gradients()
+        self._optimizer.sync_grad()
diff --git a/colossalai/engine/ophooks/__init__.py b/colossalai/engine/ophooks/__init__.py
index abfe0a581..b1130dc5d 100644
--- a/colossalai/engine/ophooks/__init__.py
+++ b/colossalai/engine/ophooks/__init__.py
@@ -1,9 +1,10 @@
 from ._base_ophook import BaseOpHook
 from ._memtracer_ophook import MemTracerOpHook
+from ._shard_param_ophook import ShardParamHook
 import torch
 from typing import List
 
-all = ["BaseOpHook", "MemTracerOpHook", "register_ophooks_recursively"]
+all = ["BaseOpHook", "MemTracerOpHook", "register_ophooks_recursively", "ShardParamHook"]
 
 
 # apply torch.autograd.Function that calls a backward_function to tensors in output
diff --git a/colossalai/engine/ophooks/_memtracer_ophook.py b/colossalai/engine/ophooks/_memtracer_ophook.py
index 3f5671230..a5bfde056 100644
--- a/colossalai/engine/ophooks/_memtracer_ophook.py
+++ b/colossalai/engine/ophooks/_memtracer_ophook.py
@@ -4,7 +4,6 @@ from concurrent.futures import ThreadPoolExecutor
 from colossalai.registry import OPHOOKS
 from colossalai.logging import get_dist_logger
 from time import sleep, time
-import psutil
 import pickle
 
 
diff --git a/colossalai/engine/ophooks/_shard_param_ophook.py b/colossalai/engine/ophooks/_shard_param_ophook.py
new file mode 100644
index 000000000..5bee3f9a4
--- /dev/null
+++ b/colossalai/engine/ophooks/_shard_param_ophook.py
@@ -0,0 +1,41 @@
+import torch
+from . import BaseOpHook
+from colossalai.registry import OPHOOKS
+
+@OPHOOKS.register_module
+class ShardParamHook(BaseOpHook):
+    """
+    A hook to process sharded param before and afther FWD and BWD operator executing.
+    """
+    def __init__(self):
+        super().__init__()
+
+    def niter(self):
+        return self._niter
+
+    def pre_fwd_exec(self, module: torch.nn.Module, *args):
+        for param in module.parameters():
+            assert hasattr(param, 'ca_attr')
+            param.ca_attr.gather()
+
+    def post_fwd_exec(self, module: torch.nn.Module, *args):
+        for param in module.parameters():
+            assert hasattr(param, 'ca_attr')
+            param.ca_attr.shard()
+
+    def pre_bwd_exec(self, module: torch.nn.Module, input, output):
+        for param in module.parameters():
+            assert hasattr(param, 'ca_attr')
+            param.ca_attr.gather()
+
+    def post_bwd_exec(self, module: torch.nn.Module, input):
+        for param in module.parameters():
+            assert hasattr(param, 'ca_attr')
+            param.ca_attr.shard()
+
+    def pre_iter(self):
+        pass
+
+    def post_iter(self):
+        pass
+
diff --git a/colossalai/engine/schedule/_pipeline_schedule.py b/colossalai/engine/schedule/_pipeline_schedule.py
index 5bab0d524..24256ccd0 100644
--- a/colossalai/engine/schedule/_pipeline_schedule.py
+++ b/colossalai/engine/schedule/_pipeline_schedule.py
@@ -12,8 +12,7 @@ from colossalai.core import global_context as gpc
 from colossalai.logging import get_dist_logger
 from colossalai.utils import switch_virtual_pipeline_parallel_rank
 from colossalai.utils.cuda import get_current_device
-from colossalai.zero import (ZeroRedundancyOptimizer_Level_2,
-                             ZeroRedundancyOptimizer_Level_3)
+from colossalai.zero import ShardedOptimizer, ShardedModel
 
 from ._base_schedule import BaseSchedule
 
@@ -91,9 +90,10 @@ class PipelineSchedule(BaseSchedule):
         return self._move_to_device(data), self._move_to_device(label)
 
     def pre_processing(self, engine):
-        if isinstance(engine.optimizer, (ZeroRedundancyOptimizer_Level_2, ZeroRedundancyOptimizer_Level_3)):
+        # TODO: remove this after testing new zero with pipeline parallelism
+        if isinstance(engine.optimizer, ShardedOptimizer) or isinstance(engine.model, ShardedModel):
             raise TypeError(
-                "Pipeline schedule is currently not compatible with ZeRO Level 2 and Level 3"
+                "Pipeline schedule is currently not compatible with ZeRO"
             )
         model = engine.model
         if isinstance(model, NaiveAMPModel):
diff --git a/colossalai/initialize.py b/colossalai/initialize.py
index 9329dc052..f68947f4c 100644
--- a/colossalai/initialize.py
+++ b/colossalai/initialize.py
@@ -2,30 +2,31 @@
 # -*- encoding: utf-8 -*-
 
 import argparse
-import pprint
 import os
-from colossalai.nn.optimizer.colossalai_optimizer import ColossalaiOptimizer
+import pprint
+from pathlib import Path
+from typing import Dict, Iterable, List, Optional, Tuple, Union
+
 import torch
 import torch.nn as nn
-
-from pathlib import Path
-from typing import Iterable, Union, Optional, Tuple, List, Dict
-
-from colossalai.amp import convert_to_amp, AMP_TYPE
-from colossalai.context import Config, ParallelMode, ConfigException
-from colossalai.core import global_context as gpc
-from colossalai.engine import Engine
-from colossalai.logging import get_dist_logger
-from colossalai.utils import (accumulate_gradient, get_current_device,
-                              sync_model_param, is_using_ddp, is_using_pp, is_using_sequence)
-from colossalai.zero import convert_to_zero, ZeroRedundancyOptimizer_Level_2, ZeroRedundancyOptimizer_Level_3
-from colossalai.builder.builder import build_gradient_handler
-from torch.optim.optimizer import Optimizer
-from torch.optim.lr_scheduler import _LRScheduler
-from torch.utils.data import DataLoader
 from torch.nn.modules.loss import _Loss
 from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.optim.lr_scheduler import _LRScheduler
+from torch.optim.optimizer import Optimizer
+from torch.utils.data import DataLoader
+
+from colossalai.amp import AMP_TYPE, convert_to_amp
+from colossalai.builder.builder import build_gradient_handler
+from colossalai.context import Config, ConfigException, ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.engine import Engine
 from colossalai.global_variables import moe_env
+from colossalai.logging import get_dist_logger
+from colossalai.nn.optimizer.colossalai_optimizer import ColossalaiOptimizer
+from colossalai.utils import (accumulate_gradient, get_current_device,
+                              is_using_ddp, is_using_pp, is_using_sequence,
+                              sync_model_param)
+from colossalai.zero import convert_to_zero, ShardedOptimizer
 
 
 def get_default_parser():
@@ -332,8 +333,7 @@ def initialize(model: Union[nn.Module, List[nn.Module]],
         # 1. if optimizer is ZERO, then use zero grad handler
         # 2. if dp size is larger than 1 and pipeline is not used, use pytorch ddp
         # 3. if using pipeline and dp size larger than 1, use data parallel grad handler
-        if isinstance(optimizer, (ZeroRedundancyOptimizer_Level_2,
-                                  ZeroRedundancyOptimizer_Level_3)):
+        if isinstance(optimizer, ShardedOptimizer):
             gradient_handler_cfg = [dict(type='ZeROGradientHandler')]
             if verbose:
                 logger.info(
@@ -348,7 +348,8 @@ def initialize(model: Union[nn.Module, List[nn.Module]],
                     "added even though not specified in the configuration",
                     ranks=[0])
         elif is_using_sequence():
-            model = DDP(model, process_group=gpc.get_group(ParallelMode.SEQUENCE_DP), device_ids=[torch.cuda.current_device()])
+            model = DDP(model, process_group=gpc.get_group(ParallelMode.SEQUENCE_DP),
+                        device_ids=[torch.cuda.current_device()])
             if verbose:
                 logger.info(
                     'Model is using torch.nn.parallel.DistributedDataParallel for Sequence Parallelism', ranks=[0])
@@ -393,7 +394,7 @@ def initialize(model: Union[nn.Module, List[nn.Module]],
         gradient_handlers = [build_gradient_handler(cfg, model, optimizer) for cfg in gradient_handler_cfg]
 
     # check if optimizer is ColossalaiOptimizer
-    if not isinstance(optimizer, (ColossalaiOptimizer, ZeroRedundancyOptimizer_Level_2, ZeroRedundancyOptimizer_Level_3)):
+    if not isinstance(optimizer, (ColossalaiOptimizer, ShardedOptimizer)):
         optimizer = ColossalaiOptimizer(optim=optimizer)
 
     # gradient accumulation
diff --git a/colossalai/utils/__init__.py b/colossalai/utils/__init__.py
index c769022a5..645b58131 100644
--- a/colossalai/utils/__init__.py
+++ b/colossalai/utils/__init__.py
@@ -1,9 +1,13 @@
 from .activation_checkpoint import checkpoint
-from .common import (clip_grad_norm_fp32, conditional_context, copy_tensor_parallel_attributes, count_zeros_fp32,
-                     free_port, is_dp_rank_0, is_model_parallel_parameter, is_no_pp_or_last_stage, is_tp_rank_0,
-                     is_using_ddp, is_using_pp, is_using_sequence, model_branch_context, multi_tensor_applier,
-                     param_is_not_tensor_parallel_duplicate, print_rank_0, switch_virtual_pipeline_parallel_rank,
-                     sync_model_param)
+
+from .common import (clip_grad_norm_fp32, conditional_context,
+                     copy_tensor_parallel_attributes, count_zeros_fp32,
+                     free_port, is_dp_rank_0, is_model_parallel_parameter,
+                     is_moe_parallel_parameter, is_no_pp_or_last_stage,
+                     is_tp_rank_0, is_using_ddp, is_using_pp,
+                     is_using_sequence, multi_tensor_applier,
+                     param_is_not_tensor_parallel_duplicate, print_rank_0,
+                     switch_virtual_pipeline_parallel_rank, sync_model_param)
 from .cuda import empty_cache, get_current_device, set_to_cuda, synchronize
 from .data_sampler import DataParallelSampler, get_dataloader
 from .gradient_accumulation import accumulate_gradient
@@ -12,9 +16,9 @@ from .timer import MultiTimer, Timer
 
 __all__ = [
     'checkpoint', 'free_port', 'print_rank_0', 'sync_model_param', 'is_dp_rank_0', 'is_tp_rank_0',
-    'is_no_pp_or_last_stage', 'is_using_ddp', 'is_using_pp', 'is_using_sequence', 'model_branch_context',
-    'conditional_context', 'is_model_parallel_parameter', 'clip_grad_norm_fp32', 'count_zeros_fp32',
-    'copy_tensor_parallel_attributes', 'param_is_not_tensor_parallel_duplicate', 'get_current_device', 'synchronize',
-    'empty_cache', 'set_to_cuda', 'report_memory_usage', 'Timer', 'MultiTimer', 'multi_tensor_applier',
-    'accumulate_gradient', 'DataParallelSampler', 'get_dataloader', 'switch_virtual_pipeline_parallel_rank'
+    'is_no_pp_or_last_stage', 'is_using_ddp', 'is_using_pp', 'is_using_sequence', 'conditional_context',
+    'is_model_parallel_parameter', 'clip_grad_norm_fp32', 'count_zeros_fp32', 'copy_tensor_parallel_attributes',
+    'param_is_not_tensor_parallel_duplicate', 'get_current_device', 'synchronize', 'empty_cache', 'set_to_cuda',
+    'report_memory_usage', 'Timer', 'MultiTimer', 'multi_tensor_applier', 'accumulate_gradient', 'DataParallelSampler',
+    'get_dataloader', 'switch_virtual_pipeline_parallel_rank', 'is_moe_parallel_parameter'
 ]
diff --git a/colossalai/utils/common.py b/colossalai/utils/common.py
index 942801018..6427e4c8a 100644
--- a/colossalai/utils/common.py
+++ b/colossalai/utils/common.py
@@ -2,9 +2,12 @@
 # -*- encoding: utf-8 -*-
 import random
 import socket
+from typing import List, Union
 
 import torch
 from torch._six import inf
+from torch.nn.parameter import Parameter
+
 
 try:
     import colossal_C
@@ -14,7 +17,8 @@ except:
 from contextlib import contextmanager
 
 import torch.distributed as dist
-from colossalai.constants import IS_TENSOR_PARALLEL, NUM_PARTITIONS, TENSOR_PARALLEL_ATTRIBUTES
+from colossalai.constants import (IS_TENSOR_PARALLEL, NUM_PARTITIONS,
+                                  TENSOR_PARALLEL_ATTRIBUTES)
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.global_variables import moe_env
@@ -134,6 +138,10 @@ def _calc_lp(grads, norm_type):
         norm += grad_norm**norm_type
     return norm
 
+def _move_norm_to_cuda(norm: Union[float, torch.Tensor]) -> Union[float, torch.Tensor]:
+    if torch.is_tensor(norm) and norm.device.type != 'cuda':
+        norm = norm.to(torch.cuda.current_device())
+    return norm
 
 # ======== Gradient Clipping =========
 
@@ -163,17 +171,27 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
     #   - grad should not be none
     #   - parameter should not be shared
     #   - should not be a replica due to tensor model parallelism
-    params = []
+    params: List[Parameter] = []
+    has_zero_shared_param: bool = False
     for param in parameters:
         if param.grad is not None:
             # Make sure the grads are in fp32
-            assert param.grad.type() == 'torch.cuda.FloatTensor', \
-                f'expected gradient to be dtype torch.cuda.FloatTensor, but got {param.grad.type()}'
+            assert param.grad.dtype == torch.float, \
+                f'expected gradient to be dtype torch.float, but got {param.grad.type()}'
+            if hasattr(param, 'zero_is_sharded'):
+                has_zero_shared_param = True
             params.append(param)
+
+    if len(params) == 0:
+        return 0.0
     # Norm parameters.
     max_norm = float(max_norm)
     norm_type = float(norm_type)
 
+    # Parameters can be on CPU or CUDA
+    # If parameters are on CPU, disable CUDA kernerls
+    enable_cuda_kernels = params[0].grad.device.type == 'cuda'
+
     # Calculate norm.
     if norm_type == inf:
         total_norm = max(p.grad.data.abs().max() for p in params)
@@ -184,28 +202,49 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
                             op=dist.ReduceOp.MAX,
                             group=gpc.get_group(ParallelMode.MODEL),
                             async_op=False)
+        if has_zero_shared_param:
+            dist.all_reduce(total_norm_cuda,
+                            op=dist.ReduceOp.MAX,
+                            group=gpc.get_group(ParallelMode.DATA),
+                            async_op=False)
         total_norm = total_norm_cuda[0].item()
     else:
         tensor_parallel_grads = []
         no_tensor_parallel_grads = []
         moe_parallel_grads = []  # used to collect moe tensor parallel gradients
+        zero_sharded_grads = []
         for p in params:
             if is_model_parallel_parameter(p):
                 reductor = (gpc.get_world_size(ParallelMode.TENSOR) / getattr(p, NUM_PARTITIONS))**(1 / norm_type)
                 tensor_parallel_grads.append(p.grad.data / reductor)
             elif is_moe_parallel_parameter(p):
                 moe_parallel_grads.append(p.grad.data)
+            elif hasattr(p, 'zero_is_sharded'):
+                zero_sharded_grads.append(p.grad.data)
             else:
                 no_tensor_parallel_grads.append(p.grad.data)
 
-        if norm_type == 2.0:
-            tensor_parallel_norm = _calc_l2_norm(tensor_parallel_grads)**norm_type
-            no_tensor_parallel_norm = _calc_l2_norm(no_tensor_parallel_grads)**norm_type
-            moe_parallel_norm = _calc_l2_norm(moe_parallel_grads)**norm_type
+        if norm_type == 2.0 and enable_cuda_kernels:
+            tensor_parallel_norm = _calc_l2_norm(
+                tensor_parallel_grads) ** norm_type
+            no_tensor_parallel_norm = _calc_l2_norm(
+                no_tensor_parallel_grads) ** norm_type
+            moe_parallel_norm = _calc_l2_norm(
+                moe_parallel_grads) ** norm_type
+            zero_sharded_norm = _calc_l2_norm(zero_sharded_grads) ** norm_type
         else:
             tensor_parallel_norm = _calc_lp(tensor_parallel_grads, norm_type)
             no_tensor_parallel_norm = _calc_lp(no_tensor_parallel_grads, norm_type)
             moe_parallel_norm = _calc_lp(moe_parallel_grads, norm_type)
+            zero_sharded_norm = _calc_lp(zero_sharded_grads, norm_type)
+
+        # If grads are on CPU, the norms is also on CPU. Cast them to CUDA tensors
+        if not enable_cuda_kernels:
+            tensor_parallel_norm = _move_norm_to_cuda(tensor_parallel_norm)
+            no_tensor_parallel_norm = _move_norm_to_cuda(no_tensor_parallel_norm)
+            moe_parallel_norm = _move_norm_to_cuda(moe_parallel_norm)
+            zero_sharded_norm = _move_norm_to_cuda(zero_sharded_norm)
+
         # Sum across all model-parallel GPUs.
         if gpc.is_initialized(ParallelMode.TENSOR) and len(tensor_parallel_grads) > 0:
             dist.all_reduce(tensor_parallel_norm, op=dist.ReduceOp.SUM, group=gpc.get_group(ParallelMode.TENSOR))
@@ -213,20 +252,32 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
         if len(moe_parallel_grads) > 0:
             dist.all_reduce(moe_parallel_norm, group=gpc.get_group(ParallelMode.MOE_MODEL))
             no_tensor_parallel_norm += moe_parallel_norm
+        # Sum across all zero sharded GPUs
+        if len(zero_sharded_grads) > 0:
+            dist.all_reduce(zero_sharded_norm, group=gpc.get_group(ParallelMode.DATA))
+            no_tensor_parallel_norm += zero_sharded_norm
         total_norm = tensor_parallel_norm + no_tensor_parallel_norm
         if gpc.is_initialized(ParallelMode.PIPELINE) and gpc.get_world_size(ParallelMode.PIPELINE) > 1:
-            dist.all_reduce(total_norm, op=dist.ReduceOp.SUM, group=gpc.get_group(ParallelMode.PIPELINE))
-        total_norm = total_norm**(1.0 / norm_type)
-        if type(total_norm) == 'torch.cuda.FloatTensor':
+            dist.all_reduce(total_norm,
+                            op=dist.ReduceOp.SUM,
+                            group=gpc.get_group(ParallelMode.PIPELINE))
+        total_norm = total_norm ** (1.0 / norm_type)
+        if torch.is_tensor(total_norm):
             total_norm = total_norm.item()
 
     # Scale.
     clip_coeff = max_norm / (total_norm + 1.0e-6)
     if clip_coeff < 1.0:
-        grads = [p.grad.detach() for p in params]
-        dummy_overflow_buf = torch.cuda.IntTensor([0])
-        multi_tensor_applier(colossal_C.multi_tensor_scale, dummy_overflow_buf, [grads, grads], clip_coeff)
-
+        if enable_cuda_kernels:
+            grads = [p.grad.detach() for p in params]
+            dummy_overflow_buf = torch.cuda.IntTensor([0])
+            multi_tensor_applier(colossal_C.multi_tensor_scale,
+                                 dummy_overflow_buf,
+                                 [grads, grads],
+                                 clip_coeff)
+        else:
+            for p in params:
+                p.grad.detach().mul_(clip_coeff)
     return total_norm
 
 
diff --git a/colossalai/zero/__init__.py b/colossalai/zero/__init__.py
index 02c210c0b..708650cd8 100644
--- a/colossalai/zero/__init__.py
+++ b/colossalai/zero/__init__.py
@@ -1,13 +1,12 @@
+from distutils.command.config import config
 import torch
 import torch.nn as nn
-from torch.optim import Optimizer
 from colossalai.amp.naive_amp import NaiveAMPModel
-from colossalai.utils import is_no_pp_or_last_stage
-from colossalai.core import global_context as gpc
 from colossalai.context.parallel_mode import ParallelMode
-
-from .zero_redundancy_optimizer_level_2 import ZeroRedundancyOptimizer_Level_2
-from .zero_redundancy_optimizer_level_3 import ZeroRedundancyOptimizer_Level_3
+from colossalai.core import global_context as gpc
+from torch.optim import Optimizer
+from .sharded_model import ShardedModel
+from .sharded_optim import ShardedOptimizer
 
 
 def convert_to_zero(model: nn.Module,
@@ -29,82 +28,14 @@ def convert_to_zero(model: nn.Module,
     :return: (model, optimizer)
     :rtype: Tuple
     """
-    import deepspeed
-    assert level == 2 or level == 3, 'Only ZERO Optimizer Level 2 and 3 are provided'
-    model = NaiveAMPModel(model, output_to_fp32=False)
-
-    if level == 2:
-        optimizer = ZeroRedundancyOptimizer_Level_2(init_optimizer=optimizer, **zero_config)
+    assert 1 <= level <= 3, 'Only ZERO Optimizer Level 1-3 are provided'
+    if level in [1, 2]:
+        if level == 2:
+            assert config['partition_grad'], 'ZeRO Optimizer requires partition_grad to be True'
+        model = NaiveAMPModel(model, output_to_fp32=True)
+        optimizer = ShardedOptimizer(model.parameters(), *zero_config)
     else:
-        optimizer = ZeroRedundancyOptimizer_Level_3(init_optimizer=optimizer, module=model, **zero_config)
+        model = ShardedModel(module=model, **zero_config)
     return model, optimizer
 
-
-def zero3_model_context(dtype=torch.half):
-    """A context to enable massive model construction for training with
-        ZeRO-3. Models are automatically partitioned (or, sharded) across the
-        system and converted to half precision. Note that the config of ZeRO-3 will be loaded automatically from `gpc.config`.
-
-        Args:
-            dtype (``dtype``, optional): Can be used to change the data type of the parameters.
-                Supported options are ``torch.half`` and ``torch.float``. Defaults to ``torch.half``
-
-        This context accelerates model initialization and enables models that
-        are too large to allocate in their entirety in CPU memory. It has the
-        following effects:
-
-        #. allocates tensors to either GPU or CPU memory or NVMe
-        #. converts floating point tensors to half precision
-        #. immediately partitions tensors among the group of data-parallel devices
-        #. (*optional*) replaces ``torch.nn.functional.linear`` with a more
-           memory-efficient implementation
-
-        These modifications allow for models that exceed the size of local CPU/GPU
-        memory/NVMe, but fit within the total NVMe capacity (*i.e.*, aggregate CPU
-        or GPU memory or NVMe) across all nodes. Consider initializing a model with one
-        trillion parameters, whose weights occupy two terabytes (TB) in half
-        precision. The initial CPU allocation in full precision requires 4TB of
-        memory *per process*, and so a system with 8 GPUs per node would need 32TB of
-        CPU memory due to data-parallel redundancies. Instead, by immediately
-        partitioning tensors we remove the redundancies. The result is that
-        regardless of the number of GPUs, we still only require the original 4TB. This
-        allows for a linear increase in model size with the aggregate system memory.
-        For example, if a node has 1TB of memory and 8 GPUs, we could fit a trillion
-        parameter model with 4 nodes and 32 GPUs.
-
-        Important: If the fp16 weights of the model can't fit onto a single GPU memory
-        this feature must be used.
-
-        Examples
-        --------
-
-        #. Allocate a model and partition it among all processes:
-
-            .. code-block:: python
-
-                with zero3_model_context():
-                    model = MyLargeModel()
-
-    """
-    assert dtype == torch.half or dtype == torch.float, f'Invalid dtype, except torch.half or torch.float, got {dtype}'
-    import deepspeed
-    ds_config = {
-        "train_micro_batch_size_per_gpu": 1,
-        "gradient_accumulation_steps": 1,
-        "zero_optimization": {
-            "offload_param": getattr(gpc.config.zero, 'offload_param_config', None),
-            "offload_optimizer": getattr(gpc.config.zero, 'offload_optimizer_config'),
-        },
-        "aio": getattr(gpc.config.zero, 'aio_config', None)
-    }
-    remote_device = getattr(ds_config['zero_optimization']['offload_param'], 'device', None)
-    pin_memory = getattr(ds_config['zero_optimization']['offload_param'], 'pin_memory', False)
-    return deepspeed.zero.Init(data_parallel_group=gpc.get_group(ParallelMode.DATA),
-                               remote_device=remote_device,
-                               config_dict_or_path=ds_config,
-                               pin_memory=pin_memory,
-                               dtype=dtype)
-
-
-__all__ = ['convert_to_zero', 'ZeroRedundancyOptimizer_Level_2',
-           'ZeroRedundancyOptimizer_Level_3', 'zero3_model_context']
+__all__ = ['convert_to_zero', 'ShardedModel', 'ShardedOptimizer']
diff --git a/colossalai/zero/loss_scaler.py b/colossalai/zero/loss_scaler.py
deleted file mode 100644
index ebaaf2549..000000000
--- a/colossalai/zero/loss_scaler.py
+++ /dev/null
@@ -1,169 +0,0 @@
-# Copyright 2019 The Microsoft DeepSpeed Team
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# Taken and modified for DeepSpeed from:
-#    https://github.com/NVIDIA/Megatron-LM/blob/master/fp16/loss_scaler.py
-# Commit: 93ab4bea59dc5cbf97c079d313741866af4deac9
-
-
-INITIAL_LOSS_SCALE = 'init_scale'
-SCALE_WINDOW = 'scale_window'
-DELAYED_SHIFT = 'delayed_shift'
-MIN_LOSS_SCALE = 'min_scale'
-
-
-# item() is a recent addition, so this helps with backward compatibility.
-def to_python_float(t):
-    if hasattr(t, 'item'):
-        return t.item()
-    return t[0]
-
-
-class LossScalerBase:
-    """LossScalarBase
-    Base class for a loss scaler
-    """
-
-    def __init__(self, cur_scale):
-        self.cur_scale = cur_scale
-
-    @property
-    def loss_scale(self):
-        return self.cur_scale
-
-    def scale_gradient(self, module, grad_in, grad_out):
-        return tuple(self.loss_scale * g for g in grad_in)
-
-    def update_scale(self, overflow):
-        pass
-
-    def backward(self, loss, retain_graph=False):
-        scaled_loss = loss * self.loss_scale
-        scaled_loss.backward(retain_graph=retain_graph)
-
-
-class LossScaler(LossScalerBase):
-    """
-    Class that manages a static loss scale.  This class is intended to interact with
-    :class:`FP16_Optimizer`, and should not be directly manipulated by the user.
-    Use of :class:`LossScaler` is enabled via the ``static_loss_scale`` argument to
-    :class:`FP16_Optimizer`'s constructor.
-    Args:
-        scale (float, optional, default=1.0):  The loss scale.
-    """
-
-    def __init__(self, scale=1):
-        super(LossScaler, self).__init__(scale)
-
-    # `params` is a list / generator of torch.Variable
-    def has_overflow(self, params):
-        return False
-
-    # `x` is a torch.Tensor
-    def _has_inf_or_nan(x):
-        return False
-
-
-class DynamicLossScaler(LossScalerBase):
-    """
-    Class that manages dynamic loss scaling.  It is recommended to use :class:`DynamicLossScaler`
-    indirectly, by supplying ``dynamic_loss_scale=True`` to the constructor of
-    :class:`FP16_Optimizer`.  However, it's important to understand how :class:`DynamicLossScaler`
-    operates, because the default options can be changed using the
-    the ``dynamic_loss_args`` argument to :class:`FP16_Optimizer`'s constructor.
-    Loss scaling is designed to combat the problem of underflowing gradients encountered at long
-    times when training fp16 networks.  Dynamic loss scaling begins by attempting a very high loss
-    scale.  Ironically, this may result in OVERflowing gradients.  If overflowing gradients are
-    encountered, :class:`DynamicLossScaler` informs :class:`FP16_Optimizer` that an overflow has
-    occurred.
-    :class:`FP16_Optimizer` then skips the update step for this particular iteration/minibatch,
-    and :class:`DynamicLossScaler` adjusts the loss scale to a lower value.
-    If a certain number of iterations occur without overflowing gradients detected,
-    :class:`DynamicLossScaler` increases the loss scale once more.
-    In this way :class:`DynamicLossScaler` attempts to "ride the edge" of
-    always using the highest loss scale possible without incurring overflow.
-    Args:
-        init_scale (float, optional, default=2**32):  Initial loss scale attempted by :class:`DynamicLossScaler.`
-        scale_factor (float, optional, default=2.0):  Factor used when adjusting the loss scale. If an overflow is
-            encountered, the loss scale is readjusted to loss scale/``scale_factor``.  If ``scale_window`` consecutive
-            iterations take place without an overflow, the loss scale is readjusted to loss_scale*``scale_factor``.
-        scale_window (int, optional, default=1000):  Number of consecutive iterations without an overflow to wait before
-            increasing the loss scale.
-    """
-
-    def __init__(self,
-                 init_scale=2 ** 32,
-                 scale_factor=2.,
-                 scale_window=1000,
-                 min_scale=1,
-                 delayed_shift=1,
-                 consecutive_hysteresis=False):
-        super(DynamicLossScaler, self).__init__(init_scale)
-        self.cur_iter = 0
-        self.last_overflow_iter = -1
-        self.scale_factor = scale_factor
-        self.scale_window = scale_window
-        self.min_scale = min_scale
-        self.delayed_shift = delayed_shift
-        self.cur_hysteresis = delayed_shift
-        self.consecutive_hysteresis = consecutive_hysteresis
-
-    # `params` is a list / generator of torch.Variable
-    def has_overflow_serial(self, params):
-        for p in params:
-            if p.grad is not None and self._has_inf_or_nan(p.grad.data):
-                return True
-
-        return False
-
-    # `x` is a torch.Tensor
-    @staticmethod
-    def _has_inf_or_nan(x):
-        try:
-            # if x is half, the .float() incurs an additional deep copy, but it's necessary if
-            # Pytorch's .sum() creates a one-element tensor of the same type as x
-            # (which is true for some recent version of pytorch).
-            cpu_sum = float(x.float().sum())
-            # More efficient version that can be used if .sum() returns a Python scalar
-            # cpu_sum = float(x.sum())
-        except RuntimeError as instance:
-            # We want to check if inst is actually an overflow exception.
-            # RuntimeError could come from a different error.
-            # If so, we still want the exception to propagate.
-            if "value cannot be converted" not in instance.args[0]:
-                raise
-            return True
-        else:
-            if cpu_sum in [float('inf'), -float('inf')] or cpu_sum != cpu_sum:
-                return True
-            return False
-
-    # `overflow` is boolean indicating whether the gradient overflowed
-    def update_scale(self, overflow):
-        if overflow:
-            # self.cur_scale /= self.scale_factor
-            if self.delayed_shift == 1 or self.cur_hysteresis == 1:
-                self.cur_scale = max(
-                    self.cur_scale / self.scale_factor, self.min_scale)
-            else:
-                self.cur_hysteresis -= 1
-            self.last_overflow_iter = self.cur_iter
-        else:
-            if self.consecutive_hysteresis:
-                self.cur_hysteresis = self.delayed_shift
-            if (self.cur_iter - self.last_overflow_iter) % self.scale_window == 0:
-                if not self.consecutive_hysteresis:
-                    self.cur_hysteresis = self.delayed_shift
-                self.cur_scale *= self.scale_factor
-        self.cur_iter += 1
diff --git a/colossalai/zero/shard_param/__init__.py b/colossalai/zero/shard_param/__init__.py
new file mode 100644
index 000000000..bd7f5e46b
--- /dev/null
+++ b/colossalai/zero/shard_param/__init__.py
@@ -0,0 +1,3 @@
+from .shard_param import ShardParam
+
+__all__ = ['ShardParam']
\ No newline at end of file
diff --git a/colossalai/zero/shard_param/shard_param.py b/colossalai/zero/shard_param/shard_param.py
new file mode 100644
index 000000000..aafe78384
--- /dev/null
+++ b/colossalai/zero/shard_param/shard_param.py
@@ -0,0 +1,63 @@
+from enum import Enum
+from optparse import Option
+import torch
+from colossalai.zero.sharded_model._zero3_utils import get_shard
+from colossalai.context.parallel_mode import ParallelMode
+from colossalai.core import global_context as gpc
+import torch.distributed as dist
+
+class TensorType(Enum):
+    GRAD = 1
+    DATA = 2
+
+class ShardParam(object):
+    r"""
+    A wrapper to torch.nn.Parameter. Shard a param
+    on different processes.
+    """
+    def __init__(self,
+            param: torch.nn.Parameter,
+            tensor_type: TensorType = TensorType.DATA,
+            process_group = None,
+        ) -> None:
+        self.process_group = process_group or gpc.get_group(ParallelMode.DATA)
+        self.world_size = dist.get_world_size(self.process_group)
+        self.local_rank = dist.get_rank(self.process_group)
+        self._param_payload = param.data if tensor_type == TensorType.DATA else param.grad
+        self._payload_numel = None
+        self._origin_shape = param.shape
+        self._origin_numel = param.numel()
+        self.is_shared = False
+    
+    def payload(self, target_device : torch.device):
+        return self._param_payload.to(target_device)
+
+    def shard(self):
+        r"""
+        Distributed the payload of param to all processes.
+        """
+        if self.is_shared:
+            return
+        self._param_payload, _ = get_shard(self._param_payload, self.local_rank, self.world_size)
+        self.is_shared = True
+    
+    def gather(self):
+        r"""
+        Collect the payload of param from different processes to process of local rank.
+        """
+        if not self.is_shared:
+            return
+        
+        buffer_list = []
+        payload_numel = self._param_payload.numel()
+        for i in range(self.world_size):
+            if i == self.local_rank:
+                buffer_list.append(self._param_payload.cuda())
+            else:
+                buffer_list.append(torch.zeros(payload_numel).cuda())
+        
+        torch.distributed.all_gather(buffer_list, buffer_list[self.local_rank], group=self.process_group, async_op=False)
+        print(buffer_list)
+        self._param_payload = torch.narrow(torch.cat(buffer_list), 0, 0, self._origin_numel).view(self._origin_shape)
+        self.is_shared = False
+
diff --git a/colossalai/zero/sharded_model/__init__.py b/colossalai/zero/sharded_model/__init__.py
new file mode 100644
index 000000000..dffd7f21a
--- /dev/null
+++ b/colossalai/zero/sharded_model/__init__.py
@@ -0,0 +1,4 @@
+from .sharded_model import ShardedModel
+from .sharded_model_v2 import ShardedModelV2
+
+__all__ = ['ShardedModel', 'ShardedModelV2']
\ No newline at end of file
diff --git a/colossalai/zero/sharded_model/_zero3_utils.py b/colossalai/zero/sharded_model/_zero3_utils.py
new file mode 100644
index 000000000..b10534c9a
--- /dev/null
+++ b/colossalai/zero/sharded_model/_zero3_utils.py
@@ -0,0 +1,124 @@
+
+from collections import OrderedDict
+from typing import Any, Callable, Dict, List, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+
+
+def get_gradient_predivide_factor(world_size: int) -> float:
+    factor: int = 1
+    while world_size % factor == 0 and world_size / factor > factor:
+        factor *= 2
+    return float(factor)
+
+
+def get_shard(tensor: torch.Tensor, rank: int, world_size: int) -> Tuple[torch.Tensor, int]:
+    """Return the local shard of a full tensor."""
+    # Shard using torch.chunk to match all-gather/reduce-scatter.
+    chunks = list(torch.flatten(tensor).chunk(world_size))
+    while len(chunks) < world_size:
+        chunks.append(chunks[0].new_empty(0))
+
+    # Determine number of padding elements.
+    num_to_pad = chunks[0].numel() - chunks[rank].numel()
+    assert num_to_pad >= 0, num_to_pad
+
+    shard = chunks[rank].clone()
+    if num_to_pad > 0:
+        shard = F.pad(shard, [0, num_to_pad])
+    return shard, num_to_pad
+
+
+def free_storage(data: torch.Tensor) -> None:
+    """Free underlying storage of a Tensor."""
+    if data.storage().size() > 0:
+        # Since we're modifying the Tensor's Storage directly, make sure the Tensor
+        # is the sole occupant of the Storage.
+        assert data.storage_offset() == 0
+        data.storage().resize_(0)
+
+
+@torch.no_grad()
+def alloc_storage(data: torch.Tensor, size: torch.Size) -> None:
+    """Allocate storage for a tensor."""
+    if data.storage().size() == size.numel():  # no need to reallocate
+        return
+    assert data.storage().size() == 0
+    data.storage().resize_(size.numel())
+
+
+def cast_trensor_to_fp16(tensor: torch.Tensor) -> torch.Tensor:
+    if tensor.dtype is torch.float32:
+        out = tensor.half()
+        if tensor.is_leaf:
+            out.requires_grad = tensor.requires_grad
+        return out
+    return tensor
+
+
+def cast_trensor_to_fp32(tensor: torch.Tensor) -> torch.Tensor:
+    if tensor.dtype is torch.float16:
+        out = tensor.float()
+        if tensor.is_leaf:
+            out.requires_grad = tensor.requires_grad
+        return out
+    return tensor
+
+
+def apply_to_tensors(x: Any, fn: Callable):
+    if torch.is_tensor(x):
+        return fn(x)
+    elif isinstance(x, list):
+        return [apply_to_tensors(t, fn) for t in x]
+    elif isinstance(x, tuple):
+        return tuple(apply_to_tensors(t, fn) for t in x)
+    elif isinstance(x, dict):
+        return {key: apply_to_tensors(val, fn) for key, val in x.items()}
+    else:
+        return x
+
+
+def cast_float_arguments(fn: Callable, *args: Any, **kwargs: Any) -> Tuple[Any, Any]:
+    return apply_to_tensors(args, fn), apply_to_tensors(kwargs, fn)
+
+
+def chunk_and_pad(tensor: torch.Tensor, num_chunks: int) -> List[torch.Tensor]:
+    """Chunk a given Tensor into num_chunks parts and add any necessary padding."""
+    chunks = list(torch.flatten(tensor).chunk(num_chunks))
+    # torch.chunk may return fewer than num_chunks chunks, pad accordingly.
+    num_pad_for_partial_chunk = chunks[0].numel() - chunks[-1].numel()
+    if num_pad_for_partial_chunk > 0:
+        chunks[-1] = F.pad(chunks[-1], [0, num_pad_for_partial_chunk])
+    if len(chunks) < num_chunks:
+        chunks.extend([torch.zeros_like(chunks[0]) for _ in range(num_chunks - len(chunks))])
+    return chunks
+
+
+def assert_in_engine(cond: Any, s: Any) -> None:
+    """Used in backward context to make sure error is printed."""
+    if not cond:
+        print(s)
+        raise AssertionError
+
+
+def replace_state_dict_prefix(
+    state_dict: Union[Dict[str, torch.Tensor], "OrderedDict[str, torch.Tensor]"], old_prefix: str, new_prefix: str
+) -> None:
+    """
+    Replace all keys that match a given old_prefix with a new_prefix (in-place).
+
+    Usage::
+
+        state_dict = {"layer.xyz": torch.tensor(1)}
+        replace_state_dict_prefix(state_dict, "layer.", "module.layer.")
+        assert state_dict == {"module.layer.xyz": torch.tensor(1)}
+    """
+    if old_prefix == new_prefix:
+        raise ValueError("old_prefix and new_prefix must be distinct")
+    for key in list(state_dict.keys()):
+        if not key.startswith(old_prefix):
+            continue
+        new_key = new_prefix + key[len(old_prefix):]
+        state_dict[new_key] = state_dict[key]
+        del state_dict[key]
diff --git a/colossalai/zero/sharded_model/param_manager.py b/colossalai/zero/sharded_model/param_manager.py
new file mode 100644
index 000000000..7670ab3a4
--- /dev/null
+++ b/colossalai/zero/sharded_model/param_manager.py
@@ -0,0 +1,385 @@
+import os
+from typing import Dict, List, Optional, Tuple
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from torch.distributed import ProcessGroup
+from torch.nn.parameter import Parameter
+
+from ._zero3_utils import alloc_storage, free_storage, get_shard
+
+# TODO: Remove the toggle-enable_nccl_base_collectives in the future
+if os.getenv("ENABLE_NCCL_BASE_COLLECTIVES", "1") == "0":
+    enable_nccl_base_collectives = False
+else:
+    enable_nccl_base_collectives = True
+
+# TODO: add flatten params
+
+
+class Zero3ParameterManager:
+    def __init__(self,
+                 module: nn.Module,
+                 process_group: Optional[ProcessGroup],
+                 mixed_precision: bool = False,
+                 flatten_parameters: bool = True,
+                 compute_dtype: Optional[torch.dtype] = None,
+                 compute_device: Optional[torch.device] = None,
+                 offload_config: Optional[dict] = None
+                 ) -> None:
+        """Manage parameter shards. We manage several attributes on each Parameter instance:
+            ``zero_is_sharded``: ``True`` if the Parameter is sharded or ``False``
+                if the Parameter is intentionally not sharded (in which case we
+                will all-reduce grads for this param).
+            ``zero_orig_size``: the size of the original Parameter (before sharding)
+            ``zero_shard_padding``: the padding size. All paddings are right padding.
+            ``zero_fp32_shard``: a single shard of the parameters in full precision
+                (typically FP32, but this is dependent on the dtype of the model
+                as it's passed in by the user). This can be on CPU or GPU
+                depending on the value of *``offload_config``*.
+            ``zero_fp16_shard``: This will be a single shard of the parameters in FP16, used for all-gather.
+                This can be in FP16 or FP32 depending on the value of *``compute_dtype``* and
+                if params are offloaded to CPU.
+            ``zero_full_param_padded``: the full weight (padded to be evenly
+                divisible by ``world_size``), used for computation in the
+                forward and backward pass. This will be resized in place and
+                only materialized (via all-gather) as needed.
+            ``zero_cpu_grad``: the gradient saved on CPU. It's set only when using CPU offload.
+
+        :param module: original module
+        :type module: nn.Module
+        :param process_group: typically data parallel process group, defaults to None
+        :type process_group: Optional[ProcessGroup], optional
+        :param mixed_precision: whether to use mixed precision mode, defaults to False
+        :type mixed_precision: bool, optional
+        :param flatten_parameters: whether to flatten parameters, useless now, defaults to True
+        :type flatten_parameters: bool, optional
+        :param compute_dtype: the dtype of parameters when computing, defaults to None
+        :type compute_dtype: Optional[torch.dtype], optional
+        :param compute_device: the device of parameters when computing, defaults to None
+        :type compute_device: Optional[torch.device], optional
+        :param offload_config: offload config, defaults to None
+        :type offload_config: Optional[dict], optional
+        """
+        self.process_group = process_group
+        self.shard_idx = process_group.rank()
+        self.num_shards = process_group.size()
+        self.mixed_precision = mixed_precision
+        self.compute_dtype = compute_dtype
+        self.compute_device = compute_device
+        self.offload_config = offload_config
+
+        self._cpu_offload = offload_config.get('device', None) == 'cpu' if offload_config else False
+
+        self.params: List[Parameter] = []
+        for param in module.parameters():
+            if not hasattr(param, 'zero_is_sharded'):
+                self.params.append(param)
+
+        self._has_params = len(self.params) > 0
+        self._has_sharded_params = False
+        # Flag to indicate if the full params are gathered.
+        self.has_full_params: bool = False
+
+        self._shard_params()
+        # Maybe no need, reserve to prevent bugs
+        # self.delete_fp32_shards()
+
+        self._streams: Dict[str, torch.cuda.Stream] = {}
+
+    def _shard_params(self) -> None:
+        for p in self.params:
+            assert not hasattr(p, "zero_is_sharded")
+            assert p.is_floating_point()
+            if self.mixed_precision:
+                assert p.dtype == torch.float32
+
+            # If world_size is 1, then we all-reduce grads instead of sharding.
+            p.zero_is_sharded = self.num_shards > 1
+            p.zero_orig_size = p.data.size()
+
+            if not p.zero_is_sharded:
+                p.zero_shard_padding = 0
+                continue
+
+            # Replace p.data with the relevant shard.
+            orig_data = p.data
+            p.data, p.zero_shard_padding = get_shard(p.data, self.shard_idx, self.num_shards)
+            free_storage(orig_data)
+
+    @torch.no_grad()
+    def reset_param_attr(self, p: Parameter, training: bool) -> None:
+        """This should be called by ``ZeroRedundancyLevel3Model._lazy_init()``
+        """
+        assert hasattr(p, 'zero_is_sharded') and hasattr(p, 'zero_orig_size')
+        if hasattr(p, 'zero_fp32_shard'):
+            return
+
+        # A single shard of the parameters in full precision.
+        p.zero_fp32_shard = p.data
+
+        if self.mixed_precision:
+            assert p.zero_fp32_shard.dtype == torch.float32
+
+        if self._cpu_offload:
+            assert p.zero_fp32_shard.device == torch.device('cpu')
+            # If we plan to keep the FP32 parameters on CPU, then pinning
+            # memory allows us to later use non-blocking transfers when moving
+            # the FP32 param shard to compute_device.
+            p.zero_fp32_shard = p.zero_fp32_shard.pin_memory()
+            p.data = p.zero_fp32_shard
+
+        if self.mixed_precision or self._cpu_offload:
+
+            # In mixed precision mode, we maintain a reduced precision
+            # (typically FP16) parameter shard on compute_device for performing
+            # the computation in the forward/backward pass. We resize the
+            # storage to size 0 at init (here) and re-materialize (by copying
+            # from _fp32_shard) as needed. If offloading params to CPU, the
+            # dtype of the fp16 shard will depend on the *`compute_dtype`*.
+            p.zero_fp16_shard = torch.zeros_like(
+                p.zero_fp32_shard, device=self.compute_device, dtype=self.compute_dtype)
+            free_storage(p.zero_fp16_shard)
+
+        if self.mixed_precision:
+            assert p.zero_fp32_shard.dtype == torch.float32
+
+        if not self.mixed_precision and not self._cpu_offload:
+            # use _fp32_shard if you are not in using mixed precision or
+            # offloading params and grads to CPU.
+            p.zero_fp16_shard = None
+
+        # We also maintain a full-sized parameter of type self.compute_dtype
+        # (FP16 for mixed_precision or FP32 otherwise). We resize the
+        # storage to size 0 at init (here) and only materialize as needed. The
+        # storage may contain padding elements so that it is evenly divisible by
+        # world_size, although these padding elements will be removed before the
+        # relevant computation.
+        if p.zero_is_sharded:
+            p.zero_full_param_padded = torch.zeros(
+                p.data.numel() * self.num_shards, device=self.compute_device, dtype=self.compute_dtype
+            )
+            free_storage(p.zero_full_param_padded)
+
+        if self._cpu_offload and training:
+            p.zero_cpu_grad = torch.zeros_like(p.data, device='cpu').pin_memory()
+
+    def setup_streams(self, streams):
+        self._streams = streams
+
+    @torch.no_grad()
+    def rebuild_full_params(self, force_full_precision: bool = False) -> Optional[List[Tuple[torch.Tensor, bool]]]:
+        """
+        Gather all shards of params.
+
+        Note, this is idempotent if full params are already gathered. Callers
+        assume the idempotency. So please keep it that way.
+
+        Args:
+            force_full_precision (bool, Optional): by default params will be gathered
+                in ``compute_dtype`` (e.g., FP16), unless *force_full_precision* is
+                ``True``, in which case they will be gathered in full precision
+                (e.g., FP32), possibly in fresh storage. The parameter that's being
+                rebuilt will end up in full precision as well.
+
+        Returns:
+            A list of tuples, where the first element is the full-sized param
+            and the second element is a bool indicating if it's safe for the
+            caller to free the full-sized param. This will be ``None`` if
+            ``force_full_precision=False`` and the full params are already gathered.
+        """
+        # Store tensor and free flag
+        output_tensors: List[Tuple[torch.Tensor, bool]] = []
+
+        def update_p_data(custom_output_tensor: Optional[torch.Tensor] = None) -> None:
+            """
+            Helper function to update p.data pointer.
+
+            Args:
+                custom_output_tensor (torch.Tensor, Optional): if not None, this
+                tensor contains the data we just gathered.
+            """
+            if custom_output_tensor is not None:
+                assert p.zero_is_sharded
+                p.data = custom_output_tensor
+                output_tensors.append((p.data, True))
+            elif not p.zero_is_sharded:
+                if (self.mixed_precision or self._cpu_offload) and not force_full_precision:
+                    assert p.zero_fp16_shard is not None
+                    p.data = p.zero_fp16_shard
+                    output_tensors.append((p.data, True))
+                else:
+                    # Here p.data == p._fp32_shard, so it's not safe to free.
+                    output_tensors.append((p.data, False))
+            else:
+                p.data = p.zero_full_param_padded
+                output_tensors.append((p.data, True))
+            # Trim any padding and reshape to match original size.
+            p.data = p.data[: p.zero_orig_size.numel()].view(p.zero_orig_size)
+
+        if self._has_sharded_params:
+            # self.has_full_params flag can be out of sync if a shared param is
+            # sharded by another ZeroRedundancyLevel3Model instance. An example is that in eval case
+            # with reshard_after_forward=False but the sharing instance has
+            # reshard_after_forward=True. Then, on the second forward, the
+            # other instance can shard the shared param and but this instance
+            # can mistakenly think the full param is already gathered from the
+            # has_full_params flag.
+            #
+            # Therefore, we update the flag accordingly here.
+            self.has_full_params = not any(p.zero_full_param_padded.storage().size() == 0 for p in self.params)
+
+        # Early exit if we already have full params and don't need full precision.
+        if self.has_full_params and not force_full_precision:
+            for p in self.params:
+                update_p_data()
+            return output_tensors
+
+        self.has_full_params = True
+
+        with torch.cuda.stream(self._streams["all_gather"]):
+            if (self.mixed_precision or self._cpu_offload) and not force_full_precision:
+                self.use_fp16_shards()
+
+            if self._cpu_offload and force_full_precision:
+                # If the compute_dtype and storage dtype are the same,
+                # use pinned memory. Otherwise move p.data to the compute
+                # device.
+                if self.params[0].dtype == self.compute_dtype:
+                    self.use_fp16_shards()
+                else:
+                    for p in self.params:
+                        p.data = p.data.to(self.compute_device)
+
+            for p in self.params:
+                if not p.zero_is_sharded:  # e.g., when world_size == 1
+                    update_p_data()
+                else:
+                    # Skip if already built. Only shared param can be rebuilt multiple times.
+                    # A corner case is p.zero_orig_size = (1,), which means the shape equality is
+                    # not a perfect check. But we assume we don't share a param with shape (1,).
+                    # if p.data.shape == p.zero_orig_size and hasattr(p, "zero_is_shared") and p.zero_is_shared:
+                    #     continue
+                    # If self._cpu_offload and force_full_precision, we need to cast
+                    # the FP32 CPU param to CUDA for the all-gather.
+                    p_data = p.data.to(p.zero_full_param_padded.device, non_blocking=True)
+
+                    p_size = p.zero_full_param_padded.size()
+                    assert p_size.numel() % self.num_shards == 0
+                    if self.mixed_precision and force_full_precision:
+                        # Allocate fresh tensor in full precision since we are in
+                        # mixed precision and full precision rebuild is asked.
+                        output_tensor = p_data.new_zeros(p_size)
+                    else:
+                        if p.zero_full_param_padded.storage().size() != p_size.numel():
+                            # Allocate based on full size from all shards.
+                            alloc_storage(p.zero_full_param_padded, size=p_size)
+                        output_tensor = p.zero_full_param_padded
+
+                    # Fill output_tensor with (p.data for each shard in self.world_size)
+                    if hasattr(dist, "_all_gather_base") and enable_nccl_base_collectives:
+                        # New version of PyTorch has all_gather_base, which is faster than chunk and then all_gather.
+                        dist._all_gather_base(output_tensor, p_data, group=self.process_group)
+                    else:
+                        chunks = list(output_tensor.chunk(self.num_shards))
+                        dist.all_gather(chunks, p_data, group=self.process_group)
+
+                    # Set p.data = output_tensor (with padding trimmed)
+                    update_p_data(output_tensor)
+
+                    if (self.mixed_precision or self._cpu_offload) and not force_full_precision:
+                        self.free_fp16_shards([p])
+
+                    if self._cpu_offload and (self.params[0].dtype == self.compute_dtype):
+                        self.free_fp16_shards([p])
+
+        torch.cuda.current_stream().wait_stream(self._streams["all_gather"])
+        return output_tensors
+
+    @torch.no_grad()
+    def use_full_params(self) -> None:
+        """
+        Switch p.data pointers to use the full params.
+
+        Note: this assumes full params are already gathered.
+
+        Note: this might be called after full_params is already in used. So please
+              make sure it is idempotent in that case.
+        """
+        assert self.has_full_params
+        for p in self.params:
+            if not p.zero_is_sharded:
+                if self.mixed_precision or self._cpu_offload:
+                    assert p.zero_fp16_shard is not None
+                    assert p.zero_fp16_shard.storage().size() != 0
+                    p.data = p.zero_fp16_shard
+            else:
+                assert p.zero_full_param_padded.storage().size() != 0, f"{p.zero_orig_size} {id(self)}"
+                p.data = p.zero_full_param_padded[: p.zero_orig_size.numel()].view(p.zero_orig_size)
+
+    @torch.no_grad()
+    def use_fp16_shards(self, params: Optional[List[Parameter]] = None) -> None:
+        """Cast FP32 param shard to FP16 for a list of params."""
+        if params is None:
+            params = self.params
+        with torch.cuda.stream(self._streams["fp32_to_fp16"]):
+            for p in params:
+                assert p.zero_fp16_shard is not None
+                alloc_storage(p.zero_fp16_shard, size=p.zero_fp32_shard.size())
+                p.zero_fp16_shard.copy_(
+                    # If _cpu_offload is True, this will be non-blocking
+                    # because _fp32_shard is pinned, otherwise it's a no-op.
+                    p.zero_fp32_shard.to(p.zero_fp16_shard.device, non_blocking=True)
+                )
+                p.data = p.zero_fp16_shard
+        torch.cuda.current_stream().wait_stream(self._streams["fp32_to_fp16"])
+
+    @torch.no_grad()
+    def use_fp32_shards(self, params: Optional[List[Parameter]] = None) -> None:
+        """Use FP32 shard for a list of params."""
+        if params is None:
+            params = self.params
+        for p in params:
+            p.data = p.zero_fp32_shard
+
+    @torch.no_grad()
+    def free_full_params(self, params: Optional[List[Parameter]] = None) -> None:
+        """Free up storage for full parameters."""
+        if params is None:
+            params = self.params
+        self.has_full_params = False
+        current_stream = torch.cuda.current_stream()
+        for p in params:
+            if not p.zero_is_sharded:  # e.g., world_size == 1
+                if self.mixed_precision or self._cpu_offload:
+                    self.free_fp16_shards([p])
+                continue
+            # Don't let PyTorch reuse this memory until all work in the current
+            # stream is complete.
+            p.zero_full_param_padded.record_stream(current_stream)
+            # There may be external references to the Tensor Storage that we
+            # can't modify, such as references that are created by
+            # ctx.save_for_backward in the forward pass. Thus when we
+            # unshard parameters, we should reuse the original Tensor
+            # Storage object and unshard it in-place. For now, just resize
+            # the Storage to 0 to save memory.
+            free_storage(p.zero_full_param_padded)
+
+    @torch.no_grad()
+    def free_fp16_shards(self, params: Optional[List[Parameter]] = None) -> None:
+        """Free storage for FP16 shards for a list of params."""
+        if params is None:
+            params = self.params
+        current_stream = torch.cuda.current_stream()
+        for p in params:
+            if p.zero_fp16_shard is not None:
+                # zero_fp16_shard is allocated in "fp32_to_fp16" stream, so we can't
+                # free it until the work in the current stream completes.
+                p.zero_fp16_shard.record_stream(current_stream)
+                free_storage(p.zero_fp16_shard)
+
+    def delete_fp32_shards(self) -> None:
+        for p in self.params:
+            if hasattr(p, 'zero_fp32_shard'):
+                del p.zero_fp32_shard  # reset _init_param_attr
diff --git a/colossalai/zero/sharded_model/reduce_scatter.py b/colossalai/zero/sharded_model/reduce_scatter.py
new file mode 100644
index 000000000..25f76daf5
--- /dev/null
+++ b/colossalai/zero/sharded_model/reduce_scatter.py
@@ -0,0 +1,204 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
+import functools
+import os
+from typing import Callable, Dict, List, Optional, Tuple
+
+import torch
+import torch.distributed as dist
+from torch import Tensor
+from torch.distributed import ProcessGroup
+
+# TODO: Remove the toggle-enable_nccl_base_collectives when github open issue #801 is resolved.
+if os.getenv("ENABLE_NCCL_BASE_COLLECTIVES", "1") == "0":
+    enable_nccl_base_collectives = False
+else:
+    enable_nccl_base_collectives = True
+
+
+class Bucket:
+    def __init__(self, shard_size: int, dtype: torch.dtype, device: torch.device, group: ProcessGroup):
+        self.buffer = torch.zeros((group.size(), shard_size), dtype=dtype, device=device)
+        self.group = group
+        self.offset = 0
+        self.callbacks: List[Callable] = []
+        self.output_shard = torch.zeros_like(self.buffer[0])
+
+    def flush(self) -> None:
+        """Flush content of the bucket."""
+        if self.offset == 0:
+            assert len(self.callbacks) == 0
+            return
+        # reduce-scatter bucket
+        if hasattr(dist, "_reduce_scatter_base") and enable_nccl_base_collectives:
+            dist._reduce_scatter_base(
+                self.output_shard[: self.offset], self.buffer[:, : self.offset].contiguous(), group=self.group
+            )
+        else:
+            dist.reduce_scatter(
+                self.output_shard[: self.offset], list(self.buffer[:, : self.offset].unbind(0)), group=self.group
+            )
+        # execute post-reduction callbacks
+        for callback_fn in self.callbacks:
+            callback_fn()
+        # reuse input bucket but allocate a fresh output shard
+        self.buffer[:, : self.offset].zero_()
+        self.offset = 0
+        self.callbacks.clear()
+        self.output_shard = torch.zeros_like(self.buffer[0])
+
+    def alloc(self) -> None:
+        """Setup the buffers if they are not allocated.
+
+        Using ``setup`` and ``teardown``, we can ensure that the bucket
+        buffers are only allocated during the backward pass, hence saving more
+        memory to other parts of the training process, such as the forward pass
+        for activation memory.
+        """
+        for tensor in [self.buffer, self.output_shard]:
+            if tensor.storage().size() == 0:
+                tensor.storage().resize_(tensor.size().numel())
+
+    def free(self) -> None:
+        """Tear down the bucket by freeing the memory"""
+        assert self.offset == 0 and self.callbacks == [], "Incorrect call of teardown"
+        for tensor in [self.buffer, self.output_shard]:
+            tensor.storage().resize_(0)
+
+    def append(self, tensor_list: List[Tensor], callback_fn: Callable):
+        # copy data from input_list into bucket
+        tensor_size = tensor_list[0].numel()
+        stacked_input = torch.stack(tensor_list).view(self.group.size(), tensor_size)
+        offset = self.offset
+        self.buffer[:, offset: offset + tensor_size].copy_(stacked_input)
+        self.offset += tensor_size
+
+        # callback will be given the reduced result
+        if callback_fn is not None:
+            result_view = self.output_shard[offset: offset + tensor_size].view_as(tensor_list[0])
+            self.callbacks.append(functools.partial(callback_fn, result_view))
+
+
+class ReduceScatterBucketer:
+    """
+    Helper for bucketing multiple reduce-scatter operations on small tensors
+    into larger reduce-scatter ops to improve communication efficiency.
+
+    Usage::
+
+        bucketer = ReduceScatterBucketer()
+        bucketer.reduce_scatter_async(
+            small_tensors, callback_fn=lambda result: print("small")
+        )
+        bucketer.reduce_scatter_async(
+            big_tensors, callback_fn=lambda result: print("big")
+        )
+        bucketer.reduce_scatter_async(
+            more_small_tensors, callback_fn=lambda result: print("small2")
+        )
+        bucketer.flush()  # callbacks only guaranteed to be called after flush()
+        # Example output (note that it is out of order, due to bucketing):
+        # big
+        # small
+        # small2
+
+    Args:
+        bucket_size_mb (int, Optional): bucket size for communicating. Buckets
+            are sub-divided based on world_size. Values <= 0 disable bucketing.
+    """
+
+    def __init__(self, bucket_size_mb: int = 25):
+        self.bucket_size_mb = bucket_size_mb
+        self.buckets: Dict[Tuple[torch.dtype, torch.device, ProcessGroup], Bucket] = {}
+
+    @torch.no_grad()
+    def reduce_scatter_async(
+        self,
+        input_list: List[Tensor],
+        group: ProcessGroup,
+        callback_fn: Optional[Callable] = None,
+    ) -> None:
+        """
+        Reduce-scatter a list of tensors asynchronously, so smaller reductions
+        can be bucketed together. The given callback (``callback_fn``) will be
+        called with the reduced result at some later time. Call ``flush()`` to
+        force all queued ops and callbacks to be executed.
+
+        Note that large inputs will be reduced immediately, and this function
+        may also flush the relevant bucket to make room for ``input_list``.
+
+        Args:
+            input_list (List[Tensor]): list of tensors to reduce-scatter. List
+                should contain ``group.size()`` tensors and each tensor should
+                have identical shape, dtype and device.
+            group (ProcessGroup): process group for reduction
+            callback_fn (Callable, Optional): callback function to call after
+                the reduction executes. Function will be called with a single
+                argument corresponding to the reduced result.
+        """
+        world_size = group.size()
+
+        assert (
+            len(input_list) == world_size
+        ), f"reduce_scatter received {len(input_list)} inputs, expected group.size() ({world_size})"
+
+        first_input = input_list[0]
+        first_input_size = first_input.numel()
+
+        bucket_shard_size = self._get_shard_size(first_input.element_size(), world_size)
+        if first_input_size > bucket_shard_size:
+            # TODO: investigate how to avoid using torch.cat (because it seems to be slow for CPU tensors)
+            # input is too big to fit in the bucket, reduce-scatter directly
+            output = torch.zeros_like(input_list[0])
+            if hasattr(dist, "_reduce_scatter_base") and enable_nccl_base_collectives:
+                input_flattened = torch.cat(input_list)
+                dist._reduce_scatter_base(output, input_flattened, group=group)
+            else:
+                # fallback
+                dist.reduce_scatter(output, input_list, group=group)
+            if callback_fn is not None:
+                callback_fn(output)
+            return
+
+        bucket = self._get_bucket(first_input, group)
+        if first_input_size > bucket.buffer.size(1) - bucket.offset:
+            # not enough space remaining in bucket, flush it now
+            bucket.flush()
+        bucket.append(input_list, callback_fn)
+
+    @torch.no_grad()
+    def flush(self) -> None:
+        """Reduce-scatter any partial buckets."""
+        for bucket in self.buckets.values():
+            bucket.flush()
+
+    @torch.no_grad()
+    def free(self) -> None:
+        """Free buffers from all buckets."""
+        for bucket in self.buckets.values():
+            bucket.free()
+
+    @functools.lru_cache()
+    def _get_shard_size(self, element_size: int, num_shards: int) -> int:
+        if self.bucket_size_mb <= 0:  # Values <= 0 disable bucketing.
+            return 0
+        MB = 1024 * 1024
+        bucket_size = self.bucket_size_mb * MB / element_size
+        return int(bucket_size // num_shards)
+
+    def _get_bucket(self, tensor: Tensor, group: ProcessGroup) -> Bucket:
+        # TODO (Min): the `group` used here in the key is the object hash, not the content
+        #     hash. That means if FSDP instances are initialized with different process groups,
+        #     even when the group members are in fact the same, we end up creating different
+        #     buckets here.
+        key = (tensor.dtype, tensor.device, group)
+        if key not in self.buckets:
+            # buckets are divided into world_size pieces, bucket.data shaped (world_size, shard_size)
+            world_size = group.size()
+            shard_size = self._get_shard_size(tensor.element_size(), world_size)
+            self.buckets[key] = Bucket(shard_size, tensor.dtype, tensor.device, group)
+        self.buckets[key].alloc()
+        return self.buckets[key]
diff --git a/colossalai/zero/sharded_model/sharded_model.py b/colossalai/zero/sharded_model/sharded_model.py
new file mode 100644
index 000000000..d4765391e
--- /dev/null
+++ b/colossalai/zero/sharded_model/sharded_model.py
@@ -0,0 +1,1100 @@
+import contextlib
+import copy
+import functools
+import os
+import traceback
+from collections import OrderedDict
+from enum import Enum, auto
+from typing import (Any, Callable, Dict, Generator, List, NamedTuple, Optional,
+                    Set, Union)
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from colossalai.context.parallel_mode import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.logging import get_dist_logger
+from colossalai.utils import get_current_device
+from .param_manager import Zero3ParameterManager
+from torch.autograd import Variable
+from torch.distributed import ProcessGroup
+from torch.nn.parameter import Parameter
+
+from ._zero3_utils import (apply_to_tensors, assert_in_engine,
+                           cast_float_arguments, cast_trensor_to_fp16,
+                           cast_trensor_to_fp32, chunk_and_pad, free_storage,
+                           get_gradient_predivide_factor, get_shard,
+                           replace_state_dict_prefix)
+from .reduce_scatter import ReduceScatterBucketer
+
+# TODO: Remove the toggle-enable_nccl_base_collectives in the future
+if os.getenv("ENABLE_NCCL_BASE_COLLECTIVES", "1") == "0":
+    enable_nccl_base_collectives = False
+else:
+    enable_nccl_base_collectives = True
+
+
+class TrainingState(Enum):
+    IDLE = auto()
+    FORWARD = auto()
+    PRE_BACKWARD = auto()
+    POST_BACKWARD = auto()
+    GATHER_FULL_PARAMS = auto()
+
+# TODO: Add clip_grad_norm_
+# TODO: Add gather_full_optim_state_dict and get_shard_from_optim_state_dict
+
+
+class ShardedModel(nn.Module):
+    def __init__(self,
+                 module: nn.Module,
+                 process_group: Optional[ProcessGroup] = None,
+                 reduce_scatter_process_group: Optional[ProcessGroup] = None,
+                 reshard_after_forward: bool = True,
+                 disable_reshard_on_root: bool = True,
+                 mixed_precision: bool = False,
+                 fp32_reduce_scatter: bool = False,
+                 flatten_parameters: bool = True,
+                 compute_dtype: Optional[torch.dtype] = None,
+                 buffer_dtype: Optional[torch.dtype] = None,
+                 reduce_scatter_bucket_size_mb: int = 25,
+                 compute_device: Optional[torch.device] = None,
+                 no_broadcast_optim_state: Optional[bool] = False,
+                 state_dict_device: Optional[torch.device] = None,
+                 clear_autocast_cache: bool = False,
+                 force_input_to_fp32: bool = False,
+                 verbose: bool = False,
+                 offload_config: Optional[dict] = None,
+                 state_dict_on_rank_0_only: bool = False,
+                 gradient_predivide_factor: Optional[float] = 1.0) -> None:
+        super().__init__()
+        self.logger = get_dist_logger()
+
+        self.process_group = process_group or gpc.get_group(ParallelMode.DATA)
+        self.reduce_scatter_process_group = reduce_scatter_process_group or self.process_group
+        self.world_size = dist.get_world_size(self.process_group)
+        self.rank = dist.get_rank(self.process_group)
+
+        self.reshard_after_forward = self._orig_reshard_after_forward = reshard_after_forward
+        self.disable_reshard_on_root = disable_reshard_on_root
+        self.mixed_precision = mixed_precision
+        self.fp32_reduce_scatter = fp32_reduce_scatter
+        self.offload_config = offload_config
+        self.compute_dtype = compute_dtype or (torch.float16 if mixed_precision else torch.float32)
+        self.buffer_dtype = buffer_dtype or self.compute_dtype
+        self.reduce_scatter_bucket_size_mb = reduce_scatter_bucket_size_mb
+        self.compute_device = compute_device or torch.device(f'cuda:{get_current_device()}')
+        self.uncollected_opt_state: Dict[int, Dict] = {}
+        self.no_broadcast_optim_state = no_broadcast_optim_state
+        self.state_dict_device = state_dict_device or self.compute_device
+        self.clear_autocast_cache = clear_autocast_cache
+        self.force_input_to_fp32 = force_input_to_fp32
+        self.verbose = verbose
+        self.state_dict_on_rank_0_only = state_dict_on_rank_0_only
+
+        self._cpu_offload = offload_config.get('device', None) == 'cpu' if offload_config else False
+
+        # We find if gradient_predivide_factor != 1.0, there may be wrong precision problem
+        # So we use 1.0 as the default gradient_predivide_factor
+        # However, if you set gradient_predivide_factor to None, we will set gradient_predivide_factor to a value >= 1.0 automatically
+        self.gradient_predivide_factor: float = gradient_predivide_factor if gradient_predivide_factor is not None else \
+            get_gradient_predivide_factor(self.world_size)
+        self.gradient_postdivide_factor: float = self.world_size / self.gradient_predivide_factor
+
+        self._check_sanity()
+
+        self.params: List[Parameter] = []
+
+        for name, param in module.named_parameters():
+            if not hasattr(param, 'zero_is_sharded'):
+                self.params.append(param)
+
+        self.module = module
+
+        self.param_manager = Zero3ParameterManager(module, process_group=self.process_group, mixed_precision=self.mixed_precision,
+                                                   flatten_parameters=flatten_parameters, compute_dtype=self.compute_dtype, compute_device=self.compute_device,
+                                                   offload_config=offload_config)
+
+        self._reset_lazy_init_info()
+
+        # Flag to indicate if we require gradient reduction in the backward
+        # pass. This will be False when inside the no_sync context manager.
+        self._require_backward_grad_sync: bool = True
+
+        # Enum to indicate if we're in the forward/backward pass, idle, etc.
+        self.training_state = TrainingState.IDLE
+
+        # Register hook after state_dict() to remove the "_zero3_module."
+        # prefix and before load_state_dict() to add it back.
+        self._register_state_dict_hook(functools.partial(_post_state_dict_hook, self.state_dict_on_rank_0_only))
+        self._register_load_state_dict_pre_hook(_pre_load_state_dict_hook)
+
+        # Flag to indicate whether state_dict() should automatically gather the full params.
+        self._return_full_state_dict = True
+
+        # Flag to guard against preparing gradients multiple times per iteration.
+        # This is reset at the end of the backward pass.
+        self._pre_backward_hook_has_run = False
+
+    def forward(self, *args: Any, **kwargs: Any) -> torch.Tensor:
+        self._lazy_init()
+
+        # Start of a forward pass.
+        self.training_state = TrainingState.FORWARD
+
+        # For root and mixed precision, we convert the input to FP16 (no_grad is needed for
+        # the conversion).
+        if self._is_root and self.mixed_precision:
+            args, kwargs = cast_float_arguments(cast_trensor_to_fp16, *args, **kwargs)
+
+        # If enabled, convert the input to FP32 if we are in full precision.
+        # no_grad is not used because the input might be for a non-root instance,
+        # which mean autograd needs to go through the conversion.
+        if self.force_input_to_fp32 and not self.mixed_precision:
+            args, kwargs = cast_float_arguments(cast_trensor_to_fp32, *args, **kwargs)
+
+        # All-gather full parameters. This will also transfer FP32 parameters to
+        # ``self.compute_dtype`` (e.g., FP16 if *mixed_precision* is ``True``).
+        self.param_manager.rebuild_full_params()
+
+        # Register backward hooks to reshard params and reduce-scatter grads.
+        # These need to be re-registered every forward pass.
+        self._register_post_backward_hooks()
+
+        outputs = self.module(*args, **kwargs)
+
+        if self.reshard_after_forward:
+            self.param_manager.free_full_params()
+            if self.mixed_precision or self._cpu_offload:
+                self.param_manager.free_fp16_shards()
+
+        # Switch to main FP32 param shard. We maintain this invariant throughout
+        # the code, i.e., ``p.data == p.zero_fp32_shard`` after each function. This
+        # also ensures that after the first forward, the optimizer state will be
+        # initialized with the correct dtype and (sharded) size, since optimizer
+        # state is typically initialized lazily in ``optim.step()``.
+        self.param_manager.use_fp32_shards()
+
+        # Register pre-backward hooks to all-gather the params for the backward
+        # pass (if output's grad was needed). This won't register anything if
+        # we are in eval mode.
+        #
+        # Some model does forward pass multiple times, we need to register the
+        # pre-backward hook on every output since the last output's hook has to
+        # fire first to setup for backward. However, we use ``self._pre_backward_hook_has_run``
+        # to prevent repeated overhead from multiple hook callbacks.
+        outputs = self._register_pre_backward_hooks(outputs)
+
+        # Done with a forward pass.
+        self.training_state = TrainingState.IDLE
+
+        # Only need to clear cache during forward. During backward, the cache is not used.
+        if self.clear_autocast_cache:
+            torch.clear_autocast_cache()
+
+        return outputs
+
+    def _check_sanity(self) -> None:
+        if self.fp32_reduce_scatter and not self.mixed_precision:
+            raise ValueError("fp32_reduce_scatter requires mixed_precision=True")
+        if self.compute_device.type == 'cuda':
+            input_tensor = torch.ones(1).to(self.compute_device)
+            output = list(torch.zeros(self.world_size).to(self.compute_device).chunk(self.world_size))
+            dist.all_gather(output, input_tensor, group=self.process_group)
+            assert torch.cat(output).sum() == float(self.world_size), (
+                f"found {torch.cat(output).sum()} devices in process group but "
+                f"world_size={self.world_size}. Check torch.cuda.set_device is called properly"
+            )
+
+    def _reset_lazy_init_info(self) -> None:
+        self._is_root: Optional[bool] = None
+        self._streams: Dict[str, torch.cuda.Stream] = {}
+        self._reducer: Optional[ReduceScatterBucketer] = None
+        self.param_manager.delete_fp32_shards()
+        self._output_pre_backward_hook_registered: Optional[List] = None
+        self.reshard_after_forward = self._orig_reshard_after_forward
+
+    def _lazy_init(self):
+        # Initialize param attributes lazily, in case the param's dtype or
+        # device changes after __init__.
+        for p in self.params:
+            self.param_manager.reset_param_attr(p, self.training)
+
+        # Initialize _is_root and setup streams. These steps would ideally
+        # happen in __init__, but _is_root can only be determined after the
+        # entire model hierarchy is setup, thus we run it lazily.
+        if self._is_root is None:
+            self._set_is_root()
+            self._setup_streams()
+            self._setup_output_hook_list()
+
+        if self._is_root:
+            # Buffers stay on GPU, and don't get sharded. Since _cast_buffers
+            # applies recursively, we only call this from the root instance.
+            self._cast_buffers()
+
+            if self.disable_reshard_on_root:
+                # Don't free the full params for the outer-most (root) instance,
+                # since those params will be needed immediately after for the
+                # backward pass.
+                self.reshard_after_forward = False
+
+            # Due to the use of streams, we need to make sure the previous
+            # ``optim.step()`` is done before we all-gather parameters.
+            self._wait_for_previous_optim_step()
+
+    def _set_is_root(self) -> None:
+        """If ``True``, implies that no other :class:`ShardedModel`
+        instance wraps this one. Called once by :func:`_lazy_init`.
+        Also sets self.children_share_process_group = True if all child
+        instances share the same process group. If some child instances use a
+        different process group, self.clip_grad_norm_ will raise an error.
+        """
+        if self._is_root is not None:
+            return
+        # No Zero3Model instance wraps this, else _is_root would be set to False.
+        self._is_root = True
+        # If final backward callback is never been queued, state should be IDLE.
+        # If final backward callback is queued, the callback should be finished
+        # and the state was reset to be IDLE.
+        # This should be asserted at the beginning of forward pass in the root instance only.
+        # For children instances, if they are checkpointed, state will not be reset to
+        # IDLE after each inner forward/backward.
+        self._assert_state(TrainingState.IDLE)
+        # As the root, we now set all children instances to False and
+        # give them a closure to try to queue a wait_for_post_backward.
+        self.children_share_process_group = True
+        for n, m in self.named_modules():
+            # `n != ""` excludes self.
+            if n != '' and isinstance(m, ShardedModel):
+                # We relax the assert for non-root instance, when the nested inialized module is wrapped
+                # again in ShardedModel later, for example after training to run inference.
+                assert m._is_root is None or not m._is_root
+                if m._is_root is None:
+                    m._is_root = False
+                if m.process_group != self.process_group:
+                    self.children_share_process_group = False
+
+                # if child instance in its own (smaller) world, that was probably an attempt to avoid OOM.
+                # Therefore gathering this child's optim state will probably cause OOM, so we won't do it.
+                m.no_broadcast_optim_state = m.no_broadcast_optim_state or (
+                    (m.world_size == 1) and (m.world_size < self.world_size) and (m.process_group != self.process_group)
+                )
+
+    def _setup_streams(self) -> None:
+        """Create streams to overlap data transfer and computation."""
+        if len(self._streams) > 0 or not self._is_root:
+            return
+
+        if torch.cuda.is_available():
+            # Stream to move main FP32 params (may be on CPU) to FP16 for forward.
+            self._streams['fp32_to_fp16'] = torch.cuda.Stream()
+            # Stream for all-gathering parameters.
+            self._streams['all_gather'] = torch.cuda.Stream()
+            # Stream for overlapping grad reduction with the backward pass.
+            self._streams['post_backward'] = torch.cuda.Stream()
+
+        self.param_manager.setup_streams(self._streams)
+        # Helper for bucketing reduce-scatter ops. This is also shared with
+        # children instances to improve bucket utilization.
+        self._reducer = ReduceScatterBucketer(self.reduce_scatter_bucket_size_mb)
+        # We share streams with all children instances, which allows them to
+        # overlap transfers across the forward pass without synchronizing with
+        # the default stream.
+        for n, m in self.named_modules():
+            if n != "" and isinstance(m, ShardedModel):
+                m._streams = self._streams
+                m._reducer = self._reducer
+                m.param_manager.setup_streams(self._streams)
+
+    def _setup_output_hook_list(self) -> None:
+        """set up a list to avoid registering pre-backward hooks
+        incorrectly.
+        """
+        assert self._is_root, "This should only be called on the root"
+        self._output_pre_backward_hook_registered = []
+        for n, m in self.named_modules():
+            if n != "" and isinstance(m, ShardedModel):
+                m._output_pre_backward_hook_registered = self._output_pre_backward_hook_registered
+
+    def _wait_for_previous_optim_step(self) -> None:
+        """
+        The outer-most :class:`ShardedModel` instance (i.e., the root
+        instance) needs to synchronize with the default stream to ensure the
+        previous optimizer step is done.
+        """
+        if not torch.cuda.is_available():
+            return
+        if self.mixed_precision or self._cpu_offload:
+            self._streams["fp32_to_fp16"].wait_stream(torch.cuda.current_stream())
+        else:
+            self._streams["all_gather"].wait_stream(torch.cuda.current_stream())
+
+    def _cast_buffers(
+        self, device: Optional[torch.device] = None, dtype: Optional[torch.dtype] = None, memo: Optional[Set] = None
+    ) -> None:
+        """Move all buffers to the given *device* and *dtype*.
+
+        If *device* or *dtype* are not given, then they will default to
+        ``self.compute_device`` and ``self.buffer_dtype``, respectively. In the
+        case of nested ShardedModel instances, we will respect the child instance's
+        ``compute_device`` and ``buffer_dtype`` configuration.
+
+        Args:
+            device (torch.device, Optional):
+                device to cast buffers to (defaults to compute_device)
+            dtype (torch.dtype, Optional):
+                dtype to cast buffers to (defaults to buffer_dtype)
+            memo (Set, Optional):
+                set of modules that have already been processed
+        """
+        if memo is None:
+            memo = set()
+        for module in self.modules():
+            if module is not self and isinstance(module, ShardedModel):
+                # Allow any child Zero3Model instances to handle their own buffers.
+                module._cast_buffers(device=device, dtype=dtype, memo=memo)
+            elif module not in memo:
+                memo.add(module)
+                for name, buf in module.named_buffers(recurse=False):
+                    if buf is None:
+                        continue
+                    buf = buf.to(device=device or self.compute_device)
+                    if torch.is_floating_point(buf):
+                        buf = buf.to(dtype=dtype or self.buffer_dtype)
+                    setattr(module, name, buf)
+
+    @torch.no_grad()
+    def _prep_grads_for_backward(self) -> None:
+        """Make sure p.grad is correctly prepared for the backward with
+        right shape, device, accumulated values, etc.
+        """
+        for p in self.params:
+            if p.grad is not None:
+                if p.grad.device != p.data.device:
+                    p.grad = None
+                elif p.grad.size() == p.zero_orig_size:
+                    if not p.zero_is_sharded:
+                        p.zero_saved_grad = p.grad.data
+                        p.grad = None
+                    else:
+                        # This is gradient accumulation with no_sync context.
+                        pass
+                elif p.grad.size() == p.zero_fp32_shard.shape:
+                    # This is gradient accumulation without no_sync context.
+                    # We save the grad shard and set p.grad to None for this backward pass.
+                    # We will accumulate after this pass's grad is generated and reduced and
+                    # sharded.
+                    p.zero_saved_grad_shard = p.grad.data
+                    p.grad = None
+                else:
+                    raise AssertionError(f"unexpected grad shape: {p.grad.size()}")
+
+    def _register_pre_backward_hooks(self, outputs: Any) -> Any:
+        """Register pre-backward hook to run before the wrapped module's
+        backward. Hooks should be attached to all outputs from the forward.
+
+        Returns:
+            outputs: new outputs with hooks registered if they requires gradient.
+        """
+        if not torch.is_grad_enabled():
+            return outputs  # don't register hooks if grad isn't enabled
+
+        if self._is_root:
+            # This actually means that only root instance has
+            # _post_backward_callback_queued defined. Accidentally accessing this field
+            # will assert on all other instances, giving us a nice bug checker.
+            self._post_backward_callback_queued = False
+
+        def _pre_backward_hook(*unused: Any) -> None:
+            # try to queue final backward callback only once for root, so
+            # that final backward callback is attached to the outer most
+            # backward graph task and called after all the backward
+            # calls are completed.
+            if self._is_root:
+                self._register_final_backward_hook()
+
+            # All-gather full parameters or switching to the full params.
+            #
+            # This needs to be done on every pre_backward hook, even within the same
+            # iteration (i.e. for checkpointed, multiple forward pass modules). This is
+            # because after the forward pass (i.e. in checkpoint inner graph), we always
+            # switch to fp32_shard in the ``forward`` function.
+            #
+            # We used to do this only after the ``self._pre_backward_hook_has_run``
+            # boolean guard below, which is incorrect. It worked in pytorch < 1.9 for
+            # some unknown reason, but pytorch 1.10 nightly exposed this bug.
+            #
+            # Note, both ``self.param_manager.rebuild_full_params`` and ``self.param_manager.use_full_params`` are
+            # idempotent.  So in case they are called unnecessarily, they don't incur much
+            # overhead.
+            if self.reshard_after_forward:
+                self.param_manager.rebuild_full_params()
+            else:
+                self.param_manager.use_full_params()
+
+            # Only run the ``self._prep_grads_for_backward`` once per iteration (i.e. in case
+            # it is multiple outputs or multiple forward passes).
+            if not self._pre_backward_hook_has_run:
+                self._pre_backward_hook_has_run = True
+                # Start of a backward pass for the first time in an iteration.
+                self._assert_state([TrainingState.IDLE, TrainingState.PRE_BACKWARD])
+                # Prepare p.grad so that it is in the right shape, device, accumulated values, etc.
+                self._prep_grads_for_backward()
+
+            # Transition to PRE_BACKWARD state if currently IDLE. We can transition from POST_BACKWARD
+            # to IDLE when ShardedModel is within activation checkpointing and called multiple times, due to the
+            # extra forward pass for re-computation.
+            if self.training_state == TrainingState.IDLE:
+                self.training_state = TrainingState.PRE_BACKWARD
+            self._assert_state([TrainingState.PRE_BACKWARD, TrainingState.POST_BACKWARD])
+
+        _registered = 0
+
+        def _register_hook(t: torch.Tensor) -> torch.Tensor:
+            # We don't register the pre_backward hook on the same tensor that has been
+            # returned from an inner ShardedModel, unless it is the first one. This does
+            # not cover all problematic cases though. A tensor not from an inner
+            # ShardedModel can cause problems too:
+            # ```
+            #   x = layer1(input)
+            #   state = [x]  # better change to x.detach(), not fixed by the following if-condition
+            #   x = inner_zero3_module_layer2(x)
+            #   state.append(x)  # better change to x.detach(), but fixed by the following if-condition
+            #   x = layer3(x)
+            #   return x, state
+            # ```
+            # The tensors in `state`, if not detached, can be registered with
+            # backward hooks (in addition to the `x` on the last line). In that case,
+            # pre-backward hook can fire multiple times in the order that causes
+            # the outer ShardedModel to crash.
+            #
+            # The best practice is for modules to be wrapped by ShardedModel to return 1 and only
+            # 1 tensor to be used for backward. All other tensors returned should be
+            # detached.
+            nonlocal _registered
+            assert self._output_pre_backward_hook_registered is not None
+            if t.requires_grad and (_registered == 0 or id(t) not in self._output_pre_backward_hook_registered):
+                t.register_hook(_pre_backward_hook)
+                self._output_pre_backward_hook_registered.append(id(t))
+                _registered += 1
+            return t
+
+        # Attach hooks to Tensor outputs.
+        outputs = apply_to_tensors(outputs, _register_hook)
+
+        return outputs
+
+    def _register_post_backward_hooks(self) -> None:
+        """
+        Register backward hooks to reshard params and reduce-scatter grads.
+
+        This is called during forward pass. The goal is to attach a hook
+        on each of the parameter's gradient generating function (``grad_acc``
+        below) so that the hook is called *after* all gradients for that
+        param are computed.
+
+        Goals:
+
+        1. We want the hook to fire once and only once *after* all gradients
+        are accumulated for a param.
+        2. If it fires more than once, we end up incorrectly shard the grad
+        multiple times. (could lead to dimension too small)
+        3. If it fires once but too early or doesn't fire, we leave gradients
+        unsharded. (could lead to dimension too large)
+
+        Due to multiple-pass forward, this function can be called on
+        the same parameter multiple times in a single forward pass. If we register
+        the hook multiple time, we end up getting called multiple times. We
+        could try to get a new hook every time and delete the previous one
+        registered. However, due to *unknown reason* (I have debugged it for
+        a long time!), in mixed precision mode, we get two different ``grad_acc``
+        objects below during different calls of this function (in the same
+        forward pass). If we keep the last one, the hook end up firing too
+        early. In full precision mode, we luckily get the *same* ``grad_acc``
+        object, so deleting and re-registering still ensured the hook fire
+        once after all gradients are generated. However, we find if we use activation
+        checkpoint in mixed precision mode, hook on ``grad_acc`` object won't be
+        fire for *unknown reason*. So we finally register hook on parameter directly.
+
+        Empirically, keep the first hook register per forward pass seems to
+        work the best. We do need to remove the hook at the end of the
+        backward pass. Otherwise, the next forward pass will not register
+        a new hook, which is needed for a new forward pass.
+        """
+        if not torch.is_grad_enabled():
+            return  # don't register grad hooks if grad isn't enabled
+        for p in self.params:
+            if p.requires_grad:
+                if hasattr(p, "zero_shard_bwd_hook"):
+                    continue
+                # For mixed precision with activation checkpoint, hooks on GradAccumulation won't be fired normally
+                # Instead we register hook on parameter
+                # In this way, we can't modify param.grad and param.data directly, which leads to more memory usage
+                # Register a hook on the first call, empirically, autograd
+                # fires it at the end for this param, which makes sense.
+                # p_tmp = p.expand_as(p)  # Get a grad_fn on p_tmp.
+                # assert p_tmp.grad_fn is not None
+                # grad_acc = p_tmp.grad_fn.next_functions[0][0]  # Gets its GradAccumulation object.
+                # handle = grad_acc.register_hook(functools.partial(self._post_backward_hook, p))
+                # p.zero_shard_bwd_hook = (grad_acc, handle)
+                handle = p.register_hook(functools.partial(self._post_backward_hook, p))
+                p.zero_shard_bwd_hook = handle
+
+    @torch.no_grad()
+    def _post_backward_hook(self, param: Parameter, grad: torch.Tensor) -> Optional[torch.Tensor]:
+        """
+        At the start of :func:`_post_backward_hook`, ``param.grad`` contains the
+        full gradient for the local batch. The reduce-scatter op will replace
+        ``param.grad`` with a single shard of the summed gradient across all
+        GPUs. This shard will align with the current GPU rank. For example::
+
+            before reduce_scatter:
+                param.grad (GPU #0): [1, 2, 3, 4]
+                param.grad (GPU #1): [5, 6, 7, 8]
+
+            after reduce_scatter:
+                param.grad (GPU #0): [6, 8]    # 1+5, 2+6
+                param.grad (GPU #1): [10, 12]  # 3+7, 4+8
+
+        The local GPU's ``optim.step`` is responsible for updating a single
+        shard of params, also corresponding to the current GPU's rank. This
+        alignment is created by `param_manager`, which ensures that
+        the local optimizer only sees the relevant parameter shard.
+        """
+        # First hook callback will see PRE state. If we have multiple params,
+        # then subsequent hook callbacks will see POST state.
+        self._assert_state([TrainingState.PRE_BACKWARD, TrainingState.POST_BACKWARD])
+        self.training_state = TrainingState.POST_BACKWARD
+        if grad is None:
+            return
+
+        assert grad is not None, param.shape
+        if grad.requires_grad:
+            raise RuntimeError("ShardedModel only works with gradients that don't require gradients")
+
+        if self._require_backward_grad_sync or self.reshard_after_forward:
+            # Free full params. As a special case, we don't free the full params
+            # when in a ``no_sync`` context (as inversely indicated by
+            # ``self._require_backward_grad_sync``), since the params will not
+            # get updated before the next forward. This saves networking
+            # bandwidth but uses more GPU memory.
+            self.param_manager.free_full_params([param])
+
+        if self.mixed_precision:
+            # This is a no-op if reshard_after_forward is True, since we already
+            # free the param shard when rebuilding the full params in the
+            # pre_backward_hook.
+            self.param_manager.free_fp16_shards([param])
+
+        # Switch to FP32 shard after backward.
+        # Cannot modify param.data, so we switch to FP32 in final backward hook
+        # self.param_manager.use_fp32_shards([param])
+
+        if not self._require_backward_grad_sync:
+            return
+
+        # Wait for all work in the current stream to finish, then start the
+        # reductions in post_backward stream.
+        self._streams["post_backward"].wait_stream(torch.cuda.current_stream())
+        with torch.cuda.stream(self._streams["post_backward"]):
+            new_grad = grad.clone()
+
+            if self.mixed_precision and self.fp32_reduce_scatter:
+                # Cast grad to FP32.
+                new_grad.data = new_grad.data.to(param.dtype)
+
+            if self.gradient_predivide_factor > 1:
+                # Average grad by world_size for consistency with PyTorch DDP.
+                new_grad.data.div_(self.gradient_predivide_factor)
+
+            orig_grad_data = new_grad.data
+            if param.zero_is_sharded:
+                assert self._reducer is not None
+                # Save the unsharded grad for reduction. We will asynchronously accumulate the reduced gradient into
+                # param.zero_saved_grad_shard. If this ShardedModel module was called multiple times it's possible that multiple
+                # gradient reductions will happen in an undefined order. But addition commutes, so this order doesn't
+                # matter, neglecting rounding.
+                # Clear grad on the tensor, so any repeated gradient computations do not interfere with this reduction.
+                #
+                # The effect on memory consumption is not usually significant. No extra memory is allocated if this
+                # module is called only once, reduction happens quickly, or the tensor is bucketed. If the module is
+                # called multiple times, and the backwards pass runs far enough ahead of the `post_backward` stream,
+                # then we can end up with multiple unsharded gradients allocated and queued for reduction.
+                #
+                # We could guard against this by using CUDA events (see record_event, wait_event in torch.cuda.Stream).
+                # This ensures the `default` stream will wait for the `post_backward` stream to complete the last
+                # reduction for this module, before scheduling additional reduction work. Then at most there are two
+                # unsharded gradients allocated; one for a pending reduction, and one for gradient computation.
+                callback_fn = functools.partial(self._reduce_scatter_callback, param)
+                grad_chunks = chunk_and_pad(orig_grad_data, self.reduce_scatter_process_group.size())
+                self._reducer.reduce_scatter_async(
+                    grad_chunks, group=self.reduce_scatter_process_group, callback_fn=callback_fn
+                )
+            else:
+                # Currently the only way for _is_sharded to be False is if
+                # world_size == 1. This could be relaxed in the future, in which
+                # case grads should be all-reduced here.
+                assert self.world_size == 1
+                self._reduce_scatter_callback(param, new_grad)
+
+            # After _post_backward_hook returns, orig_grad_data will eventually
+            # go out of scope, at which point it could otherwise be freed for
+            # further reuse by the main stream while the div/reduce_scatter/copy
+            # are underway in the post_backward stream. See:
+            # github.com/NVIDIA/apex/blob/master/apex/parallel/distributed.py
+            orig_grad_data.record_stream(self._streams["post_backward"])
+
+    def _reduce_scatter_callback(self, param: Parameter, reduced_grad: torch.Tensor) -> None:
+        """Hook to call on each param after the reduce-scatter."""
+        assert torch.cuda.current_stream() == self._streams["post_backward"]
+        self._assert_state(TrainingState.POST_BACKWARD)
+        if self.gradient_postdivide_factor > 1:
+            # Average grad by world_size for consistency with PyTorch DDP.
+            reduced_grad.data.div_(self.gradient_postdivide_factor)
+        # Cast grad to param's dtype (typically FP32). Note: we do this
+        # before the cpu offload step so that this entire hook remains
+        # non-blocking. The downside is a bit more D2H transfer in that case.
+        if self.mixed_precision:
+            orig_param_grad_data = reduced_grad.data
+            reduced_grad.data = reduced_grad.data.to(dtype=param.zero_fp32_shard.dtype)
+            # Don't let this memory get reused until after the transfer.
+            orig_param_grad_data.record_stream(torch.cuda.current_stream())
+
+        if param.zero_is_sharded:
+            # Accumulate into the gradient shard.
+            if getattr(param, "zero_saved_grad_shard", None) is None:
+                param.zero_saved_grad_shard = reduced_grad.data
+            else:
+                assert (
+                    param.zero_saved_grad_shard.shape == reduced_grad.shape
+                ), f"{param.zero_saved_grad_shard.shape} vs {reduced_grad.shape}"
+                param.zero_saved_grad_shard.data += reduced_grad.data
+            reduced_grad = param.zero_saved_grad_shard.data
+        else:
+            # We can't modify the dtype of grad in this function
+            # So we use `param.zero_saved_grad` to store gradient
+            # This is useful when using mixed precision mode on single node
+            if getattr(param, 'zero_saved_grad', None) is None:
+                param.zero_saved_grad = reduced_grad.data
+            else:
+                param.zero_saved_grad.data += reduced_grad.data
+
+        # Optionally move gradients to CPU, typically used if one is running the optimizer on the CPU. Once the full
+        # backwards pass completes, we will set `.grad` to the CPU copy.
+        if self._cpu_offload:
+            param.zero_cpu_grad.copy_(reduced_grad.data, non_blocking=True)
+            # Don't let this memory get reused until after the transfer.
+            reduced_grad.data.record_stream(torch.cuda.current_stream())
+
+    def _register_final_backward_hook(self) -> None:
+        """Try to queue a `_final_backward_hook` callback.
+
+        Only called on root and only queue one callback at the beginning of
+        outer most backward.
+        """
+        assert self._is_root
+        if not self._post_backward_callback_queued:
+            self._assert_state([TrainingState.IDLE])
+            self._post_backward_callback_queued = True
+            Variable._execution_engine.queue_callback(self._final_backward_hook)
+
+    @torch.no_grad()
+    def _final_backward_hook(self) -> None:
+        """Wait for post-backward to finish. Only called on root instance."""
+        # None, backward runtime swallow the assert error, so we use assert_in_engine() here.
+        assert_in_engine(self._is_root, "FinalBackwardHook not called on root")
+        # Check if the root module has params and if any of them has
+        # the `requires_grad` field set. If `requires_grad=False` for
+        # all the params, the post_backward hook will not fire and the
+        # state will remain in `TrainingState.PRE_BACKWARD`.
+        if any([p.requires_grad for p in self.params]):
+            self._assert_state(TrainingState.POST_BACKWARD)
+        else:
+            self._assert_state(TrainingState.PRE_BACKWARD)
+        self.param_manager.use_fp32_shards()
+        if self._require_backward_grad_sync:
+            # Flush any unreduced buckets in the post_backward stream.
+            with torch.cuda.stream(self._streams["post_backward"]):
+                assert_in_engine(self._reducer is not None, "FinalBackwardHook: reducer is None")
+                assert self._reducer is not None  # make mypy happy
+                self._reducer.flush()
+            torch.cuda.current_stream().wait_stream(self._streams["post_backward"])
+            if self._cpu_offload:
+                # Wait for the non-blocking GPU -> CPU grad transfers to finish.
+                torch.cuda.current_stream().synchronize()
+
+        # A backward pass is done, clean up below.
+        # Free reducer buffers.
+        if self._reducer is not None:
+            self._reducer.free()
+
+        def _finalize_parameters(zero_module: ShardedModel) -> None:
+            """Helper used below on all zero3 modules."""
+            for p in zero_module.params:
+                if not p.requires_grad:
+                    continue
+                if hasattr(p, "zero_shard_bwd_hook"):
+                    p.zero_shard_bwd_hook.remove()
+                    delattr(p, "zero_shard_bwd_hook")
+
+                # Leave the gradient accumulation state as-is if not synchronizing this pass. This ensures p.grad
+                # remains the unsharded gradient accumulated from prior no-sync passes, and p.zero_saved_grad_shard
+                # remains the sharded gradient from the last synchronized pass. This also allows interleaved no-sync and
+                # sync passes, if desired.
+                if not self._require_backward_grad_sync:
+                    continue
+
+                # Parameter and gradient devices must match.
+                if hasattr(p, "zero_cpu_grad"):
+                    assert_in_engine(p.device == torch.device("cpu"),
+                                     f"FinalBackwardHook: incorrect cpu_grad device {p.device}")
+                    p.grad = p.zero_cpu_grad
+                elif hasattr(p, "zero_saved_grad_shard"):
+                    assert_in_engine(
+                        p.device == p.zero_saved_grad_shard.device,
+                        f"FinalBackwardHook: incorrect saved_grad_shard device {p.device} vs {p.zero_saved_grad_shard.device}",
+                    )
+                    p.grad = p.zero_saved_grad_shard
+                elif hasattr(p, 'zero_saved_grad'):
+                    p.grad = p.zero_saved_grad
+
+                if hasattr(p, "zero_saved_grad_shard"):
+                    delattr(p, "zero_saved_grad_shard")
+                if hasattr(p, 'zero_saved_grad'):
+                    delattr(p, "zero_saved_grad")
+
+        # Update root and nested ShardedModel's hooks and flags.
+        for m in self.modules():  # includes self
+            if isinstance(m, ShardedModel):
+                _finalize_parameters(m)
+                m._pre_backward_hook_has_run = False
+                if any(p.requires_grad for p in m.parameters()):
+                    # Check if the module has params and if any of them has
+                    # the `requires_grad` field set. If `requires_grad=False` for
+                    # all the params, the post_backward hook will not fire and the
+                    # state will remain in `TrainingState.PRE_BACKWARD`.
+                    if any([p.requires_grad for p in m.params]):
+                        m._assert_state(TrainingState.POST_BACKWARD)
+                    else:
+                        m._assert_state(TrainingState.PRE_BACKWARD)
+                else:
+                    # When `m` and its children has no params or has params but
+                    # none with `requires_grad==True`, there are two cases:
+                    # 1. output tensors are `requires_grad==True`. In this case,
+                    # pre-backward hook is still registered, so it is in PRE_BACKWARD state.
+                    # 2. output tensors are `requires_grad==False`. In this case,
+                    # pre-backward hook is not registered, so it is in IDLE state.
+                    m._assert_state([TrainingState.PRE_BACKWARD, TrainingState.IDLE])
+                m.training_state = TrainingState.IDLE
+
+                if m._is_root:
+                    # reset this flag for cases like "one forward pass + multiple backward passes"
+                    self._post_backward_callback_queued = False
+                    # clear this list for next iteration
+                    assert_in_engine(
+                        self._output_pre_backward_hook_registered is not None,
+                        "FinalBackwardHook: self._output_pre_backward_hook_registered should not be None",
+                    )
+                    assert self._output_pre_backward_hook_registered is not None  # make mypy happy
+                    self._output_pre_backward_hook_registered.clear()
+
+    @contextlib.contextmanager
+    def gather_full_params(self, recurse: bool = True, volatile: bool = False) -> Generator:
+        """
+        A context manager to expose full params for the current ShardedModel instance.
+        Can be useful *after* forward/backward for a model to get the params for
+        additional processing or checking. Parameters will be gathered in full
+        precision (e.g., FP32).
+
+        .. note:: This can be used on inner ShardedModels.
+
+        .. note:: This can *not* be used within a forward or backward pass. Nor
+            can forward and backward be started from within this context.
+
+        .. note:: The full parameters will be freed after the context manager
+            exits; it is up to the caller to clone them if needed.
+
+        .. note:: The full parameters can be modified, but only the portion
+            corresponding to the local param shard will persist after the
+            context manager exits (unless ``volatile=True``, in which case there
+            are no guarantees about persistence).
+
+        Args:
+            recurse (bool, Optional): recursively summon all params for nested
+                ShardedModel instances (default: True)
+            volatile (bool, Optional): if ``True``, modifications to params are
+                not guaranteed to persist after the context manager exists;
+                enabling this can be slightly more efficient (default: False)
+        """
+        if recurse:
+            with contextlib.ExitStack() as stack:
+                # Summon all params for any nested Zero3Model instances.
+                for module in self.modules():
+                    if isinstance(module, ShardedModel):
+                        stack.enter_context(module.gather_full_params(recurse=False, volatile=volatile))
+                # Yield to the caller, with full params in all nested instances.
+                yield
+            # Exiting from the ExitStack will re-shard params.
+            return
+        else:
+            torch.cuda.synchronize()
+            self._lazy_init()
+            self._assert_state(TrainingState.IDLE)
+            # Set the state so that we assert when trying to go into
+            # forward/backward.
+            self.training_state = TrainingState.GATHER_FULL_PARAMS
+            full_tensors = self.param_manager.rebuild_full_params(force_full_precision=True)
+            assert full_tensors is not None
+            with contextlib.ExitStack() as stack:
+                try:
+                    yield
+                finally:
+                    stack.close()
+                    for p, (full_tensor, safe_to_free) in zip(self.params, full_tensors):
+                        if not volatile:
+                            # Copy any changes made to the full params back into
+                            # the corresponding local shards.
+                            local_shard, _ = get_shard(full_tensor)
+                            p.zero_fp32_shard.copy_(local_shard.view_as(p.zero_fp32_shard))
+                        if safe_to_free:
+                            free_storage(full_tensor)
+                    self.has_full_params = False
+                    self.param_manager.use_fp32_shards()
+                    self.training_state = TrainingState.IDLE
+
+    def apply(self, fn: Callable[[nn.Module], None]) -> "ShardedModel":
+        """
+        Applies ``fn`` recursively to every submodule (as returned by
+        ``.children()``) as well as self. Typical use includes initializing the
+        parameters of a model.
+
+        Compared to ``torch.nn.Module.apply``, this version additionally gathers
+        the full parameters before applying ``fn``. It should not be called from
+        within another ``summon_full_params`` context.
+
+        Args:
+            fn (nn.Module): function to be applied to each submodule
+
+        Returns:
+            Module: self
+        """
+        is_uninitialized = self._is_root is None
+        self._assert_state(TrainingState.IDLE)
+        with self.gather_full_params(recurse=False):
+            return_value = super().apply(fn)
+        # summon_full_params will call _lazy_init, which sets _is_root. However,
+        # apply() may be called directly on children instances to do weight
+        # init, so we should reset the _is_root flag in this case.
+        if is_uninitialized and self._is_root:
+            for module in self.modules():
+                if isinstance(module, ShardedModel):
+                    module._reset_lazy_init_info()
+        return return_value
+
+    def __getattr__(self, name: str) -> Any:
+        try:
+            return super().__getattr__(name)
+        except AttributeError:
+            return getattr(self.module, name)
+
+    def __getstate__(self) -> Dict[str, str]:
+        """Serialize the state.
+
+        Some properties are not serializable (e.g., process groups, streams), so
+        we remove them and try to reconstruct them in :func:`__setstate__`.
+        """
+        state = copy.copy(self.__dict__)
+        state["is_sharded"] = [p.zero_is_sharded for p in self.params]
+        state["orig_sizes"] = [p.zero_orig_size for p in self.params]
+        if state["process_group"] is not None:
+            state["process_group"] = "MISSING"  # process_group isn't pickleable
+        if state["process_group_reduce_scatter"] is not None:
+            state["process_group_reduce_scatter"] = "MISSING"  # process_group_reduce_scatter isn't pickleable
+        self._reset_lazy_init_info()
+        return state
+
+    def __setstate__(self, state: Dict[str, Any]) -> None:
+        """Intercept state setting and perform needed changes on params."""
+        super().__setstate__(state)
+
+        def fixup(p: Parameter, is_sharded: bool, size: torch.Size) -> Parameter:
+            assert isinstance(p, Parameter)
+            p.data = p.data.clone()  # move tensors out of shared memory
+            p.zero_is_sharded = is_sharded
+            p.zero_orig_size = size
+            return p
+
+        self.params = [
+            fixup(p, is_sharded, size) for p, is_sharded, size in zip(self.params, self.is_sharded, self.orig_sizes)
+        ]
+        del self.is_sharded
+        del self.orig_sizes
+        self._reset_lazy_init_info()
+
+    def __getitem__(self, key: int) -> Any:
+        """Forward indexing calls in case the module is a nn.Sequential."""
+        return self.module.__getitem__(key)
+
+    @contextlib.contextmanager
+    def no_sync(self) -> Generator:
+        """
+        A context manager to disable gradient synchronizations across ShardedModel
+        processes. Within this context, gradients will be accumulated on module
+        variables, which will later be synchronized in the first
+        forward-backward pass after exiting the context.
+
+        .. note:: This likely results in higher memory usage because ShardedModel will
+            accumulate the full model gradients (instead of gradient shards)
+            until the eventual sync.
+
+        .. note:: Gradient accumulation can be done without this context,
+            avoiding the extra GPU memory overhead, but with the extra
+            networking overhead.
+        """
+        self._lazy_init()
+        assert self._is_root, "no_sync on inner ShardedModel is not supported"
+        self._assert_state(TrainingState.IDLE)
+        # This instance may wrap other ShardedModel instances and we
+        # need to set all of them to accumulate gradients.
+        old_flags = []
+        for m in self.modules():  # includes self
+            if isinstance(m, ShardedModel):
+                old_flags.append((m, m._require_backward_grad_sync))
+                m._require_backward_grad_sync = False
+        try:
+            yield
+        finally:
+            for m, old_flag in old_flags:
+                assert m._require_backward_grad_sync is False
+                m._require_backward_grad_sync = old_flag
+
+    def _assert_state(self, state: Union[TrainingState, List[TrainingState]]) -> None:
+        """Assert we are in the given state."""
+        # Since assert can be turned off and this error checking
+        # is really important, we use explicit error checking
+        # and raise a ValueError if needed.
+        if isinstance(state, TrainingState):
+            state = [state]
+        if self.training_state not in state:
+            msg = f"expected to be in states {state} but current state " f"is {self.training_state}"
+            # In case we are failing in the context of autograd hook, asserting
+            # may not generate useful msg. So, let's print it to be sure.
+            self.logger.error(f'Zero3 instance {self} got error: {msg}', ranks=[0])
+            if self.rank == 0:
+                traceback.print_stack()
+            raise ValueError(msg)
+
+    def extra_repr(self) -> str:
+        repr = (
+            f"world_size={self.world_size}, "
+            f"mixed_precision={self.mixed_precision}, "
+        )
+        if self.verbose:
+            repr = (
+                f"rank={self.rank}, " + repr + f"reshard_after_forward={self.reshard_after_forward}, "
+                f"compute_dtype={self.compute_dtype}, "
+                f"buffer_dtype={self.buffer_dtype}, "
+                f"fp32_reduce_scatter={self.fp32_reduce_scatter}, "
+                f"compute_device={self.compute_device}"
+                f"reduce_scatter_bucket_size_mb={self.reduce_scatter_bucket_size_mb}, "
+                f"clear_autocast_cache={self.clear_autocast_cache}"
+                f"force_input_to_fp32={self.force_input_to_fp32}"
+                f"offload_config={self.offload_config}"
+            )
+        return repr
+
+    def state_dict(self, destination=None, prefix='', keep_vars=False):
+        """
+        Returns the whole (unsharded) state of the module. Parameters are not
+        sharded, so the resulting state_dict can be loaded directly by the
+        wrapped Module without any sharding-specific logic. Returned tensors
+        will be full precision (e.g., FP32).
+
+        .. warning:: This needs to be called on all ranks, since synchronization
+            primitives will be used.
+        """
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+        self._lazy_init()
+
+        def maybe_cast_buffers(dtype: Optional[torch.dtype] = None) -> None:
+            if self.mixed_precision:
+                self._cast_buffers(dtype=dtype)
+
+        assert self._return_full_state_dict is True, 'Only support return full state dict now'
+        if self.training_state != TrainingState.GATHER_FULL_PARAMS:
+            with self.gather_full_params(recurse=False, volatile=True):
+                maybe_cast_buffers(torch.float32)
+                state_dict = super().state_dict()
+        else:
+            maybe_cast_buffers(torch.float32)
+            state_dict = super().state_dict(destination=destination, prefix=prefix, keep_vars=keep_vars)
+
+        if self._cpu_offload:
+            for k, tensor in state_dict.items():
+                state_dict[k] = tensor.cpu()
+
+        # In case we are in mixed precision, restore buffers back to buffer_dtype.
+        maybe_cast_buffers()
+        return state_dict
+
+    def load_state_dict(
+        self, state_dict: Union[Dict[str, torch.Tensor], "OrderedDict[str, torch.Tensor]"], strict: bool = True
+    ) -> NamedTuple:
+        """
+        Load a whole (unsharded) state_dict.
+
+        .. warning:: This needs to be called on all ranks, since synchronization
+            primitives will be used.
+        """
+        if self._return_full_state_dict:
+            with self.gather_full_params():
+                return self.module.load_state_dict(state_dict, strict)
+        else:
+            torch.cuda.synchronize()
+            self._lazy_init()
+            return self.module.load_state_dict(state_dict, strict)
+
+
+def _post_state_dict_hook(
+    state_dict_on_rank_0_only: bool,
+    module: Zero3ParameterManager,
+    state_dict: "OrderedDict[str, torch.Tensor]",
+    prefix: str,
+    *args: Any,
+) -> "OrderedDict[str, torch.Tensor]":
+    # When state_dict_on_rank_0_only is ``True``, ``model.state_dict()`` will only
+    # returns full state dict on rank 0 and return empty dict non-rank 0,
+    # which allow ShardedModel to skip the GPU -> CPU copy on
+    # non-rank 0 altogether and prevent OOM.
+    if state_dict_on_rank_0_only and dist.get_rank() != 0:
+        state_dict.clear()
+        return state_dict
+    # Assuming we are in a ``gather_full_params()`` context, we need to clone
+    # each tensor so that it does not get freed (in-place) when the context
+    # exits. At the same time, this hook can be called multiple times
+    # recursively, so we need to make sure that we only clone each tensor at
+    # most once. Thus we add an attribute on the tensor called "_has_been_cloned"
+    # which keeps track of tensors that are no longer at risk of being freed.
+    for key in state_dict.keys():
+        if not key.startswith(prefix) or getattr(state_dict[key], "_has_been_cloned", False):
+            continue
+        if state_dict[key].device.type != module.state_dict_device.type:
+            state_dict[key] = state_dict[key].to(device=module.state_dict_device)
+            state_dict[key]._has_been_cloned = True
+        elif module.training_state == TrainingState.GATHER_FULL_PARAMS:
+            # We copy the state_dict since full param will be freed after we
+            # exit the ``summon_full_params()`` context.
+            state_dict[key] = state_dict[key].clone()
+            state_dict[key]._has_been_cloned = True
+
+    # Remove "_zero3_module." prefix
+    replace_state_dict_prefix(state_dict, prefix + "_zero3_module.", prefix)
+    return state_dict
+
+
+def _pre_load_state_dict_hook(
+    state_dict: Union[Dict[str, torch.Tensor], "OrderedDict[str, torch.Tensor]"], prefix: str, *args: Any
+) -> None:
+    replace_state_dict_prefix(state_dict, prefix, prefix + "_zero3_module.")
diff --git a/colossalai/zero/sharded_model/sharded_model_v2.py b/colossalai/zero/sharded_model/sharded_model_v2.py
new file mode 100644
index 000000000..0168d443e
--- /dev/null
+++ b/colossalai/zero/sharded_model/sharded_model_v2.py
@@ -0,0 +1,63 @@
+
+import contextlib
+import copy
+import functools
+import os
+import traceback
+from collections import OrderedDict
+from enum import Enum, auto
+from typing import (Any, Callable, Dict, Generator, List, NamedTuple, Optional,
+                    Set, Union)
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from colossalai.context.parallel_mode import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.logging import get_dist_logger
+from colossalai.utils import get_current_device
+from torch.distributed import ProcessGroup
+from colossalai.engine.ophooks import register_ophooks_recursively, BaseOpHook, ShardParamHook
+from colossalai.zero.shard_param import ShardParam
+
+class ShardedModelV2(nn.Module):
+    def __init__(self,
+                 module: nn.Module,
+                 process_group: Optional[ProcessGroup] = None,
+                 reduce_scatter_process_group: Optional[ProcessGroup] = None
+    ):
+        r"""
+        A demo to reconfigure zero1 shared_model.
+        Currently do not consider the Optimizer States.
+        """
+        super().__init__()
+        self.logger = get_dist_logger()
+
+        self.process_group = process_group or gpc.get_group(ParallelMode.DATA)
+        self.reduce_scatter_process_group = reduce_scatter_process_group or self.process_group
+        self.world_size = dist.get_world_size(self.process_group)
+        self.rank = dist.get_rank(self.process_group)
+
+        # The module has to be placed on GPU
+        self.module = module.cuda()
+
+        # Shard the parameters at first
+        for _, param in self.module.named_parameters():
+            param.ca_attr = ShardParam(param)
+            param.ca_attr.shard()
+
+        # Register hooks
+        register_ophooks_recursively(self.module, [ShardParamHook()])
+
+    def forward(self, *args: Any, **kwargs: Any) -> torch.Tensor:
+        outputs = self.module(*args, **kwargs)
+        return outputs
+
+
+    def backward(self, loss):
+        if self.loss_scaler:
+            self.loss_scaler.backward(loss)
+        else:
+            loss.backward()
+    
+    
\ No newline at end of file
diff --git a/colossalai/zero/sharded_optim/__init__.py b/colossalai/zero/sharded_optim/__init__.py
new file mode 100644
index 000000000..07681531c
--- /dev/null
+++ b/colossalai/zero/sharded_optim/__init__.py
@@ -0,0 +1,3 @@
+from .sharded_optim import ShardedOptimizer
+
+__all__ = ['ShardedOptimizer']
\ No newline at end of file
diff --git a/colossalai/zero/sharded_optim/_utils.py b/colossalai/zero/sharded_optim/_utils.py
new file mode 100644
index 000000000..18dca5231
--- /dev/null
+++ b/colossalai/zero/sharded_optim/_utils.py
@@ -0,0 +1,288 @@
+import math
+import torch
+from torch._six import inf
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+from colossalai.core import global_context as gpc
+from colossalai.context import ParallelMode
+from colossalai.utils import is_model_parallel_parameter
+import torch.distributed as dist
+
+
+def move_tensor(input_, device):
+    assert device in ['cpu', 'gpu']
+
+    if isinstance(input_, (list, tuple)):
+        for tensor in input_:
+            tensor.data = tensor.data.cpu(
+            ) if device == 'cpu' else tensor.data.cuda()
+    elif torch.is_tensor(input_):
+        input_.data = input_.data.cpu(
+        ) if device == 'cpu' else tensor.data.cuda()
+    else:
+        raise TypeError(
+            f"Expected argument 'input_' to be torch.Tensor, list or tuple, but got {type(input_)} "
+        )
+
+
+def flatten(input_):
+    return _flatten_dense_tensors(input_)
+
+
+def unflatten(flat, tensors):
+    return _unflatten_dense_tensors(flat, tensors)
+
+
+def count_numel(tensor_list):
+    res = 0
+    for tensor in tensor_list:
+        res += tensor.numel()
+    return res
+
+
+def calculate_padding(numel, unit_size):
+    remainder = numel % unit_size
+    return unit_size - remainder if remainder else remainder
+
+
+def shuffle_by_round_robin(tensor_list, num_partitions):
+    partitions = dict()
+
+    for tensor_idx, tensor in enumerate(tensor_list):
+        partition_to_go = tensor_idx % num_partitions
+        if partition_to_go not in partitions:
+            partitions[partition_to_go] = []
+        partitions[partition_to_go].append(dict(tensor=tensor,
+                                                index=tensor_idx))
+
+    partitions_count = len(partitions)
+    new_tensor_list = []
+    tensor_index_mapping = dict()
+
+    for partition_id in range(partitions_count):
+        partition_tensors = partitions[partition_id]
+        for item in partition_tensors:
+            tensor_index_mapping[item['index']] = len(new_tensor_list)
+            new_tensor_list.append(item['tensor'])
+
+    return new_tensor_list, tensor_index_mapping
+
+
+# create a flat tensor aligned at the alignment boundary
+def flatten_dense_tensors_with_padding(tensor_list, unit_size):
+    num_elements = count_numel(tensor_list)
+    padding = calculate_padding(num_elements, unit_size=unit_size)
+
+    if padding > 0:
+        pad_tensor = torch.zeros(padding,
+                                 device=tensor_list[0].device,
+                                 dtype=tensor_list[0].dtype)
+        padded_tensor_list = tensor_list + [pad_tensor]
+    else:
+        padded_tensor_list = tensor_list
+
+    return flatten(padded_tensor_list)
+
+
+def is_nccl_aligned(tensor):
+    return tensor.data_ptr() % 4 == 0
+
+def get_grad_accumulate_object(tensor):
+    """
+    Return the AccumulateGrad of the input tensor
+    """
+
+    # grad_fn reference:
+    # https://discuss.pytorch.org/t/in-the-grad-fn-i-find-a-next-functions-but-i-dont-understand-the-meaning-of-the-attribute/24463
+    # expand_as reference: https://pytorch.org/docs/stable/generated/torch.Tensor.expand.html#torch.Tensor.expand
+    #
+    # `next_functions` will return the backward graph where
+    # the first element is the AccumulateGrad of the leaf nodes.
+    # we want to get the AccumulateGrad of the input tensor instead of the leaf
+    # node in the whole computation graph.
+    # Therefore, we call expand_as to create a dummy graph
+    # where tensor_tmp and tensor indeed point to the same object.
+    # You can check this by print(tensor.data_ptr() == tensor_tmp.data_ptr())
+    tensor_tmp = tensor.expand_as(tensor)
+    grad_acc_obj = tensor_tmp.grad_fn.next_functions[0][0]
+    return grad_acc_obj
+
+
+def split_half_float_double(tensor_list):
+    dtypes = [
+        "torch.cuda.HalfTensor", "torch.cuda.FloatTensor",
+        "torch.cuda.DoubleTensor", "torch.cuda.BFloat16Tensor"
+    ]
+    buckets = []
+    for i, dtype in enumerate(dtypes):
+        bucket = [t for t in tensor_list if t.type() == dtype]
+        if bucket:
+            buckets.append(bucket)
+    return buckets
+
+
+def reduce_tensor(tensor,
+                  dtype,
+                  dst_rank=None,
+                  parallel_mode=ParallelMode.DATA):
+    """
+    Reduce the tensor in the data parallel process group
+
+    :param tensor: A tensor object to reduce/all-reduce
+    :param dtype: The data type used in communication
+    :param dst_rank: The source rank for reduce. If dst_rank is None,
+    all-reduce will be used instead of reduce. Default is None.
+
+    :type tensor: torch.Tensor
+    :type dtype: torch.dtype
+    :type dst_rank: int, optional
+    """
+
+    # cast the data to specified dtype for reduce/all-reduce
+    if tensor.dtype != dtype:
+        tensor_to_reduce = tensor.to(dtype)
+    else:
+        tensor_to_reduce = tensor
+
+    world_size = gpc.get_world_size(parallel_mode)
+    group = gpc.get_group(parallel_mode)
+    tensor_to_reduce.div_(world_size)
+
+    # if rank is None, all reduce will be used
+    # else, reduce is used
+    use_all_reduce = dst_rank is None
+
+    if use_all_reduce:
+        dist.all_reduce(tensor_to_reduce, group=group)
+    else:
+        ranks_in_group = gpc.get_ranks_in_group(parallel_mode)
+        global_rank = ranks_in_group[dst_rank]
+        dist.reduce(tensor=tensor_to_reduce, dst=global_rank, group=group)
+
+    # recover the original dtype
+    if tensor.dtype != dtype and tensor is not tensor_to_reduce:
+        local_rank = gpc.get_local_rank(parallel_mode)
+        if use_all_reduce or dst_rank == local_rank:
+            tensor.copy_(tensor_to_reduce)
+    return tensor
+
+def has_inf_or_nan(tensor):
+    try:
+        # if tensor is half, the .float() incurs an additional deep copy, but it's necessary if
+        # Pytorch's .sum() creates a one-element tensor of the same type as tensor
+        # (which is true for some recent version of pytorch).
+        tensor_sum = float(tensor.float().sum())
+        # More efficient version that can be used if .sum() returns a Python scalar
+        # tensor_sum = float(tensor.sum())
+    except RuntimeError as instance:
+        # We want to check if inst is actually an overflow exception.
+        # RuntimeError could come from a different error.
+        # If so, we still want the exception to propagate.
+        if "value cannot be converted" not in instance.args[0]:
+            raise
+        return True
+    else:
+        if tensor_sum == float('inf') or tensor_sum == -float(
+                'inf') or tensor_sum != tensor_sum:
+            return True
+        return False
+
+
+def release_param_grad(tensor_list):
+    for tensor in tensor_list:
+        tensor.grad = None
+
+
+def calculate_global_norm_from_list(norm_list):
+    """ Compute total from a list of norms
+    """
+    total_norm = 0.0
+    for norm in norm_list:
+        total_norm += norm**2.0
+    return math.sqrt(total_norm)
+
+
+def compute_norm(gradients,
+                 params,
+                 dp_group,
+                 mp_group,
+                 norm_type=2):
+    """Clips gradient norm of an iterable of parameters.
+    This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and
+    added functionality to handle model parallel parameters. Note that
+    the gradients are modified in place.
+    Arguments:
+        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
+            single Tensor that will have gradients normalized
+        max_norm (float or int): max norm of the gradients
+        norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
+            infinity norm.
+    Returns:
+        Total norm of the parameters (viewed as a single vector).
+    """
+
+    if mp_group is None:
+        mp_rank = 0
+    else:
+        mp_rank = dist.get_rank(mp_group)
+
+    norm_type = float(norm_type)
+    if norm_type == inf:
+        total_norm = max(g.data.abs().max() for g in gradients)
+        total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
+        dist.all_reduce(total_norm_cuda,
+                        op=torch.distributed.ReduceOp.MAX,
+                        group=dp_group)
+
+        # Take max across all GPUs.
+        if mp_group is not None:
+            dist.all_reduce(tensor=total_norm_cuda,
+                            op=torch.distributed.ReduceOp.MAX)
+        total_norm = total_norm_cuda[0].item()
+    else:
+        total_norm = 0.0
+        # if dist.get_rank() == 0:
+        #    logger.info(f"Total Norm beginning {total_norm}")
+
+        for g, p in zip(gradients, params):
+            # Pipeline parallelism may replicate parameters. Avoid multi-counting.
+            if is_model_parallel_parameter(p) or mp_rank == 0:
+                param_norm = g.data.double().norm(2)
+                total_norm += param_norm.item()**2
+            
+        # Sum across all model parallel GPUs.
+        total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
+        torch.distributed.all_reduce(total_norm_cuda,
+                                     op=torch.distributed.ReduceOp.SUM,
+                                     group=dp_group)
+        
+        if mp_group is not None:
+            dist.all_reduce(tensor=total_norm_cuda,
+                            op=torch.distributed.ReduceOp.SUM)
+
+        total_norm = total_norm_cuda[0].item()**(1. / norm_type)
+
+    if total_norm == float(
+            'inf') or total_norm == -float('inf') or total_norm != total_norm:
+        total_norm = -1
+
+    return total_norm
+
+
+def sync_param(flat_tensor, tensor_list):
+    """
+    Synchronize the flattened tensor and unflattened tensor list. When
+    a list of tensor are flattened with `torch._utils._unflatten_dense_tensors`,
+    a new tensor is created. Thus, the flat tensor and original tensor list do not
+    share the same memory space. This function will update the tensor list so that 
+    they point to the same value.
+
+    :param flat_tensor: A flat tensor obtained by calling `torch._utils._unflatten_dense_tensors` on a tensor lsit
+    :param tensor_list: A list of tensors corresponding to the flattened tensor
+    :type flat_tensor: torch.Tensor
+    :type tensor_list: List[torch.Tensor]
+    """
+    updated_params = unflatten(flat_tensor, tensor_list)
+
+    # update the tensor data
+    for p, q in zip(tensor_list, updated_params):
+        p.data = q.data
diff --git a/colossalai/zero/sharded_optim/bookkeeping/__init__.py b/colossalai/zero/sharded_optim/bookkeeping/__init__.py
new file mode 100644
index 000000000..a96c6b147
--- /dev/null
+++ b/colossalai/zero/sharded_optim/bookkeeping/__init__.py
@@ -0,0 +1,6 @@
+from .gradient_store import GradientStore
+from .parameter_store import ParameterStore
+from .bucket_store import BucketStore
+from .tensor_bucket import TensorBucket
+
+__all__ = ['GradientStore', 'ParameterStore', 'BucketStore', 'TensorBucket']
\ No newline at end of file
diff --git a/colossalai/zero/sharded_optim/bookkeeping/base_store.py b/colossalai/zero/sharded_optim/bookkeeping/base_store.py
new file mode 100644
index 000000000..78cc0479b
--- /dev/null
+++ b/colossalai/zero/sharded_optim/bookkeeping/base_store.py
@@ -0,0 +1,17 @@
+from colossalai.core import global_context as gpc
+from colossalai.context import ParallelMode
+
+
+class BaseStore:
+
+    def __init__(self, dp_parallel_mode=ParallelMode.DATA):
+        self._world_size = gpc.get_world_size(dp_parallel_mode)
+        self._local_rank = gpc.get_local_rank(dp_parallel_mode)
+
+    @property
+    def world_size(self):
+        return self._world_size
+
+    @property
+    def local_rank(self):
+        return self._local_rank
diff --git a/colossalai/zero/sharded_optim/bookkeeping/bucket_store.py b/colossalai/zero/sharded_optim/bookkeeping/bucket_store.py
new file mode 100644
index 000000000..37f5a3b99
--- /dev/null
+++ b/colossalai/zero/sharded_optim/bookkeeping/bucket_store.py
@@ -0,0 +1,43 @@
+from colossalai.core import global_context as gpc
+from colossalai.context import ParallelMode
+from .base_store import BaseStore
+
+class BucketStore(BaseStore):
+
+    def __init__(self, dp_parallel_mode):
+        super().__init__(dp_parallel_mode)
+        self._grads = dict()
+        self._params = dict()
+        self._num_elements_in_bucket = dict()
+
+        self.reset()
+
+    def num_elements_in_bucket(self, reduce_rank: int = None):
+        return self._num_elements_in_bucket[reduce_rank]
+
+    def add_num_elements_in_bucket(self, num_elements, reduce_rank: int = None):
+        self._num_elements_in_bucket[reduce_rank] += num_elements
+
+    def add_grad(self, tensor, reduce_rank: int = None):
+        self._grads[reduce_rank].append(tensor)
+
+    def add_param(self, tensor, reduce_rank: int = None):
+        self._params[reduce_rank].append(tensor)
+
+    def reset(self):
+        keys = [None] + list(range(self._world_size))
+        self._grads = {rank: [] for rank in keys}
+        self._params = {rank: [] for rank in keys}
+        self._num_elements_in_bucket = {rank: 0 for rank in keys}
+    
+    def reset_by_rank(self, reduce_rank=None):
+        self._grads[reduce_rank] = []
+        self._params[reduce_rank] = []
+        self._num_elements_in_bucket[reduce_rank] = 0
+
+
+    def get_grad(self, reduce_rank: int = None):
+        return self._grads[reduce_rank]
+
+    def get_param(self, reduce_rank: int = None):
+        return self._params[reduce_rank]
diff --git a/colossalai/zero/sharded_optim/bookkeeping/gradient_store.py b/colossalai/zero/sharded_optim/bookkeeping/gradient_store.py
new file mode 100644
index 000000000..0abcbc8c1
--- /dev/null
+++ b/colossalai/zero/sharded_optim/bookkeeping/gradient_store.py
@@ -0,0 +1,66 @@
+from typing import List
+from torch import Tensor
+from .base_store import BaseStore
+
+
+class GradientStore(BaseStore):
+
+    def __init__(self, *args):
+        super().__init__(*args)
+        # bookkeeping data structures
+        self._averaged_gradients = dict()
+
+        # for backward reduction hooks
+        self._grad_acc_objs = []
+
+    def add_accumulate_grad_object(self, obj):
+        """
+        Keep :class:`AccumulateGrad` objects. If these objects are not kept, reduction hooks may not 
+        be attached successfully.
+
+        :param obj: An object of :class:`AccumulateGrad` class
+        :type obj: :class:`AccumulateGrad`
+        """
+
+        self._grad_acc_objs.append(obj)
+
+    def get_averaged_gradients_by_group(self, group_id: int) -> List[Tensor]:
+        """
+        Return average gradients of a parameter group
+
+        :param group_id: The index of parameter group
+        :type group_id: int
+
+        :return: Return the list of averaged gradients of a parameter group. Each element is a gradient, not a parameter.
+        :rtype: List[torch.Tensor]
+        """
+
+        return self._averaged_gradients[group_id]
+
+    def add_average_gradient_by_group(self, group_id: int, tensor: Tensor) -> None:
+        """
+        Append an average gradient to the list of averaged gradients of a parameter group
+
+        :param group_id: The index of a parameter group
+        :param tensor: A :class:`torch.Tensor` object
+        :type group_id: int
+        :type tensor: torch.Tensor
+
+        """
+
+        if group_id in self._averaged_gradients:
+            self._averaged_gradients[group_id].append(tensor)
+        else:
+            self._averaged_gradients[group_id] = [tensor]
+    
+    def reset_average_gradients_by_group(self, group_id: int) -> None:
+        """
+        Reset the bookkeeping data structure for averaged gradients to an empty list
+
+        :param group_id: The index of a parameter group
+        :type group_id: int
+        """
+        
+        self._averaged_gradients[group_id] = []
+
+
diff --git a/colossalai/zero/sharded_optim/bookkeeping/parameter_store.py b/colossalai/zero/sharded_optim/bookkeeping/parameter_store.py
new file mode 100644
index 000000000..6a7cf7513
--- /dev/null
+++ b/colossalai/zero/sharded_optim/bookkeeping/parameter_store.py
@@ -0,0 +1,96 @@
+from .base_store import BaseStore
+from torch import Tensor
+from typing import List
+
+
+class ParameterStore(BaseStore):
+
+    def __init__(self, dp_paralle_mode):
+        super().__init__(dp_paralle_mode)
+        # param partitioning data structures
+        self._fp16_param_to_rank = dict()
+        self._rank_groupid_to_fp16_param_list = dict()
+        self._rank_group_id_to_flat_fp16_param = dict()
+
+        # param reduction data structures
+        self._is_param_reduced = dict()
+        self._reduced_param = []
+
+    def set_param_to_rank(self, tensor: Tensor, rank: int) -> None:
+        """
+        Set the mapping between parameter to rank, each parameter should be owned by a rank.
+
+        :param tensor: A :class:`torch.Tensor` object
+        :type tensor: torch.Tensor
+        :param rank: The rank of which the process is responsible for updating the parameter
+        :type rank: int
+        """
+
+        self._fp16_param_to_rank[tensor] = rank
+
+    def get_param_rank(self, tensor: Tensor) ->  int:
+        """
+        Gives the rank which the parameter belongs to
+
+        :param tensor: A :class:`torch.Tensor` object
+        :type tensor: torch.Tensor
+        """
+        return self._fp16_param_to_rank[tensor]
+
+    def belongs_to_current_rank(self, tensor) -> bool:
+        """
+        Check whether a parameter is supposed to be updated by the process of the current rank
+
+        :param tensor: A :class:`torch.Tensor` object
+        :type tensor: torch.Tensor
+
+        :return: True if the parameter should be updated by the current rank. Otherwise false.
+        :rtype: bool
+        """
+
+        tensor_rank = self._fp16_param_to_rank[tensor]
+        return tensor_rank == self._local_rank
+
+    def add_fp16_param_list_by_rank_group(self, rank, group_id,
+                                          tensor_list) -> None:
+        if rank not in self._rank_groupid_to_fp16_param_list:
+            self._rank_groupid_to_fp16_param_list[rank] = dict()
+
+        if group_id not in self._rank_groupid_to_fp16_param_list[rank]:
+            self._rank_groupid_to_fp16_param_list[rank][group_id] = []
+
+        self._rank_groupid_to_fp16_param_list[rank][group_id].extend(
+            tensor_list)
+
+    def get_fp16_params_by_rank_group(self, rank, group_id) -> List[Tensor]:
+        return self._rank_groupid_to_fp16_param_list[rank][group_id]
+
+    def add_flat_fp16_param_by_rank_group(self, rank, group_id, tensor) -> None:
+        if rank not in self._rank_group_id_to_flat_fp16_param:
+            self._rank_group_id_to_flat_fp16_param[rank] = dict()
+
+        self._rank_group_id_to_flat_fp16_param[rank][group_id] = tensor
+
+    def get_flat_fp16_param_by_rank_group(self, rank, group_id) -> Tensor:
+        return self._rank_group_id_to_flat_fp16_param[rank][group_id]
+
+    def is_param_reduced(self, tensor):
+        return self._is_param_reduced[tensor]
+
+    def set_param_reduction_state(self, tensor, state):
+        self._is_param_reduced[tensor] = state
+
+    def get_param_reduction_states(self):
+        return self._is_param_reduced
+
+    def reset_previous_reduced_params(self):
+        self._reduced_param = []
+
+    def add_previous_reduced_param(self, tensor):
+        self._reduced_param.append(tensor)
+
+    def clear_grads_of_previous_reduced_params(self):
+        if len(self._reduced_param) > 0:
+            for param in self._reduced_param:
+                param.grad = None
+            self.reset_previous_reduced_params()
diff --git a/colossalai/zero/sharded_optim/bookkeeping/tensor_bucket.py b/colossalai/zero/sharded_optim/bookkeeping/tensor_bucket.py
new file mode 100644
index 000000000..c07f03263
--- /dev/null
+++ b/colossalai/zero/sharded_optim/bookkeeping/tensor_bucket.py
@@ -0,0 +1,54 @@
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+
+
+class TensorBucket:
+
+    def __init__(self, size):
+        self._max_size = size
+        self._current_size = 0
+        self._bucket = []
+
+    @property
+    def max_size(self):
+        return self._max_size
+
+    @property
+    def current_size(self):
+        return self._current_size
+
+    def is_full_or_oversized(self):
+        return self._current_size >= self._max_size
+
+    def is_empty(self):
+        return len(self._bucket) == 0
+
+    def add_to_bucket(self, tensor, allow_oversize=False):
+        tensor_size = tensor.numel()
+
+        if not allow_oversize and self.will_exceed_max_size(tensor_size):
+            msg = f"The param bucket max size {self._max_size} is exceeded" \
+                + f"by tensor (size {tensor_size})"
+            raise RuntimeError(msg)
+
+        self._bucket.append(tensor)
+        self._current_size += tensor_size
+
+    def will_exceed_max_size(self, tensor_size):
+        expected_size = self._current_size + tensor_size
+        return expected_size > self._max_size
+
+    def get_bucket(self):
+        return self._bucket
+
+    def empty(self):
+        self._bucket = []
+        self._size = 0
+
+    def flatten(self):
+        return _flatten_dense_tensors(self._bucket)
+
+    def unflatten_and_copy(self, flat_tensor):
+        unflattened_tensor_list = _unflatten_dense_tensors(
+            flat_tensor, self._bucket)
+        for old, new in zip(self._bucket, unflattened_tensor_list):
+            old.copy_(new)
diff --git a/colossalai/zero/sharded_optim/sharded_optim.py b/colossalai/zero/sharded_optim/sharded_optim.py
new file mode 100644
index 000000000..2be7a2808
--- /dev/null
+++ b/colossalai/zero/sharded_optim/sharded_optim.py
@@ -0,0 +1,568 @@
+from itertools import groupby
+from colossalai.utils.cuda import get_current_device
+import torch
+import torch.distributed as dist
+from colossalai.logging import get_dist_logger
+from torch.optim import Optimizer
+from .bookkeeping import ParameterStore, GradientStore, BucketStore, TensorBucket
+from colossalai.context import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.amp.naive_amp._fp16_optimizer import DynamicGradScaler
+from colossalai.nn.optimizer import ColossalaiOptimizer
+from ._utils import (move_tensor, flatten, get_grad_accumulate_object, split_half_float_double, reduce_tensor,
+                     release_param_grad, calculate_global_norm_from_list, compute_norm, sync_param, has_inf_or_nan)
+from functools import partial
+
+
+class ShardedOptimizer(ColossalaiOptimizer):
+
+    def __init__(
+            self,
+            optimizer: Optimizer,
+
+            # grad scaler config
+            initial_scale=2**32,
+            min_scale=1,
+            growth_factor=2,
+            backoff_factor=0.5,
+            growth_interval=1000,
+            hysteresis=2,
+            max_scale: int = 2**32,
+
+            # grad clipping
+            clip_grad_norm=2.0,
+            verbose=False,
+
+            # communication
+            reduce_bucket_size=500000000,
+            communication_dtype=torch.float16,
+            overlap_communication=False,
+
+            # stage 2
+            partition_grad=False,
+            
+            dp_parallel_mode=ParallelMode.DATA,
+            mp_parallel_mode=ParallelMode.MODEL,
+            
+            # cpu offload
+            cpu_offload=False):
+
+        # TODO: add support for
+        # 1. fp16 master weights
+        # 2. contiguous gradients
+        # 3. cpu offload
+        # 4. support when some parameters requires_grad = False
+
+        self._optimizer = optimizer
+        self._dtype = self._optimizer.param_groups[0]['params'][0].dtype
+        self._logger = get_dist_logger()
+        self._verbose = verbose
+
+        # stage 2
+        self._partition_grads = partition_grad
+
+        # cpu_offload
+        self._cpu_offload = cpu_offload
+
+        # get process groups
+        self._dp_parallel_mode = dp_parallel_mode
+        self._mp_parallel_mode = mp_parallel_mode
+        self._local_rank = gpc.get_local_rank(dp_parallel_mode)
+        self._world_size = gpc.get_world_size(dp_parallel_mode)
+
+        self._dp_group = gpc.get_group(dp_parallel_mode)
+        if gpc.is_initialized(mp_parallel_mode) and gpc.get_world_size(mp_parallel_mode) > 1:
+            self._mp_group = gpc.get_group(mp_parallel_mode)
+        else:
+            self._mp_group = None
+
+        # fp16 and fp32 params for mixed precision training
+        self._fp16_param_groups = dict()
+        self._fp32_flat_param_groups_of_current_rank = dict()
+
+        # communication params
+        self._overlap_communication = overlap_communication
+        self._reduce_bucket_size = reduce_bucket_size
+        self._communication_dtype = communication_dtype
+
+        # gradient scaler
+        self.grad_scaler = DynamicGradScaler(initial_scale=initial_scale,
+                                             min_scale=min_scale,
+                                             growth_factor=growth_factor,
+                                             backoff_factor=backoff_factor,
+                                             growth_interval=growth_interval,
+                                             hysteresis=hysteresis,
+                                             max_scale=max_scale,
+                                             verbose=verbose)
+        self._found_overflow = torch.FloatTensor([0]).to(get_current_device())
+
+        # gradient clipping
+        self._clip_grad_norm = clip_grad_norm
+
+        # check argument conflict
+        self._sanity_checks()
+
+        # ParameterStore will manage the tensor buffers used for zero
+        # it will not manage the tensors used by mixed precision training
+        self._param_store = ParameterStore(self._dp_parallel_mode)
+        self._grad_store = GradientStore(self._dp_parallel_mode)
+        self._bucket_store = BucketStore(self._dp_parallel_mode)
+
+        # iterate over the param group in the optimizer
+        # partition these param groups for data parallel training
+        # and add buffers to parameter store for future access
+        for group_id, param_group in enumerate(self._optimizer.param_groups):
+            params = param_group['params']
+
+            # add the fp16 params to fp16_param_groups for bookkeeping
+            self._fp16_param_groups[group_id] = params
+
+            # assign parameters to ranks
+            # the params in the list are sorted
+            params_per_rank = self._partition_param_list(params)
+
+            # store the mapping between param to rank
+            # each param should belong to only one rank
+            for rank, params in enumerate(params_per_rank):
+                self._param_store.add_fp16_param_list_by_rank_group(rank, group_id, params)
+                for param in params:
+                    self._param_store.set_param_to_rank(param, rank)
+
+            # move to cpu to make room to create the flat tensor
+            move_tensor(params, device='cpu')
+
+            # flatten the reordered tensors
+            for rank in range(self._world_size):
+                tensor_list = self._param_store.get_fp16_params_by_rank_group(rank, group_id)
+                flat_tensor = flatten(tensor_list)
+                flat_tensor = flat_tensor.cuda()
+                self._param_store.add_flat_fp16_param_by_rank_group(rank, group_id, flat_tensor)
+
+            # sync parameters
+            for rank in range(self._world_size):
+                flat_tensor = self._param_store.get_flat_fp16_param_by_rank_group(rank, group_id)
+                tensor_list = self._param_store.get_fp16_params_by_rank_group(rank, group_id)
+                sync_param(flat_tensor=flat_tensor, tensor_list=tensor_list)
+
+            # create a copy of fp32 weights of the parameters for which this rank is responsible
+            fp16_flat_current_rank = self._param_store.get_flat_fp16_param_by_rank_group(self._local_rank, group_id)
+            fp32_flat_current_rank = fp16_flat_current_rank.clone().float().detach()
+            device = 'cpu' if self._cpu_offload else get_current_device()
+            fp32_flat_current_rank = fp32_flat_current_rank.to(device)
+            fp32_flat_current_rank.requires_grad = True
+            self._fp32_flat_param_groups_of_current_rank[group_id] = fp32_flat_current_rank
+
+            # need to replace the params in the `params` field in the optimizer
+            # so that when the optimizer calls step(), it only updates the tensors
+            # managed by this data parallel rank
+            param_group['params'] = [fp32_flat_current_rank]
+
+            # set reduction state
+            for param in self._fp16_param_groups[group_id]:
+                self._param_store.set_param_reduction_state(param, False)
+
+        # intialize communication stream for
+        # communication-compuation overlapping
+        if self._overlap_communication:
+            self._comm_stream = torch.cuda.Stream()
+
+        # reduction hook is only used if overlapping communication
+        # or stage 2 is used
+        # if it is stage 1 without overlapping, no hook will be attached
+        if self._overlap_communication or self._partition_grads:
+            self._attach_reduction_hook()
+
+        self._initialize_optimizer_states()
+
+    @property
+    def loss_scale(self):
+        return self.grad_scaler.scale
+
+    @property
+    def num_param_groups(self):
+        return len(self._fp16_param_groups)
+
+    def _partition_param_list(self, param_list):
+        params_per_rank = [[] for _ in range(self._world_size)]
+        numel_per_rank = [0 for _ in range(self._world_size)]
+
+        # partititon the parameters in a greedy fashion
+        sorted_params = sorted(param_list, key=lambda x: x.numel(), reverse=True)
+        for param in sorted_params:
+            # allocate this parameter to the rank with
+            # the smallest numel for load balancing purpose
+            rank_to_go = numel_per_rank.index(min(numel_per_rank))
+            params_per_rank[rank_to_go].append(param)
+            numel_per_rank[rank_to_go] += param.numel()
+
+        if self._verbose:
+            self._logger.info(f'Number of elements on ranks: {numel_per_rank}',
+                              ranks=[0],
+                              parallel_mode=self._dp_parallel_mode)
+        return params_per_rank
+
+    def _initialize_optimizer_states(self):
+        # create a dummy zero tensor which has the same shape as that of the param
+        # set this dummpy zero tensor as grad
+        for group_id in range(len(self._fp32_flat_param_groups_of_current_rank)):
+            fp32_partition_param = self._fp32_flat_param_groups_of_current_rank[group_id]
+            fp32_partition_grad = torch.zeros_like(fp32_partition_param)
+            fp32_partition_param.grad = fp32_partition_grad
+
+        # update the parameter with zero gradients for initialization of optimizer states
+        self._optimizer.step()
+
+        # remove the grad of the paramter to save memory
+        for group_id, fp32_flat_tensor in self._fp32_flat_param_groups_of_current_rank.items():
+            fp32_flat_tensor.grad = None
+
+    def _sanity_checks(self):
+        assert torch.cuda.is_available(), 'CUDA is required'
+        assert self._dtype == torch.float16, \
+            f'Parameters are expected to be of type torch.float16, but got {self._dtype}'
+
+    ###########################################################
+    # Backward Reduction Hook
+    ###########################################################
+
+    def _attach_reduction_hook(self):
+        # we iterate over the fp16 params
+        # on each param, we register a hook to its AccumulateGrad object
+        for group_id in range(self.num_param_groups):
+            param_group = self._fp16_param_groups[group_id]
+            for param in param_group:
+                if param.requires_grad:
+                    # determines the reduction destionation rank
+                    # this is only valid for stage 2
+                    # dst_rank = None means using all-reduce
+                    # else using reduce
+                    if self._partition_grads:
+                        reduce_rank = self._param_store.get_param_rank(param)
+                    else:
+                        reduce_rank = None
+
+                    def _define_and_attach(param, reduce_rank):
+                        # get the AccumulateGrad object of the param itself
+                        accum_grad_obj = get_grad_accumulate_object(param)
+                        self._grad_store.add_accumulate_grad_object(accum_grad_obj)
+
+                        reduction_func = partial(self._reduce_and_remove_grads_by_bucket,
+                                                 param=param,
+                                                 reduce_rank=reduce_rank)
+                        
+                        # define hook
+                        # NOT IMPORTANT BUT GOOD TO KNOW:
+                        # args here is not grad, but allow_unreacable and accumulate_grad
+                        def reduce_grad_hook(*args):
+                            reduction_func()
+                        accum_grad_obj.register_hook(reduce_grad_hook)
+
+                    _define_and_attach(param, reduce_rank)
+
+    def _reduce_and_remove_grads_by_bucket(self, param, reduce_rank=None):
+        param_size = param.numel()
+
+        # check if the bucket is full
+        # if full, will reduce the grads already in the bucket
+        # after reduction, the bucket will be empty
+        if self._bucket_store.num_elements_in_bucket(reduce_rank) + param_size > self._reduce_bucket_size:
+            self._reduce_grads_in_bucket(reduce_rank)
+
+        # the param must not be reduced to ensure correctness
+        is_param_reduced = self._param_store.is_param_reduced(param)
+        if is_param_reduced:
+            msg = f'Parameter of size ({param.size()}) has already been reduced, ' \
+                + 'duplicate reduction will lead to arithmetic incorrectness'
+            raise RuntimeError(msg)
+
+        # the param must have grad for reduction
+        assert param.grad is not None, f'Parameter of size ({param.size()}) has None grad, cannot be reduced'
+
+        self._bucket_store.add_num_elements_in_bucket(param_size, reduce_rank)
+        self._bucket_store.add_grad(param.grad, reduce_rank)
+        self._bucket_store.add_param(param, reduce_rank)
+
+    def _reduce_grads_in_bucket(self, reduce_rank=None):
+        # reduce grads
+        self._reduce_grads_by_rank(reduce_rank=reduce_rank,
+                                    grads=self._bucket_store.get_grad(reduce_rank=reduce_rank),
+                                    bucket_size=self._bucket_store.num_elements_in_bucket(reduce_rank))
+
+        # use communication stream if overlapping
+        # communication with computation
+        if self._overlap_communication:
+            stream = self._comm_stream
+        else:
+            stream = torch.cuda.current_stream()
+
+        with torch.cuda.stream(stream):
+            params_in_bucket = self._bucket_store.get_param(reduce_rank=reduce_rank)
+
+            for param in params_in_bucket:
+                # the is_param_reduced flag should be False showing that
+                # this param is not reduced before calling self._reduce_grads_by_rank
+                is_param_reduced = self._param_store.is_param_reduced(param)
+
+                if is_param_reduced:
+                    msg = f'Parameter of size ({param.size()}) has been reduced, ' + \
+                        'duplicate reduction will lead to arithmetic incorrectness'
+                    raise RuntimeError(msg)
+
+                # update the flag
+                self._param_store.set_param_reduction_state(param, True)
+
+                # if partition grads = True
+                # we do not keep the gradient after reduction
+                if self._partition_grads and not self._param_store.belongs_to_current_rank(param):
+                    if self._overlap_communication:
+                        # we need to keep this gradient for now as reduction may 
+                        # be completed yet since it is using a different cuda stream
+                        self._param_store.add_previous_reduced_param(param)
+                    else:
+                        param.grad = None
+
+        self._bucket_store.reset_by_rank(reduce_rank)
+
+    def _reduce_grads_by_rank(self, reduce_rank, grads, bucket_size):
+        grad_buckets_by_dtype = split_half_float_double(grads)
+
+        for tensor_list in grad_buckets_by_dtype:
+            self._reduce_no_retain(tensor_list=tensor_list, bucket_size=bucket_size, reduce_rank=reduce_rank)
+
+    ##############################
+    # Reduction Utility Function #
+    ##############################
+    def _reduce_no_retain(self, tensor_list, bucket_size, reduce_rank):
+        param_bucket = TensorBucket(size=bucket_size)
+
+        for tensor in tensor_list:
+            param_bucket.add_to_bucket(tensor, allow_oversize=True)
+
+            if param_bucket.is_full_or_oversized():
+                self._reduce_and_copy(bucket=param_bucket, reduce_rank=reduce_rank)
+                param_bucket.empty()
+
+        if not param_bucket.is_empty():
+            self._reduce_and_copy(bucket=param_bucket, reduce_rank=reduce_rank)
+
+    def _reduce_and_copy(self, bucket: TensorBucket, reduce_rank):
+        if self._overlap_communication:
+            torch.cuda.synchronize()
+            self._param_store.clear_grads_of_previous_reduced_params()
+            stream = self._comm_stream
+        else:
+            stream = torch.cuda.current_stream()
+
+        with torch.cuda.stream(stream):
+            flat = bucket.flatten()
+            reduced_flat = reduce_tensor(tensor=flat,
+                                         dtype=self._communication_dtype,
+                                         dst_rank=reduce_rank,
+                                         parallel_mode=self._dp_parallel_mode)
+
+            # update the reduced tensor
+            if reduce_rank is None or reduce_rank == self._local_rank:
+                bucket.unflatten_and_copy(reduced_flat)
+
+    ################################
+    # torch.optim.Optimizer methods
+    ################################
+
+    def backward(self, loss, retain_graph=True):
+        loss = self.loss_scale * loss
+        loss.backward(retain_graph=retain_graph)
+
+    def zero_grad(self, set_to_none=True):
+        """
+        Set parameter gradients to zero. If set_to_none = True, gradient
+        will be set to None to save memory.
+
+        :param set_to_none: Whether set the gradient to None. Default value is True.
+        :type set_to_none: bool
+        """
+        for group_id, param_group in self._fp16_param_groups.items():
+            for param in param_group:
+                if set_to_none:
+                    param.grad = None
+                else:
+                    if param.grad is not None:
+                        param.grad.detach()
+                        param.grad.zero_()
+
+    ####################
+    # Update Parameter #
+    ####################
+
+    def step(self, closure=None):
+        assert closure is None, 'closure is not supported by step()'
+
+        # check for overflow
+        found_inf = self._check_overflow()
+        self.grad_scaler.update(found_inf)
+
+        # update loss scale if overflow occurs
+        if found_inf:
+            self._grad_store._averaged_gradients = dict()
+            self.zero_grad()
+            return
+
+        # copy the grad of fp16 param to fp32 param
+        single_grad_partition_groups = []
+        norm_groups = []
+
+        for group_id in range(self.num_param_groups):
+            # compute norm
+            norm_group = compute_norm(gradients=self._grad_store._averaged_gradients[group_id],
+                                      params=self._param_store.get_fp16_params_by_rank_group(group_id=group_id,
+                                                                                             rank=self._local_rank),
+                                      dp_group=self._dp_group,
+                                      mp_group=self._mp_group)
+            norm_groups.append(norm_group)
+
+            # create flat gradient for the flat fp32 params
+            fp16_avg_grads = self._grad_store.get_averaged_gradients_by_group(group_id)
+            flat_fp16_avg_grads = flatten(fp16_avg_grads)
+
+            dtype = self._fp32_flat_param_groups_of_current_rank[group_id].dtype
+            flat_fp32_avg_grads = flat_fp16_avg_grads.to(dtype)
+
+            param_shape = self._fp32_flat_param_groups_of_current_rank[group_id].shape
+            assert param_shape == flat_fp32_avg_grads.shape, \
+                f'fp32 param and grad have different shape {param_shape} vs {flat_fp32_avg_grads.shape}'
+
+            single_grad_partition_groups.append(flat_fp32_avg_grads)
+            device = self._fp32_flat_param_groups_of_current_rank[group_id].device
+            self._fp32_flat_param_groups_of_current_rank[group_id].grad = flat_fp32_avg_grads.to(device)
+            self._grad_store._averaged_gradients[group_id] = []
+            self._grad_store._averaged_gradients[group_id] = []
+
+
+        # unscale and clip grads
+        global_norm = calculate_global_norm_from_list(norm_list=norm_groups)
+        self._unscale_and_clip_grads(single_grad_partition_groups, global_norm)
+
+        # update the parameters
+        self._optimizer.step()
+        # release the fp32 grad
+        release_param_grad(self._fp32_flat_param_groups_of_current_rank.values())
+
+        # update fp16 partition updated by the current rank
+        for group_id in range(len(self._fp16_param_groups)):
+            fp16_param = self._param_store.get_flat_fp16_param_by_rank_group(rank=self._local_rank, group_id=group_id)
+            fp32_param = self._fp32_flat_param_groups_of_current_rank[group_id].to(fp16_param.device)
+            fp16_param.data.copy_(fp32_param)
+
+        # broadcast the updated model weights
+        handles = []
+        for group_id in range(self.num_param_groups):
+            for rank in range(self._world_size):
+                fp16_param = self._param_store.get_flat_fp16_param_by_rank_group(rank=rank, group_id=group_id)
+                handle = dist.broadcast(fp16_param, src=rank, group=self._dp_group, async_op=True)
+                handles.append(handle)
+
+        for handle in handles:
+            handle.wait()
+
+    ##################
+    # FP16 Utilities #
+    ##################
+
+    def _check_overflow(self):
+        # clear previous overflow record
+        self._found_overflow.fill_(0.0)
+
+        # check for overflow
+        for group_id in range(len(self._fp16_param_groups)):
+            for avg_grad in self._grad_store.get_averaged_gradients_by_group(group_id):
+                if avg_grad is not None and has_inf_or_nan(avg_grad):
+                    self._found_overflow.fill_(1.0)
+                    break
+
+        # all-reduce across dp group
+        dist.all_reduce(self._found_overflow, op=dist.ReduceOp.MAX, group=self._dp_group)
+
+        # all-reduce over model parallel group
+        if self._mp_group:
+            dist.all_reduce(self._found_overflow, op=dist.ReduceOp.MAX, group=self._mp_group)
+
+        if self._found_overflow.item() > 0:
+            return True
+        else:
+            return False
+
+    def _unscale_and_clip_grads(self, grad_groups_flat, total_norm):
+        # compute combined scale factor for this group
+        combined_scale = self.loss_scale
+        
+        if self._clip_grad_norm > 0.:
+            # norm is in fact norm*scale
+            clip = ((total_norm / self.loss_scale) + 1e-6) / self._clip_grad_norm
+            if clip > 1:
+                combined_scale = clip * self.loss_scale
+
+        for grad in grad_groups_flat:
+            grad.data.mul_(1. / combined_scale)
+
+    ############################
+    # Gradient Synchronization #
+    ############################
+
+    def sync_grad(self):
+        if not self._partition_grads:
+            self._reduce_grad_stage1()
+        else:
+            # TODO: support async comm in reduce
+            self._reduce_grad_stage2()
+
+        # update param already reduced flag
+        reduction_states = self._param_store.get_param_reduction_states()
+        for tensor, state in reduction_states.items():
+            reduction_states[tensor] = False
+
+            # clear reduced grads
+        if self._overlap_communication:
+            torch.cuda.synchronize()
+            self._param_store.clear_grads_of_previous_reduced_params()
+
+        # accumulate gradient
+        avg_gradients = self._grad_store._averaged_gradients
+        for group_id in range(self.num_param_groups):
+            param_group = self._param_store.get_fp16_params_by_rank_group(self._local_rank, group_id)
+
+            if group_id not in avg_gradients:
+                avg_gradients[group_id] = []
+
+            param_idx = 0
+            for param in param_group:
+                if param.grad is not None:
+                    if len(avg_gradients[group_id]) == param_idx:
+                        avg_gradients[group_id].append(param.grad)
+                    else:
+                        avg_gradients[group_id][param_idx].add_(param.grad)
+                    param_idx += 1
+
+        # the gradients needed are stored in the avg_gradients buffer
+        # thus, can clear this
+        self.zero_grad()
+
+    def _reduce_grad_stage1(self):
+        # if not overlapping communication (no reduction hook is attached)
+        # we need to manually reduce these gradients
+        if not self._overlap_communication:
+            for group_id in range(len(self._fp16_param_groups)):
+                param_group = self._fp16_param_groups[group_id]
+                for param in param_group:
+                    if param.grad is not None:
+                        self._reduce_and_remove_grads_by_bucket(param)
+        
+        # we need to reduce the gradients
+        # left in the communication bucket
+        self._reduce_grads_in_bucket()
+
+    def _reduce_grad_stage2(self):
+        # when partition_grads is True, reduction hooks
+        # are attached in the __init__ function, so we
+        # only need to reduce the gradients
+        # left in the communication bucket
+        for reduce_rank in range(self._world_size):
+            self._reduce_grads_in_bucket(reduce_rank)
diff --git a/colossalai/zero/zero_redundancy_optimizer_level_2.py b/colossalai/zero/zero_redundancy_optimizer_level_2.py
deleted file mode 100644
index f022aaa6f..000000000
--- a/colossalai/zero/zero_redundancy_optimizer_level_2.py
+++ /dev/null
@@ -1,2347 +0,0 @@
-'''
-Copyright 2019 The Microsoft DeepSpeed Team
-'''
-
-import math
-
-import torch
-import torch.distributed as dist
-
-try:
-    from deepspeed.git_version_info import version
-    from deepspeed.moe.utils import is_moe_param
-    from deepspeed.ops.adam import DeepSpeedCPUAdam
-    from deepspeed.ops.op_builder import UtilsBuilder
-    from deepspeed.runtime.zero.config import ZERO_OPTIMIZATION_GRADIENTS
-except ImportError:
-    pass
-from packaging import version as pkg_version
-from torch._six import inf
-from torch.distributed.distributed_c10d import _get_global_rank
-from torch.optim import Optimizer
-
-from colossalai.core import global_context as gpc
-from colossalai.utils import report_memory_usage
-from colossalai.utils.common import is_model_parallel_parameter
-from .loss_scaler import LossScaler, DynamicLossScaler
-from colossalai.context import ParallelMode
-
-# Toggle this to true to enable correctness test
-# with gradient partitioning and without
-pg_correctness_test = False
-
-
-def input(msg):
-    return
-
-
-def split_half_float_double(tensors):
-    dtypes = [
-        "torch.cuda.HalfTensor",
-        "torch.cuda.FloatTensor",
-        "torch.cuda.DoubleTensor"
-    ]
-    buckets = []
-    for i, dtype in enumerate(dtypes):
-        bucket = [t for t in tensors if t.type() == dtype]
-        if bucket:
-            buckets.append(bucket)
-    return buckets
-
-
-def isclose(a, b, rtol=1e-09, atol=0.0):
-    return abs(a - b) <= max(rtol * max(abs(a), abs(b)), atol)
-
-
-def lcm(x, y):
-    from fractions import gcd  # or can import gcd from `math` in Python 3
-    return x * y // gcd(x, y)
-
-
-def get_alignment_padding(tensor_list, alignment):
-    num_elements = sum([tensor.numel() for tensor in tensor_list])
-    remainder = num_elements % alignment
-    return (alignment - remainder) if remainder else remainder
-
-
-def move_to_cpu(tensor_list):
-    for tensor in tensor_list:
-        tensor.data = tensor.data.cpu()
-
-
-def print_rank_msg(msg):
-    print(f"rank {dist.get_rank()} - {msg}")
-
-
-class ZeroRedundancyOptimizer_Level_2(Optimizer):
-    """
-    ZeroRedundancyOptimizer_Level_2 designed to reduce the memory footprint
-    required for training large deep learning models.
-
-    For more details please see ZeRO: Memory Optimization Towards Training A Trillion Parameter Models
-    https://arxiv.org/abs/1910.02054
-
-    """
-
-    def __init__(self,
-                 init_optimizer,
-                 dp_parallel_mode=ParallelMode.DATA,
-                 static_loss_scale=1.0,
-                 dynamic_loss_scale=False,
-                 dynamic_loss_args=None,
-                 verbose=False,
-                 contiguous_gradients=True,
-                 reduce_bucket_size=500000000,
-                 allgather_bucket_size=5000000000,
-                 reduce_scatter=True,
-                 overlap_comm=False,
-                 cpu_offload=False,
-                 clip_grad=0.0,
-                 allreduce_always_fp32=False,
-                 postscale_gradients=True,
-                 gradient_predivide_factor=1.0,
-                 gradient_accumulation_steps=1,
-                 ignore_unused_parameters=True,
-                 round_robin_gradients=False,
-                 fp16_master_weights_and_gradients=False):
-        # mpu = None is removed from the parameter list
-        # tensor parallel will be automatically detected later
-
-        # LSG: default arguments for compatibility
-        has_moe_layers = False
-        partition_grads = True
-        expert_parallel_group = None
-        expert_data_parallel_group = None
-        self.timers = None
-        self.defaults = init_optimizer.defaults
-
-        dp_process_group = gpc.get_group(dp_parallel_mode)
-        if gpc.get_world_size(dp_parallel_mode) == 1:
-            partition_grads = False  # for compatibility with dp size = 1
-
-        self.verbose = verbose
-
-        if dist.get_rank() == 0 and self.verbose:
-            print(f"Reduce bucket size {reduce_bucket_size}")
-            print(f"Allgather bucket size {allgather_bucket_size}")
-            print(f"CPU Offload: {cpu_offload}")
-            print(
-                f'Round robin gradient partitioning: {round_robin_gradients}')
-        # The fused optimizer does all the work. We need this layer for two reason:
-        # 1. maintain same user API from apex.fp16_utils
-        # 2. keep common stuff here in case we need to add ne552w fused optimizer later
-
-        # differences from apex.fp16_utils:
-        # - assume all model params in fp16
-        # - assume all params requires grad
-        # - flat by groups, not keeping state. TODO: remove state explicitly?
-        # - master gard and unflat master weight never exist. TODO: a way to save out unflat master?
-        if not torch.cuda.is_available:
-            raise SystemError("Cannot use fp16 without CUDA.")
-        self.optimizer = init_optimizer
-
-        # Load pre-built or JIT compile (un)flatten ops
-        util_ops = UtilsBuilder().load()
-        self.flatten = util_ops.flatten
-        self.unflatten = util_ops.unflatten
-
-        # ZeRO stage 1 (False) or 2 (True)
-        self.partition_gradients = partition_grads
-
-        self.reduce_scatter = reduce_scatter
-
-        self.overlap_comm = overlap_comm
-
-        self.cpu_offload = cpu_offload
-
-        self.deepspeed_adam_offload = cpu_offload
-
-        self.device = torch.cuda.current_device() if not self.cpu_offload else 'cpu'
-
-        self.dp_process_group = dp_process_group
-
-        # expert parallel group
-        self.ep_process_group = expert_parallel_group
-
-        # data parallel group for experts
-        self.expert_dp_process_group = expert_data_parallel_group
-
-        # data parallel size for non-experts
-        dp_size = dist.get_world_size(group=self.dp_process_group)
-
-        # For MoE models this maybe different for different param group
-        # It will be modified during MoE setup later in the init
-        self.real_dp_process_group = [
-            dp_process_group for i in range(len(self.optimizer.param_groups))
-        ]
-        self.partition_count = [dp_size for i in range(
-            len(self.optimizer.param_groups))]
-
-        self.is_gradient_accumulation_boundary = True
-
-        # CPU-Offload requires contiguous gradients
-        self.contiguous_gradients = contiguous_gradients or cpu_offload
-
-        self.has_moe_layers = has_moe_layers
-
-        if self.has_moe_layers:
-            self._configure_moe_settings()
-
-        if not gpc.is_initialized(ParallelMode.TENSOR) or gpc.get_world_size(ParallelMode.TENSOR) == 1:
-            self.model_parallel_group = None
-            self.model_parallel_rank = 0
-        else:
-            self.model_parallel_group = gpc.get_group(ParallelMode.TENSOR)
-            self.model_parallel_rank = gpc.get_local_rank(ParallelMode.TENSOR)
-
-        self.overflow = False
-        self.clip_grad = clip_grad
-        self.allreduce_always_fp32 = allreduce_always_fp32
-        self.gradient_predivide_factor = gradient_predivide_factor
-        self.postscale_gradients = postscale_gradients
-        self.gradient_accumulation_steps = gradient_accumulation_steps
-        self.micro_step_id = 0
-        self.ignore_unused_parameters = ignore_unused_parameters
-        self.round_robin_gradients = round_robin_gradients
-
-        self.extra_large_param_to_reduce = None
-        self.fp16_master_weights_and_gradients = fp16_master_weights_and_gradients
-
-        if self.fp16_master_weights_and_gradients:
-            assert self.cpu_offload and type(self.optimizer) in [
-                DeepSpeedCPUAdam], f"fp16_master_and_gradients requires optimizer to support keeping fp16 master and gradients while keeping the optimizer states in fp32. Currenty only supported using ZeRO-Offload with DeepSpeedCPUAdam. But current setting is ZeRO-Offload:{self.cpu_offload} and optimizer type {type(self.optimizer)}. Either disable fp16_master_weights_and_gradients or enable ZeRO-2 Offload with DeepSpeedCPUAdam"
-
-        if self.reduce_scatter:
-            assert not self.allreduce_always_fp32, "allreduce_always_fp32 is not yet supported with ZeRO-2 with reduce scatter enabled"
-            assert self.gradient_predivide_factor == 1.0, "gradient_predivide_factor != 1.0 is not yet supported with ZeRO-2 with reduce scatter enabled"
-            assert self.postscale_gradients, "pre-scale gradients is not yet supported with ZeRO-2 with reduce scatter enabled"
-
-        # param flattened by groups
-        self.fp16_groups = []
-        self.fp16_groups_flat = []
-
-        # param partitioned by data parallel degree
-        # this will contain a list of equal sized tensors
-        # each of which will be updated by a different process
-        self.parallel_partitioned_fp16_groups = []
-
-        # a single 32-bit partition of the parallel partitioned parameters
-        # that this process will update
-        self.single_partition_of_fp32_groups = []
-
-        # param partition info
-
-        # These are the parameters in each group that will not be updated by this process directly
-        self.params_not_in_partition = []
-
-        # These are the parameters that will be updated by this process directly
-        self.params_in_partition = []
-
-        # Offset from the first paramter in the the self.params_in_partition
-        # the parameter boundaries may not align with partition boundaries
-        # so we need to keep track of the offset
-        self.first_offset = []
-
-        # number of elements per partition in each group
-        self.partition_size = []
-
-        # align nccl all-gather send buffers to 4-bye boundary
-        # 4-byte alignment/sizeof(fp16) = 2
-        self.nccl_start_alignment_factor = 2
-
-        assert (
-            allgather_bucket_size % self.nccl_start_alignment_factor == 0), f"allgather_bucket_size must be a multiple of nccl_start_alignment_factor, {self.nccl_start_alignment_factor} "
-
-        self.all_reduce_print = False
-        self.dtype = self.optimizer.param_groups[0]['params'][0].dtype
-
-        self.round_robin_fp16_groups = []
-        self.round_robin_fp6_indices = []
-
-        # padding on each partition for alignment purposes
-        self.groups_padding = []
-        # loop to deal with groups
-        for i, param_group in enumerate(self.optimizer.param_groups):
-            partition_id = dist.get_rank(group=self.real_dp_process_group[i])
-
-            # push this group to list before modify
-            # TODO: Explore simplification that avoids the extra book-keeping by pushing the reordered group
-            self.fp16_groups.append(param_group['params'])
-
-            # Record padding required to align group to world size
-            if partition_id == dist.get_world_size(
-                    group=self.real_dp_process_group[i]) - 1:
-                padding = get_alignment_padding(self.fp16_groups[i],
-                                                self.partition_count[i])
-            else:
-                padding = 0
-            self.groups_padding.append(padding)
-
-            # not sure why apex was cloning the weights before flattening
-            # removing cloning here
-
-            if self.verbose:
-                report_memory_usage(f"Before moving param group {i} to CPU")
-            # move all the parameters to cpu to free up GPU space for creating flat buffer
-            move_to_cpu(self.fp16_groups[i])
-            if self.verbose:
-                report_memory_usage(f"After moving param group {i} to CPU")
-
-            # Reorder group parameters for load balancing of gradient partitioning during backward among ranks.
-            # This ensures that gradients are reduced in a fashion such that ownership round robins among the ranks.
-            # For example, rather than 3 gradients (g_n+2, g_n+1, g_n) that are reduced consecutively belonging
-            # to the same rank, instead they will belong to 3 ranks (r_m+2, r_m+1, r_m).
-            if self.round_robin_gradients:
-                round_robin_tensors, round_robin_indices = self._round_robin_reorder(
-                    self.fp16_groups[i],
-                    dist.get_world_size(group=self.real_dp_process_group[i])
-                )
-            else:
-                round_robin_tensors = self.fp16_groups[i]
-                round_robin_indices = list(range(len(self.fp16_groups[i])))
-
-            self.round_robin_fp16_groups.append(round_robin_tensors)
-            self.round_robin_fp6_indices.append(round_robin_indices)
-
-            # create flat buffer in CPU and move to GPU
-            self.fp16_groups_flat.append(
-                self.flatten_dense_tensors_aligned(
-                    self.round_robin_fp16_groups[i],
-                    self.nccl_start_alignment_factor *
-                    dist.get_world_size(group=self.real_dp_process_group[i])).cuda(
-                    torch.cuda.current_device()))
-
-            if self.verbose:
-                report_memory_usage(
-                    f"After flattening and moving param group {i} to GPU")
-
-                if dist.get_rank(group=self.real_dp_process_group[i]) == 0:
-                    report_memory_usage(
-                        f"After Flattening and after emptying param group {i} cache")
-
-            # set model fp16 weight to slices of flattened buffer
-            self._update_model_fp16_weights(i)
-
-            # divide the flat weights into near equal partition equal to the data parallel degree
-            # each process will compute on a different part of the partition
-            data_parallel_partitions = self.get_data_parallel_partitions(
-                self.fp16_groups_flat[i],
-                i)
-            self.parallel_partitioned_fp16_groups.append(
-                data_parallel_partitions)
-
-            # verify that data partition start locations are 4-byte aligned
-            for partitioned_data in data_parallel_partitions:
-                assert (partitioned_data.data_ptr() %
-                        (2 * self.nccl_start_alignment_factor) == 0)
-
-            # a partition of the fp32 master weights that will be updated by this process
-            if not fp16_master_weights_and_gradients:
-                self.single_partition_of_fp32_groups.append(
-                    self.parallel_partitioned_fp16_groups[i][partition_id].to(
-                        self.device).clone().float().detach())
-            else:
-                self.single_partition_of_fp32_groups.append(
-                    self.parallel_partitioned_fp16_groups[i][partition_id].to(
-                        self.device).clone().half().detach())
-
-            # modify optimizer of have flat master weight
-            self.single_partition_of_fp32_groups[
-                i].requires_grad = True  # keep this in case internal optimizer uses it
-            param_group['params'] = [self.single_partition_of_fp32_groups[i]]
-
-            partition_size = len(self.fp16_groups_flat[i]) / dist.get_world_size(
-                group=self.real_dp_process_group[i])
-            params_in_partition, params_not_in_partition, first_offset = self.get_partition_info(
-                self.round_robin_fp16_groups[i],
-                partition_size,
-                partition_id)
-
-            self.partition_size.append(partition_size)
-            self.params_in_partition.append(params_in_partition)
-            self.params_not_in_partition.append(params_not_in_partition)
-            self.first_offset.append(first_offset)
-
-        for rank in range(dist.get_world_size()):
-            if dist.get_rank() == rank and self.verbose:
-                print(
-                    f"Rank: {rank} partition count {self.partition_count} and sizes{[(p.numel(), self.is_moe_param_group[i] if hasattr(self, 'is_moe_param_group') else False) for i, p in enumerate(self.single_partition_of_fp32_groups)]} "
-                )
-                dist.barrier()
-        # exit(0)
-        self.reduce_bucket_size = int(reduce_bucket_size)
-        self.allgather_bucket_size = int(allgather_bucket_size)
-
-        self.reduction_event = torch.cuda.Event(
-            enable_timing=False, blocking=False)
-        self.reduction_stream = torch.cuda.Stream()
-        self.cpu_computation_stream = torch.cuda.Stream()
-        self.copy_grad_stream = torch.cuda.Stream()
-        self.callback_queued = False
-
-        self.param_dict = {}
-
-        # map between param_id and bool to specify if a param is in this partition
-        self.is_param_in_current_partition = {}
-
-        self.grads_in_ipg_bucket = []
-        self.params_in_ipg_bucket = []
-        self.elements_in_ipg_bucket = 0
-        self.params_already_reduced = []
-        self._release_ipg_buffers()
-        self.previous_reduced_grads = None
-        self.ipg_bucket_has_moe_params = False
-
-        # simplified param id
-        self.param_id = {}
-
-        largest_param_numel = 0
-        count = 0
-        for i, params_group in enumerate(self.fp16_groups):
-            for param in params_group:
-                unique_id = id(param)
-                self.param_id[unique_id] = count
-                self.param_dict[count] = param
-                self.params_already_reduced.append(False)
-                if param.numel() > largest_param_numel:
-                    largest_param_numel = param.numel()
-                count = count + 1
-
-        for param_group in self.params_in_partition:
-            for param in param_group:
-                self.is_param_in_current_partition[self.get_param_id(
-                    param)] = True
-
-        for param_group in self.params_not_in_partition:
-            for param in param_group:
-                self.is_param_in_current_partition[self.get_param_id(
-                    param)] = False
-
-        if self.cpu_offload:
-            self.accumulated_grads_in_cpu = {}
-            self.norm_for_param_grads = {}
-            self.local_overflow = False
-            self.grad_position = {}
-            self.temp_grad_buffer_for_cpu_offload = torch.zeros(
-                largest_param_numel,
-                device=self.device,
-                dtype=self.dtype).pin_memory()
-            self.temp_grad_buffer_for_gpu_offload = torch.zeros(
-                largest_param_numel,
-                device=torch.cuda.current_device(),
-                dtype=self.dtype)
-
-            for i, params_group in enumerate(self.fp16_groups):
-                self.get_grad_position(i,
-                                       self.params_in_partition[i],
-                                       self.first_offset[i],
-                                       self.partition_size[i])
-
-        # mapping from parameter to partition that it belongs to
-        self.param_to_partition_ids = {}
-
-        # stores if a partition has been reduced in this step
-        self.is_partition_reduced = {}
-
-        # number of grads in partition that still need to be computed
-        self.remaining_grads_in_partition = {}
-
-        # total number of grads in partition
-        self.total_grads_in_partition = {}
-
-        # stores if a grad in a partition has been computed or not
-        self.is_grad_computed = {}
-
-        # stores the offset at which a parameter gradient needs to be inserted in a partition
-        self.grad_partition_insertion_offset = {}
-
-        # the offset in the gradient at which it must be inserted at the beginning of the partition
-        self.grad_start_offset = {}
-
-        # will store the averaged gradients required by this partition
-        self.averaged_gradients = {}
-
-        # store index of first parameter in each partition
-        self.first_param_index_in_partition = {}
-
-        # initializes all data structures for implementing gradient partitioning
-        self.initialize_gradient_partitioning_data_structures()
-
-        # resets the data structure value for the next backward propagation
-        self.reset_partition_gradient_structures()
-
-        # creates backward hooks for gradient partitioning
-        if self.partition_gradients or self.overlap_comm:
-            self.create_reduce_and_remove_grad_hooks()
-
-        # we may have a way of fusing dynamic scale. Do not support for now
-        if self.dtype == torch.float or not dynamic_loss_scale:
-            loss_scale_value = 1.0 if self.dtype == torch.float else static_loss_scale
-
-            self.dynamic_loss_scale = False
-            self.loss_scaler = LossScaler(scale=loss_scale_value)
-            cur_iter = 0
-        else:
-            if dynamic_loss_args is None:
-                self.loss_scaler = DynamicLossScaler()
-            else:
-                self.loss_scaler = DynamicLossScaler(**dynamic_loss_args)
-
-            self.dynamic_loss_scale = True
-
-        if self.verbose:
-            report_memory_usage("Before initializing optimizer states")
-        self.initialize_optimizer_states()
-        if self.verbose:
-            report_memory_usage("After initializing optimizer states")
-
-            if dist.get_rank() == 0:
-                print(f"optimizer state initialized")
-
-            if dist.get_rank(group=self.dp_process_group) == 0:
-                report_memory_usage(f"After initializing ZeRO optimizer")
-
-    def _configure_moe_settings(self):
-        assert self.contiguous_gradients, "Contiguous Gradients in ZeRO Stage 2 must be set to True for MoE. Other code paths are not tested with MoE"
-        assert self.reduce_scatter, "Reduce Scatter in ZeRO Stage 2 must be set to True for MoE. Other code paths are not tested with MoE"
-
-        def is_moe_group(group):
-            return 'moe' in group and group['moe']
-
-        assert any([is_moe_group(group) for group in
-                    self.optimizer.param_groups]), "The model has moe layers, but None of the param groups are marked as MoE. Create a param group with 'moe' key set to True before creating optimizer"
-        self.is_moe_param_group = []
-        for i, group in enumerate(self.optimizer.param_groups):
-            if is_moe_group(group):
-                assert all(
-                    [is_moe_param(param) for param in group['params']]), "All params in MoE group must be MoE params"
-                self.real_dp_process_group[i] = self.expert_dp_process_group
-                self.partition_count[i] = dist.get_world_size(
-                    group=self.expert_dp_process_group)
-                self.is_moe_param_group.append(True)
-            else:
-                self.is_moe_param_group.append(False)
-
-        assert self.expert_dp_process_group is not None, "Expert data parallel group should be configured with MoE"
-        assert self.ep_process_group is not None, "Expert parallel group should be configured with MoE"
-
-    def _update_model_fp16_weights(self, group_index):
-        updated_params = self.unflatten(self.fp16_groups_flat[group_index],
-                                        self.round_robin_fp16_groups[group_index])
-        for p, q in zip(self.round_robin_fp16_groups[group_index], updated_params):
-            p.data = q.data
-
-        # set model fp16 weight to slices of reordered flattened buffer
-        for param_index, param in enumerate(self.fp16_groups[group_index]):
-            new_index = self.round_robin_fp6_indices[group_index][param_index]
-            param.data = self.round_robin_fp16_groups[group_index][new_index].data
-
-    def _round_robin_reorder(self, tensor_list, num_partitions):
-
-        # disable round robin if need to debug something
-        # return tensor_list, list(range(len(tensor_list)))
-
-        partition_tensors = {}
-
-        for i, tensor in enumerate(tensor_list):
-            j = i % num_partitions
-            if not j in partition_tensors:
-                partition_tensors[j] = []
-            partition_tensors[j].append((i, tensor))
-
-        reordered_tensors = []
-        reordered_indices = {}
-
-        for partition_index in partition_tensors.keys():
-            for i, (original_index, tensor) in enumerate(partition_tensors[partition_index]):
-                reordered_indices[original_index] = len(reordered_tensors)
-                reordered_tensors.append(tensor)
-
-        return reordered_tensors, reordered_indices
-
-    def _release_ipg_buffers(self):
-        if self.contiguous_gradients:
-            self.ipg_buffer = None
-            self.grads_in_partition = None
-            self.grads_in_partition_offset = 0
-
-    def initialize_optimizer_states(self):
-
-        for i, group in enumerate(self.fp16_groups):
-            single_grad_partition = torch.zeros(
-                int(self.partition_size[i]),
-                dtype=self.single_partition_of_fp32_groups[i].dtype,
-                device=self.device)
-            self.single_partition_of_fp32_groups[
-                i].grad = single_grad_partition.pin_memory(
-            ) if self.cpu_offload else single_grad_partition
-
-        self.optimizer.step()
-
-        if not self.cpu_offload:
-            for group in self.single_partition_of_fp32_groups:
-                group.grad = None  # class init
-
-        return
-
-    #########################################################################
-    #################### ZeRO Stage 1 - reduce gradients ####################
-    #########################################################################
-
-    def reduce_gradients(self, pipeline_parallel=False):
-        world_size = dist.get_world_size(self.dp_process_group)
-        my_rank = dist.get_rank(self.dp_process_group)
-
-        # with PP we must create ipg buffer, since backward is handled outside zero
-        if pipeline_parallel and self.contiguous_gradients:
-            self.ipg_buffer = []
-            buf_0 = torch.empty(int(self.reduce_bucket_size),
-                                dtype=self.dtype,
-                                device=torch.cuda.current_device())
-            self.ipg_buffer.append(buf_0)
-            self.ipg_index = 0
-
-        if not self.overlap_comm:
-            for i, group in enumerate(self.fp16_groups):
-                for param in group:
-                    if param.grad is not None:
-                        self.reduce_ready_partitions_and_remove_grads(param, i)
-
-        # reduce any pending grads in either hook/non-hook case
-        self.overlapping_partition_gradients_reduce_epilogue()
-
-    #########################################################################
-    #########################ZeRO Partition Gradients########################
-    #########################################################################
-
-    def get_first_param_index(self, group_id, param_group, partition_id):
-        for index, param in enumerate(param_group):
-            param_id = self.get_param_id(param)
-            if partition_id in self.param_to_partition_ids[group_id][param_id]:
-                return index
-        return None
-
-    def initialize_gradient_partitioning_data_structures(self):
-
-        for i, param_group in enumerate(self.round_robin_fp16_groups):
-
-            total_partitions = dist.get_world_size(
-                group=self.real_dp_process_group[i])
-
-            self.param_to_partition_ids[i] = {}
-            self.is_partition_reduced[i] = {}
-            self.total_grads_in_partition[i] = {}
-            self.remaining_grads_in_partition[i] = {}
-            self.is_grad_computed[i] = {}
-            self.grad_partition_insertion_offset[i] = {}
-            self.grad_start_offset[i] = {}
-            self.first_param_index_in_partition[i] = {}
-
-            for partition_id in range(total_partitions):
-                self.is_grad_computed[i][partition_id] = {}
-                self.grad_partition_insertion_offset[i][partition_id] = {}
-                self.grad_start_offset[i][partition_id] = {}
-                self.total_grads_in_partition[i][partition_id] = 0
-                self.initialize_gradient_partition(
-                    i, param_group, partition_id)
-                self.is_partition_reduced[i][partition_id] = False
-                self.first_param_index_in_partition[i][
-                    partition_id] = self.get_first_param_index(
-                    i,
-                    param_group,
-                    partition_id)
-
-    def independent_gradient_partition_epilogue(self):
-        if self.verbose:
-            self.report_ipg_memory_usage(
-                f"In ipg_epilogue before reduce_ipg_grads", 0)
-        self.reduce_ipg_grads()
-        if self.verbose:
-            self.report_ipg_memory_usage(
-                f"In ipg_epilogue after reduce_ipg_grads", 0)
-
-        # if dist.get_rank() == 0:
-        #    print()("Params already reduced %s", self.params_already_reduced)
-        for i in range(len(self.params_already_reduced)):
-            self.params_already_reduced[i] = False
-
-        if self.overlap_comm:
-            torch.cuda.synchronize()
-            # It is safe to clear previously reduced grads of other partitions
-            self._clear_previous_reduced_grads()
-
-        if self.cpu_offload is False:
-            for i, _ in enumerate(self.fp16_groups):
-
-                if not i in self.averaged_gradients or self.averaged_gradients[i] is None:
-                    self.averaged_gradients[i] = self.get_flat_partition(
-                        self.params_in_partition[i],
-                        self.first_offset[i],
-                        self.partition_size[i],
-                        dtype=self.dtype,
-                        device=torch.cuda.current_device(),
-                        return_tensor_list=True)
-                else:
-                    avg_new = self.get_flat_partition(self.params_in_partition[i],
-                                                      self.first_offset[i],
-                                                      self.partition_size[i],
-                                                      dtype=self.dtype,
-                                                      device=torch.cuda.current_device(),
-                                                      return_tensor_list=True)
-
-                    for accumulated_grad, new_avg_grad in zip(self.averaged_gradients[i], avg_new):
-                        accumulated_grad.add_(new_avg_grad)
-
-        self._release_ipg_buffers()
-
-        # No need to keep the gradients anymore.
-        # All gradients required by the step
-        # are in self.averaged_gradients
-        self.zero_grad()
-
-        if self.verbose:
-            report_memory_usage(f"End ipg_epilogue")
-
-    # resets all partition to no reduced
-    # sets remaining grads to the total number of grads in each partition
-    # set is grad computed to false for all grads in partition
-    def reset_partition_gradient_structures(self):
-        for i, _ in enumerate(self.fp16_groups):
-            total_partitions = dist.get_world_size(
-                group=self.real_dp_process_group[i])
-            for partition_id in range(total_partitions):
-                self.is_partition_reduced[i][partition_id] = False
-                self.remaining_grads_in_partition[i][
-                    partition_id] = self.total_grads_in_partition[i][partition_id]
-
-                for param_id in self.is_grad_computed[i][partition_id]:
-                    self.is_grad_computed[i][partition_id][param_id] = False
-
-    def initialize_gradient_partition(self, i, param_group, partition_id):
-        def set_key_value_list(dictionary, key, value):
-            if key in dictionary:
-                dictionary[key].append(value)
-            else:
-                dictionary[key] = [value]
-
-        def increment_value(dictionary, key):
-            if key in dictionary:
-                dictionary[key] += 1
-            else:
-                dictionary[key] = 1
-
-        partition_size = self.partition_size[i]
-
-        start_index = partition_size * partition_id
-        end_index = partition_size * (partition_id + 1)
-
-        current_index = 0
-        first_offset = 0
-
-        for param in param_group:
-
-            param_size = param.numel()
-            param_id = self.get_param_id(param)
-
-            if (current_index >= start_index and current_index < end_index):
-                set_key_value_list(self.param_to_partition_ids[i],
-                                   param_id,
-                                   partition_id)
-                increment_value(self.total_grads_in_partition[i], partition_id)
-
-                self.is_grad_computed[i][partition_id][param_id] = False
-
-                self.grad_partition_insertion_offset[i][partition_id][
-                    param_id] = current_index - start_index
-                self.grad_start_offset[i][partition_id][param_id] = 0
-
-            elif start_index > current_index and start_index < (current_index +
-                                                                param_size):
-                assert (
-                    first_offset == 0), "This can happen either zero or only once as this must be the first tensor in the partition"
-                first_offset = start_index - current_index
-
-                set_key_value_list(self.param_to_partition_ids[i],
-                                   param_id,
-                                   partition_id)
-                increment_value(self.total_grads_in_partition[i], partition_id)
-
-                self.is_grad_computed[i][partition_id][param_id] = False
-
-                self.grad_partition_insertion_offset[i][partition_id][param_id] = 0
-                self.grad_start_offset[i][partition_id][param_id] = first_offset
-
-            current_index = current_index + param_size
-
-    def overlapping_partition_gradients_reduce_epilogue(self):
-        self.independent_gradient_partition_epilogue()
-
-    def create_reduce_and_remove_grad_hooks(self):
-        self.grad_accs = []
-        for i, param_group in enumerate(self.fp16_groups):
-            for param in param_group:
-                if param.requires_grad:
-                    def wrapper(param, i):
-                        param_tmp = param.expand_as(param)
-                        grad_acc = param_tmp.grad_fn.next_functions[0][0]
-
-                        def reduce_partition_and_remove_grads(*notneeded):
-                            self.reduce_ready_partitions_and_remove_grads(
-                                param, i)
-
-                        grad_acc.register_hook(
-                            reduce_partition_and_remove_grads)
-                        self.grad_accs.append(grad_acc)
-
-                    wrapper(param, i)
-
-    def get_param_id(self, param):
-        unique_id = id(param)
-        return self.param_id[unique_id]
-
-    def report_ipg_memory_usage(self, tag, param_elems):
-        elem_count = self.elements_in_ipg_bucket + param_elems
-        percent_of_bucket_size = (
-            100.0 * elem_count) // self.reduce_bucket_size
-        if self.verbose:
-            report_memory_usage(
-                f"{tag}: elems in_bucket {self.elements_in_ipg_bucket} param {param_elems} max_percent {percent_of_bucket_size}"
-            )
-
-    # create a flat tensor aligned at the alignment boundary
-    def flatten_dense_tensors_aligned(self, tensor_list, alignment):
-        num_elements = 0
-        for tensor in tensor_list:
-            num_elements = num_elements + tensor.numel()
-
-        remaining = num_elements % alignment
-
-        if remaining:
-            elements_to_add = alignment - remaining
-            pad_tensor = torch.zeros(elements_to_add,
-                                     device=tensor_list[0].device,
-                                     dtype=tensor_list[0].dtype)
-            padded_tensor_list = tensor_list + [pad_tensor]
-
-            num_elements = num_elements + elements_to_add
-        else:
-            padded_tensor_list = tensor_list
-
-        return self.flatten(padded_tensor_list)
-
-    ############### Independent Partition Gradient ########################
-    def reduce_independent_p_g_buckets_and_remove_grads(self, param, i):
-        if self.elements_in_ipg_bucket + param.numel() > self.reduce_bucket_size:
-            self.report_ipg_memory_usage("In ipg_remove_grads before reduce_ipg_grads",
-                                         param.numel())
-            self.reduce_ipg_grads()
-            if self.contiguous_gradients and self.overlap_comm:
-                # Swap ipg_index between 0 and 1
-                self.ipg_index = 1 - self.ipg_index
-
-            self.report_ipg_memory_usage("In ipg_remove_grads after reduce_ipg_grads",
-                                         param.numel())
-
-        param_id = self.get_param_id(param)
-        assert self.params_already_reduced[param_id] == False, \
-            f"The parameter {param_id} has already been reduced. \
-            Gradient computed twice for this partition. \
-            Multiple gradient reduction is currently not supported"
-
-        if param.numel() > self.reduce_bucket_size:
-            self.extra_large_param_to_reduce = param
-
-        elif self.contiguous_gradients:
-            # keeping the gradients contiguous to prevent memory fragmentation, and avoid flattening
-            new_grad_tensor = self.ipg_buffer[self.ipg_index].narrow(
-                0,
-                self.elements_in_ipg_bucket,
-                param.numel())
-            new_grad_tensor.copy_(param.grad.view(-1))
-            param.grad.data = new_grad_tensor.data.view_as(param.grad)
-
-        self.elements_in_ipg_bucket += param.numel()
-
-        assert param.grad is not None, f"rank {dist.get_rank()} - Invalid to reduce Param {param_id} with None gradient"
-
-        self.grads_in_ipg_bucket.append(param.grad)
-        self.params_in_ipg_bucket.append((i, param, param_id))
-
-        # make sure the average tensor function knows how to average the gradients
-        if is_moe_param(param):
-            self.ipg_bucket_has_moe_params = True
-
-        self.report_ipg_memory_usage("End ipg_remove_grads", 0)
-
-    def print_rank_0(self, message):
-        if dist.get_rank() == 0 and self.verbose:
-            print(message)
-
-    def gradient_reduction_w_predivide(self, tensor):
-
-        dp_world_size = dist.get_world_size(group=self.dp_process_group)
-
-        tensor_to_allreduce = tensor
-
-        if self.allreduce_always_fp32:
-            tensor_to_allreduce = tensor.float()
-
-        if self.postscale_gradients:
-            if self.gradient_predivide_factor != 1.0:
-                tensor_to_allreduce.mul_(1. / self.gradient_predivide_factor)
-
-            dist.all_reduce(tensor_to_allreduce, group=self.dp_process_group)
-
-            if self.gradient_predivide_factor != dp_world_size:
-                tensor_to_allreduce.mul_(
-                    self.gradient_predivide_factor / dp_world_size)
-        else:
-            tensor_to_allreduce.div_(dp_world_size)
-            dist.all_reduce(tensor_to_allreduce, group=self.dp_process_group)
-
-        if self.allreduce_always_fp32 and tensor is not tensor_to_allreduce:
-            tensor.copy_(tensor_to_allreduce)
-
-        return tensor
-
-    def average_tensor(self, tensor):
-        if self.overlap_comm:
-            torch.cuda.synchronize()
-            stream = self.reduction_stream
-        else:
-            stream = torch.cuda.current_stream()
-
-        with torch.cuda.stream(stream):
-            if not self.reduce_scatter:
-                self.gradient_reduction_w_predivide(tensor)
-                return
-
-            # Accumulate destination ranks and bucket offsets for each gradient slice.
-            # Note: potential future optimization, record access pattern of parameters
-            # in backward pass and partition gradients w.r.t. access pattern so that our
-            # bucket is guaranteed to be contiguous w.r.t. ranks
-            rank_and_offsets = []
-            real_dp_process_group = []
-            curr_size = 0
-            prev_id = -1
-
-            process_group = self.dp_process_group
-            # count = 0
-            for i, param, param_id in self.params_in_ipg_bucket:
-
-                process_group = self.dp_process_group
-                # Averages gradients at parameter level if ipg has a moe param
-                # Otherwise averaging is done at the entire buffer level at the end of the loop
-                if self.ipg_bucket_has_moe_params:
-                    process_group = self.expert_dp_process_group if is_moe_param(
-                        param) else self.dp_process_group
-                    param.grad.data.div_(
-                        dist.get_world_size(group=process_group))
-
-                partition_ids = self.param_to_partition_ids[i][param_id]
-                partition_size = self.partition_size[i]
-                # Get all partition ids + their offsets
-                partition_ids_w_offsets = []
-                for partition_id in partition_ids:
-                    offset = self.grad_start_offset[i][partition_id][param_id]
-                    partition_ids_w_offsets.append((partition_id, offset))
-                partition_ids_w_offsets.sort(key=lambda t: t[1])
-
-                # Calculate rank and offsets for grad slices
-                for idx in range(len(partition_ids_w_offsets)):
-                    partition_id, offset = partition_ids_w_offsets[idx]
-
-                    # if dist.get_rank() == 0 and count < 100:
-                    #     print(f"Rank {dist.get_rank()} rank offet id {idx} calculated dp size {dist.get_world_size(group=process_group)} real dp size {dist.get_world_size(self.real_dp_process_group[i])} and dst: {partition_id}")
-                    # count += 1
-
-                    # Calculate numel for grad slice depending on partition location
-                    if idx == len(partition_ids_w_offsets) - 1:
-                        # Last partition_id uses its own offset
-                        numel = param.numel() - offset
-                    else:
-                        # Set numel to next partition's offset
-                        numel = partition_ids_w_offsets[idx + 1][1] - offset
-
-                    # Merge bucket ranges if they belong to the same rank
-                    if partition_id == prev_id:
-                        prev_pid, prev_size, prev_numel = rank_and_offsets[-1]
-                        rank_and_offsets[-1] = (prev_pid,
-                                                prev_size, prev_numel + numel)
-                    else:
-                        rank_and_offsets.append(
-                            (partition_id, curr_size, numel))
-                        real_dp_process_group.append(process_group)
-                    curr_size += numel
-                    prev_id = partition_id
-
-            if not self.ipg_bucket_has_moe_params:
-                tensor.div_(dist.get_world_size(group=self.dp_process_group))
-
-            async_handles = []
-            for i, (dst, bucket_offset, numel) in enumerate(rank_and_offsets):
-                grad_slice = tensor.narrow(0, int(bucket_offset), int(numel))
-                # if dist.get_rank() == 0:
-                #     print(f"Rank {dist.get_rank()} rank offet id {i} real dp size {dist.get_world_size(group=real_dp_process_group[i])} and dst: {dst}")
-                # dist.barrier()
-                # dist.barrier()
-                dst_rank = _get_global_rank(real_dp_process_group[i], dst)
-                async_handle = dist.reduce(grad_slice,
-                                           dst=dst_rank,
-                                           group=real_dp_process_group[i],
-                                           async_op=True)
-                async_handles.append(async_handle)
-
-            for handle in async_handles:
-                handle.wait()
-
-    ##############################################################################
-    ############################# CPU Offload Methods#############################
-    ##############################################################################
-    def get_grad_position(self, group_id, tensor_list, first_offset, partition_size):
-        current_offset = 0
-
-        for i, tensor in enumerate(tensor_list):
-            param_id = self.get_param_id(tensor)
-            param_start_offset = 0
-
-            num_elements = tensor.numel()
-            tensor_offset = 0
-
-            # we need to offset to get to the right element
-            if i == 0 and first_offset > 0:
-                tensor_offset = first_offset
-                num_elements = num_elements - tensor_offset
-                param_start_offset = first_offset
-
-            # we dont need all elements of the tensor
-            if num_elements > (partition_size - current_offset):
-                num_elements = partition_size - current_offset
-
-            self.grad_position[param_id] = [
-                int(group_id),
-                int(param_start_offset),
-                int(current_offset),
-                int(num_elements)
-            ]
-            current_offset += num_elements
-
-    def update_overflow_tracker_for_param_grad(self, param):
-        if param.grad is not None and self._has_inf_or_nan(param.grad.data):
-            self.local_overflow = True
-
-    def async_accumulate_grad_in_cpu_via_gpu(self, param):
-        param_id = self.get_param_id(param)
-
-        [i, source_offset, dest_offset, num_elements] = self.grad_position[param_id]
-
-        # copy to a preexisiting buffer to avoid memory allocation penalty
-        dest_buffer = self.temp_grad_buffer_for_gpu_offload.view(-1).narrow(
-            0,
-            0,
-            param.numel())
-
-        # buffer for storing gradients for this parameter in CPU
-        def buffer_to_accumulate_to_in_cpu():
-            if not self.fp16_master_weights_and_gradients:
-                return torch.zeros(param.numel(),
-                                   dtype=param.dtype,
-                                   device=self.device).pin_memory()
-            else:
-                return self.single_partition_of_fp32_groups[i].grad.view(-1).narrow(
-                    0,
-                    dest_offset,
-                    num_elements)
-
-        # accumulate gradients into param.grad or parts of it that belongs to this parittion
-        def accumulate_gradients():
-            if not self.fp16_master_weights_and_gradients:
-                dest_buffer.copy_(self.accumulated_grads_in_cpu[param_id].view(-1),
-                                  non_blocking=True)
-                param.grad.data.view(-1).add_(dest_buffer)
-            else:
-                dest_buffer.narrow(0,
-                                   source_offset,
-                                   num_elements).copy_(
-                    self.accumulated_grads_in_cpu[param_id].view(-1),
-                    non_blocking=True)
-                param.grad.data.view(-1).narrow(
-                    0,
-                    source_offset,
-                    num_elements).add_(dest_buffer.narrow(0,
-                                                          source_offset,
-                                                          num_elements))
-
-        # move accumulated gradients back to CPU
-        def copy_gradients_to_cpu():
-            if not self.fp16_master_weights_and_gradients:
-                self.accumulated_grads_in_cpu[param_id].data.copy_(
-                    param.grad.data.view(-1),
-                    non_blocking=True)
-            else:
-                self.accumulated_grads_in_cpu[param_id].data.copy_(
-                    param.grad.data.view(-1).narrow(0,
-                                                    source_offset,
-                                                    num_elements),
-                    non_blocking=True)
-
-        if param_id not in self.accumulated_grads_in_cpu:
-            self.accumulated_grads_in_cpu[param_id] = buffer_to_accumulate_to_in_cpu(
-            )
-
-        if self.micro_step_id > 0:
-            accumulate_gradients()
-
-        # at the boundary we will send 32bit directly
-        if not self.is_gradient_accumulation_boundary:
-            copy_gradients_to_cpu()
-
-    def set_norm_for_param_grad(self, param):
-        param_id = self.get_param_id(param)
-        accumulated_grad = self.accumulated_grads_in_cpu[
-            param_id] if self.gradient_accumulation_steps > 1 else param.grad
-
-        [i, source_offset, dest_offset, num_elements] = self.grad_position[param_id]
-
-        start = source_offset
-        accumulated_grad = accumulated_grad.view(
-            -1).narrow(0, start, num_elements)
-
-        self.norm_for_param_grads[param_id] = accumulated_grad.data.double().norm(
-            2)
-
-    def set_norm_for_param_grad_in_gpu(self, param):
-        param_id = self.get_param_id(param)
-        accumulated_grad = param.grad
-
-        [i, source_offset, dest_offset, num_elements] = self.grad_position[param_id]
-
-        start = source_offset
-        accumulated_grad = accumulated_grad.view(
-            -1).narrow(0, start, num_elements)
-
-        self.norm_for_param_grads[param_id] = accumulated_grad.data.double().norm(
-            2)
-
-    def async_inplace_copy_grad_to_fp32_buffer_from_gpu(self, param):
-        param_id = self.get_param_id(param)
-
-        [i, source_offset, dest_offset, num_elements] = self.grad_position[param_id]
-
-        dest_tensor = self.single_partition_of_fp32_groups[i].grad.view(-1).narrow(
-            0,
-            dest_offset,
-            num_elements)
-
-        src_tensor = param.grad.view(-1).narrow(0, source_offset, num_elements)
-        if not self.fp16_master_weights_and_gradients:
-            src_tensor = src_tensor.float()
-
-        dest_tensor.copy_(src_tensor, non_blocking=True)
-        param.grad = None  # offload only
-
-    def complete_grad_norm_calculation_for_cpu_offload(self, params):
-        total_norm = 0.0
-        norm_type = 2.0
-        for p in params:
-            if is_model_parallel_parameter(p) or (self.model_parallel_rank == 0):
-                param_id = self.get_param_id(p)
-                # as some model have trainable parameters but skipped in training,
-                # their backward hooks in self.create_reduce_and_remove_grad_hooks() will not run,
-                # so they have no norm_for_param_grads
-                if param_id in self.norm_for_param_grads:
-                    param_norm = self.norm_for_param_grads[param_id]
-                    total_norm += param_norm.item() ** 2
-                else:
-                    # As unused parameters in modules may not be expected sometimes,
-                    # add an explicit error msg when it occurred and an option to
-                    # avoid the error
-                    assert self.ignore_unused_parameters, """
-                        This assert indicates that your module has parameters that
-                        were not used in producing loss.
-                        You can avoid this assert by
-                        (1) enable ignore_unused_parameters option in zero_optimization config;
-                        (2) making sure all trainable parameters and `forward` function
-                            outputs participate in calculating loss.
-                    """
-
-        # Sum across all model parallel GPUs.
-        total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
-
-        torch.distributed.all_reduce(total_norm_cuda,
-                                     op=torch.distributed.ReduceOp.SUM,
-                                     group=self.dp_process_group)
-
-        self._model_parallel_all_reduce(tensor=total_norm_cuda,
-                                        op=torch.distributed.ReduceOp.SUM)
-
-        total_norm = total_norm_cuda[0].item() ** (1. / norm_type)
-
-        if total_norm == float(
-                'inf') or total_norm == -float('inf') or total_norm != total_norm:
-            total_norm = -1
-
-        return total_norm
-
-    ############################################################################################
-
-    def copy_grads_in_partition(self, param):
-        if self.cpu_offload:
-
-            if self.gradient_accumulation_steps > 1:
-                self.async_accumulate_grad_in_cpu_via_gpu(param)
-
-            if self.is_gradient_accumulation_boundary:
-                self.set_norm_for_param_grad_in_gpu(param)
-
-                self.update_overflow_tracker_for_param_grad(param)
-
-                self.async_inplace_copy_grad_to_fp32_buffer_from_gpu(param)
-
-            return
-        # print(f"ID {self.get_param_id(param)} grad norm {param.grad.norm()}")
-        if self.grads_in_partition is None:
-            self.grads_in_partition_offset = 0
-            total_size = 0
-            for group in self.params_in_partition:
-                for param_in_partition in group:
-                    total_size += param_in_partition.numel()
-
-            if self.verbose:
-                report_memory_usage(
-                    f"before copying {total_size} gradients into partition")
-            self.grads_in_partition = torch.empty(int(total_size),
-                                                  dtype=self.dtype,
-                                                  device=torch.cuda.current_device())
-
-            if self.verbose:
-                report_memory_usage(
-                    f"after copying {total_size} gradients into partition")
-
-        # The allreduce buffer will be rewritted. Copy the gradients in partition to a new buffer
-        new_grad_tensor = self.grads_in_partition.view(-1).narrow(
-            0,
-            self.grads_in_partition_offset,
-            param.numel())
-        new_grad_tensor.copy_(param.grad.view(-1))
-        param.grad.data = new_grad_tensor.data.view_as(param.grad)
-        # print(f"Grad norm after copy to contiguous_buffer {param.grad.data.norm()}")
-        self.grads_in_partition_offset += param.numel()
-
-    def reduce_ipg_grads(self):
-        if self.contiguous_gradients:
-            if self.extra_large_param_to_reduce is not None:
-                assert len(
-                    self.params_in_ipg_bucket) == 1, "more than 1 param in ipg bucket, this shouldn't happen"
-                _, _, param_id = self.params_in_ipg_bucket[0]
-                assert self.get_param_id(
-                    self.extra_large_param_to_reduce) == param_id, "param in ipg bucket does not match extra-large param"
-                self.average_tensor(
-                    self.extra_large_param_to_reduce.grad.view(-1))
-                self.extra_large_param_to_reduce = None
-            else:
-                self.average_tensor(self.ipg_buffer[self.ipg_index])
-        else:
-            self.buffered_reduce_fallback(
-                None,
-                self.grads_in_ipg_bucket,
-                elements_per_buffer=self.elements_in_ipg_bucket)
-
-        if self.overlap_comm:
-            stream = self.reduction_stream
-        elif self.cpu_offload:
-            # TODO: copy_grad_stream is disabled because of race with reduce. This hurts perf and should be fixed.
-            #            torch.cuda.synchronize()
-            #            stream = self.copy_grad_stream
-            stream = torch.cuda.current_stream()
-        else:
-            stream = torch.cuda.current_stream()
-
-        with torch.cuda.stream(stream):
-            for _, param, param_id in self.params_in_ipg_bucket:
-
-                assert self.params_already_reduced[param_id] == False, \
-                    f"The parameter {param_id} has already been reduced. \
-                    Gradient computed twice for this partition. \
-                    Multiple gradient reduction is currently not supported"
-
-                self.params_already_reduced[param_id] = True
-
-                if self.partition_gradients:
-                    if not self.is_param_in_current_partition[param_id]:
-                        if self.overlap_comm and self.contiguous_gradients is False:
-                            # Clear grads of other partitions during the next reduction
-                            # to avoid clearing them before the reduction is complete.
-                            if self.previous_reduced_grads is None:
-                                self.previous_reduced_grads = []
-                            self.previous_reduced_grads.append(param)
-                        else:
-                            param.grad = None  # only if self.partition_gradients
-                    elif self.contiguous_gradients:
-                        self.copy_grads_in_partition(param)
-
-        self.grads_in_ipg_bucket = []
-        self.params_in_ipg_bucket = []
-        self.ipg_bucket_has_moe_params = False
-        self.elements_in_ipg_bucket = 0
-        #####################################################################
-
-    def reduce_ready_partitions_and_remove_grads(self, param, i):
-        if self.partition_gradients or self.is_gradient_accumulation_boundary:
-            self.reduce_independent_p_g_buckets_and_remove_grads(param, i)
-
-    def zero_reduced_gradients(self, partition_id, i):
-        def are_all_related_partitions_reduced(params_id):
-            for partition_id in self.param_to_partition_ids[i][params_id]:
-                if not self.is_partition_reduced[i][partition_id]:
-                    return False
-            return True
-
-        for params_id in self.is_grad_computed[i][partition_id]:
-            if are_all_related_partitions_reduced(params_id):
-                self.param_dict[params_id].grad = None  # dead code
-
-    def flatten_and_print(self, message, tensors, start=0, n=5):
-        flatten_tensor = self.flatten(tensors)
-
-        def print_func():
-            print(flatten_tensor.contiguous().view(-1).narrow(0, start, n))
-
-        self.sequential_execution(print_func, message)
-
-    def get_grads_to_reduce(self, i, partition_id):
-        def get_reducable_portion(key):
-            grad = self.param_dict[key].grad
-            total_elements = grad.numel()
-            start = self.grad_start_offset[i][partition_id][key]
-            num_elements = min(
-                total_elements - start,
-                self.partition_size[i] -
-                self.grad_partition_insertion_offset[i][partition_id][key])
-            if not pg_correctness_test:
-                if num_elements == total_elements:
-                    return grad
-                else:
-                    return grad.contiguous().view(-1).narrow(0,
-                                                             int(start),
-                                                             int(num_elements))
-            else:
-                if num_elements == total_elements:
-                    return grad.clone()
-                else:
-                    return grad.clone().contiguous().view(-1).narrow(
-                        0,
-                        int(start),
-                        int(num_elements))
-
-        grads_to_reduce = []
-        for key in self.is_grad_computed[i][partition_id]:
-            grad = get_reducable_portion(key)
-            grads_to_reduce.append(grad)
-        return grads_to_reduce
-
-    def sequential_execution(self, function, message, group=None):
-        if group is None:
-            group = self.dp_process_group
-        if dist.get_rank(group=group) == 0:
-            print(message)
-        for id in range(dist.get_world_size(group=group)):
-            if id == dist.get_rank(group=group):
-                function()
-            dist.barrier(group=group)
-
-    def set_none_gradients_to_zero(self, i, partition_id):
-        for param_id in self.is_grad_computed[i][partition_id]:
-            param = self.param_dict[param_id]
-            if param.grad is None:
-                param.grad = torch.zero_like(param)
-
-    ######################Reduction Related Methods##############################
-
-    def allreduce_bucket(self, bucket, allreduce_always_fp32=False, rank=None, log=None):
-        rank = None
-        tensor = self.flatten(bucket)
-
-        tensor_to_allreduce = tensor
-
-        if pg_correctness_test:
-            allreduce_always_fp32 = True
-
-        if allreduce_always_fp32:
-            tensor_to_allreduce = tensor.float()
-
-        tensor_to_allreduce.div_(
-            dist.get_world_size(group=self.dp_process_group))
-
-        if rank is None:
-            #    "All Reducing"
-            dist.all_reduce(tensor_to_allreduce, group=self.dp_process_group)
-        else:
-            global_rank = _get_global_rank(self.dp_process_group, rank)
-            dist.reduce(tensor_to_allreduce, global_rank,
-                        group=self.dp_process_group)
-
-        if allreduce_always_fp32 and tensor is not tensor_to_allreduce:
-            if rank is None or rank == dist.get_rank(group=self.dp_process_group):
-                tensor.copy_(tensor_to_allreduce)
-
-        return tensor
-
-    def _clear_previous_reduced_grads(self):
-        if self.previous_reduced_grads is not None:
-            for param in self.previous_reduced_grads:
-                param.grad = None  # overlap enabled
-            self.previous_reduced_grads = None
-
-    # if rank is specified do a reduction instead of an allreduce
-    def allreduce_and_copy(self, small_bucket, rank=None, log=None):
-        if self.overlap_comm:
-            torch.cuda.synchronize()
-            # It is safe to clear the previously reduced grads of other partitions
-            self._clear_previous_reduced_grads()
-            stream = self.reduction_stream
-        else:
-            stream = torch.cuda.current_stream()
-
-        with torch.cuda.stream(stream):
-            allreduced = self.allreduce_bucket(
-                small_bucket, rank=rank, log=log)
-            if rank is None or rank == dist.get_rank(group=self.dp_process_group):
-                for buf, synced in zip(small_bucket, self.unflatten(allreduced, small_bucket)):
-                    buf.copy_(synced)
-
-    def allreduce_no_retain(self,
-                            bucket,
-                            numel_per_bucket=500000000,
-                            rank=None,
-                            log=None):
-        small_bucket = []
-        numel = 0
-        for tensor in bucket:
-            small_bucket.append(tensor)
-            numel = numel + tensor.numel()
-            if numel > numel_per_bucket:
-                self.allreduce_and_copy(small_bucket, rank=rank, log=None)
-                small_bucket = []
-
-        if len(small_bucket) > 0:
-            self.allreduce_and_copy(small_bucket, rank=rank, log=log)
-
-    # allows using reduction of gradients instead of using all_reduce
-
-    def buffered_reduce_fallback(self,
-                                 rank,
-                                 grads,
-                                 elements_per_buffer=500000000,
-                                 log=None):
-        split_buckets = split_half_float_double(grads)
-
-        for i, bucket in enumerate(split_buckets):
-            self.allreduce_no_retain(bucket,
-                                     numel_per_bucket=elements_per_buffer,
-                                     rank=rank,
-                                     log=log)
-
-    #############################################################################
-    #############################################################################
-    #############################################################################
-
-    # views the tensor as multiple partitions and returns
-    # those partitions
-    def get_data_parallel_partitions(self, tensor, group_id):
-        partitions = []
-
-        dp = dist.get_world_size(group=self.real_dp_process_group[group_id])
-        dp_id = dist.get_rank(group=self.real_dp_process_group[group_id])
-
-        total_num_elements = tensor.numel()
-
-        base_size = total_num_elements // dp
-        remaining = total_num_elements % dp
-
-        start = 0
-        for id in range(dp):
-            partition_size = base_size
-            if id < remaining:
-                partition_size = partition_size + 1
-            partitions.append(tensor.narrow(0, start, partition_size))
-            start = start + partition_size
-        return partitions
-
-    def get_partition_info(self, tensor_list, partition_size, partition_id):
-        params_in_partition = []
-        params_not_in_partition = []
-
-        start_index = partition_size * partition_id
-        end_index = partition_size * (partition_id + 1)
-
-        current_index = 0
-        first_offset = 0
-
-        for tensor in tensor_list:
-
-            tensor_size = tensor.numel()
-
-            if (current_index >= start_index and current_index < end_index):
-                params_in_partition.append(tensor)
-
-            elif start_index > current_index and start_index < (current_index +
-                                                                tensor_size):
-                params_in_partition.append(tensor)
-
-                assert (
-                    first_offset == 0), "This can happen either zero or only once as this must be the first tensor in the partition"
-                first_offset = start_index - current_index
-
-            else:
-                params_not_in_partition.append(tensor)
-
-            current_index = current_index + tensor_size
-
-        return params_in_partition, params_not_in_partition, first_offset
-
-    def zero_grad(self, set_grads_to_None=True):
-        """
-        Zero FP16 parameter grads.
-        """
-        # FP32 grad should never exist.
-        # For speed, set model fp16 grad to None by default
-        for group in self.fp16_groups:
-            for p in group:
-                if set_grads_to_None:
-                    p.grad = None  # epilogue and in step
-                else:
-                    if p.grad is not None:
-                        p.grad.detach_()
-                        p.grad.zero_()
-
-    def _model_parallel_all_reduce(self, tensor, op):
-        """ Perform all reduce within model parallel group, if any.
-        """
-        if self.model_parallel_group is None:
-            pass
-        else:
-            torch.distributed.all_reduce(tensor=tensor,
-                                         op=op,
-                                         group=self.model_parallel_group)
-
-    def clip_grad_norm(self, *args, **kwargs):
-        # dummy function to retain the same function interface
-        # as ColossalaiOptimizer for compatibility
-        pass
-
-    def get_grad_norm_direct(self, gradients, params, norm_type=2):
-        """Clips gradient norm of an iterable of parameters.
-
-        This is adapted from ``torch.nn.utils.clip_grad.clip_grad_norm_`` and
-        added functionality to handle model parallel parameters. Note that
-        the gradients are modified in place.
-
-        Arguments:
-            parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
-                single Tensor that will have gradients normalized
-            max_norm (float or int): max norm of the gradients
-            norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
-                infinity norm.
-
-        Returns:
-            Total norm of the parameters (viewed as a single vector).
-        """
-        norm_type = float(norm_type)
-        if norm_type == inf:
-            total_norm = max(g.data.abs().max() for g in gradients)
-            total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
-            torch.distributed.all_reduce(total_norm_cuda,
-                                         op=torch.distributed.ReduceOp.MAX,
-                                         group=self.dp_process_group)
-
-            # Take max across all GPUs.
-            self._model_parallel_all_reduce(tensor=total_norm_cuda,
-                                            op=torch.distributed.ReduceOp.MAX)
-            total_norm = total_norm_cuda[0].item()
-        else:
-            total_norm = 0.0
-            # if dist.get_rank() == 0:
-            #    print()(f"Total Norm begining {total_norm}")
-            for g, p in zip(gradients, params):
-                if is_model_parallel_parameter(p) or (self.model_parallel_rank == 0):
-                    param_norm = g.data.double().norm(2)
-                    total_norm += param_norm.item() ** 2
-            # Sum across all model parallel GPUs.
-            total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
-
-            torch.distributed.all_reduce(total_norm_cuda,
-                                         op=torch.distributed.ReduceOp.SUM,
-                                         group=self.dp_process_group)
-
-            self._model_parallel_all_reduce(tensor=total_norm_cuda,
-                                            op=torch.distributed.ReduceOp.SUM)
-
-            total_norm = total_norm_cuda[0].item() ** (1. / norm_type)
-
-        if total_norm == float(
-                'inf') or total_norm == -float('inf') or total_norm != total_norm:
-            total_norm = -1
-
-        return total_norm
-
-    # creates a flat fused tensor from the tensor list starting at the first_offset
-    # in the first tensor of the list. If there are not enough elements in the tensor
-    # list then the flat tensor will be padded with zeros
-    def get_flat_partition(self,
-                           tensor_list,
-                           first_offset,
-                           partition_size,
-                           dtype,
-                           device,
-                           return_tensor_list=False):
-        flat_tensor_list = []
-        current_size = 0
-        for i, tensor in enumerate(tensor_list):
-            if tensor.grad is None:
-                tensor.grad = torch.zeros_like(tensor)
-
-            tensor = tensor.grad
-            num_elements = tensor.numel()
-            tensor_offset = 0
-
-            # we need to offset to get to the right element
-            if i == 0 and first_offset > 0:
-                tensor_offset = first_offset
-                num_elements = num_elements - tensor_offset
-
-            # we dont need all elements of the tensor
-            if num_elements > (partition_size - current_size):
-                num_elements = partition_size - current_size
-
-            # we need a narrow view of the tensor based on the tensor offset and number of elements that
-            # we need from this tensor
-            if tensor_offset > 0 or num_elements < tensor.numel():
-                flat_tensor_list.append(tensor.contiguous().view(-1).narrow(
-                    0,
-                    int(tensor_offset),
-                    int(num_elements)))
-            else:
-                flat_tensor_list.append(tensor)
-
-            current_size = current_size + num_elements
-
-        # this means its the last partition and does not align with the dp boundary. We need to pad before flattening
-        if current_size < partition_size:
-            flat_tensor_list.append(
-                torch.zeros(int(partition_size - current_size),
-                            dtype=dtype,
-                            device=device))
-
-        if return_tensor_list:
-            return flat_tensor_list
-
-        return self.flatten(flat_tensor_list)
-
-    def free_grad_in_param_list(self, param_list):
-        for p in param_list:
-            p.grad = None  # in step
-
-    def reset_cpu_buffers(self):
-        self.norm_for_param_grads = {}
-        self.local_overflow = False
-
-    def log_timers(self, timer_names):
-        if self.timers is None:
-            return
-
-        self.timers.log(names=list(timer_names))
-
-    def start_timers(self, timer_names):
-        if self.timers is None:
-            return
-
-        for name in timer_names:
-            self.timers(name).start()
-
-    def stop_timers(self, timer_names):
-        if self.timers is None:
-            return
-
-        for name in timer_names:
-            self.timers(name).stop()
-
-    def step(self, closure=None):
-        """
-        Not supporting closure.
-        """
-        self.micro_step_id = -1
-
-        if self.verbose:
-            report_memory_usage(f"In step before checking overflow")
-
-        # First compute norm for all group so we know if there is overflow
-        self.check_overflow(self.partition_gradients)
-
-        OPTIMIZER_ALLGATHER = 'optimizer_allgather'
-        OPTIMIZER_GRADIENTS = 'optimizer_gradients'
-        OPTIMIZER_STEP = 'optimizer_step'
-        timer_names = [OPTIMIZER_ALLGATHER,
-                       OPTIMIZER_GRADIENTS, OPTIMIZER_STEP]
-
-        prev_scale = self.loss_scale
-        self._update_scale(self.overflow)
-        if self.overflow:
-            if self.verbose:
-                report_memory_usage('After overflow before clearing gradients')
-            self.zero_grad()
-            if self.cpu_offload:
-                self.reset_cpu_buffers()
-            else:
-                self.averaged_gradients = {}
-
-            if self.verbose:
-                report_memory_usage('After overflow after clearing gradients')
-
-            print(
-                "[deepspeed] fp16 dynamic loss scale overflow! Rank {} Skipping step. Attempted loss scale: {}, "
-                "reducing to {}".format(dist.get_rank(),
-                                        prev_scale,
-                                        self.loss_scale))
-            self.start_timers(timer_names)
-            self.stop_timers(timer_names)
-            return
-
-        self.start_timers([OPTIMIZER_GRADIENTS])
-        norm_groups = []
-        single_partition_grad_groups = []
-        skip = False
-        for i, group in enumerate(self.fp16_groups):
-            partition_id = dist.get_rank(group=self.real_dp_process_group[i])
-            if self.cpu_offload:
-                norm_groups.append(
-                    self.complete_grad_norm_calculation_for_cpu_offload(
-                        self.params_in_partition[i]))
-                single_grad_partition = self.single_partition_of_fp32_groups[i].grad
-            else:
-                norm_groups.append(
-                    self.get_grad_norm_direct(self.averaged_gradients[i],
-                                              self.params_in_partition[i]))
-
-                # free gradients for all the prameters that are not updated by this process
-                self.free_grad_in_param_list(self.params_not_in_partition[i])
-
-                # create a flat gradients for parameters updated by this process
-                # If we are last partition, ensure we have same size grads and partition size, if not pad with zero tensors
-                if partition_id == dist.get_world_size(
-                        group=self.real_dp_process_group[i]) - 1:
-                    single_grad_partition = self.flatten_dense_tensors_aligned(
-                        self.averaged_gradients[i],
-                        int(self.partition_size[i])).to(
-                        self.single_partition_of_fp32_groups[i].dtype)
-                else:
-                    single_grad_partition = self.flatten(self.averaged_gradients[i]).to(
-                        self.single_partition_of_fp32_groups[i].dtype)
-                assert single_grad_partition.numel() == self.partition_size[i], \
-                    "averaged gradients have different number of elements that partition size {} {} {} {}".format(
-                        single_grad_partition.numel(), self.partition_size[i], i, partition_id)
-
-                self.single_partition_of_fp32_groups[i].grad = single_grad_partition
-                # release all the gradient since we have already created a necessary copy in dp_grad_partition
-                self.free_grad_in_param_list(self.params_in_partition[i])
-
-                self.averaged_gradients[i] = None
-
-            single_partition_grad_groups.append(single_grad_partition)
-
-        if self.has_moe_layers:
-            self._average_expert_grad_norms(norm_groups)
-
-        self.unscale_and_clip_grads(single_partition_grad_groups, norm_groups)
-        self.stop_timers([OPTIMIZER_GRADIENTS])
-
-        self.start_timers([OPTIMIZER_STEP])
-        if self.deepspeed_adam_offload:
-            from deepspeed.ops.adam import DeepSpeedCPUAdam
-            if type(self.optimizer) == DeepSpeedCPUAdam and self.dtype == torch.half:
-                fp16_param_groups = [
-                    fp16_partitions[partition_id]
-                    for fp16_partitions in self.parallel_partitioned_fp16_groups
-                ]
-                self.optimizer.step(fp16_param_groups=fp16_param_groups)
-            else:
-                self.optimizer.step()
-                for fp16_partitions, fp32_partition in zip(self.parallel_partitioned_fp16_groups,
-                                                           self.single_partition_of_fp32_groups):
-                    fp16_partitions[partition_id].data.copy_(
-                        fp32_partition.data)
-        else:
-            self.optimizer.step()
-
-            # get rid of the fp32 gradients. Not needed anymore
-            if not self.cpu_offload:
-                for group in self.single_partition_of_fp32_groups:
-                    group.grad = None  # in step
-
-            for fp16_partitions, fp32_partition in zip(self.parallel_partitioned_fp16_groups,
-                                                       self.single_partition_of_fp32_groups):
-                fp16_partitions[partition_id].data.copy_(fp32_partition.data)
-
-        self.stop_timers([OPTIMIZER_STEP])
-
-        if self.cpu_offload:
-            self.reset_cpu_buffers()
-
-        self.start_timers([OPTIMIZER_ALLGATHER])
-        # gather the updated weights from everyone
-        for group_id, partitioned_params in enumerate(self.parallel_partitioned_fp16_groups):
-
-            # Sequential AllGather Best of both worlds
-            dp_world_size = dist.get_world_size(
-                group=self.real_dp_process_group[group_id])
-            num_shards = max(
-                1,
-                partitioned_params[partition_id].numel() * dp_world_size //
-                self.allgather_bucket_size)
-
-            shard_size = partitioned_params[partition_id].numel() // num_shards
-            num_elements = shard_size
-
-            assert shard_size * \
-                num_shards <= partitioned_params[partition_id].numel()
-
-            for shard_id in range(num_shards):
-
-                if shard_id == (num_shards - 1):
-                    num_elements = partitioned_params[partition_id].numel(
-                    ) - shard_id * shard_size
-
-                shard_list = []
-                for dp_id in range(dp_world_size):
-                    curr_shard = partitioned_params[dp_id].narrow(
-                        0,
-                        shard_id * shard_size,
-                        num_elements).detach()
-                    shard_list.append(curr_shard)
-
-                dist.all_gather(shard_list,
-                                shard_list[partition_id],
-                                group=self.real_dp_process_group[group_id])
-        self.stop_timers([OPTIMIZER_ALLGATHER])
-
-        # TODO: we probably don't need this? just to be safe
-        for i in range(len(norm_groups)):
-            self._update_model_fp16_weights(i)
-
-        self.log_timers(timer_names)
-        if self.verbose:
-            report_memory_usage('After zero_optimizer step')
-
-        return
-
-    def _average_expert_grad_norms(self, norm_groups):
-        for i, norm in enumerate(norm_groups):
-            if self.is_moe_param_group[i]:
-                scaled_norm = norm * 1.0 / float(
-                    dist.get_world_size(group=self.ep_process_group))
-                scaled_norm_tensor = torch.tensor(scaled_norm,
-                                                  device='cuda',
-                                                  dtype=torch.float)
-                dist.all_reduce(scaled_norm_tensor,
-                                group=self.ep_process_group)
-                norm_groups[i] = scaled_norm_tensor.item()
-
-    def unscale_and_clip_grads(self, grad_groups_flat, norm_groups):
-        total_norm = 0.0
-        for norm in norm_groups:
-            total_norm += norm ** 2.0
-        total_norm = math.sqrt(total_norm)
-
-        # compute combined scale factor for this group
-        combined_scale = self.loss_scale
-        if self.clip_grad > 0.:
-            # norm is in fact norm*scale
-            clip = ((total_norm / self.loss_scale) + 1e-6) / self.clip_grad
-            if clip > 1:
-                combined_scale = clip * self.loss_scale
-
-        for grad in grad_groups_flat:
-            if isinstance(grad, list):
-                sub_partitions = grad
-                for g in sub_partitions:
-                    g.data.mul_(1. / combined_scale)
-            else:
-                grad.data.mul_(1. / combined_scale)
-
-    def _check_overflow(self, partition_gradients=True):
-        self.overflow = self.has_overflow(partition_gradients)
-
-    # `params` is a list / generator of torch.Variable
-    def has_overflow_serial(self, params, is_grad_list=False):
-        for p in params:
-            if p.grad is not None and self._has_inf_or_nan(p.grad.data):
-                return True
-
-        return False
-
-    def has_overflow_partitioned_grads_serial(self):
-        for i in range(len(self.fp16_groups)):
-            for j, grad in enumerate(self.averaged_gradients[i]):
-                if grad is not None and self._has_inf_or_nan(grad.data, j):
-                    return True
-        return False
-
-    def has_overflow(self, partition_gradients=True):
-        if partition_gradients:
-            overflow = self.local_overflow if self.cpu_offload else self.has_overflow_partitioned_grads_serial(
-            )
-            overflow_gpu = torch.cuda.ByteTensor([overflow])
-            '''This will capture overflow across all data parallel and expert parallel process
-            Since expert parallel process are a subset of data parallel process'''
-            torch.distributed.all_reduce(overflow_gpu,
-                                         op=torch.distributed.ReduceOp.MAX,
-                                         group=self.dp_process_group)
-
-        else:
-            params = []
-            for group in self.fp16_groups:
-                for param in group:
-                    params.append(param)
-
-            overflow = self.has_overflow_serial(
-                params, is_grad_list=partition_gradients)
-            overflow_gpu = torch.cuda.ByteTensor([overflow])
-
-        # Since each model parallel GPU carries only part of the model,
-        # make sure overflow flag is synced across all the model parallel GPUs
-        self._model_parallel_all_reduce(tensor=overflow_gpu,
-                                        op=torch.distributed.ReduceOp.MAX)
-
-        overflow = overflow_gpu[0].item()
-        return bool(overflow)
-
-    # `x` is a torch.Tensor
-    @staticmethod
-    def _has_inf_or_nan(x, j=None):
-        try:
-            # if x is half, the .float() incurs an additional deep copy, but it's necessary if
-            # Pytorch's .sum() creates a one-element tensor of the same type as x
-            # (which is true for some recent version of pytorch).
-            cpu_sum = float(x.float().sum())
-            # More efficient version that can be used if .sum() returns a Python scalar
-            # cpu_sum = float(x.sum())
-        except RuntimeError as instance:
-            # We want to check if inst is actually an overflow exception.
-            # RuntimeError could come from a different error.
-            # If so, we still want the exception to propagate.
-            if "value cannot be converted" not in instance.args[0]:
-                raise
-            return True
-        else:
-            if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
-                return True
-            return False
-
-    def backward(self, loss, retain_graph=False):
-        """
-        :attr:`backward` performs the following steps:
-
-        1. fp32_loss = loss.float()
-        2. scaled_loss = fp32_loss*loss_scale
-        3. scaled_loss.backward(), which accumulates scaled gradients into the ``.grad`` attributes of the model's fp16 leaves
-        """
-        self.micro_step_id += 1
-
-        if self.contiguous_gradients:
-            self.ipg_buffer = []
-            buf_0 = torch.empty(int(self.reduce_bucket_size),
-                                dtype=self.dtype,
-                                device=torch.cuda.current_device())
-            self.ipg_buffer.append(buf_0)
-
-            # Use double buffers to avoid data access conflict when overlap_comm is enabled.
-            if self.overlap_comm:
-                buf_1 = torch.empty(int(self.reduce_bucket_size),
-                                    dtype=self.dtype,
-                                    device=torch.cuda.current_device())
-                self.ipg_buffer.append(buf_1)
-            self.ipg_index = 0
-
-        self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
-
-    def check_overflow(self, partition_gradients=True):
-        self._check_overflow(partition_gradients)
-
-    def _update_scale(self, has_overflow=False):
-        self.loss_scaler.update_scale(has_overflow)
-
-    # Promote state so it can be retrieved or set via "fp16_optimizer_instance.state"
-    def _get_state(self):
-        return self.optimizer.state
-
-    def _set_state(self, value):
-        self.optimizer.state = value
-
-    state = property(_get_state, _set_state)
-
-    # Promote param_groups so it can be retrieved or set via "fp16_optimizer_instance.param_groups"
-    # (for example, to adjust the learning rate)
-    def _get_param_groups(self):
-        return self.optimizer.param_groups
-
-    def _set_param_groups(self, value):
-        self.optimizer.param_groups = value
-
-    param_groups = property(_get_param_groups, _set_param_groups)
-
-    # Promote loss scale so it can be retrieved or set via "fp16_optimizer_instance.loss_scale"
-    def _get_loss_scale(self):
-        return self.loss_scaler.loss_scale
-
-    def _set_loss_scale(self, value):
-        self.loss_scaler.cur_scale = value
-
-    loss_scale = property(_get_loss_scale, _set_loss_scale)
-    cur_scale = property(_get_loss_scale, _set_loss_scale)
-
-    # Return group tensor after removing paddings that are added for alignment to DP world size.
-    # This method works on the assumption that each group contains a single flattened tensor.
-    def _get_groups_without_padding(self, groups_with_padding):
-        groups_without_padding = []
-        for i, group in enumerate(groups_with_padding):
-            lean_length = group.numel() - self.groups_padding[i]
-            groups_without_padding.append(group[:lean_length])
-
-        return groups_without_padding
-
-    # Return optimizer state after removing paddings that are added for alignment.
-    def _get_state_without_padding(self, state_with_padding, padding):
-        lean_state = {}
-        for key, value in state_with_padding.items():
-            if torch.is_tensor(value):
-                lean_length = value.numel() - padding
-                lean_state[key] = value[:lean_length]
-            else:
-                lean_state[key] = value
-
-        return lean_state
-
-    # Return base optimizer states.
-    # This method assumes that each param group contains a single flattened tensor.
-    def _get_base_optimizer_state(self):
-        optimizer_groups_state = []
-        for i, group in enumerate(self.optimizer.param_groups):
-            p = group['params'][0]
-            lean_optimizer_state = self._get_state_without_padding(
-                self.optimizer.state[p],
-                self.groups_padding[i])
-            optimizer_groups_state.append(lean_optimizer_state)
-
-        return optimizer_groups_state
-
-    def state_dict(self):
-        """
-        Returns a dict containing the current state of this :class:`FP16_Optimizer` instance.
-        This dict contains attributes of :class:`FP16_Optimizer`, as well as the state_dict
-        of the contained Pytorch optimizer.
-
-        Example::
-
-            checkpoint = {}
-            checkpoint['model'] = model.state_dict()
-            checkpoint['optimizer'] = optimizer.state_dict()
-            torch.save(checkpoint, "saved.pth")
-        """
-        state_dict = {}
-        state_dict['loss_scaler'] = self.loss_scaler
-        state_dict['dynamic_loss_scale'] = self.dynamic_loss_scale
-        state_dict['overflow'] = self.overflow
-        state_dict['base_optimizer_state'] = self._get_base_optimizer_state()
-
-        state_dict['zero_stage'] = ZERO_OPTIMIZATION_GRADIENTS
-        state_dict['partition_count'] = self.partition_count
-
-        state_dict['ds_version'] = version
-
-        # Remove paddings for DP alignment to enable loading for other alignment values
-        fp32_groups_without_padding = self._get_groups_without_padding(
-            self.single_partition_of_fp32_groups)
-        state_dict['single_partition_of_fp32_groups'] = fp32_groups_without_padding
-
-        #        if self.cpu_offload:
-        #            state_dict_tmp = async_copy_to(state_dict,
-        #                                           'cpu',
-        #                                           torch.cuda.current_stream())
-        #            state_dict = state_dict_tmp
-
-        return state_dict
-
-    # Restore base optimizer fp32 weights from checkpoint by:
-    # 1) Merging fp32 weights from checkpoints of all partitions
-    # 2) Extracting fp32 weights for current partition from merged weights
-    # 3) Using extracted weights to update base optimizer weights directly.
-    def _restore_from_fp32_weights(self, all_state_dict):
-        merged_single_partition_of_fp32_groups = []
-        for i in range(len(self.single_partition_of_fp32_groups)):
-            partition_id = dist.get_rank(group=self.real_dp_process_group[i])
-            merged_partitions = [
-                sd['single_partition_of_fp32_groups'][i] for sd in all_state_dict
-            ]
-            flat_merged_partitions = self.flatten_dense_tensors_aligned(
-                merged_partitions,
-                self.nccl_start_alignment_factor *
-                dist.get_world_size(group=self.real_dp_process_group[i]))
-            dp_partitions = self.get_data_parallel_partitions(
-                flat_merged_partitions, i)
-            merged_single_partition_of_fp32_groups.append(
-                dp_partitions[partition_id])
-
-        for current, saved in zip(self.single_partition_of_fp32_groups, merged_single_partition_of_fp32_groups):
-            current.data.copy_(saved.data)
-
-    # Restore base optimizer fp32 weights from ZeRO fp16 weights
-    def _restore_from_fp16_weights(self):
-        for group_id, fp16_partitions, fp32_partition in enumerate(
-                zip(self.parallel_partitioned_fp16_groups, self.single_partition_of_fp32_groups)):
-            partition_id = dist.get_rank(
-                group=self.real_dp_process_group[group_id])
-            fp32_partition.data.copy_(fp16_partitions[partition_id].data)
-
-    # Refresh the fp32 master params from the fp16 copies.
-    def refresh_fp32_params(self):
-        self._restore_from_fp16_weights()
-
-    # Extract optimizer state for current partition from merged states of all partitions
-    def _partition_base_optimizer_state(self, state_key, all_partition_states, group_id):
-        partition_id = dist.get_rank(
-            group=self.real_dp_process_group[group_id])
-        alignment = dist.get_world_size(
-            group=self.real_dp_process_group[group_id])
-        if torch.is_tensor(all_partition_states[0]):
-            flat_merged_partitions = self.flatten_dense_tensors_aligned(
-                all_partition_states,
-                alignment)
-            dp_partitions = self.get_data_parallel_partitions(flat_merged_partitions,
-                                                              group_id)
-            return dp_partitions[partition_id]
-        else:
-            # Assume non-tensor states are not partitioned and equal across ranks, so return first one
-            return all_partition_states[0]
-
-    # Restore base optimizer state from checkpoint by
-    # 1) Merging optimizer state from checkpoints of all partitions
-    # 2) Extracting optimizer state for current partition from the merged state
-    # 3) Using the extracted value to directly update the base optimizer.
-    def _restore_base_optimizer_state(self, all_state_dict):
-        base_optimizer_group_states = []
-        for i in range(len(self.optimizer.param_groups)):
-            partition_states = {}
-            all_partition_group_states = [
-                sd['base_optimizer_state'][i] for sd in all_state_dict
-            ]
-            for key in all_partition_group_states[0].keys():
-                all_partition_states = [
-                    all_states[key] for all_states in all_partition_group_states
-                ]
-                partition_states[key] = self._partition_base_optimizer_state(
-                    key,
-                    all_partition_states,
-                    i)
-            base_optimizer_group_states.append(partition_states)
-
-        for i, group in enumerate(self.optimizer.param_groups):
-            p = group['params'][0]
-            for key, saved in base_optimizer_group_states[i].items():
-                if torch.is_tensor(self.optimizer.state[p][key]):
-                    self.optimizer.state[p][key].data.copy_(saved.data)
-                else:
-                    self.optimizer.state[p][key] = saved
-
-    def load_state_dict(self,
-                        state_dict_list,
-                        load_optimizer_states=True,
-                        load_from_fp32_weights=False):
-        r"""Loading ZeRO checkpoint
-
-        Arguments:
-            state_dict_list: List of all saved ZeRO checkpoints, one for each saved partition.
-                Note that the number of saved partitions may differ from number of loading partitions to support
-                changing GPU count, specifically DP world size, between saving and loading checkpoints.
-            load_optimizer_states: Boolean indicating whether or not to load base optimizer states
-            load_from_fp32_weights: Boolean indicating whether to initialize fp32 master weights from fp32
-            copies in checkpoints (no precision loss) or from model's fp16 copies (with precision loss).
-        """
-        """
-        Loads a state_dict created by an earlier call to state_dict().
-        If ``fp16_optimizer_instance`` was constructed from some ``init_optimizer``,
-        whose parameters in turn came from ``model``, it is expected that the user
-        will call ``model.load_state_dict()`` before
-        ``fp16_optimizer_instance.load_state_dict()`` is called.
-        
-        Example::
-
-            model = torch.nn.Linear(D_in, D_out).cuda().half()
-            optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
-            optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0)
-            ...
-            checkpoint = torch.load("saved.pth")
-            model.load_state_dict(checkpoint['model'])
-            optimizer.load_state_dict(checkpoint['optimizer'])
-        """
-        # I think it should actually be ok to reload the optimizer before the model.
-        self.loss_scaler = state_dict_list[0]['loss_scaler']
-        self.dynamic_loss_scale = state_dict_list[0]['dynamic_loss_scale']
-        self.overflow = state_dict_list[0]['overflow']
-
-        # zero stage 1 mode
-        if not self.partition_gradients:
-            required_version = pkg_version.parse("0.3.17")
-            ckpt_version = state_dict_list[0].get("ds_version", False)
-            error_str = f"ZeRO stage 1 changed in {required_version} and is not backwards compatible " \
-                        "with older stage 1 checkpoints. If you'd like to load an old ZeRO-1 checkpoint " \
-                        "please set 'legacy_stage1': true in your zero config json. This old version of " \
-                        "stage 1 will be removed in v0.4.0."
-
-            assert ckpt_version, f"Empty ds_version! {error_str}"
-            assert required_version <= pkg_version.parse(
-                ckpt_version), f"Old version: {ckpt_version} {error_str}"
-
-        if load_optimizer_states:
-            self._restore_base_optimizer_state(state_dict_list)
-
-        # At this point, the optimizer's references to the model's fp32 parameters are up to date.
-        # The optimizer's hyperparameters and internal buffers are also up to date.
-        # However, the fp32 master copies of the model's fp16 params stored by the optimizer are still
-        # out of date.  There are two options.
-        # 1:  Refresh the master params from the model's fp16 params.
-        # This requires less storage but incurs precision loss.
-        # 2:  Save and restore the fp32 master copies separately.
-        # We choose option 1 if changing DP degree and option 2 otherwise.
-        #
-        # Pytorch Optimizer.load_state_dict casts saved buffers (e.g. momentum) to the type and device
-        # of their associated parameters, because it's possible those buffers might not exist yet in
-        # the current optimizer instance.  In our case, as long as the current FP16_Optimizer has been
-        # constructed in the same way as the one whose state_dict we are loading, the same master params
-        # are guaranteed to exist, so we can just copy_() from the saved master params.
-
-        if load_from_fp32_weights:
-            self._restore_from_fp32_weights(state_dict_list)
-        else:
-            self._restore_from_fp16_weights()
-
-    def allreduce_gradients(self):
-        self.overlapping_partition_gradients_reduce_epilogue()
-
-
-def _handle_overflow(cpu_sum, x, i):
-    import math
-    rank = torch.distributed.get_rank()
-    if rank == 0:
-        t_i = -1
-        for v_i, v in enumerate(x.data.contiguous().view(-1)):
-            if not math.isfinite(float(v)):
-                t_i = v_i
-                break
-        print(
-            f"rank {rank} detected overflow {cpu_sum} in tensor {i}:{t_i} shape {x.shape}"
-        )
-
-
-def estimate_zero2_model_states_mem_needs(total_params,
-                                          num_gpus_per_node=1,
-                                          num_nodes=1,
-                                          cpu_offload=True,
-                                          additional_buffer_factor=1.5):
-    total_gpus = num_nodes * num_gpus_per_node
-
-    if cpu_offload:
-        gpu_mem = 2 * total_params
-        cpu_mem = total_params * \
-            max(4 * total_gpus, 16) * additional_buffer_factor
-    else:
-        gpu_mem = 4 * total_params + int(16 * total_params / total_gpus)
-        cpu_mem = total_params * 4 * num_gpus_per_node * additional_buffer_factor
-
-    return int(cpu_mem), int(gpu_mem)
-
-
-def model_to_params(model):
-    # shared params calculated only once
-    total_params = sum(
-        dict((p.data_ptr(),
-              p.numel()) for p in model.parameters()).values())
-    return total_params
-
-
-def estimate_zero2_model_states_mem_needs_all_live(model,
-                                                   num_gpus_per_node=1,
-                                                   num_nodes=1,
-                                                   additional_buffer_factor=1.5):
-    """
-    Print out estimates on memory usage requirements for ZeRO 2 params, optim states and gradients
-    for a given ``model`` and hardware setup.
-
-    If you have an actual model object, use this function and everything will be derived
-    automatically.
-
-    If it's a hypothetical model, use ``estimate_zero2_model_states_mem_needs_all_cold`` where you have to pass
-    the ``total_params`` explicitly.
-
-    Args:
-        - ``model``: ``nn.Module`` object
-        - ``num_gpus_per_node``: how many gpus per node (defaults to 1)
-        - ``num_nodes``: how many nodes (defaults to 1),
-        - ``additional_buffer_factor``: estimation factor (defaults to 1.5):
-
-    """
-
-    total_params = model_to_params(model)
-
-    estimate_zero2_model_states_mem_needs_all_cold(
-        total_params=total_params,
-        num_gpus_per_node=num_gpus_per_node,
-        num_nodes=num_nodes,
-        additional_buffer_factor=additional_buffer_factor)
-
-
-def estimate_zero2_model_states_mem_needs_all_cold(total_params,
-                                                   num_gpus_per_node=1,
-                                                   num_nodes=1,
-                                                   additional_buffer_factor=1.5):
-    """
-    Print out estimates on memory usage requirements for ZeRO 2 params, optim states and gradients
-    for a given ``model`` and hardware setup.
-
-    If it's a hypothetical model, use this function where you have to pass
-    the ``total_params`` and ``largest_layer_params`` explicitly.
-
-    If you have an actual model object, use ``estimate_zero2_model_states_mem_needs_all_live`` and everything
-    will be derived automatically.
-
-    Args:
-        - ``total_params``: total  model params
-        - ``num_gpus_per_node``: how many gpus per node (defaults to 1)
-        - ``num_nodes``: how many nodes (defaults to 1),
-        - ``additional_buffer_factor``: estimation factor (defaults to 1.5):
-
-    """
-
-    def format_options(cpu_offload):
-        enabled = []
-        enabled.append(f"cpu_offload={1 if cpu_offload else 0}")
-        return ", ".join(enabled)
-
-    nodes_str = "nodes" if num_nodes > 1 else "node"
-    gpus_str = "GPUs" if num_gpus_per_node > 1 else "GPU"
-    print(
-        "Estimated memory needed for params, optim states and gradients for a:\n"
-        f"HW: Setup with {num_nodes} {nodes_str}, {num_gpus_per_node} {gpus_str} per node.\n"
-        f"SW: Model with {int(total_params / 1e6)}M total params.")
-    print("  per CPU  |  per GPU |   Options")
-    for cpu_offload in [True, False]:
-        cpu_mem, gpu_mem = estimate_zero2_model_states_mem_needs(
-            total_params=total_params,
-            num_gpus_per_node=num_gpus_per_node,
-            num_nodes=num_nodes,
-            cpu_offload=cpu_offload,
-            additional_buffer_factor=additional_buffer_factor
-        )
-
-        options_str = format_options(cpu_offload=cpu_offload)
-        print(
-            f" {cpu_mem / 2 ** 30:7.2f}GB | {gpu_mem / 2 ** 30:6.2f}GB | {options_str}")
diff --git a/colossalai/zero/zero_redundancy_optimizer_level_3.py b/colossalai/zero/zero_redundancy_optimizer_level_3.py
deleted file mode 100644
index 34051e638..000000000
--- a/colossalai/zero/zero_redundancy_optimizer_level_3.py
+++ /dev/null
@@ -1,3624 +0,0 @@
-"""
-"Copyright 2020 The Microsoft DeepSpeed Team.
-Licensed under the MIT license.
-"""
-
-import math
-from collections import OrderedDict
-
-import torch
-import torch.distributed as dist
-
-try:
-    from deepspeed.utils.debug import debug_module2name_id, debug_param2name_id, debug_param2name_id_numel, \
-        debug_param2name_id_shape_device, debug_module2name_class
-    from deepspeed.ops.adam import DeepSpeedCPUAdam
-    from deepspeed.ops.op_builder import UtilsBuilder
-    from deepspeed.runtime.swap_tensor.partitioned_optimizer_swapper import PartitionedOptimizerSwapper
-    from deepspeed.runtime.swap_tensor.pipelined_optimizer_swapper import PipelinedOptimizerSwapper
-    from deepspeed.runtime.utils import is_model_parallel_parameter
-    from deepspeed.runtime.zero.constants import ZERO_OPTIMIZATION_WEIGHTS
-    from deepspeed.runtime.zero.partition_parameters import *
-    from deepspeed.runtime.zero.partition_parameters import _init_external_params
-except ImportError:
-    pass
-
-from torch._six import inf
-from torch.distributed.distributed_c10d import _get_global_rank
-from torch.optim import Optimizer
-
-from colossalai.core import global_context as gpc
-from colossalai.utils import report_memory_usage
-from .loss_scaler import LossScaler, DynamicLossScaler
-from colossalai.context import ParallelMode
-
-# Toggle this to true to enable correctness test
-# with gradient partitioning and without
-pg_correctness_test = False
-
-FWD_MODULE_STACK = list()
-
-
-def print_rank_0(message, debug=False, force=False):
-    rank = torch.distributed.get_rank()
-    if rank == 0 and (debug or force):
-        print(message)
-    # other variations
-    # - print for all ranks w/o interleaving
-    # printflock(f"[{rank}] {message}")
-    # - print to log file per rank
-    # log_rank_file(rank, message)
-
-
-def input(msg):
-    return
-
-
-def split_half_float_double(tensors):
-    dtypes = [
-        "torch.cuda.HalfTensor",
-        "torch.cuda.FloatTensor",
-        "torch.cuda.DoubleTensor"
-    ]
-    buckets = []
-    for i, dtype in enumerate(dtypes):
-        bucket = [t for t in tensors if t.type() == dtype]
-        if bucket:
-            buckets.append(bucket)
-    return buckets
-
-
-def isclose(a, b, rtol=1e-09, atol=0.0):
-    return abs(a - b) <= max(rtol * max(abs(a), abs(b)), atol)
-
-
-def lcm(x, y):
-    from fractions import gcd  # or can import gcd from `math` in Python 3
-    return x * y // gcd(x, y)
-
-
-def move_to_cpu(tensor_list):
-    for tensor in tensor_list:
-        tensor.data = tensor.data.cpu()
-
-
-def get_all_parameters(sub_module, recurse=False):
-    return itertools.chain(sub_module.named_parameters(recurse=recurse),
-                           sub_module.ds_external_parameters())
-
-
-# apply torch.autograd.Function that calls a backward_function to tensors in output
-def _apply_to_tensors_only(module, functional, backward_function, outputs):
-    if type(outputs) is tuple:
-        touched_outputs = []
-        for output in outputs:
-            touched_output = _apply_to_tensors_only(module,
-                                                    functional,
-                                                    backward_function,
-                                                    output)
-            touched_outputs.append(touched_output)
-        return tuple(touched_outputs)
-    elif type(outputs) is torch.Tensor:
-        return functional.apply(module, backward_function, outputs)
-    else:
-        return outputs
-
-
-# for each tensor in outputs run the forward_funciton and register backward_function as hook
-def _apply_forward_and_backward_to_tensors_only(module,
-                                                forward_function,
-                                                backward_function,
-                                                outputs):
-    if type(outputs) is tuple:
-        touched_outputs = []
-        for output in outputs:
-            touched_output = _apply_forward_and_backward_to_tensors_only(
-                module,
-                forward_function,
-                backward_function,
-                output)
-            touched_outputs.append(touched_output)
-        return tuple(touched_outputs)
-    elif type(outputs) is torch.Tensor:
-        forward_function(outputs)
-        if outputs.requires_grad:
-            outputs.register_hook(backward_function)
-        return outputs
-    else:
-        return outputs
-
-
-class ZeROOrderedDict(OrderedDict):
-    def __init__(self, parent_module, *args, **kwargs):
-        """A replacement for ``collections.OrderedDict`` to detect external ZeRO params.
-
-        Args:
-            parent_module (``collections.OrderedDict``): the collection to replace
-        """
-
-        super().__init__(*args, **kwargs)
-        self._parent_module = parent_module
-        self._in_forward = False
-
-    def __getitem__(self, key):
-        param = super().__getitem__(key)
-
-        # Params can be registered as None (e.g., bias)
-        if param is None:
-            return param
-
-        if param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
-            if self._parent_module._parameters._in_forward:
-                print_rank_0(f'Registering external parameter from getter {key}',
-                             force=False)
-                register_external_parameter(FWD_MODULE_STACK[-1], param)
-                param.all_gather()
-
-        return param
-
-
-def _inject_parameters(module, cls):
-    for module in module.modules():
-        if cls == ZeROOrderedDict:
-            new_param = cls(parent_module=module)
-        else:
-            new_param = cls()
-
-        for key, param in module._parameters.items():
-            new_param[key] = param
-        module._parameters = new_param
-
-
-# TODO Needs to be implemented
-class PrefetchCoordinator(object):
-    def __init__(self):
-        # step_id keeps track of the number of sub-modules invoked so far
-        # the step_id is tracking forward and backward sequence of sub-modules
-        self.step_id = 0
-
-        # stores the sequence of sub modules in forward+backward pass
-        self.sub_module_trace = []
-
-        # maps sub_module id to submodule objects
-        self.id_to_sub_module_map = {}
-
-        # stores the total number of parmeters in each sub_module
-        self.id_to_sub_module_size_map = {}
-
-        self.trace_completed = False
-
-        self.most_recent_sub_module_step = {}
-
-        # reuse distances
-        self.reuse_numel_for_step_id = {}
-
-    def record_trace(self, sub_module):
-        if not self.trace_completed:
-            self.sub_module_trace.append(sub_module.id)
-            self.id_to_sub_module_map[sub_module.id] = sub_module
-
-    def print_trace(self):
-        print_rank_0(
-            f"The module trace is : {[self.id_to_sub_module_map[module_id].id for module_id in self.sub_module_trace]}"
-        )
-
-    def increment_step(self, sub_module):
-        self.most_recent_sub_module_step[sub_module.id] = self.step_id
-        self.step_id += 1
-
-    def reset_step(self):
-        self.step_id = 0
-
-    # returns the next numel parameters that will be used next but are not available or inflight
-    def get_params_to_prefetch(self, sub_module, numel=2000000):
-
-        # numel_in_sub_module = 0
-        # for name, param in sub_module.named_parameters(recurse=False):
-        #     numel_in_sub_module += param.ds_numel
-
-        # #if numel_in_sub_module < (numel // 2):
-        #    return []
-
-        # tracing failed. The sub_module passed at the step_id must match with the sub_module during tracing
-        if sub_module.id != self.sub_module_trace[self.step_id]:
-            print_rank_0(
-                f"Tracing failed. Prefetching is disabled at sub-module: {debug_module2name_id(sub_module)}"
-            )
-            return []
-
-        params_to_prefetch = []
-        total_numel_to_prefetch = 0
-
-        for i in range(self.step_id, len(self.sub_module_trace)):
-            module_id = self.sub_module_trace[i]
-            for _, param in get_all_parameters(self.id_to_sub_module_map[module_id]):
-                if param.ds_status is ZeroParamStatus.NOT_AVAILABLE and (
-                        param.ds_id not in [p.ds_id for p in params_to_prefetch]):
-                    params_to_prefetch.append(param)
-                    total_numel_to_prefetch += param.ds_numel
-                    # print_rank_0(f"Total numel to prefetch: {total_numel_to_prefetch}. Param: {param.ds_shape} and numel {param.ds_numel}, numel limit {numel}")
-                    # and total_numel_to_prefetch > (numel_in_sub_module // 2):
-                    if total_numel_to_prefetch >= numel:
-                        return params_to_prefetch
-
-        return params_to_prefetch
-
-    # checks if this sub_module will be used again and if so then returns the number of elements
-    # in the parameters used between this sub_module and the reuse of this sub_module
-    def get_reuse_distance_in_numel(self, sub_module, sub_module_step_id=None):
-        # assert is_forward is not None, "is_forward must be set to True for Forward Propagation and False for backward Propagation"
-        is_there_reuse = False
-        reuse_distance_in_numel = 1000000000000
-
-        # set the appropriate trace
-        trace = self.sub_module_trace
-        total_steps = len(trace)
-        if sub_module_step_id is None:
-            sub_module_step_id = self.most_recent_sub_module_step[sub_module.id]
-
-        # tracing failed. The sub_module passed at the step_id must match with the sub_module during tracing
-        if sub_module.id != trace[sub_module_step_id]:
-            print_rank_0(
-                f"Tracing failed. Cannot tell if the sub_module: {sub_module.id} is reused"
-            )
-            return reuse_distance_in_numel
-
-        # return cached value
-        if sub_module_step_id in self.reuse_numel_for_step_id:
-            return self.reuse_numel_for_step_id[sub_module_step_id]
-
-        start_step = self.step_id
-        print_rank_0(f"Step id is {self.step_id} ")
-        for step_id in range(start_step, total_steps):
-            print_rank_0(
-                f"Trace id {trace[step_id]} and sub_module id {sub_module.id}")
-            if sub_module.id == trace[step_id]:
-                end_step = step_id
-
-                is_there_reuse = True
-                reuse_distance_in_numel = self._distance_in_numel(
-                    start_step,
-                    end_step,
-                    trace)
-                break
-
-        self.reuse_numel_for_step_id[sub_module_step_id] = reuse_distance_in_numel
-
-        return reuse_distance_in_numel
-
-    def _distance_in_numel(self, start_step, end_step, trace):
-        distance_in_numel = 0
-        for step_id in range(start_step, end_step):
-            module_id = trace[step_id]
-            for _, param in self.id_to_sub_module_map[module_id].named_parameters(recurse=False):
-                distance_in_numel += param.ds_numel
-            for _, param in self.id_to_sub_module_map[module_id].ds_external_parameters():
-                distance_in_numel += param.ds_numel
-        return distance_in_numel
-
-
-class PartitionedParameterCoordinator(object):
-    def __init__(self,
-                 comm_stream=None,
-                 max_reuse_distance_in_numel=500000000,
-                 max_available_parameters_in_numel=700000000):
-
-        self.in_flight_handles = []
-        self.params_in_flight = []
-        self.comm_stream = comm_stream if comm_stream is not None else torch.cuda.current_stream(
-        )
-        self.prefetch_coordinator = PrefetchCoordinator()
-        self.hierarchy = 0
-
-        self.total_available_parameter_numel = 0
-        self.max_available_parameters_in_numel = max_available_parameters_in_numel
-
-        # max distance between two use of the module beyond which module is released
-        self.max_reuse_distance_in_numel = max_reuse_distance_in_numel
-
-    def _increment_available_parameter_numel(self, increment):
-        self.total_available_parameter_numel += increment
-
-    def _decrement_available_parameter_numel(self, decrement):
-        self.total_available_parameter_numel -= decrement
-
-    '''-----------------------Tracing and Prefetching ---------------'''
-
-    def record_trace(self, sub_module):
-        self.prefetch_coordinator.record_trace(sub_module)
-
-    def finish_tracing(self, print_trace=False):
-        self.prefetch_coordinator.trace_completed = True
-
-        if print_trace:
-            self.prefetch_coordinator.print_trace()
-
-    # swap in parameter partitions from nvme for those parameters that will be used
-    # after the ones that are already being prefetched into full parameters
-    def _prefetch_nvme_param_partitions(self, sub_module, params_in_flight):
-        numel_in_flight = sum(
-            [param.ds_tensor.ds_numel for param in params_in_flight])
-        upcoming_param_list = self.prefetch_coordinator.get_params_to_prefetch(
-            sub_module,
-            numel=2 * numel_in_flight)
-        swap_in_params = []
-        for param in upcoming_param_list:
-            if len(swap_in_params) >= param.nvme_swapper.available_swap_in_buffers():
-                break
-            if param.ds_tensor.status == PartitionedParamStatus.NOT_AVAILABLE:
-                swap_in_params.append(param)
-
-        if len(swap_in_params) > 0:
-            swap_in_params[0].nvme_swapper.swap_in(
-                swap_in_params, async_op=True)
-
-    # Pre fetches the parameters for sub_modules that comes after
-    #  the current sub_module. This call is asynchronous
-    def prefetch_next_sub_modules(self, sub_module, numel=5000000, nvme=False):
-
-        params_to_prefetch = []
-        if not self.prefetch_coordinator.trace_completed:
-            return params_to_prefetch
-
-        # prefetch if there is no current prefetching in flight
-        if not self.in_flight_handles and self.total_available_parameter_numel < self.max_available_parameters_in_numel:
-            params_to_prefetch = self.prefetch_coordinator.get_params_to_prefetch(
-                sub_module,
-                numel=numel)
-
-            self._all_gather(params_to_prefetch, async_op=True)
-            for param in params_to_prefetch:
-                param.ds_status = ZeroParamStatus.INFLIGHT
-
-                # keeping track of number of elements consumed by available parmaeters
-                self._increment_available_parameter_numel(param.ds_numel)
-
-            if nvme:
-                self._prefetch_nvme_param_partitions(
-                    sub_module, params_to_prefetch)
-
-        self._print_prefetch_elements_info(sub_module, params_to_prefetch)
-        print_rank_0(
-            f"{'--' * self.hierarchy}--PreFetching parameters {[param.ds_id for param in params_to_prefetch]} and available {self.total_available_parameter_numel}, max limit {self.max_available_parameters_in_numel}",
-            force=False)
-
-    def _print_prefetch_elements_info(self, sub_module, params_to_prefetch):
-        sub_module_numel = 0.0
-        for name, param in sub_module.named_parameters(recurse=False):
-            sub_module_numel += param.ds_numel
-        numel_being_prefetched = 0
-        for param in params_to_prefetch:
-            numel_being_prefetched = param.ds_numel
-        print_rank_0(
-            f"{'--' * self.hierarchy}--PreFetching  {numel_being_prefetched} numels and number of numel in the next sub module is {sub_module_numel}",
-            force=False)
-
-    def increment_step(self, sub_module):
-        self.prefetch_coordinator.increment_step(sub_module)
-
-    def reset_step(self):
-        self.prefetch_coordinator.reset_step()
-
-    '''----------------------------------------------------------------------'''
-
-    # Fetches the parameters in the sub_module
-    # This call is blocking
-    def fetch_sub_module(self, sub_module):
-        partitioned_params = []
-        params_in_flight = False
-        print_rank_0(
-            f"{'--' * self.hierarchy}Fetching params in module {debug_module2name_class(sub_module)}"
-        )
-        params_to_fetch = [
-            param for _,
-            param in sub_module.named_parameters(recurse=False)
-        ]
-        # print([n for n,p in sub_module.named_parameters(recurse=False)])
-
-        if hasattr(sub_module, 'ds_external_parameters'):
-            print_rank_0(
-                f"{'--' * self.hierarchy}--Fetching external parameters {sub_module.ds_external_parameters()}"
-            )
-            params_to_fetch += [
-                param for _,
-                param in sub_module.ds_external_parameters()
-            ]
-        # for _, param in sub_module.named_parameters(recurse=False):
-        for param in params_to_fetch:
-            param.ds_active_sub_modules += 1
-            print_rank_0(
-                f"{'--' * self.hierarchy}--Fetching parameters {debug_param2name_id_shape(param)} with active sub modules {param.ds_active_sub_modules}"
-            )
-
-            if param.ds_status == ZeroParamStatus.AVAILABLE:
-                print_rank_0(
-                    f"{'--' * self.hierarchy}--Parameter {debug_param2name_id(param)} is already available"
-                )
-
-            if param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
-                print_rank_0(
-                    f"{'--' * self.hierarchy}--Parameter {debug_param2name_id(param)} is being fetched"
-                )
-                partitioned_params.append(param)
-
-                # keeping track of number of elements consumed by available parmaeters
-                self._increment_available_parameter_numel(param.ds_numel)
-                print_rank_0(f"Incrementing with parameter id {param.ds_id}")
-
-            if param.ds_status == ZeroParamStatus.INFLIGHT:
-                params_in_flight = True
-                print_rank_0(
-                    f"{'--' * self.hierarchy}--Parameters {debug_param2name_id(param)} is already in flight (prefetched)"
-                )
-        self.hierarchy += 1
-
-        # parameters are partitioned and need to be allgathered
-        self._all_gather(partitioned_params, async_op=True)
-
-        # parameters are inflight and communication needs to be completed
-        if partitioned_params or params_in_flight:
-            self._synchronize_communication()
-
-        for _, param in sub_module.named_parameters(recurse=False):
-            param.ds_status = ZeroParamStatus.AVAILABLE
-            print_rank_0(
-                f"Param {debug_param2name_id_shape_device(param)} norm={param.norm()}",
-                force=False)
-        # print_rank_0(f"After fetching (id, shape, device): {[(param.ds_id, param.shape, param.device) for param in sub_module.named_parameters(recurse=False)]}")
-
-    def release_sub_module(self, sub_module):
-        self.hierarchy -= 1
-        print_rank_0(
-            f"{'--' * self.hierarchy}Releasing params in module {debug_module2name_class(sub_module)}"
-        )
-        params_to_release = [
-            param for _,
-            param in sub_module.named_parameters(recurse=False)
-        ]
-
-        if hasattr(sub_module, 'ds_external_parameters'):
-            # print_rank_0(f"Releasing external parameters {sub_module.ds_external_parameters()}")
-            params_to_release += [
-                param for _,
-                param in sub_module.ds_external_parameters()
-            ]
-
-        # for _, param in sub_module.named_parameters(recurse=False):
-        for param in params_to_release:
-            param.ds_active_sub_modules -= 1
-            if not param.ds_active_sub_modules and not self._keep_for_later(
-                    sub_module) and not param.ds_persist:
-
-                print_rank_0(
-                    f"{'--' * self.hierarchy}--Releasing parameter {debug_param2name_id_numel(param)} active sub modules {param.ds_active_sub_modules} and keep for later {self._keep_for_later(sub_module)}",
-                    force=False)
-
-                # Keeping track of number of elements that are consumed by available parameters
-                self._decrement_available_parameter_numel(param.ds_numel)
-
-                # report_memory_usage(
-                #     f"Before releasing param {debug_param2name_id_numel(param)}",
-                # )
-                param.partition(hierarchy=self.hierarchy)
-
-                # report_memory_usage(
-                #     f"After releasing param {debug_param2name_id_numel(param)}",
-                # )
-
-                param.ds_status = ZeroParamStatus.NOT_AVAILABLE
-            else:
-                print_rank_0(
-                    f"{'--' * self.hierarchy}--Did not release param {debug_param2name_id_numel(param)} with active sub modules {param.ds_active_sub_modules}, keep for later={self._keep_for_later(sub_module)} and persistence={param.ds_persist}",
-                    force=False)
-
-    def release_and_reset_parameter(self, param):
-        param.ds_active_sub_modules = 0
-        if param.ds_status == ZeroParamStatus.AVAILABLE:
-            print_rank_0(
-                f"Releasing unpartitioned param {debug_param2name_id_numel(param)} active sub-modules {param.ds_active_sub_modules} and persisitence {param.ds_persist}"
-            )
-            self._decrement_available_parameter_numel(param.ds_numel)
-            param.partition()
-
-    def _keep_for_later(self, sub_module):
-        if not self.prefetch_coordinator.trace_completed:
-            return False
-        if self.max_reuse_distance_in_numel == 0:
-            return False
-        reuse_distance_in_numel = self.prefetch_coordinator.get_reuse_distance_in_numel(
-            sub_module)
-        # print_rank_0(f"Reuse distance and numel for sub_module id {sub_module.id} is {reuse_distance_in_numel}")
-        return reuse_distance_in_numel < self.max_reuse_distance_in_numel
-
-    def _all_gather(self, partitioned_params, async_op=False):
-        with torch.cuda.stream(self.comm_stream):
-            handles = partitioned_params[0].all_gather(
-                param_list=partitioned_params,
-                async_op=async_op,
-                hierarchy=self.hierarchy) if partitioned_params else None
-
-        if handles is not None:
-            self.in_flight_handles.extend(handles)
-            self.params_in_flight.extend(partitioned_params)
-
-    def _synchronize_communication(self, synchronize_streams=True):
-        assert len(self.params_in_flight) == len(self.in_flight_handles)
-        for handle, param in zip(self.in_flight_handles, self.params_in_flight):
-            if handle is not None:
-                with torch.cuda.stream(self.comm_stream):
-                    handle.wait()
-            param.ds_status = ZeroParamStatus.AVAILABLE
-        self.comm_stream.synchronize()
-        torch.cuda.synchronize() if synchronize_streams else None
-        self.in_flight_handles = []
-        self.params_in_flight = []
-
-
-class PreBackwardFunction(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, module, pre_backward_function, outputs):
-        ctx.module = module
-        ctx.pre_backward_function = pre_backward_function
-        if not hasattr(module, "applied_pre_backward_ref_cnt"):
-            module.applied_pre_backward_ref_cnt = 0
-        module.applied_pre_backward_ref_cnt += 1
-        # print(f"After Forward: {ctx.module.__class__.__name__}")
-        outputs = outputs.detach()
-        return outputs
-
-    @staticmethod
-    def backward(ctx, *args):
-        # print(f"Before Backward: {ctx.module.__class__.__name__}")
-        ctx.pre_backward_function(ctx.module)
-        return (None, None) + args
-
-
-class PostBackwardFunction(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, module, pre_backward_function, output):
-        ctx.module = module
-        if output.requires_grad:
-            # TODO SOME TIMES post backward does not report_memory_usage()ered debug in detail
-            # Should only cause increase in memory not correctness issue
-            # if output.grad_fn.__class__.__name__ == 'ViewBackward':
-            #    ctx.view=True
-            #    print(f"Warning view tensor for input to module : {module.__class__.__name__}. Backward hooks may not trigger properly")
-            # assert len(module.parameters(recurse=False)), "The input tensor to the module is a view, and autograd Function or register_hook is not triggered with view tensors."
-            # if module.ds_grads_remaining == 0:
-            #    print(f"Before Forward: {ctx.module.__class__.__name__}")
-            module.ds_grads_remaining += 1
-            ctx.pre_backward_function = pre_backward_function
-        output = output.detach()
-        return output
-
-    @staticmethod
-    def backward(ctx, *args):
-        ctx.module.ds_grads_remaining = ctx.module.ds_grads_remaining - 1
-        if ctx.module.ds_grads_remaining == 0:
-            ctx.pre_backward_function(ctx.module)
-            # print(f"After Backward: {ctx.module.__class__.__name__}")
-        return (None, None) + args
-
-
-INITIAL_MICRO_STEP_ID = -1
-
-
-class ZeroRedundancyOptimizer_Level_3(Optimizer):
-    """
-    ZeroRedundancyOptimizer_Level_3 designed to reduce the memory footprint
-    required for training large deep learning models.
-
-    For more details please report_memory_usage() Optimization Towards Training A Trillion Parameter Models
-    https://arxiv.org/abs/1910.02054
-
-    """
-
-    def __init__(self,
-                 module,
-                 init_optimizer,
-                 dp_paralllel_mode=ParallelMode.DATA,
-                 static_loss_scale=1.0,
-                 dynamic_loss_scale=False,
-                 dynamic_loss_args=None,
-                 verbose=False,
-                 contiguous_gradients=True,
-                 reduce_bucket_size=500000000,
-                 prefetch_bucket_size=50000000,
-                 max_reuse_distance=1000000000,
-                 max_live_parameters=1000000000,
-                 param_persistence_threshold=100000,
-                 reduce_scatter=True,
-                 overlap_comm=False,
-                 offload_optimizer_config=None,
-                 offload_param_config=None,
-                 sub_group_size=1000000000000,
-                 clip_grad=0.0,
-                 allreduce_always_fp32=False,
-                 postscale_gradients=True,
-                 gradient_predivide_factor=1.0,
-                 gradient_accumulation_steps=1,
-                 aio_config=None,
-                 dtype=torch.half):
-        # mpu = None
-        # mpu is removed from the parameter list
-        # tensor parallel will be automatically detected later
-
-        # LSG: default parameter for compatibility
-        elastic_checkpoint = False
-        timers = None
-        dp_process_group = gpc.get_group(dp_paralllel_mode)
-        self.verbose = verbose
-
-        # LSG: in deepspeed deepspeed/runtime/zero/partition_parameters.py,
-        # self.local_device = torch.device('cuda:{}'.format(os.environ["LOCAL_RANK"]))
-        # the local device is obtained by env var LOCAL_RANK, thus, need to change this
-        # env var on the spot as LOCAL_RANK may not be present
-        if not 'LOCAL_RANK' in os.environ:
-            device_id = gpc.get_global_rank() % torch.cuda.device_count()
-            os.environ['LOCAL_RANK'] = str(device_id)
-
-        # self.local_device = torch.device('cuda:{}'.format(os.environ["LOCAL_RANK"]))
-
-        if self.verbose:
-            report_memory_usage("Stage 3 initialize beginning")
-
-            if dist.get_rank() == 0:
-                print(f"Reduce bucket size {reduce_bucket_size}")
-                print(f"Allgather bucket size {prefetch_bucket_size}")
-        # The fused optimizer does all the work. We need this layer for two reason:
-        # 1. maintain same user API from apex.fp16_utils
-        # 2. keep common stuff here in case we need to add ne552w fused optimizer later
-
-        # differences from apex.fp16_utils:
-        # - assume all model params in fp16
-        # - assume all params requires grad
-        # - flat by groups, not keeping state. TODO: remove state explicitly?
-        # - master gard and unflat master weight never exist. TODO: a way to save out unflat master?
-        if not torch.cuda.is_available:
-            raise SystemError("Cannot use fp16 without CUDA.")
-        self.optimizer = init_optimizer
-        self.defaults = init_optimizer.defaults
-
-        # Load pre-built or JIT compile (un)flatten ops
-        util_ops = UtilsBuilder().load()
-        self.flatten = util_ops.flatten
-        self.unflatten = util_ops.unflatten
-        self.dtype = dtype
-
-        if not all(is_zero_param(p) for p in module.parameters()):
-            ds_config = {
-                "train_micro_batch_size_per_gpu": 1,
-                "gradient_accumulation_steps": 1,
-                "zero_optimization": {
-                    "offload_param": offload_param_config,
-                    "offload_optimizer": offload_optimizer_config,
-                },
-                "aio": aio_config
-            }
-
-            if offload_param_config is not None:
-                remote_device = offload_param_config['device']
-            else:
-                remote_device = None
-
-            if offload_optimizer_config is not None:
-                pin_memory = offload_optimizer_config.get(OFFLOAD_OPTIMIZER_PIN_MEMORY, False)
-            else:
-                pin_memory = False
-
-            group = None
-            if gpc.is_initialized(ParallelMode.DATA):
-                group = gpc.get_group(ParallelMode.DATA)
-            Init(module=module, data_parallel_group=group, dtype=self.dtype,
-                 remote_device=remote_device, config_dict_or_path=ds_config,
-                 pin_memory=pin_memory)
-
-        for m in module.modules():
-            _init_external_params(m)
-
-        self.module = module
-        self.elastic_checkpoint = elastic_checkpoint
-        self.overlap_comm = overlap_comm
-
-        # Replace ._parameters with a new class to enable auto-registration of
-        # external parameters
-        _inject_parameters(module, ZeROOrderedDict)
-
-        if self.overlap_comm:
-            self.gpu_sum = torch.zeros(1, dtype=torch.float).cuda()
-
-        ###################### offload optimizer setup ##################################
-        self.optimizer_swapper = None
-        self.swap_optimizer = False
-
-        self.offload_optimizer = False
-        self.offload_optimizer_pin_memory = False
-        self.offload_optimizer_fast_init = False
-        if offload_optimizer_config is not None:
-            self.offload_optimizer = True
-            self.offload_optimizer_pin_memory = offload_optimizer_config[
-                OFFLOAD_OPTIMIZER_PIN_MEMORY]
-            self.swap_optimizer = offload_optimizer_config[
-                OFFLOAD_OPTIMIZER_DEVICE] == OFFLOAD_NVME_DEVICE
-            self.offload_optimizer_fast_init = offload_optimizer_config[
-                OFFLOAD_OPTIMIZER_FAST_INIT]
-
-        ###################### offload param setup ##################################
-        self.offload_param = False
-        self.offload_param_pin_memory = False
-        self.params_in_nvme_and_cpu = False
-        self.max_params_in_cpu = 0
-        if offload_param_config is not None:
-            assert self.offload_optimizer, "parameter offload is only available with optimizer state offload"
-            self.offload_param = True
-            self.offload_param_pin_memory = offload_param_config[
-                OFFLOAD_PARAM_PIN_MEMORY]
-            self.params_in_nvme_and_cpu = offload_param_config[
-                OFFLOAD_PARAM_DEVICE] == OFFLOAD_NVME_DEVICE
-            self.max_params_in_cpu = offload_param_config[OFFLOAD_PARAM_MAX_IN_CPU]
-            if self.verbose:
-                print_rank_0(
-                    f"FP16 params swapping is {self.params_in_nvme_and_cpu}, Max params in CPU is {self.max_params_in_cpu}",
-                    force=False)
-
-        self.deepspeed_adam_offload = (self.offload_optimizer
-                                       and type(init_optimizer) == DeepSpeedCPUAdam)
-
-        self.device = torch.cuda.current_device(
-        ) if not self.offload_optimizer else OFFLOAD_CPU_DEVICE
-        ############################################################################
-
-        if self.verbose:
-            report_memory_usage("Before Partitioned Parameter Coordinator")
-
-        fetch_stream = torch.cuda.Stream() if self.overlap_comm else None
-        self.param_coordinator = PartitionedParameterCoordinator(
-            comm_stream=fetch_stream,
-            max_reuse_distance_in_numel=int(max_reuse_distance),
-            max_available_parameters_in_numel=int(max_live_parameters))
-
-        if self.verbose:
-            report_memory_usage("After Partitioned Parameter Coordinator")
-
-        # self.param_coordinator = PartitionedParameterCoordinator(comm_stream=torch.cuda.Stream())
-        # -------------Stage 3 Setup-------------------#
-        # parameters smaller than the threshold will be collectively gathered at the
-        # end of the optimizer step and will be kept till the end of the backward pass
-        # TODO maybe worth just replicating these parameters and doing all reduce for them
-        self.persistence_threshold = int(param_persistence_threshold)
-
-        self.persistent_parameters = self.persistent_parameters()
-
-        self.setup_zero_stage3_hooks()
-
-        # resetting ds_tensor just in case parameters have been changed after initialization
-        # example .half() or .to()
-        # self.reset_ds_tensor()
-        # ---------------------------------------------#
-
-        self.timers = timers
-
-        self.reduce_scatter = reduce_scatter
-
-        self.dp_process_group = dp_process_group
-
-        self.partition_count = dist.get_world_size(group=self.dp_process_group)
-
-        if gpc.is_initialized(ParallelMode.TENSOR) is None:
-            self.model_parallel_group = None
-            self.model_parallel_rank = 0
-        else:
-            self.model_parallel_group = gpc.get_group(ParallelMode.TENSOR)
-            self.model_parallel_rank = gpc.get_local_rank(ParallelMode.TENSOR)
-
-        self.overflow = False
-        self.clip_grad = clip_grad
-        self.allreduce_always_fp32 = allreduce_always_fp32
-        self.gradient_predivide_factor = gradient_predivide_factor
-        self.postscale_gradients = postscale_gradients
-        self.gradient_accumulation_steps = gradient_accumulation_steps
-        self.micro_step_id = INITIAL_MICRO_STEP_ID
-
-        if self.reduce_scatter:
-            assert not self.allreduce_always_fp32, "allreduce_always_fp32 is not yet supported with ZeRO-2 with reduce scatter enabled"
-            assert self.gradient_predivide_factor == 1.0, "gradient_predivide_factor != 1.0 is not yet supported with ZeRO-2 with reduce scatter enabled"
-            assert self.postscale_gradients, "pre-scale gradients is not yet supported with ZeRO-2 with reduce scatter enabled"
-
-        # Holds the mode parameter
-        # The param.data may not hold any meaningful data
-        # when param's status is NOT_AVAILABLE or IN_FLGHT
-        self.fp16_groups = []
-
-        # Hold partitioned parameters
-        self.fp16_partitioned_groups = []
-
-        # Holds a fused and flattened copy of the parameters
-        self.fp16_partitioned_groups_flat = []
-        self.fp16_partitioned_groups_flat_numel = []
-
-        # defragmented pinned memory
-        self.param_groups_fp16_flat_cpu_memory = []
-
-        # a single 32-bit partition of the parallel partitioned parameters
-        # that this process will update
-        self.fp32_partitioned_groups_flat = []
-        self.next_swappable_fp32_partitioned_groups = []
-
-        # number of elements per partition in each group
-        self.partition_size = []
-
-        self.all_reduce_print = False
-
-        self.prefetch_elements = int(prefetch_bucket_size)
-
-        # padding on each partition for alignment purposes
-        self.groups_padding = []
-
-        self.sub_group_size = sub_group_size
-
-        self.sub_group_to_group_id = {}
-
-        if self.verbose:
-            report_memory_usage("Before creating fp16 partitions")
-        self._create_fp16_partitions_with_defragmentation()
-        num_fp16_subgroups = len(self.fp16_partitioned_groups_flat)
-        if self.verbose:
-            report_memory_usage(
-                f"After creating fp16 partitions: {num_fp16_subgroups}")
-
-        # Optimizer ensor swapping
-        if self.swap_optimizer:
-            self._configure_tensor_swapping(
-                offload_optimizer_config, aio_config)
-
-        if self.verbose:
-            report_memory_usage("Before creating fp32 partitions")
-        self._create_fp32_partitions()
-        if self.verbose:
-            report_memory_usage("After creating fp32 partitions")
-        dist.barrier()
-
-        # To support pipelined optimizer swapping
-        self._create_next_swappable_fp32_groups()
-
-        if self.verbose:
-            report_memory_usage("Before initializing optimizer states")
-        self.initialize_optimizer_states()
-        if self.verbose:
-            report_memory_usage("After initializing optimizer states")
-        dist.barrier()
-
-        if dist.get_rank() == 0 and self.verbose:
-            print(f"optimizer state initialized")
-
-        self.reduce_bucket_size = int(reduce_bucket_size)
-
-        self.reduction_event = torch.cuda.Event(
-            enable_timing=False, blocking=False)
-
-        self.reduction_stream = torch.cuda.Stream(
-        ) if self.overlap_comm else torch.cuda.current_stream()
-        self.callback_queued = False
-        self.copy_grad_stream = torch.cuda.Stream()
-
-        self.param_dict = {}
-
-        # map between param_id and bool to specify if a param is in this partition
-        self.is_param_in_current_partition = {}
-
-        self.contiguous_gradients = contiguous_gradients
-        self.extra_large_param_to_reduce = None
-        self.grads_in_ipg_bucket = []
-        self.params_in_ipg_bucket = []
-        self.elements_in_ipg_bucket = 0
-        self.params_already_reduced = []
-        self.is_gradient_accumulation_boundary = True
-        self._release_ipg_buffers()
-        self.previous_reduced_grads = None
-
-        # simplified param id
-        self.param_id = {}
-
-        count = 0
-        for i, params_group in enumerate(self.fp16_groups):
-            for param in params_group:
-                unique_id = id(param)
-                self.param_id[unique_id] = count
-                self.param_dict[count] = param
-                self.params_already_reduced.append(False)
-                count = count + 1
-
-        # Largest partitioned param
-        largest_partitioned_param_numel = max([
-            max([tensor.numel() for tensor in fp16_partitioned_group])
-            for fp16_partitioned_group in self.fp16_partitioned_groups
-        ])
-        if self.verbose:
-            print_rank_0(
-                f'Largest partitioned param numel = {largest_partitioned_param_numel}',
-                force=False)
-
-        if self.verbose:
-            report_memory_usage(f"Before Set Grad positions")
-
-        self.grad_position = {}
-        self.set_grad_positions()
-        if self.verbose:
-            report_memory_usage(f"Before CPU Offload initialization")
-
-        self.grads_in_partition = None
-
-        if self.offload_optimizer:
-            self.accumulated_grads_in_cpu = {}
-            self.norm_for_param_grads = {}
-            self.local_overflow = False
-            self.temp_grad_buffer_for_gpu_offload = torch.zeros(
-                largest_partitioned_param_numel,
-                device=torch.cuda.current_device(),
-                dtype=self.dtype)
-            self.temp_grad_gpu_buffer = torch.zeros(largest_partitioned_param_numel,
-                                                    device=torch.cuda.current_device(),
-                                                    dtype=self.dtype)
-
-        if self.verbose:
-            report_memory_usage(f"After CPU Offload initialization")
-
-        # stores if a partition has been reduced in this step
-        self.is_partition_reduced = {}
-
-        # stores if a grad in a partition has been computed or not
-        self.is_grad_computed = {}
-
-        # will store the averaged gradients required by this parititon
-        self.averaged_gradients = {}
-
-        # creates backward hooks for gradient partitioning
-        self.create_reduce_and_remove_grad_hooks()
-
-        # exit(0)
-
-        # we may have a way of fusing dynamic scale. Do not support for now
-        if self.dtype == torch.float or not dynamic_loss_scale:
-            loss_scale_value = 1.0 if self.dtype == torch.float else static_loss_scale
-
-            self.dynamic_loss_scale = False
-            self.loss_scaler = LossScaler(scale=loss_scale_value)
-            cur_iter = 0
-        else:
-            if dynamic_loss_args is None:
-                self.loss_scaler = DynamicLossScaler()
-            else:
-                self.loss_scaler = DynamicLossScaler(**dynamic_loss_args)
-
-            self.dynamic_loss_scale = True
-
-        self.debug_fp16_grads = [{} for _ in self.fp16_groups]
-
-        if dist.get_rank(group=self.dp_process_group) == 0 and self.verbose:
-            report_memory_usage(f"After initializing ZeRO optimizer")
-
-    def _configure_tensor_swapping(self, offload_optimizer_config, aio_config):
-        nvme_swap_folder = os.path.join(
-            offload_optimizer_config[OFFLOAD_OPTIMIZER_NVME_PATH],
-            'zero_stage_3')
-        os.makedirs(nvme_swap_folder, exist_ok=True)
-        if torch.distributed.get_rank() == 0 and self.verbose:
-            print(f'Tensor Swapping: Adding optimizer tensors')
-
-        swapper_type = PipelinedOptimizerSwapper if offload_optimizer_config[
-            OFFLOAD_OPTIMIZER_PIPELINE] else PartitionedOptimizerSwapper
-
-        self.optimizer_swapper = swapper_type(
-            swap_config=offload_optimizer_config,
-            aio_config=aio_config,
-            base_folder=nvme_swap_folder,
-            optimizer=self.optimizer,
-            largest_numel=max(self.fp16_partitioned_groups_flat_numel),
-            device=self.device,
-            dtype=torch.float32,
-            timers=self.timers)
-
-    def _create_fp16_partitions(self):
-        dist.barrier()
-        partition_id = dist.get_rank(group=self.dp_process_group)
-
-        # loop to deal with groups
-        for j, param_group in enumerate(self.optimizer.param_groups):
-
-            sub_groups = self._create_fp16_sub_groups(param_group['params'])
-            for sub_group in sub_groups:
-                i = len(self.fp16_groups)
-
-                # push this group to list before modify
-                self.fp16_groups.append(sub_group)
-                self.sub_group_to_group_id[i] = j
-
-                # These are the list of the partitioned parameters
-                self.fp16_partitioned_groups.append(
-                    [param.ds_tensor for param in self.fp16_groups[i]])
-
-                if self.verbose:
-                    print_rank_0(
-                        f"fp16 group {i} partitioned_param norms : {[param.ds_tensor.norm().item() for param in self.fp16_groups[i]]}"
-                    )
-
-                # Record padding required to align group to world size (only applies to last rank)
-                if partition_id == dist.get_world_size(group=self.dp_process_group) - 1:
-                    padding = [p.padding_size() for p in self.fp16_groups[i]]
-                else:
-                    padding = [0] * len(self.fp16_groups[i])
-                self.groups_padding.append(padding)
-
-                # not sure why apex was cloning the weights before flattening
-                # removing cloning here
-                if self.verbose:
-                    report_memory_usage(f"Before Flattening param group {i}")
-
-                if not self.offload_param:
-                    if self.verbose:
-                        report_memory_usage(
-                            f"Before moving param group {i} to CPU")
-                    # move all the parameters to cpu to free up GPU space for creating flat buffer
-                    move_to_cpu(self.fp16_partitioned_groups[i])
-                    if self.verbose:
-                        report_memory_usage(
-                            f"After moving param group {i} to CPU")
-
-                    # create flat buffer in CPU and move to GPU
-                    self.fp16_partitioned_groups_flat.append(
-                        self.flatten_dense_tensors_aligned(
-                            self.fp16_partitioned_groups[i],
-                            dist.get_world_size(group=self.dp_process_group)).cuda(
-                            torch.cuda.current_device()))
-
-                    if self.verbose:
-                        report_memory_usage(
-                            f"After flattening and moving param group {i} to GPU"
-                        )
-                else:
-                    # Without the detach, report_memory_usage()lattening becomes part of the
-                    # model graph causing errors downstream
-                    self.fp16_partitioned_groups_flat.append(
-                        self.flatten_dense_tensors_aligned(
-                            self.fp16_partitioned_groups[i],
-                            dist.get_world_size(
-                                group=self.dp_process_group)).detach().pin_memory())
-
-                if self.verbose:
-                    report_memory_usage(f"After Flattening param group {i}")
-
-                # set model fp16 weight to slices of flattened buffer
-                updated_params = self.unflatten(self.fp16_partitioned_groups_flat[i],
-                                                self.fp16_partitioned_groups[i])
-
-                for partitioned_param, q in zip(self.fp16_partitioned_groups[i], updated_params):
-                    partitioned_param.data = q.data
-
-    def _move_to_flat_buffer(self, param_list, flat_buffer, avoid_copy=False):
-        '''If flat buffer is None then the parameters in the param_list are
-        not copied to the flat buffer. This is because they excede the number of max_params_in_cpu
-        Some of these parameters may aready be in CPU in unflattened buffers
-        or they maybe in GPU, or they maybe in NVME. If they are in NVME, then
-        they will be marked as NOT_AVAILABLE, and will be moved to CPU when they are
-        needed during training.'''
-        if flat_buffer is None:
-            # this dst buffer is on NVMe, so skip this
-            return
-
-        start = 0
-        for param in param_list:
-            src = param.ds_tensor
-            dest = flat_buffer.narrow(0, start, src.ds_numel)
-            start = start + src.ds_numel
-            '''if the parameter was initialized in nvme then bring it to the destination buffer directly'''
-            if src.status == PartitionedParamStatus.NOT_AVAILABLE:
-                if self.verbose:
-                    print_rank_0(
-                        f"Swapping in {param.ds_id} with partition size {param.ds_tensor.ds_numel} permanently to CPU"
-                    )
-                param.nvme_swapper.swap_into_buffer(param, dest)
-                src.data = dest.data
-                src.status = PartitionedParamStatus.AVAILABLE
-            else:
-                assert src.status == PartitionedParamStatus.AVAILABLE, "Partitioned Parm must be avialable here"
-                if not avoid_copy:
-                    dest.data.copy_(src.data)
-                src.data = dest.data
-
-            # Final location must be gpu/cpu in this case
-            param.ds_tensor.final_location = 'not-nvme'
-
-    def _create_param_groups_fp16_flat_cpu_memory(self):
-
-        aggregate_params_count = 0
-
-        for j, param_group in enumerate(self.optimizer.param_groups):
-            params_in_group = sum(
-                [p.ds_tensor.ds_numel for p in param_group['params']])
-
-            flat_buffer_size = params_in_group
-
-            if self.params_in_nvme_and_cpu and \
-                    aggregate_params_count + params_in_group > self.max_params_in_cpu:
-                flat_buffer_size = max(0,
-                                       self.max_params_in_cpu - aggregate_params_count)
-
-            aggregate_params_count += params_in_group
-
-            if flat_buffer_size > 0:
-                if self.verbose:
-                    print_rank_0(f"group {j} flat buffer size {flat_buffer_size}",
-                                 force=False)
-                self.param_groups_fp16_flat_cpu_memory.append(
-                    torch.empty(int(flat_buffer_size),
-                                dtype=self.dtype,
-                                pin_memory=True))
-            else:
-                if self.verbose:
-                    print_rank_0(
-                        f"No flat buffer size. Param group size was  {params_in_group}",
-                        force=False)
-
-                self.param_groups_fp16_flat_cpu_memory.append(
-                    torch.empty(1,
-                                dtype=self.dtype))
-
-    def _create_fp16_partitions_with_defragmentation(self):
-        dist.barrier()
-        partition_id = dist.get_rank(group=self.dp_process_group)
-        create_fp16_flat_reuse_buffer = False
-        largest_partition_numel = []
-        max_partition_numel = 0
-
-        # create a flat CPU memory allocation for each param group
-        if self.offload_param:
-            self._create_param_groups_fp16_flat_cpu_memory()
-
-        # loop to deal with groups
-        for j, param_group in enumerate(self.optimizer.param_groups):
-
-            sub_groups = self._create_fp16_sub_groups(param_group['params'])
-
-            if self.verbose:
-                print_rank_0(
-                    f'fp16 group {j} has {len(sub_groups)} subgroups', force=False)
-
-            flat_offset = 0
-            for sub_group in sub_groups:
-                i = len(self.fp16_groups)
-
-                # push this group to list before modify
-                self.fp16_groups.append(sub_group)
-                self.sub_group_to_group_id[i] = j
-
-                # comment out for zero_to_fp32 debug
-                # if torch.distributed.get_rank() == 0:
-                #     for param in self.fp16_groups[i]:
-                #         print(f"{debug_param2name_id_shape(param)} {param.ds_shape}")
-
-                # These are the list of the partitioned parameters
-                self.fp16_partitioned_groups.append(
-                    [param.ds_tensor for param in self.fp16_groups[i]])
-
-                total_elements = sum(
-                    [t.ds_numel for t in self.fp16_partitioned_groups[i]])
-                self.fp16_partitioned_groups_flat_numel.append(total_elements)
-
-                if total_elements > max_partition_numel:
-                    largest_partition_numel = [
-                        t.ds_numel for t in self.fp16_partitioned_groups[i]
-                    ]
-                    max_partition_numel = total_elements
-
-                if self.verbose:
-                    print_rank_0(
-                        f"fp16 group {i} partitioned_param norms : {[param.ds_tensor.norm().item() for param in self.fp16_groups[i]]}"
-                    )
-
-                # Record padding required to align group to world size (only applies to last rank)
-                if partition_id == dist.get_world_size(group=self.dp_process_group) - 1:
-                    padding = [p.padding_size() for p in self.fp16_groups[i]]
-                else:
-                    padding = [0] * len(self.fp16_groups[i])
-                self.groups_padding.append(padding)
-
-                # not sure why apex was cloning the weights before flattening
-                # removing cloning here
-                if self.verbose:
-                    report_memory_usage(
-                        f"Before Flattening param subgroup {i}")
-
-                # all partitioned parameters remain in GPU during training
-                if not self.offload_param:
-                    if self.verbose:
-                        report_memory_usage(
-                            f"Before moving param subgroup group {i} to CPU")
-                    # move all the parameters to cpu to free up GPU space for creating flat buffer
-                    move_to_cpu(self.fp16_partitioned_groups[i])
-                    if self.verbose:
-                        report_memory_usage(
-                            f"After moving param subgroup {i} to CPU")
-
-                    # create flat buffer in CPU and move to GPU
-                    self.fp16_partitioned_groups_flat.append(
-                        self.flatten_dense_tensors_aligned(
-                            self.fp16_partitioned_groups[i],
-                            1).cuda(torch.cuda.current_device()))
-                    if self.verbose:
-                        report_memory_usage(
-                            f"After flattening and moving param subgroup {i} to GPU")
-
-                # all partitioned parameters are in CPU during training
-                else:
-                    if self.verbose:
-                        print_rank_0(
-                            f"Params in nvme and cpu {self.params_in_nvme_and_cpu}")
-                    # Flat buffer may not be available for parameters that reside in NVME
-                    if not self.params_in_nvme_and_cpu or flat_offset + total_elements <= \
-                            self.param_groups_fp16_flat_cpu_memory[
-                                j].numel():
-                        fp16_partitioned_group_flat = self.param_groups_fp16_flat_cpu_memory[
-                            j].narrow(0,
-                                      flat_offset,
-                                      total_elements)
-                        if self.verbose:
-                            print_rank_0(
-                                f"Creating a flat buffer for subgroup {i} requiring {total_elements} elements, and cumulative CPU elemets {flat_offset + total_elements}",
-                                force=False)
-                    # these parameters reside in NVME and
-                    elif self.params_in_nvme_and_cpu:
-                        fp16_partitioned_group_flat = None
-                        if self.verbose:
-                            print_rank_0(
-                                f"No flat buffer for sub group {i} of {total_elements} elements",
-                                force=False)
-                    else:
-                        assert False, "Either params are in nvme, or they are in CPU memory. This code path should not be triggered. Please report_memory_usage()ms_in_cpu and params_in_nvme configs"
-
-                    self.fp16_partitioned_groups_flat.append(
-                        fp16_partitioned_group_flat)
-                    flat_offset += total_elements
-
-                # move param to flat buffer for both param offload on/off
-                self._move_to_flat_buffer(self.fp16_groups[i],
-                                          self.fp16_partitioned_groups_flat[i],
-                                          avoid_copy=not self.offload_param)
-                if self.verbose:
-                    report_memory_usage(f"After Flattening param group {i}")
-
-                # create a pinned memory to be used for swapping out params to NVME after optimizer step
-                if self.fp16_partitioned_groups_flat[-1] is None:
-                    create_fp16_flat_reuse_buffer = True
-
-                if self.verbose:
-                    report_memory_usage(f"After Flattening param subgroup {i}")
-
-        if create_fp16_flat_reuse_buffer:
-            assert len(
-                largest_partition_numel) > 0, f'Unexpected that largest partition is empty'
-            self.fp16_groups[0][0].nvme_swapper.reserve_partitioned_swap_space(
-                largest_partition_numel)
-
-    def _swap_in_sub_group_to_flat_buffer(self, flat_buffer, sub_group_id):
-        offset = 0
-        elements_in_sub_group = sum(
-            [t.ds_numel for t in self.fp16_partitioned_groups[sub_group_id]])
-        assert (flat_buffer.numel() == elements_in_sub_group)
-        for param, partitioned_param in zip(self.fp16_groups[sub_group_id], self.fp16_partitioned_groups[sub_group_id]):
-            dest = flat_buffer.narrow(0, offset, partitioned_param.ds_numel)
-            if partitioned_param.status == PartitionedParamStatus.NOT_AVAILABLE:
-                if self.verbose:
-                    print_rank_0(
-                        f"Swapping in {param.ds_id} with elements {param.ds_numel} and partition {param.ds_tensor.ds_numel}"
-                    )
-                param.nvme_swapper.swap_in([param], async_op=False)
-                dest.data.copy_(partitioned_param.data)
-                param.nvme_swapper.remove_partition_and_release_buffers([
-                    param])
-                if self.verbose:
-                    print_rank_0(f"Swapping in {param.ds_id} done")
-            else:
-                dest.data.copy_(partitioned_param.data)
-            offset += partitioned_param.ds_numel
-
-    def _create_next_swappable_fp32_groups(self):
-        reverse_order_indices = [
-            i for i in range(len(self.fp32_partitioned_groups_flat))
-        ]
-        reverse_order_indices.reverse()
-
-        next_group = None
-        for i in reverse_order_indices:
-            self.next_swappable_fp32_partitioned_groups.append(next_group)
-            if self._swappable_optimizer_subgroup(i):
-                next_group = self.fp32_partitioned_groups_flat[i]
-
-        self.next_swappable_fp32_partitioned_groups.reverse()
-
-    def _get_sub_group_partitions(self, sub_group_id):
-        sub_group_partitions = []
-        for param, partitioned_param in zip(self.fp16_groups[sub_group_id], self.fp16_partitioned_groups[sub_group_id]):
-            if partitioned_param.status == PartitionedParamStatus.NOT_AVAILABLE:
-                swap_path = param.nvme_swapper.get_path(param, True)
-                sub_group_partitions.append((partitioned_param,
-                                             param.ds_tensor.ds_numel,
-                                             swap_path))
-            else:
-                sub_group_partitions.append((partitioned_param,
-                                             partitioned_param.ds_numel,
-                                             None))
-
-        return sub_group_partitions
-
-    def _create_fp32_partitions(self):
-        cpu_memory_usage = 0
-        cpu_memory_sub_groups = 0
-        nvme_memory_usage = 0
-        num_swappable_partitions = 0
-        num_swap_from_nvme_partitions = 0
-        num_swap_from_cpu_partitions = 0
-        swap_from_nvme_memory_usage = 0
-        swap_from_cpu_memory_usage = 0
-        GIGA_BYTES = (1024 ** 3)
-
-        swappable_fp32_tensors = []
-        swappable_fp16_src_tensors = []
-        nvme_fp16_partitions_info = []
-        nvme_fp16_num_elems = []
-        nvme_fp32_dest_tensors = []
-        fp32_element_size = torch.tensor(
-            [], dtype=torch.float32).element_size()
-
-        for i, tensor in enumerate(self.fp16_partitioned_groups_flat):
-            num_elements = self.fp16_partitioned_groups_flat_numel[i]
-
-            # a partition of the fp32 master weights that will be updated by this process
-            if self._swappable_optimizer_subgroup(i):
-                self.fp32_partitioned_groups_flat.append(torch.Tensor())
-                nvme_memory_usage += (fp32_element_size * num_elements)
-                num_swappable_partitions += 1
-
-                if self.params_in_nvme_and_cpu and tensor is None:
-                    num_swap_from_nvme_partitions += 1
-                    swap_from_nvme_memory_usage += (
-                        fp32_element_size * num_elements)
-                    if self.offload_optimizer_fast_init:
-                        sub_group_partitions = self._get_sub_group_partitions(
-                            i)
-                        nvme_fp16_partitions_info.append(sub_group_partitions)
-                        nvme_fp16_num_elems.append(num_elements)
-                        nvme_fp32_dest_tensors.append(
-                            self.fp32_partitioned_groups_flat[i])
-                    else:
-                        unpinned_fp32_buffer = torch.empty(num_elements,
-                                                           device=self.device,
-                                                           dtype=torch.float)
-                        self._swap_in_sub_group_to_flat_buffer(
-                            unpinned_fp32_buffer, i)
-                        self.optimizer_swapper.initialize_parameters(
-                            parameters=[self.fp32_partitioned_groups_flat[i]],
-                            src_tensors=[unpinned_fp32_buffer])
-                else:
-                    num_swap_from_cpu_partitions += 1
-                    swap_from_cpu_memory_usage += (
-                        fp32_element_size * num_elements)
-                    swappable_fp32_tensors.append(
-                        self.fp32_partitioned_groups_flat[i])
-                    swappable_fp16_src_tensors.append(
-                        self.fp16_partitioned_groups_flat[i])
-            else:
-                cpu_memory_usage += (fp32_element_size * num_elements)
-                cpu_memory_sub_groups += 1
-
-                if self.params_in_nvme_and_cpu and tensor is None:
-                    unpinned_fp32_buffer = torch.empty(num_elements,
-                                                       device=self.device,
-                                                       dtype=torch.float)
-                    self._swap_in_sub_group_to_flat_buffer(
-                        unpinned_fp32_buffer, i)
-                    self.fp32_partitioned_groups_flat.append(
-                        unpinned_fp32_buffer)
-                else:
-                    self.fp32_partitioned_groups_flat.append(
-                        self.fp16_partitioned_groups_flat[i].to(
-                            self.device).clone().float().detach())
-
-            self.fp32_partitioned_groups_flat[
-                i].requires_grad = True  # keep this in case internal optimizer uses it
-
-        if len(swappable_fp32_tensors) > 0:
-            self.optimizer_swapper.initialize_parameters(
-                parameters=swappable_fp32_tensors,
-                src_tensors=swappable_fp16_src_tensors)
-
-        if len(nvme_fp32_dest_tensors) > 0:
-            fp16_pinned_buffers = self.fp16_groups[0][
-                0].nvme_swapper.reserve_available_buffers()
-            assert len(fp16_pinned_buffers) > 0
-            self.optimizer_swapper.initialize_from_swapped_fp16_params(
-                fp16_partitions_info=nvme_fp16_partitions_info,
-                fp16_num_elems=nvme_fp16_num_elems,
-                fp16_pinned_buffers=fp16_pinned_buffers,
-                fp32_parameters=nvme_fp32_dest_tensors)
-            self.fp16_groups[0][0].nvme_swapper.release_reserved_buffers()
-
-        nvme_gigabytes = nvme_memory_usage / GIGA_BYTES
-        if self.verbose:
-            print_rank_0(
-                f'Swappable FP32 Partitions: count={num_swappable_partitions} size={nvme_gigabytes:5.2f} GB',
-                force=False)
-        if self.params_in_nvme_and_cpu:
-            if self.verbose:
-                print_rank_0(
-                    f'Swap from NVMe Partitions: count = {num_swap_from_nvme_partitions}, size = {swap_from_nvme_memory_usage / GIGA_BYTES:5.2f}GB',
-                    force=False)
-                print_rank_0(
-                    f'Swap from CPU Partitions: count = {num_swap_from_cpu_partitions}, size = {swap_from_cpu_memory_usage / GIGA_BYTES:5.2f}GB',
-                    force=False)
-
-        cpu_memory_gigabytes = cpu_memory_usage / GIGA_BYTES
-        if self.verbose:
-            print_rank_0(
-                f'In-Memory FP32 Partitions: count={cpu_memory_sub_groups} size={cpu_memory_gigabytes:5.2f} GB',
-                force=False)
-
-        # Clear for on-the-fly population before the optimizer step
-        for param_group in self.optimizer.param_groups:
-            param_group['params'] = []
-
-    def _create_fp16_sub_groups(self, params_group):
-
-        params_group_numel = sum([param.partitioned_size()
-                                  for param in params_group])
-        sub_group_size = self.sub_group_size
-
-        if sub_group_size is None or sub_group_size >= params_group_numel:
-            return [params_group]
-
-        sub_groups = []
-        sub_group = []
-        local_sub_group_size = 0
-        for param in params_group:
-
-            sub_group.append(param)
-            local_sub_group_size += param.partitioned_size()
-
-            if local_sub_group_size >= sub_group_size or id(param) == id(
-                    params_group[-1]):
-                sub_groups.append(sub_group)
-
-                sub_group = []
-                local_sub_group_size = 0
-
-        return sub_groups
-
-    # def reset_ds_tensor(self):
-    #     for name, param in self.module.named_parameters(recurse=True):
-    #         assert hasattr(param,'ds_id'), "Parameters have not been converted to be Zero 3 compatible"
-    #         assert (param.ds_status == ZeroParamStatus.NOT_AVAILABLE), "All the parameters must have been partitioned by now"
-    #         param.ds_tensor.data = param.data
-
-    def setup_zero_stage3_hooks(self):
-        self.hierarchy = 0
-        self._register_hooks_recursively(self.module)
-
-        # reset step at the beginning of forward
-        def _pre_forward_hook(module, *args):
-            self.param_coordinator.reset_step()
-
-        # reset step if in inference mode
-        def _end_of_forward_hook(module, *args):
-            if not torch._C.is_grad_enabled():
-                self.param_coordinator.reset_step()
-
-        # likely one of them should be enough but just to be safe
-        self.module.register_forward_hook(_end_of_forward_hook)
-        self.module.register_forward_pre_hook(_pre_forward_hook)
-
-        # Add top todule to stack trace
-        global FWD_MODULE_STACK
-        FWD_MODULE_STACK.append(self.module)
-
-    def persistent_parameters(self):
-        persistent_params = []
-        total_persistent_parameters = 0
-        params_count = 0
-        for _, param in self.module.named_parameters(recurse=True):
-            if param.ds_numel < self.persistence_threshold:
-                params_count += 1
-                param.ds_persist = True
-                persistent_params.append(param)
-                total_persistent_parameters += param.ds_numel
-
-        if self.verbose:
-            print_rank_0(
-                f"ZeRO 3: Total persistent parameters: {total_persistent_parameters} in {params_count} params",
-                force=False)
-        return persistent_params
-
-    def _register_hooks_recursively(self, module, count=[0]):
-        my_count = count[0]
-        module.id = my_count
-
-        # print(f"{module.__class__} : {module.id}")
-
-        for child in module.children():
-            count[0] = count[0] + 1
-            self._register_hooks_recursively(child, count=count)
-
-        def _pre_forward_module_hook(module, *args):
-            self.pre_sub_module_forward_function(module)
-
-        def _post_forward_module_hook(module, input, output):
-            global FWD_MODULE_STACK
-            FWD_MODULE_STACK.pop()
-            if output is None:
-                output = []
-            elif not isinstance(output, (list, tuple)):
-                if torch.is_tensor(output):
-                    output = [output]
-                else:
-                    # print(f'got UNKNOWN type {type(output)}')
-                    outputs = []
-                    output = output if isinstance(
-                        output, dict) else vars(output)
-                    for name, val in output.items():
-                        if not name.startswith('__') and torch.is_tensor(val):
-                            outputs.append(val)
-                    output = outputs
-                    # print(f'convert output to {output}')
-
-            for item in filter(lambda item: is_zero_param(item), output):
-                if not any(id(item) in m._external_params for m in FWD_MODULE_STACK):
-                    item.ds_active_sub_modules += 1
-                    module_to_register = FWD_MODULE_STACK[-1]
-
-                    if self.verbose:
-                        print_rank_0(
-                            f'Registering dangling parameter for module {module_to_register.__class__.__name__}.',
-                            force=False)
-                    register_external_parameter(module_to_register, item)
-
-                    # It's possible that the parameter was already external to the completed module. If so, remove it the
-                    # registration as it will be covered by the outer module instead.
-                    if id(item) in module._external_params:
-                        if self.verbose:
-                            print_rank_0(
-                                f'  Unregistering nested dangling parameter from module {module.__class__.__name__}',
-                                force=False)
-                        unregister_external_parameter(module, item)
-
-                    item.all_gather()
-
-            self.post_sub_module_forward_function(module)
-
-        def _pre_backward_module_hook(module, inputs, output):
-            def _run_before_backward_function(sub_module):
-                # some models (e.g. Albert) may run multiple forwards on the same layer in a loop
-                # before doing backwards, so each backward will need a pre-fetch - using reference
-                # counting to support this scenario
-                # print(f"COUNTER before: {sub_module.applied_pre_backward_ref_cnt}")
-                if sub_module.applied_pre_backward_ref_cnt > 0:
-                    self.pre_sub_module_backward_function(sub_module)
-                    sub_module.applied_pre_backward_ref_cnt -= 1
-                # print(f"COUNTER after: {sub_module.applied_pre_backward_ref_cnt}")
-
-            return _apply_to_tensors_only(module,
-                                          PreBackwardFunction,
-                                          _run_before_backward_function,
-                                          output)
-
-        # This is an alternate to doing _post_backward_module_hook
-        # it uses tensor.register_hook instead of using torch.autograd.Function
-        def _alternate_post_backward_module_hook(module, inputs):
-            module.ds_grads_remaining = 0
-
-            # print(f"Before Forward {module.__class__.__name__}")
-
-            def _run_after_backward_hook(*unused):
-                module.ds_grads_remaining = module.ds_grads_remaining - 1
-                if module.ds_grads_remaining == 0:
-                    # print(f"After backward {module.__class__.__name__}")
-                    self.post_sub_module_backward_function(module)
-
-            def _run_before_forward_function(input):
-                if input.requires_grad:
-                    module.ds_grads_remaining += 1
-
-            return _apply_forward_and_backward_to_tensors_only(
-                module,
-                _run_before_forward_function,
-                _run_after_backward_hook,
-                inputs)
-
-        def _post_backward_module_hook(module, inputs):
-            module.ds_grads_remaining = 0
-
-            def _run_after_backward_function(sub_module):
-                if sub_module.ds_grads_remaining == 0:
-                    self.post_sub_module_backward_function(sub_module)
-
-            return _apply_to_tensors_only(module,
-                                          PostBackwardFunction,
-                                          _run_after_backward_function,
-                                          inputs)
-
-        # Pre forward hook
-        module.register_forward_pre_hook(_pre_forward_module_hook)
-        # Post forward hook
-        module.register_forward_hook(_post_forward_module_hook)
-
-        # Pre backward hook
-        module.register_forward_hook(_pre_backward_module_hook)
-
-        # post backward hook
-        module.register_forward_pre_hook(_post_backward_module_hook)
-
-    def pre_sub_module_forward_function(self, sub_module):
-        if self.verbose:
-            report_memory_usage(
-                f"Before sub module function {sub_module.__class__.__name__}")
-
-        global FWD_MODULE_STACK
-        FWD_MODULE_STACK.append(sub_module)
-
-        self.param_coordinator.record_trace(sub_module)
-
-        self.param_coordinator.fetch_sub_module(sub_module)
-        if self.verbose:
-            report_memory_usage(
-                f"Before sub module function {sub_module.__class__.__name__} after fetch")
-
-        self.param_coordinator.prefetch_next_sub_modules(
-            sub_module,
-            numel=self.prefetch_elements,
-            nvme=self.params_in_nvme_and_cpu)
-        if self.verbose:
-            report_memory_usage(
-                f"Before sub module function {sub_module.__class__.__name__} after prefetch")
-
-        self.param_coordinator.increment_step(sub_module)
-
-    def post_sub_module_forward_function(self, sub_module):
-        if self.verbose:
-            report_memory_usage(
-                f"After sub module function {sub_module.__class__.__name__} {sub_module.id} before release")
-
-        self.param_coordinator.release_sub_module(sub_module)
-        if self.verbose:
-            report_memory_usage(
-                f"After sub module function {sub_module.__class__.__name__}  {sub_module.id} after release")
-
-    def pre_sub_module_backward_function(self, sub_module):
-        self.param_coordinator.record_trace(sub_module)
-
-        self.param_coordinator.fetch_sub_module(sub_module)
-
-        self.param_coordinator.prefetch_next_sub_modules(sub_module,
-                                                         numel=self.prefetch_elements)
-
-        self.param_coordinator.increment_step(sub_module)
-
-    def post_sub_module_backward_function(self, sub_module):
-        if self.verbose:
-            report_memory_usage(
-                f"After sub module backward function {sub_module.__class__.__name__} {sub_module.id} before release")
-        self.param_coordinator.release_sub_module(sub_module)
-
-        if self.verbose:
-            report_memory_usage(
-                f"After sub module backward function {sub_module.__class__.__name__} {sub_module.id} after release")
-
-    def _release_ipg_buffers(self):
-        if self.contiguous_gradients:
-            self.ipg_buffer = None
-            if not self.offload_optimizer and self.is_gradient_accumulation_boundary:
-                self.grads_in_partition = None
-
-            self.grads_in_partition_offset = 0
-
-    def _optimizer_step(self, sub_group_id):
-        param_group_id = self.sub_group_to_group_id[sub_group_id]
-        fp32_param = self.fp32_partitioned_groups_flat[sub_group_id]
-        fp16_param = self.fp16_partitioned_groups_flat[sub_group_id]
-        self.optimizer.param_groups[param_group_id]['params'] = [fp32_param]
-
-        self.optimizer.step()
-        self.optimizer.param_groups[param_group_id]['params'] = []
-
-    def _swappable_optimizer_subgroup(self, sub_group_id):
-        if not self.swap_optimizer:
-            return False
-
-        return self.optimizer_swapper.swappable_tensor(
-            None,
-            numel=self.fp16_partitioned_groups_flat_numel[sub_group_id])
-
-    def _partitioned_params_swap_out(self, i):
-        offset = 0
-        fp32_param = self.fp32_partitioned_groups_flat[i]
-        assert fp32_param is not None, \
-            f'fp32 parameters of sub_group {i} is None'
-
-        swap_fp16_params = []
-        swap_fp32_params = []
-        for param, partitioned_param in zip(self.fp16_groups[i], self.fp16_partitioned_groups[i]):
-            src = fp32_param.narrow(0, offset, partitioned_param.ds_numel)
-            if partitioned_param.status == PartitionedParamStatus.AVAILABLE:
-                partitioned_param.data.copy_(src.data)
-            else:
-                swap_fp32_params.append(src)
-                swap_fp16_params.append(param)
-            offset += partitioned_param.ds_numel
-
-        if len(swap_fp16_params):
-            swap_fp16_params[0].nvme_swapper.swap_out_partitioned_params(
-                dst_fp16_params=swap_fp16_params,
-                src_fp32_params=swap_fp32_params)
-
-    def initialize_optimizer_states(self):
-        num_subgroups = len(self.fp16_groups)
-
-        largest_numel = max(
-            [sum([p.ds_numel for p in psg]) for psg in self.fp16_partitioned_groups])
-        gradient_dtype = self.fp32_partitioned_groups_flat[0].dtype
-        gradient_buffer = torch.zeros(int(largest_numel),
-                                      dtype=gradient_dtype,
-                                      device=self.device)
-
-        timers = self.timers
-        timer_names = set()
-
-        if self.swap_optimizer:
-            self.optimizer_swapper.init_timers()
-
-        INIT_OPTIMIZER_TIMER = 'init_optimizer_state'
-        timer_names.add(INIT_OPTIMIZER_TIMER)
-        self.start_timers([INIT_OPTIMIZER_TIMER])
-
-        for i, group in enumerate(self.fp16_groups):
-            swappable_optimizer_subgroup = self._swappable_optimizer_subgroup(
-                i)
-            swappable_param_subgroup = self.fp16_partitioned_groups_flat[i] is None
-
-            num_elements = int(self.fp16_partitioned_groups_flat_numel[i])
-
-            if self.verbose:
-                report_memory_usage(
-                    f'[Begin] Initialize optimizer states {i} / {num_subgroups} subgroups, num_elems: {num_elements}, swappable opt/param:{swappable_optimizer_subgroup}/{swappable_param_subgroup}')
-
-            if swappable_optimizer_subgroup:
-                self._optimizer_states_and_gradient_swap_in(i, timer_names)
-
-            if self.offload_optimizer and not swappable_optimizer_subgroup:
-                subgroup_gradient_buffer = torch.zeros(num_elements,
-                                                       dtype=gradient_dtype,
-                                                       device=self.device)
-                if self.offload_optimizer_pin_memory:
-                    subgroup_gradient_buffer = subgroup_gradient_buffer.pin_memory()
-
-                self.fp32_partitioned_groups_flat[i].grad = subgroup_gradient_buffer
-            else:
-                self.fp32_partitioned_groups_flat[i].grad = gradient_buffer.narrow(
-                    0,
-                    0,
-                    num_elements)
-
-            self._optimizer_step(i)
-
-            if swappable_param_subgroup:
-                self._partitioned_params_swap_out(i)
-
-            if swappable_optimizer_subgroup:
-                self._optimizer_states_and_gradient_swap_out(i, timer_names)
-
-            if self.verbose:
-                report_memory_usage(
-                    f'[End] Initialize optimizer states {i} / {num_subgroups} subgroups, num_elems: {num_elements}, swappable opt/param:{swappable_optimizer_subgroup}/{swappable_param_subgroup}')
-
-        self.stop_timers([INIT_OPTIMIZER_TIMER])
-        self.log_timers(timer_names)
-
-        if self.swap_optimizer:
-            self.optimizer_swapper.log_timers()
-
-        if not self.offload_optimizer:
-            for group in self.fp32_partitioned_groups_flat:
-                group.grad = None
-
-        # Reset steps
-        return
-
-    #########################################################################
-    #########################ZeRO Partition Gradients########################
-    #########################################################################
-
-    def get_first_param_index(self, group_id, param_group, partition_id):
-        for index, param in enumerate(param_group):
-            param_id = self.get_param_id(param)
-            if partition_id in self.param_to_partition_ids[group_id][param_id]:
-                return index
-        return None
-
-    def initialize_gradient_partitioning_data_structures(self):
-
-        total_partitions = dist.get_world_size(group=self.dp_process_group)
-
-        for i, param_group in enumerate(self.fp16_groups):
-
-            self.param_to_partition_ids[i] = {}
-            self.is_partition_reduced[i] = {}
-            self.total_grads_in_partition[i] = {}
-            self.remaining_grads_in_partition[i] = {}
-            self.is_grad_computed[i] = {}
-            self.grad_partition_insertion_offset[i] = {}
-            self.grad_start_offset[i] = {}
-            self.first_param_index_in_partition[i] = {}
-
-            for partition_id in range(total_partitions):
-                self.is_grad_computed[i][partition_id] = {}
-                self.grad_partition_insertion_offset[i][partition_id] = {}
-                self.grad_start_offset[i][partition_id] = {}
-                self.initialize_gradient_partition(
-                    i, param_group, partition_id)
-                self.is_partition_reduced[i][partition_id] = False
-                self.first_param_index_in_partition[i][
-                    partition_id] = self.get_first_param_index(
-                    i,
-                    param_group,
-                    partition_id)
-
-    def independent_gradient_partition_epilogue(self):
-        if self.verbose:
-            self.report_ipg_memory_usage(
-                f"In ipg_epilogue before reduce_ipg_grads", 0)
-        self.reduce_ipg_grads()
-        if self.verbose:
-            self.report_ipg_memory_usage(
-                f"In ipg_epilogue after reduce_ipg_grads", 0)
-
-        if self.overlap_comm:
-            self.reduction_stream.synchronize()
-
-        with torch.cuda.stream(self.reduction_stream):
-            self.partition_previous_reduced_grads()
-
-        # if dist.get_rank() == 0:
-        #    print()("Params already reduced %s", self.params_already_reduced)
-        for i in range(len(self.params_already_reduced)):
-            self.params_already_reduced[i] = False
-
-        # in case of cpu offload, averaged gradients are already in fp32_partitioned_groups_flat.grad
-        # TODO: use a similar code path for both cpu_offload and non-cpu offload
-        if not self.offload_optimizer:
-            for i, sub_group in enumerate(self.fp16_groups):
-                self.averaged_gradients[i] = [
-                    torch.zeros_like(param.ds_tensor) if param.grad is None else
-                    param.grad.data.narrow(0,
-                                           0,
-                                           param.ds_tensor.numel())
-                    for param in sub_group
-                ]
-                # self.averaged_gradients[i] = self.get_flat_partition(
-                #     self.fp16_groups[i],
-                #     0,
-                #     self.fp32_partitioned_groups_flat[i].numel(),
-                #     return_tensor_list=True)
-
-        self._release_ipg_buffers()
-
-        if self.verbose:
-            report_memory_usage(f"End ipg_epilogue")
-
-    # resets all partition to no reduced
-    # sets remianing grads to the total number of grads in each partition
-    # set is grad computed to false for all grads in partition
-    def reset_partition_gradient_structures(self):
-        total_partitions = dist.get_world_size(group=self.dp_process_group)
-        for i, _ in enumerate(self.fp16_groups):
-            for partition_id in range(total_partitions):
-                self.is_partition_reduced[i][partition_id] = False
-                self.remaining_grads_in_partition[i][
-                    partition_id] = self.total_grads_in_partition[i][partition_id]
-
-                for param_id in self.is_grad_computed[i][partition_id]:
-                    self.is_grad_computed[i][partition_id][param_id] = False
-
-    def initialize_gradient_partition(self, i, param_group, partition_id):
-        def set_key_value_list(dictionary, key, value):
-            if key in dictionary:
-                dictionary[key].append(value)
-            else:
-                dictionary[key] = [value]
-
-        def increment_value(dictionary, key):
-            if key in dictionary:
-                dictionary[key] += 1
-            else:
-                dictionary[key] = 1
-
-        partition_size = self.partition_size[i]
-
-        start_index = partition_size * partition_id
-        end_index = partition_size * (partition_id + 1)
-
-        current_index = 0
-        first_offset = 0
-
-        for param in param_group:
-
-            param_size = param.numel()
-            param_id = self.get_param_id(param)
-
-            if (current_index >= start_index and current_index < end_index):
-                set_key_value_list(self.param_to_partition_ids[i],
-                                   param_id,
-                                   partition_id)
-                increment_value(self.total_grads_in_partition[i], partition_id)
-
-                self.is_grad_computed[i][partition_id][param_id] = False
-
-                self.grad_partition_insertion_offset[i][partition_id][
-                    param_id] = current_index - start_index
-                self.grad_start_offset[i][partition_id][param_id] = 0
-
-            elif start_index > current_index and start_index < (current_index +
-                                                                param_size):
-                assert (
-                    first_offset == 0), "This can happen either zero or only once as this must be the first tensor in the partition"
-                first_offset = start_index - current_index
-
-                set_key_value_list(self.param_to_partition_ids[i],
-                                   param_id,
-                                   partition_id)
-                increment_value(self.total_grads_in_partition[i], partition_id)
-
-                self.is_grad_computed[i][partition_id][param_id] = False
-
-                self.grad_partition_insertion_offset[i][partition_id][param_id] = 0
-                self.grad_start_offset[i][partition_id][param_id] = first_offset
-
-            current_index = current_index + param_size
-
-    def overlapping_partition_gradients_reduce_epilogue(self):
-        self.independent_gradient_partition_epilogue()
-        self.zero_grad()
-
-    def create_reduce_and_remove_grad_hooks(self):
-        if self.verbose:
-            print_rank_0(f'[Begin] Create gradient reduction hooks')
-        self.grad_accs = []
-        for i, param_group in enumerate(self.fp16_groups):
-            for param in param_group:
-                if param.requires_grad:
-                    # print_rank_0(f" Before all gather {param.device}, {param.shape}")
-
-                    # The hook must be created in un-partitioned parameter
-                    param.all_gather()
-
-                    # print(f"After all gather {param.device}, {param.shape}")
-                    def wrapper(param, i):
-                        param_tmp = param.expand_as(param)
-                        grad_acc = param_tmp.grad_fn.next_functions[0][0]
-
-                        def reduce_partition_and_remove_grads(*notneeded):
-                            self.reduce_ready_partitions_and_remove_grads(
-                                param, i)
-
-                        grad_acc.register_hook(
-                            reduce_partition_and_remove_grads)
-                        self.grad_accs.append(grad_acc)
-
-                    # print(f"param grad fn {param.expand_as(param).grad_fn}")
-                    wrapper(param, i)
-
-                    # Partition the parameter after creating the hook
-                    param.partition()
-        if self.verbose:
-            print_rank_0(f'[End] Create gradient reduction hooks')
-
-    def get_param_id(self, param):
-        unique_id = id(param)
-        return self.param_id[unique_id]
-
-    def report_ipg_memory_usage(self, tag, param_elems):
-        elem_count = self.elements_in_ipg_bucket + param_elems
-        percent_of_bucket_size = (
-            100.0 * elem_count) // self.reduce_bucket_size
-        report_memory_usage(
-            f"{tag}: elems in_bucket {self.elements_in_ipg_bucket} param {param_elems} max_percent {percent_of_bucket_size}")
-
-    ###############Idependent Partition Gradient ########################
-    def reduce_independent_p_g_buckets_and_remove_grads(self, param, i):
-        # print_rank_0(f"Inside reduce ipg buckets. {debug_param2name_id_shape(param)}, ipg elements {self.elements_in_ipg_bucket}, reduce bucket size {self.reduce_bucket_size}", force=True)
-
-        # Because the ipg bucket is initialized with a random place holder tensor, we must
-        # explicitly check that the bucket has any real data in it (self.elements_in_ipg_bucket >
-        # 0). Otherwise if the incoming param.ds_numel is large, this branch may get triggered on a
-        # garbage data and `self.average_tensor()` will crash because its params_to_reduce will be
-        # empty, while reduction_list will have that garbage data.
-        if self.elements_in_ipg_bucket > 0 and self.elements_in_ipg_bucket + param.ds_numel > self.reduce_bucket_size:
-            if self.verbose:
-                self.report_ipg_memory_usage("In ipg_remove_grads before reduce_ipg_grads",
-                                             param.ds_numel)
-
-            self.reduce_ipg_grads()
-
-            if self.contiguous_gradients and self.overlap_comm:
-                # Swap ipg_index between 0 and 1
-                self.ipg_index = 1 - self.ipg_index
-            if self.verbose:
-                self.report_ipg_memory_usage("In ipg_remove_grads after reduce_ipg_grads",
-                                             param.ds_numel)
-
-        param_id = self.get_param_id(param)
-        assert self.params_already_reduced[param_id] == False, \
-            f"The parameter {param_id} has already been reduced. \
-            Gradient computed twice for this partition. \
-            Multiple gradient reduction is currently not supported"
-
-        # keeping the gradients contiguous to prevent memory fragmentation, and avoid flattening
-        if param.ds_numel > self.reduce_bucket_size:
-            self.extra_large_param_to_reduce = param
-
-        elif self.contiguous_gradients:
-            # print_rank_0("before new grad tensor move")
-            new_grad_tensor = self.ipg_buffer[self.ipg_index].narrow(
-                0,
-                self.elements_in_ipg_bucket,
-                param.ds_numel)
-            # print_rank_0("after new grad tensor move")
-            new_grad_tensor.copy_(param.grad.view(-1))
-            param.grad.data = new_grad_tensor.data.view_as(param.grad)
-
-        self.elements_in_ipg_bucket += param.ds_numel
-        self.grads_in_ipg_bucket.append(param.grad)
-        self.params_in_ipg_bucket.append((i, param, param_id))
-        if self.verbose:
-            self.report_ipg_memory_usage("End ipg_remove_grads", 0)
-
-    def gradient_reduction_w_predivide(self, tensor):
-        dp_world_size = dist.get_world_size(group=self.dp_process_group)
-
-        tensor_to_allreduce = tensor
-
-        if self.allreduce_always_fp32:
-            tensor_to_allreduce = tensor.float()
-
-        if self.postscale_gradients:
-            if self.gradient_predivide_factor != 1.0:
-                tensor_to_allreduce.mul_(1. / self.gradient_predivide_factor)
-
-            dist.all_reduce(tensor_to_allreduce, group=self.dp_process_group)
-
-            if self.gradient_predivide_factor != dp_world_size:
-                tensor_to_allreduce.mul_(
-                    self.gradient_predivide_factor / dp_world_size)
-        else:
-            tensor_to_allreduce.div_(dp_world_size)
-            dist.all_reduce(tensor_to_allreduce, group=self.dp_process_group)
-
-        if self.allreduce_always_fp32 and tensor is not tensor_to_allreduce:
-            tensor.copy_(tensor_to_allreduce)
-
-        return tensor
-
-    def average_tensor(self, tensors, params_to_reduce):
-        with torch.cuda.stream(self.reduction_stream):
-            if not self.reduce_scatter:
-                for tensor in tensors:
-                    self.gradient_reduction_w_predivide(tensor)
-                return
-
-            for tensor in tensors:
-                tensor.div_(dist.get_world_size(group=self.dp_process_group))
-
-            # reduction resulting with each rank only holding the gradient partition it owns
-            # This could either be a reduce scatter or a reduce op depending on how
-            # parameters are partitionied. The method is implemented by the
-            # DeepSpeed param extensions to the pytorch parameter, so its up to
-            # the extension to define what happens here
-            params_to_reduce[0].reduce_gradients_at_owner(
-                param_list=params_to_reduce,
-                hierarchy=self.param_coordinator.hierarchy)
-
-    def set_grad_positions(self):
-        for i, group in enumerate(self.fp16_groups):
-            current_offset = 0
-            for param in group:
-                param_id = self.get_param_id(param)
-                num_elements = param.ds_tensor.ds_numel
-
-                self.grad_position[param_id] = [
-                    int(i),
-                    int(current_offset),
-                    int(num_elements)
-                ]
-                # print(f"param id {param_id} i:{i}, ds_tensor {num_elements} numel {param.numel()}")
-                current_offset += num_elements
-
-    def async_accumulate_grad_in_cpu_via_gpu(self, param, acc_grad_cpu_partition):
-
-        # copy to a preexisiting buffer to avoid memory allocation penalty
-        dest_buffer = self.temp_grad_buffer_for_gpu_offload.view(-1).narrow(
-            0,
-            0,
-            param.ds_tensor.ds_numel)
-
-        if self.micro_step_id > 0:
-            dest_buffer.copy_(
-                acc_grad_cpu_partition.view(-1), non_blocking=True)
-            param.grad.data.view(-1).add_(dest_buffer)
-
-        # at the boundary we will send 32bit directly
-        if not self.is_gradient_accumulation_boundary:
-            acc_grad_cpu_partition.data.copy_(param.grad.data.view(-1),
-                                              non_blocking=True)
-
-    def _constant_buffered_norm2(self, input, buffer_size=250000000):
-        norm = None
-        for part in input.view(-1).split(buffer_size):
-            if norm is None:
-                norm = part.data.double().norm(2) ** 2.0
-            else:
-                norm += part.data.double().norm(2) ** 2.0
-        return norm ** 0.5
-
-    def set_norm_for_param_grad_in_gpu(self, param):
-        param_id = self.get_param_id(param)
-        # self.norm_for_param_grads[param_id] = param.grad.data.double().norm(2)
-        # Using a more memory efficient version
-        self.norm_for_param_grads[param_id] = self._constant_buffered_norm2(
-            param.grad)
-
-    def update_overflow_tracker_for_param_grad(self, param):
-        # Credit to our user David Minn
-        if param.grad is not None:
-            if self.overlap_comm:
-                self.gpu_sum = self.gpu_sum + param.grad.data.float().sum()
-            elif self._has_inf_or_nan(param.grad.data):
-                self.local_overflow = True
-
-    def async_inplace_copy_grad_to_fp32_buffer_from_gpu(self, param, fp32_grad_tensor):
-        with torch.cuda.stream(self.copy_grad_stream):
-            param_id = self.get_param_id(param)
-            src_tensor = param.grad.view(-1).float()
-            # print(f"src_tensor {src_tensor.size()} and fp32 grad {fp32_grad_tensor.size()}")
-            fp32_grad_tensor.copy_(src_tensor, non_blocking=True)
-            param.grad = None
-
-    def complete_grad_norm_calculation_for_cpu_offload(self, params):
-        total_norm = 0.0
-        norm_type = 2.0
-        for p in params:
-            if is_model_parallel_parameter(p) or (self.model_parallel_rank == 0):
-                param_id = self.get_param_id(p)
-                if param_id in self.norm_for_param_grads.keys():
-                    param_norm = self.norm_for_param_grads[param_id]
-                    total_norm += param_norm.item() ** 2
-
-        # Sum across all model parallel GPUs.
-        total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
-
-        torch.distributed.all_reduce(total_norm_cuda,
-                                     op=torch.distributed.ReduceOp.SUM,
-                                     group=self.dp_process_group)
-
-        self._model_parallel_all_reduce(tensor=total_norm_cuda,
-                                        op=torch.distributed.ReduceOp.SUM)
-
-        total_norm = total_norm_cuda[0].item() ** (1. / norm_type)
-
-        if total_norm == float(
-                'inf') or total_norm == -float('inf') or total_norm != total_norm:
-            total_norm = -1
-
-        return total_norm
-
-    def partition_previous_reduced_grads(self):
-        if not self.previous_reduced_grads:
-            return
-
-        if self.offload_optimizer:
-            allocate_grads_in_partition = self.grads_in_partition is None \
-                and self.gradient_accumulation_steps > 1
-        else:
-            allocate_grads_in_partition = self.grads_in_partition is None
-
-        if allocate_grads_in_partition:
-            self.grads_in_partition = []
-
-            for i, group in enumerate(self.fp16_groups):
-                total_size = 0
-                for param_in_partition in group:
-                    total_size += param_in_partition.ds_tensor.ds_numel
-
-                if self.verbose:
-                    report_memory_usage(
-                        f"group {i} before creating {total_size} reduced gradients into partition")
-                if self.offload_param_pin_memory:
-                    self.grads_in_partition.append(
-                        torch.zeros(int(total_size),
-                                    dtype=self.dtype,
-                                    device=self.device).pin_memory())
-                else:
-                    self.grads_in_partition.append(
-                        torch.zeros(int(total_size),
-                                    dtype=self.dtype,
-                                    device=self.device))
-                if self.verbose:
-                    report_memory_usage(
-                        f"group {i} after creating {total_size} reduced gradients into partition")
-
-        if self.offload_optimizer:
-            offload_fp32_gradients = {}
-            offload_fp32_offsets = {}
-
-        with torch.cuda.stream(self.copy_grad_stream):
-            self.reduction_stream.synchronize()
-            for param in self.previous_reduced_grads:
-
-                [i,
-                 dest_offset,
-                 num_elements] = self.grad_position[self.get_param_id(param)]
-
-                if self.offload_optimizer:
-                    param.partition_gradients(
-                        partition_buffers=self.temp_grad_gpu_buffer)
-                    # with torch.cuda.stream(self.copy_grad_stream):
-                    #    self.reduction_stream.synchronize()
-
-                    if self.gradient_accumulation_steps > 1:
-                        # The allreduce buffer will be rewritted. Copy the gradients in partition to a new buffer
-                        fp16_grad_tensor = self.grads_in_partition[i].narrow(
-                            0,
-                            dest_offset,
-                            num_elements)
-                        self.async_accumulate_grad_in_cpu_via_gpu(
-                            param,
-                            fp16_grad_tensor)
-
-                    if self.is_gradient_accumulation_boundary:
-
-                        self.set_norm_for_param_grad_in_gpu(param)
-
-                        self.update_overflow_tracker_for_param_grad(param)
-
-                        if self._swappable_optimizer_subgroup(i):
-                            if not i in offload_fp32_gradients.keys():
-                                offload_fp32_gradients[i] = []
-                                offload_fp32_offsets[i] = []
-
-                            offload_fp32_gradients[i].append(
-                                param.grad.view(-1).float())
-                            param.grad = None
-                            offload_fp32_offsets[i].append(dest_offset)
-                        else:
-                            fp32_grad_tensor = self.fp32_partitioned_groups_flat[
-                                i].grad.narrow(0,
-                                               dest_offset,
-                                               num_elements)
-
-                            self.async_inplace_copy_grad_to_fp32_buffer_from_gpu(
-                                param,
-                                fp32_grad_tensor)
-                else:
-                    # The allreduce buffer will be rewritted. Copy the gradients in partition to a new buffer
-                    fp16_grad_tensor = self.grads_in_partition[i].narrow(
-                        0,
-                        dest_offset,
-                        num_elements)
-                    param.partition_gradients(
-                        partition_buffers=fp16_grad_tensor,
-                        accumulate=True if self.micro_step_id > 0 else False)
-
-            if self.offload_optimizer and self.swap_optimizer:
-                for i in offload_fp32_gradients.keys():
-                    self.optimizer_swapper.swap_out_gradients(
-                        parameter=self.fp32_partitioned_groups_flat[i],
-                        gradient_offsets=offload_fp32_offsets[i],
-                        gradient_tensors=offload_fp32_gradients[i])
-
-        self.previous_reduced_grads = []
-
-    def reduce_ipg_grads(self, extra_param=None):
-        if self.overlap_comm:
-            self.reduction_stream.synchronize()
-
-        with torch.cuda.stream(self.reduction_stream):
-            self.partition_previous_reduced_grads()
-
-        params_to_reduce = [param for i, param,
-                            param_id in self.params_in_ipg_bucket]
-        # print(f"Params in ipg bucket {self.params_in_ipg_bucket}")
-        # print(f"Reducing {[(debug_param2name_id_shape(param), param.grad) for param in params_to_reduce]}")
-        # exit(0)
-        if self.contiguous_gradients:
-            reduction_list = [self.ipg_buffer[self.ipg_index]]
-            if self.extra_large_param_to_reduce is not None:
-                reduction_list.append(self.extra_large_param_to_reduce.grad)
-                self.extra_large_param_to_reduce = None
-            self.average_tensor(reduction_list, params_to_reduce)
-        else:
-            self.buffered_reduce_fallback(
-                None,
-                self.grads_in_ipg_bucket,
-                elements_per_buffer=self.elements_in_ipg_bucket)
-
-        for _, param, param_id in self.params_in_ipg_bucket:
-            self.params_already_reduced[param_id] = True
-
-        self.previous_reduced_grads = params_to_reduce
-
-        self.grads_in_ipg_bucket = []
-        self.params_in_ipg_bucket = []
-        self.elements_in_ipg_bucket = 0
-        #####################################################################
-
-    def reduce_ready_partitions_and_remove_grads(self, param, i):
-        # print_rank_0(f"Backward {debug_param2name_id_shape(param)}", force=True)
-        self.reduce_independent_p_g_buckets_and_remove_grads(param, i)
-
-    def zero_reduced_gradients(self, partition_id, i):
-        def are_all_related_partitions_reduced(params_id):
-            for partition_id in self.param_to_partition_ids[i][params_id]:
-                if not self.is_partition_reduced[i][partition_id]:
-                    return False
-            return True
-
-        for params_id in self.is_grad_computed[i][partition_id]:
-            if are_all_related_partitions_reduced(params_id):
-                self.param_dict[params_id].grad = None
-
-    def flatten_and_print(self, message, tensors, start=0, n=5):
-        flatten_tensor = self.flatten(tensors)
-
-        def print_func():
-            print(flatten_tensor.contiguous().view(-1).narrow(0, start, n))
-
-        self.sequential_execution(print_func, message)
-
-    def get_grads_to_reduce(self, i, partition_id):
-        def get_reducable_portion(key):
-            grad = self.param_dict[key].grad
-            total_elements = grad.numel()
-            start = self.grad_start_offset[i][partition_id][key]
-            num_elements = min(
-                total_elements - start,
-                self.partition_size[i] -
-                self.grad_partition_insertion_offset[i][partition_id][key])
-            if not pg_correctness_test:
-                if num_elements == total_elements:
-                    return grad
-                else:
-                    return grad.contiguous().view(-1).narrow(0,
-                                                             int(start),
-                                                             int(num_elements))
-            else:
-                if num_elements == total_elements:
-                    return grad.clone()
-                else:
-                    return grad.clone().contiguous().view(-1).narrow(
-                        0,
-                        int(start),
-                        int(num_elements))
-
-        grads_to_reduce = []
-        for key in self.is_grad_computed[i][partition_id]:
-            grad = get_reducable_portion(key)
-            grads_to_reduce.append(grad)
-        return grads_to_reduce
-
-    def sequential_execution(self, function, message, group=None):
-        if group is None:
-            group = self.dp_process_group
-        if dist.get_rank(group=group) == 0:
-            print(message)
-        for id in range(dist.get_world_size(group=group)):
-            if id == dist.get_rank(group=group):
-                function()
-            dist.barrier(group=group)
-
-    def set_none_gradients_to_zero(self, i, partition_id):
-        for param_id in self.is_grad_computed[i][partition_id]:
-            param = self.param_dict[param_id]
-            if param.grad is None:
-                param.grad = torch.zero_like(param)
-
-    ######################Reduction Related Methods##############################
-
-    def allreduce_bucket(self, bucket, allreduce_always_fp32=False, rank=None, log=None):
-        rank = None
-        tensor = self.flatten(bucket)
-
-        tensor_to_allreduce = tensor
-
-        if pg_correctness_test:
-            allreduce_always_fp32 = True
-
-        if allreduce_always_fp32:
-            tensor_to_allreduce = tensor.float()
-
-        tensor_to_allreduce.div_(
-            dist.get_world_size(group=self.dp_process_group))
-
-        if rank is None:
-            #    "All Reducing"
-            dist.all_reduce(tensor_to_allreduce, group=self.dp_process_group)
-        else:
-            global_rank = _get_global_rank(self.dp_process_group, rank)
-            dist.reduce(tensor_to_allreduce, global_rank,
-                        group=self.dp_process_group)
-
-        if allreduce_always_fp32 and tensor is not tensor_to_allreduce:
-            if rank is None or rank == dist.get_rank(group=self.dp_process_group):
-                tensor.copy_(tensor_to_allreduce)
-
-        return tensor
-
-    # if rank is specified do a reduction instead of an allreduce
-    def allreduce_and_copy(self, small_bucket, rank=None, log=None):
-        with torch.cuda.stream(self.reduction_stream):
-            allreduced = self.allreduce_bucket(
-                small_bucket, rank=rank, log=log)
-            if rank is None or rank == dist.get_rank(group=self.dp_process_group):
-                for buf, synced in zip(small_bucket, self.unflatten(allreduced, small_bucket)):
-                    buf.copy_(synced)
-
-    def allreduce_no_retain(self,
-                            bucket,
-                            numel_per_bucket=500000000,
-                            rank=None,
-                            log=None):
-        small_bucket = []
-        numel = 0
-        for tensor in bucket:
-            small_bucket.append(tensor)
-            numel = numel + tensor.numel()
-            if numel > numel_per_bucket:
-                self.allreduce_and_copy(small_bucket, rank=rank, log=None)
-                small_bucket = []
-        if len(small_bucket) > 0:
-            self.allreduce_and_copy(small_bucket, rank=rank, log=log)
-
-    # allows using reduction of gradients instead of using all_reduce
-    def buffered_reduce_fallback(self,
-                                 rank,
-                                 grads,
-                                 elements_per_buffer=500000000,
-                                 log=None):
-        split_buckets = split_half_float_double(grads)
-
-        for i, bucket in enumerate(split_buckets):
-            self.allreduce_no_retain(bucket,
-                                     numel_per_bucket=elements_per_buffer,
-                                     rank=rank,
-                                     log=log)
-
-    #############################################################################
-    #############################################################################
-    #############################################################################
-
-    # views the tensor as multiple partitions and returns
-    # those partitions
-    def get_data_parallel_partitions(self, tensor):
-        partitions = []
-
-        dp = dist.get_world_size(group=self.dp_process_group)
-        dp_id = dist.get_rank(group=self.dp_process_group)
-
-        total_num_elements = tensor.numel()
-
-        base_size = total_num_elements // dp
-        remaining = total_num_elements % dp
-
-        start = 0
-        for id in range(dp):
-            partition_size = base_size
-            if id < remaining:
-                partition_size = partition_size + 1
-            partitions.append(tensor.narrow(0, start, partition_size))
-            start = start + partition_size
-        return partitions
-
-    def get_partition_info(self, tensor_list, partition_size, partition_id):
-        params_in_partition = []
-        params_not_in_partition = []
-
-        start_index = partition_size * partition_id
-        end_index = partition_size * (partition_id + 1)
-
-        current_index = 0
-        first_offset = 0
-
-        for tensor in tensor_list:
-
-            tensor_size = tensor.numel()
-
-            if (current_index >= start_index and current_index < end_index):
-                params_in_partition.append(tensor)
-
-            elif start_index > current_index and start_index < (current_index +
-                                                                tensor_size):
-                params_in_partition.append(tensor)
-
-                assert (
-                    first_offset == 0), "This can happen either zero or only once as this must be the first tensor in the partition"
-                first_offset = start_index - current_index
-
-            else:
-                params_not_in_partition.append(tensor)
-
-            current_index = current_index + tensor_size
-
-        return params_in_partition, params_not_in_partition, first_offset
-
-    def zero_grad(self, set_grads_to_None=True):
-        """
-        Zero FP16 parameter grads.
-        """
-        # FP32 grad should never exist.
-        # For speed, set model fp16 grad to None by default
-        for group in self.fp16_groups:
-            for p in group:
-                if set_grads_to_None:
-                    p.grad = None
-                else:
-                    if p.grad is not None:
-                        p.grad.detach_()
-                        p.grad.zero_()
-
-    def _model_parallel_all_reduce(self, tensor, op):
-        """ Perform all reduce within model parallel group, if any.
-        """
-        if self.model_parallel_group is None:
-            pass
-        else:
-            torch.distributed.all_reduce(tensor=tensor,
-                                         op=op,
-                                         group=self.model_parallel_group)
-
-    def clip_grad_norm(self, *args, **kwargs):
-        # dummy function to retain the same function interface
-        # as ColossalaiOptimizer for compatibility
-        pass
-
-    def get_grad_norm_direct(self, gradients, params, norm_type=2):
-        """Clips gradient norm of an iterable of parameters.
-
-        This is adapted from ``torch.nn.utils.clip_grad.clip_grad_norm_`` and
-        added functionality to handle model parallel parameters. Note that
-        the gradients are modified in place.
-
-        Arguments:
-            parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
-                single Tensor that will have gradients normalized
-            max_norm (float or int): max norm of the gradients
-            norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
-                infinity norm.
-
-        Returns:
-            Total norm of the parameters (viewed as a single vector).
-        """
-        norm_type = float(norm_type)
-        if norm_type == inf:
-            total_norm = max(g.data.abs().max() for g in gradients)
-            total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
-            torch.distributed.all_reduce(total_norm_cuda,
-                                         op=torch.distributed.ReduceOp.MAX,
-                                         group=self.dp_process_group)
-
-            # Take max across all GPUs.
-            self._model_parallel_all_reduce(tensor=total_norm_cuda,
-                                            op=torch.distributed.ReduceOp.MAX)
-            total_norm = total_norm_cuda[0].item()
-        else:
-            total_norm = 0.0
-            # if dist.get_rank() == 0:
-            #    print()(f"Total Norm begining {total_norm}")
-            for g, p in zip(gradients, params):
-                if is_model_parallel_parameter(p) or (self.model_parallel_rank == 0):
-                    param_norm = g.data.double().norm(2)
-                    total_norm += param_norm.item() ** 2
-            # Sum across all model parallel GPUs.
-            total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
-
-            torch.distributed.all_reduce(total_norm_cuda,
-                                         op=torch.distributed.ReduceOp.SUM,
-                                         group=self.dp_process_group)
-
-            self._model_parallel_all_reduce(tensor=total_norm_cuda,
-                                            op=torch.distributed.ReduceOp.SUM)
-
-            total_norm = total_norm_cuda[0].item() ** (1. / norm_type)
-
-        if total_norm == float(
-                'inf') or total_norm == -float('inf') or total_norm != total_norm:
-            total_norm = -1
-
-        return total_norm
-
-    # creates a flat fused tensor from the tensor list starting at the first_offset
-    # in the first tensor of the list. If there are not enough elements in the tensor
-    # list then the flat tensor will be padded with zeros
-    def get_flat_partition(self,
-                           tensor_list,
-                           first_offset,
-                           partition_size,
-                           return_tensor_list=False):
-        flat_tensor_list = []
-        current_size = 0
-        for i, tensor in enumerate(tensor_list):
-            if tensor.grad is None:
-                tensor.grad = torch.zeros_like(tensor)
-
-            tensor = tensor.grad
-            num_elements = tensor.numel()
-            tensor_offset = 0
-
-            # we need to offset to get to the right element
-            if i == 0 and first_offset > 0:
-                tensor_offset = first_offset
-                num_elements = num_elements - tensor_offset
-
-            # we dont need all elements of the tensor
-            if num_elements > (partition_size - current_size):
-                num_elements = partition_size - current_size
-
-            # we need a narrow view of the tensor based on the tensor offset and number of elements that
-            # we need from this tensor
-            if tensor_offset > 0 or num_elements < tensor.numel():
-                flat_tensor_list.append(tensor.contiguous().view(-1).narrow(
-                    0,
-                    int(tensor_offset),
-                    int(num_elements)))
-            else:
-                flat_tensor_list.append(tensor)
-
-            current_size = current_size + num_elements
-
-        # this means its the last partition and does not align with the dp boundary. We need to pad before flattening
-        if current_size < partition_size:
-            flat_tensor_list.append(
-                torch.zeros(int(partition_size - current_size),
-                            dtype=tensor_list[0].dtype,
-                            device=tensor_list[0].device))
-
-        if return_tensor_list:
-            return flat_tensor_list
-
-        return self.flatten(flat_tensor_list)
-
-    def free_grad_in_param_list(self, param_list):
-        for p in param_list:
-            p.grad = None
-
-    def reset_cpu_buffers(self):
-        self.norm_for_param_grads = {}
-        self.local_overflow = False
-
-    def log_timers(self, timer_names):
-        if self.timers is None:
-            return
-
-        self.timers.log(names=list(timer_names))
-
-    def start_timers(self, timer_names):
-        if self.timers is None:
-            return
-
-        for name in timer_names:
-            self.timers(name).start()
-
-    def stop_timers(self, timer_names):
-        if self.timers is None:
-            return
-
-        for name in timer_names:
-            self.timers(name).stop()
-
-    def _pre_step(self):
-        self.micro_step_id = INITIAL_MICRO_STEP_ID
-
-        if self.verbose:
-            print_rank_0(f"Inside Step function")
-            report_memory_usage(f"In step before checking overflow")
-            print_rank_0("Finished Tracing at Beginning of Step")
-        self.param_coordinator.hierarchy = 0
-        self.param_coordinator.finish_tracing(print_trace=True)
-
-        self.param_coordinator.reset_step()
-
-        if self.verbose:
-            print_rank_0("Finished Tracing at Beginning of Step")
-
-    def _get_norm_groups(self):
-        norm_groups = []
-        for i, group in enumerate(self.fp16_groups):
-            if self.offload_optimizer:
-                norm_groups.append(
-                    self.complete_grad_norm_calculation_for_cpu_offload(
-                        self.fp16_groups[i]))
-            else:
-                norm_groups.append(
-                    self.get_grad_norm_direct(self.averaged_gradients[i],
-                                              self.fp16_groups[i]))
-        return norm_groups
-
-    def _prepare_fp32_grad_for_sub_group(self, sub_group_id):
-        partition_id = dist.get_rank(group=self.dp_process_group)
-
-        single_grad_partition = self.flatten(self.averaged_gradients[sub_group_id]).to(
-            self.fp32_partitioned_groups_flat[sub_group_id].dtype)
-
-        assert single_grad_partition.numel() == self.fp32_partitioned_groups_flat[sub_group_id].numel(), \
-            "averaged gradients have different number of elements that partition size {} {} {} {}".format(
-                single_grad_partition.numel(
-                ), self.fp32_partitioned_groups_flat[sub_group_id].numel(), sub_group_id,
-                partition_id)
-
-        self.fp32_partitioned_groups_flat[sub_group_id].grad = single_grad_partition
-
-        # release all the gradient since we have already created a necessary copy in dp_grad_partition
-        self.zero_grad()
-
-        self.averaged_gradients[sub_group_id] = None
-
-    def _prepare_sub_group(self, sub_group_id, timer_names=set()):
-        if self.verbose:
-            report_memory_usage(
-                f'Before prepare optimizer sub group {sub_group_id}')
-        if self._swappable_optimizer_subgroup(sub_group_id):
-            self._optimizer_states_and_gradient_swap_in(
-                sub_group_id, timer_names)
-        elif not self.offload_optimizer:
-            self._prepare_fp32_grad_for_sub_group(sub_group_id)
-        if self.verbose:
-            report_memory_usage(
-                f'After prepare optimizer sub group {sub_group_id}')
-
-    def _optimizer_states_and_gradient_swap_in(self, sub_group_id, timer_names=set()):
-        param_length = self.fp16_partitioned_groups_flat_numel[sub_group_id]
-        fp32_param_id = id(self.fp32_partitioned_groups_flat[sub_group_id])
-        assert self._swappable_optimizer_subgroup(sub_group_id), \
-            f'Parameter {fp32_param_id} of numel={param_length} is not swappable'
-
-        OPTIMIZER_SWAP_IN_STATE = 'optimizer_swap_in_state'
-        if self.verbose:
-            report_memory_usage(
-                f'pre-step Before swapping in optimizer tensors {sub_group_id}')
-        self.start_timers([OPTIMIZER_SWAP_IN_STATE])
-
-        self.optimizer_swapper.swap_in_optimizer_state(
-            parameter=self.fp32_partitioned_groups_flat[sub_group_id],
-            async_parameter=self.next_swappable_fp32_partitioned_groups[sub_group_id])
-
-        self.stop_timers([OPTIMIZER_SWAP_IN_STATE])
-        timer_names.add(OPTIMIZER_SWAP_IN_STATE)
-        if self.verbose:
-            report_memory_usage(
-                f'pre-step After swapping in optimizer tensors {sub_group_id}')
-
-    def _release_sub_group(self, sub_group_id, timer_names=set()):
-        if self.verbose:
-            report_memory_usage(
-                f'Before release optimizer sub group {sub_group_id}')
-        # get rid of the fp32 gradients. Not needed anymore
-        if not self.offload_optimizer:
-            self.fp32_partitioned_groups_flat[sub_group_id].grad = None
-
-        if self._swappable_optimizer_subgroup(sub_group_id):
-            self._optimizer_states_and_gradient_swap_out(
-                sub_group_id, timer_names)
-        if self.verbose:
-            report_memory_usage(
-                f'After release optimizer sub group {sub_group_id}')
-
-    # create a flat tensor aligned at the alignment boundary
-    def flatten_dense_tensors_aligned(self, tensor_list, alignment):
-        num_elements = 0
-        for tens in tensor_list:
-            num_elements = num_elements + tens.numel()
-
-        remaining = num_elements % alignment
-
-        if remaining:
-            elements_to_add = alignment - remaining
-            pad_tensor = torch.zeros(elements_to_add,
-                                     device=tensor_list[0].device,
-                                     dtype=tensor_list[0].dtype)
-            padded_tensor_list = tensor_list + [pad_tensor]
-
-            num_elements = num_elements + elements_to_add
-        else:
-            padded_tensor_list = tensor_list
-
-        return self.flatten(padded_tensor_list)
-
-    def _optimizer_states_and_gradient_swap_out(self, sub_group_id, timer_names=set()):
-        param_length = self.fp16_partitioned_groups_flat_numel[sub_group_id]
-        fp32_param_id = id(self.fp32_partitioned_groups_flat[sub_group_id])
-        assert self._swappable_optimizer_subgroup(sub_group_id), \
-            f'Parameter {fp32_param_id} of numel={param_length} is not swappable'
-
-        OPTIMIZER_SWAP_OUT_STATE = 'optimizer_swap_out_state'
-        if self.verbose:
-            report_memory_usage(
-                f'post-step Before swapping out optimizer tensors {sub_group_id}')
-        self.start_timers([OPTIMIZER_SWAP_OUT_STATE])
-
-        self.optimizer_swapper.swap_out_optimizer_state(
-            parameter=self.fp32_partitioned_groups_flat[sub_group_id],
-            async_swap=self.next_swappable_fp32_partitioned_groups[sub_group_id] is
-            not None)
-
-        self.stop_timers([OPTIMIZER_SWAP_OUT_STATE])
-        if self.verbose:
-            report_memory_usage(
-                f'post-step After swapping out optimizer tensors {sub_group_id}')
-        timer_names.add(OPTIMIZER_SWAP_OUT_STATE)
-
-        # get rid of the fp32 gradients. Not needed anymore
-        self.fp32_partitioned_groups_flat[sub_group_id].grad = None
-
-    def _unflatten_partitioned_parameters(self, sub_group_id):
-        updated_params = self.unflatten(self.fp16_partitioned_groups_flat[sub_group_id],
-                                        self.fp16_partitioned_groups[sub_group_id])
-
-        for partitioned_param, q in zip(self.fp16_partitioned_groups[sub_group_id], updated_params):
-            partitioned_param.data = q.data
-
-    def _overflow_clean_up(self, prev_scale):
-        if self.verbose:
-            report_memory_usage('After overflow before clearing gradients')
-        self.zero_grad()
-
-        if self.offload_optimizer:
-            self.reset_cpu_buffers()
-        else:
-            self.averaged_gradients = {}
-
-        if self.verbose:
-            report_memory_usage('After overflow after clearing gradients')
-
-        if torch.distributed.get_rank() == 0:
-            print(
-                "[deepscale] OVERFLOW! Rank {} Skipping step. Attempted loss scale: {}, "
-                "reducing to {}".format(dist.get_rank(),
-                                        prev_scale,
-                                        self.loss_scale))
-
-    def _overflow_check_and_loss_scale_update(self):
-
-        # First compute norm for all group so we know if there is overflow
-        self.check_overflow()
-
-        # loss scaling related computation
-        prev_scale = self.loss_scale
-        self._update_scale(self.overflow)
-
-        if self.overflow:
-            self._overflow_clean_up(prev_scale)
-
-        return self.overflow
-
-    def _post_step(self, timer_names=set()):
-        if self.offload_optimizer:
-            self.reset_cpu_buffers()
-
-        # Gathering persisting parameters
-        if len(self.persistent_parameters) > 0:
-            self.persistent_parameters[0].all_gather(
-                self.persistent_parameters)
-
-        if self.swap_optimizer:
-            self.optimizer_swapper.log_timers()
-
-        self.log_timers(timer_names)
-
-        if self.verbose:
-            report_memory_usage('After zero_optimizer step')
-            print_rank_0(
-                f"------------------Finishing Step-----------------------")
-
-    def _reassign_or_swap_out_partitioned_parameters(self, sub_group_id):
-        if self.fp16_partitioned_groups_flat[sub_group_id] is not None:
-            self.fp16_partitioned_groups_flat[sub_group_id].data.copy_(
-                self.fp32_partitioned_groups_flat[sub_group_id].data)
-
-            # unflatten fp16 parameter subgroup
-            self._unflatten_partitioned_parameters(sub_group_id)
-        else:
-            self._partitioned_params_swap_out(sub_group_id)
-
-    def allreduce_gradients(self):
-        self.overlapping_partition_gradients_reduce_epilogue()
-
-    def step(self, closure=None):
-        """
-            Not supporting closure.
-            """
-        self._pre_step()
-
-        # checks for overflow, adjust the loss scale accordingly
-        if self._overflow_check_and_loss_scale_update():
-            if self.swap_optimizer:
-                self.optimizer_swapper.log_timers()
-            return
-
-        norm_groups = self._get_norm_groups()
-
-        timer_names = set()
-
-        timer_names.add('optimizer_step')
-        self.start_timers(['optimizer_step'])
-
-        # update parameters one sub group at a time
-        for sub_group_id, group in enumerate(self.fp16_groups):
-            # prepare optimizer states, gradients and fp32 parameters for update
-            self._prepare_sub_group(sub_group_id, timer_names)
-
-            # scale the fp32 gradients
-            self.unscale_and_clip_grads(sub_group_id, norm_groups)
-
-            # apply the optimizer step on the sub group and copy fp32 parameters to fp16
-            self._optimizer_step(sub_group_id)
-
-            # put fp16 parameters in appropriate location
-            self._reassign_or_swap_out_partitioned_parameters(sub_group_id)
-
-            # release memory or swap out optimizer states of fp32 parameters
-            self._release_sub_group(sub_group_id, timer_names)
-
-        self.stop_timers(['optimizer_step'])
-
-        self._post_step(timer_names)
-        return
-
-    def dump_pre_step_gradients(self, debug_fp32_grads):
-        # Dump gradient norms for debbuging
-        for i, _ in enumerate(self.fp16_groups):
-            if self.verbose:
-                print(
-                    f'Pre-Step Dump Norms for Group {i} FP16P, FP16G, FP32G, FP32GUC')
-            for fp16_param, fp32_grad in zip(self.fp16_groups[i], debug_fp32_grads[i]):
-                param_id = self.get_param_id(fp16_param)
-                fp16_grad_norm = self.debug_fp16_grads[i][param_id]
-
-                fp32_grad_norm = [float(t.data.float().norm(2))
-                                  for t in fp32_grad]
-                norm_list = [fp16_grad_norm, fp32_grad_norm]
-                if self.verbose:
-                    print(f'Pre-Step Norms {i} {param_id} = {norm_list}')
-
-    def dump_post_step_gradients(self):
-        # Dump gradient norms for debbuging
-        for i, group in enumerate(self.fp16_groups):
-            if self.verbose:
-                print(
-                    f'Post-Step Dump Norms for Group {i} FP16P, FP16DS, FP16FLAT, FP32FLAT')
-            unflat_fp16 = self.unflatten(
-                self.fp16_groups_flat[i], self.fp16_groups[i])
-            unflat_fp32 = self.unflatten(self.fp32_partitioned_groups_flat[i],
-                                         self.fp16_groups[i])
-            for j, p in enumerate(self.fp16_groups[i]):
-                param_id = self.get_param_id(p)
-                param_norm = float(p.data.float().norm(2))
-                ds_norm = float(p.ds_tensor.data.float().norm(2))
-
-                unflat_norm = [
-                    float(t.data.float().norm(2))
-                    for t in [unflat_fp16[j],
-                              unflat_fp32[j]]
-                ]
-                norm_list = [param_norm, ds_norm] + unflat_norm
-                if self.verbose:
-                    print(f'Post-Step Norms {i} {param_id} = {norm_list}')
-
-    def unscale_and_clip_grads(self, sub_group_id, norm_groups):
-
-        grad_groups_flat = [
-            self.fp32_partitioned_groups_flat[sub_group_id].grad]
-
-        total_norm = 0.0
-        for norm in norm_groups:
-            total_norm += norm ** 2.0
-        total_norm = math.sqrt(total_norm)
-
-        # compute combined scale factor for this group
-        combined_scale = self.loss_scale
-        if self.clip_grad > 0.:
-            # norm is in fact norm*scale
-            clip = ((total_norm / self.loss_scale) + 1e-6) / self.clip_grad
-            if clip > 1:
-                combined_scale = clip * self.loss_scale
-
-        for grad in grad_groups_flat:
-            if isinstance(grad, list):
-                sub_partitions = grad
-                for g in sub_partitions:
-                    g.data.mul_(1. / combined_scale)
-            else:
-                grad.data.mul_(1. / combined_scale)
-
-    def _check_overflow(self, partition_gradients=True):
-        self.overflow = self.has_overflow(partition_gradients)
-
-    # `params` is a list / generator of torch.Variable
-    def has_overflow_serial(self, params, is_grad_list=False):
-        for p in params:
-            if p.grad is not None and self._has_inf_or_nan(p.grad.data):
-                return True
-
-        return False
-
-    def has_overflow_partitioned_grads_serial(self):
-        for i in range(len(self.fp16_groups)):
-            for j, grad in enumerate(self.averaged_gradients[i]):
-                if grad is not None and self._has_inf_or_nan(grad.data, j):
-                    return True
-        return False
-
-    def has_overflow(self, partition_gradients=True):
-        if partition_gradients:
-            if self.overlap_comm:
-                self.local_overflow = self._has_inf_or_nan(self.gpu_sum)
-                self.gpu_sum = torch.zeros(1, dtype=torch.float).cuda()
-
-            overflow = self.local_overflow if self.offload_optimizer else self.has_overflow_partitioned_grads_serial(
-            )
-            # overflow = self.has_overflow_partitioned_grads_serial()
-            overflow_gpu = torch.cuda.ByteTensor([overflow])
-            torch.distributed.all_reduce(overflow_gpu,
-                                         op=torch.distributed.ReduceOp.MAX,
-                                         group=self.dp_process_group)
-
-        else:
-            params = []
-            for group in self.fp16_groups:
-                for param in group:
-                    params.append(param)
-
-            overflow = self.has_overflow_serial(
-                params, is_grad_list=partition_gradients)
-            overflow_gpu = torch.cuda.ByteTensor([overflow])
-
-        # Since each model parallel GPU carries only part of the model,
-        # make sure overflow flag is synced across all the model parallel GPUs
-        self._model_parallel_all_reduce(tensor=overflow_gpu,
-                                        op=torch.distributed.ReduceOp.MAX)
-
-        overflow = overflow_gpu[0].item()
-        return bool(overflow)
-
-    # `x` is a torch.Tensor
-    @staticmethod
-    def _has_inf_or_nan(x, j=None):
-        try:
-            # if x is half, the .float() incurs an additional deep copy, but it's necessary if
-            # Pytorch's .sum() creates a one-element tensor of the same type as x
-            # (which is true for some recent version of pytorch).
-            cpu_sum = float(x.float().sum())
-            # More efficient version that can be used if .sum() returns a Python scalar
-            # cpu_sum = float(x.sum())
-        except RuntimeError as instance:
-            # We want to check if inst is actually an overflow exception.
-            # RuntimeError could come from a different error.
-            # If so, we still want the exception to propagate.
-            if "value cannot be converted" not in instance.args[0]:
-                raise
-            return True
-        else:
-            if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
-                return True
-            return False
-
-    def backward(self, loss, retain_graph=False):
-        """
-        :attr:`backward` performs the following steps:
-
-        1. fp32_loss = loss.float()
-        2. scaled_loss = fp32_loss*loss_scale
-        3. scaled_loss.backward(), which accumulates scaled gradients into the ``.grad`` attributes of the model's fp16 leaves
-        """
-        self.micro_step_id += 1
-        if self.verbose:
-            print_rank_0(
-                f"Total fully available parameters {self.param_coordinator.total_available_parameter_numel}"
-            )
-
-        if self.swap_optimizer:
-            self.optimizer_swapper.pre_backward()
-
-        if self.verbose:
-            report_memory_usage(f"Before backward")
-
-        if self.contiguous_gradients:
-            self.ipg_buffer = []
-            buf_0 = torch.empty(self.reduce_bucket_size,
-                                dtype=self.dtype,
-                                device=torch.cuda.current_device())
-            self.ipg_buffer.append(buf_0)
-
-            # Use double buffers to avoid data access conflict when overlap_comm is enabled.
-            if self.overlap_comm:
-                buf_1 = torch.empty(self.reduce_bucket_size,
-                                    dtype=self.dtype,
-                                    device=torch.cuda.current_device())
-                self.ipg_buffer.append(buf_1)
-            self.ipg_index = 0
-
-        self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
-        '''Partitioning Parameters that were not partitioned
-        Usually if parameters of modules whose input parameters do not require
-        grad computation do not trigger post call and will therefore will remain unpartitioned '''
-        self._partition_all_parameters()
-
-        if self.swap_optimizer:
-            self.optimizer_swapper.post_backward()
-
-    def _partition_all_parameters(self):
-        for name, param in self.module.named_parameters(recurse=True):
-            self.param_coordinator.release_and_reset_parameter(param)
-
-    def check_overflow(self, partition_gradients=True):
-        self._check_overflow(partition_gradients)
-
-    def _update_scale(self, has_overflow=False):
-        self.loss_scaler.update_scale(has_overflow)
-
-    # Promote state so it can be retrieved or set via "fp16_optimizer_instance.state"
-    def _get_state(self):
-        return self.optimizer.state
-
-    def _set_state(self, value):
-        self.optimizer.state = value
-
-    state = property(_get_state, _set_state)
-
-    # Promote param_groups so it can be retrieved or set via "fp16_optimizer_instance.param_groups"
-    # (for example, to adjust the learning rate)
-    def _get_param_groups(self):
-        return self.optimizer.param_groups
-
-    def _set_param_groups(self, value):
-        self.optimizer.param_groups = value
-
-    param_groups = property(_get_param_groups, _set_param_groups)
-
-    # Promote loss scale so it can be retrieved or set via "fp16_optimizer_instance.loss_scale"
-    def _get_loss_scale(self):
-        return self.loss_scaler.loss_scale
-
-    def _set_loss_scale(self, value):
-        self.loss_scaler.cur_scale = value
-
-    loss_scale = property(_get_loss_scale, _set_loss_scale)
-    cur_scale = property(_get_loss_scale, _set_loss_scale)
-
-    def _get_lean_tensors(self, padded_flattened_tensor, group_tensors, paddings):
-        # Remove paddings from flattened tensor
-        individual_tensors = self.unflatten(
-            padded_flattened_tensor, group_tensors)
-        lean_lengths = [t.numel() - pad for t,
-                        pad in zip(group_tensors, paddings)]
-        lean_tensors = [t[:len]
-                        for t, len in zip(individual_tensors, lean_lengths)]
-        # print()(f'rank {dist.get_rank()}: lean_tensors = {[t.numel() for t in lean_tensors]}')
-        return lean_tensors
-
-    # TODO REVISIT this for stage 3
-    def get_lean_optimizer_state(self):
-        # Return optimizer states after removing paddings.
-        # This method assumes that each param group contains a single flattened tensor.
-        optimizer_groups_state = []
-
-        for i, group in enumerate(self.optimizer.param_groups):
-            p = group['params'][0]
-            lean_state = {}
-            for key, value in self.optimizer.state[p].items():
-                if torch.is_tensor(value):
-                    padded_lens = [t.numel()
-                                   for t in self.fp16_partitioned_groups[i]]
-                    lean_state[key] = self._get_lean_tensors(
-                        value,
-                        self.fp16_partitioned_groups[i],
-                        self.groups_padding[i])
-                    lean_flat_len = sum([t.numel() for t in lean_state[key]])
-                else:
-                    lean_state[key] = value
-
-            optimizer_groups_state.append(lean_state)
-
-        return optimizer_groups_state
-
-    def get_groups_without_padding(self, groups_with_padding):
-        # Return group tensor after removing paddings added for alignment to DP world size.
-        groups_without_padding = []
-        for i, group in enumerate(groups_with_padding):
-            lean_group = self._get_lean_tensors(group,
-                                                self.fp16_partitioned_groups[i],
-                                                self.groups_padding[i])
-            groups_without_padding.append(lean_group)
-
-        return groups_without_padding
-
-    def _set_fp32_optimizer_param_groups(self):
-        for sub_group_id, _ in enumerate(self.fp16_groups):
-            param_group_id = self.sub_group_to_group_id[sub_group_id]
-            self.optimizer.param_groups[param_group_id]['params'].append(
-                self.fp32_partitioned_groups_flat[sub_group_id])
-
-    def _clear_fp32_optimizer_param_groups(self):
-        for param_group in self.optimizer.param_groups:
-            param_group['params'] = []
-
-    def _rigid_state_dict(self):
-        state_dict = {}
-        state_dict['zero_stage'] = ZERO_OPTIMIZATION_WEIGHTS
-        state_dict['loss_scaler'] = self.loss_scaler
-        state_dict['dynamic_loss_scale'] = self.dynamic_loss_scale
-        state_dict['overflow'] = self.overflow
-        state_dict['partition_count'] = self.partition_count
-
-        self._set_fp32_optimizer_param_groups()
-        state_dict['optimizer_state_dict'] = self.optimizer.state_dict()
-        state_dict['fp32_flat_groups'] = self.fp32_partitioned_groups_flat
-        self._clear_fp32_optimizer_param_groups()
-
-        return state_dict
-
-    def state_dict(self):
-        """
-        Returns a dict containing the current state of this :class:`FP16_Optimizer` instance.
-        This dict contains attributes of :class:`FP16_Optimizer`, as well as the state_dict
-        of the contained Pytorch optimizer.
-
-        Example::
-
-            checkpoint = {}
-            checkpoint['model'] = model.state_dict()
-            checkpoint['optimizer'] = optimizer.state_dict()
-            torch.save(checkpoint, "saved.pth")
-        """
-        if self.elastic_checkpoint:
-            raise NotImplementedError(
-                "ZeRO-3 does not yet support elastic checkpointing, please disable for now."
-            )
-
-        if self.swap_optimizer or self.params_in_nvme_and_cpu:
-            raise NotImplementedError(
-                "ZeRO-3 does not yet support checkpointing with NVMe offloading, please disable for now."
-            )
-
-        return self._rigid_state_dict()
-
-    # Restore base optimizer fp32 weights from checkpoint by:
-    # 1) Merging fp32 weights from checkpoints of all partitions
-    # 2) Extracting fp32 weights for current partition from merged weights
-    # 3) Using extracted weights to update base optimizer weights directly.
-
-    def _restore_from_fp32_weights(self, all_state_dict):
-
-        flat_local_partition = []
-        for i in range(len(self.fp32_partitioned_groups_flat)):
-            merged_partitions = [sd['fp32_groups'][i] for sd in all_state_dict]
-            flat_local_partition.append(
-                self._get_flattened_partition(merged_partitions))
-
-        for current, saved in zip(self.fp32_partitioned_groups_flat, flat_local_partition):
-            current.data.copy_(saved.data)
-
-    # Restore base optimizer fp32 weights from ZeRO fp16 weights
-    def _restore_from_fp16_weights(self):
-        for fp16_partitions, fp32_partition in zip(self.fp16_partitioned_groups_flat,
-                                                   self.fp32_partitioned_groups_flat):
-            fp32_partition.data.copy_(fp16_partitions.data)
-
-    # Refresh the fp32 master params from the fp16 copies.
-    def refresh_fp32_params(self):
-        self._restore_from_fp16_weights()
-
-    # Extract flattened partion for current rank from all partitions
-    def _get_flattened_partition(self, all_partition_states):
-        partition_id = dist.get_rank(group=self.dp_process_group)
-        alignment = dist.get_world_size(group=self.dp_process_group)
-
-        param_partitions = [[] for _ in range(len(all_partition_states[0]))]
-        for i, partition in enumerate(all_partition_states):
-            for j, param in enumerate(partition):
-                param_partitions[j].append(param)
-
-        local_state_partitions = []
-        for param_index, param_slices in enumerate(param_partitions):
-            flattened_merged_tensor = self.flatten_dense_tensors_aligned(
-                param_slices,
-                alignment)
-            new_partitions = self.get_data_parallel_partitions(
-                flattened_merged_tensor)
-            local_state_partitions.append(new_partitions[partition_id])
-
-        if torch.is_tensor(local_state_partitions[0]):
-            return self.flatten_dense_tensors_aligned(local_state_partitions, alignment)
-
-        # Assume non-tensor states are not partitioned and equal across ranks, so return first one
-        return local_state_partitions[0]
-
-    # Restore base optimizer state from checkpoint by
-    # 1) Merging optimizer state from checkpoints of all partitions
-    # 2) Extracting optimizer state for current partition from the merged state
-    # 3) Using the extracted value to directly update the base optimizer.
-    def _restore_base_optimizer_state(self, all_state_dict):
-        base_optimizer_group_states = []
-        for i in range(len(self.optimizer.param_groups)):
-            partition_states = {}
-            all_partition_group_states = [
-                sd['base_optimizer_state'][i] for sd in all_state_dict
-            ]
-            for key in all_partition_group_states[0].keys():
-                all_partition_states = [
-                    all_states[key] for all_states in all_partition_group_states
-                ]
-                partition_states[key] = self._get_flattened_partition(
-                    all_partition_states)
-            base_optimizer_group_states.append(partition_states)
-
-        for i, group in enumerate(self.optimizer.param_groups):
-            p = group['params'][0]
-            for key, saved in base_optimizer_group_states[i].items():
-                if torch.is_tensor(self.optimizer.state[p][key]):
-                    self.optimizer.state[p][key].data.copy_(saved.data)
-                else:
-                    self.optimizer.state[p][key] = saved
-
-    def _rigid_load_state_dict(self, state_dict, load_optimizer_states=True):
-        # I think it should actually be ok to reload the optimizer before the model.
-        self.loss_scaler = state_dict['loss_scaler']
-        self.dynamic_loss_scale = state_dict['dynamic_loss_scale']
-        self.overflow = state_dict['overflow']
-
-        if load_optimizer_states:
-            self._set_fp32_optimizer_param_groups()
-            self.optimizer.load_state_dict(state_dict['optimizer_state_dict'])
-            self._clear_fp32_optimizer_param_groups()
-
-        # restore fp32 partitions
-        for curr_param, saved_param in zip(self.fp32_partitioned_groups_flat, state_dict['fp32_flat_groups']):
-            curr_param.data.copy_(saved_param.data)
-
-        # restore fp16 partitions from fp32
-        for sub_group_id in range(len(self.fp32_partitioned_groups_flat)):
-            fp32_param = self.fp32_partitioned_groups_flat[sub_group_id]
-            fp16_param = self.fp16_partitioned_groups_flat[sub_group_id]
-            fp16_param.data.copy_(fp32_param.data)
-
-        # update fp16 unflattened params
-        for sub_group_id in range(len(self.fp16_partitioned_groups_flat)):
-            updated_params = self.unflatten(
-                self.fp16_partitioned_groups_flat[sub_group_id],
-                self.fp16_partitioned_groups[sub_group_id])
-
-            for partitioned_param, q in zip(self.fp16_partitioned_groups[sub_group_id], updated_params):
-                partitioned_param.data = q.data
-
-    # TODO: Support different/changing load/save DP degree.
-    def load_state_dict(self,
-                        state_dict_list,
-                        load_optimizer_states=True,
-                        load_from_fp32_weights=False):
-        r"""Loading a ZeRO checkpoint
-
-        Loads a state_dict created by an earlier call to state_dict().
-        If ``fp16_optimizer_instance`` was constructed from some ``init_optimizer``,
-        whose parameters in turn came from ``model``, it is expected that the user
-        will call ``model.load_state_dict()`` before
-        ``fp16_optimizer_instance.load_state_dict()`` is called.
-
-        Arguments:
-            state_dict_list: List of all saved ZeRO checkpoints, one for each saved partition.
-                Note that the number of saved partitions may differ from number of loading partitions to support
-                changing GPU count, specifically DP world size, between saving and loading checkpoints.
-            load_optimizer_states: Boolean indicating whether or not to load base optimizer states
-            load_from_fp32_weights: Boolean indicating whether to initialize fp32 master weights from fp32
-            copies in checkpoints (no precision loss) or from model's fp16 copies (with precision loss).
-
-        Example::
-
-            model = torch.nn.Linear(D_in, D_out).cuda().half()
-            optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
-            optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0)
-            ...
-            checkpoint = torch.load("saved.pth")
-            model.load_state_dict(checkpoint['model'])
-            optimizer.load_state_dict(checkpoint['optimizer'])
-        """
-
-        if self.elastic_checkpoint:
-            raise NotImplementedError(
-                "ZeRO-3 does not yet support elastic checkpointing, please disable for now."
-            )
-
-        if self.swap_optimizer or self.params_in_nvme_and_cpu:
-            raise NotImplementedError(
-                "ZeRO-3 does not yet support checkpointing with NVMe offloading, please disable for now."
-            )
-
-        self._rigid_load_state_dict(
-            state_dict_list[dist.get_rank(group=self.dp_process_group)],
-            load_optimizer_states=load_optimizer_states)
-
-        if len(self.persistent_parameters) > 0:
-            self.persistent_parameters[0].partition(self.persistent_parameters)
-            self.persistent_parameters[0].all_gather(
-                self.persistent_parameters)
-
-    def save_checkpoint_prologue(self):
-        self._partition_all_parameters()
-
-    def save_checkpoint_epilogue(self):
-        if len(self.persistent_parameters) > 0:
-            self.persistent_parameters[0].all_gather(
-                self.persistent_parameters)
-
-
-def _handle_overflow(cpu_sum, x, i):
-    import math
-    rank = torch.distributed.get_rank()
-    if rank == 0:
-        t_i = -1
-        for v_i, v in enumerate(x.data.contiguous().view(-1)):
-            if not math.isfinite(float(v)):
-                t_i = v_i
-                break
-        print(
-            f"rank {rank} detected overflow {cpu_sum} in tensor {i}:{t_i} shape {x.shape}"
-        )
-
-
-def estimate_zero3_model_states_mem_needs(total_params,
-                                          largest_layer_params,
-                                          num_gpus_per_node=1,
-                                          num_nodes=1,
-                                          cpu_offload=True,
-                                          cpu_offload_params=True,
-                                          zero_init=True,
-                                          additional_buffer_factor=1.5):
-    total_gpus = num_nodes * num_gpus_per_node
-    gpus_factor = 1 / num_nodes
-    largest_layer_memory = (4 * largest_layer_params)
-
-    if cpu_offload:
-        if cpu_offload_params:
-            gpu_mem = largest_layer_memory
-
-            if zero_init:
-                cpu_mem = total_params * 18 * gpus_factor * additional_buffer_factor
-            else:
-                cpu_mem = total_params * max(4 * num_gpus_per_node,
-                                             18 * gpus_factor) * additional_buffer_factor
-
-        else:
-            gpu_mem = largest_layer_memory + int(2 * total_params / total_gpus)
-
-            if zero_init:
-                cpu_mem = total_params * 16 * gpus_factor * additional_buffer_factor
-            else:
-                cpu_mem = total_params * max(4 * num_gpus_per_node,
-                                             16 * gpus_factor) * additional_buffer_factor
-    else:
-        gpu_mem = largest_layer_memory + int(18 * total_params / total_gpus)
-        if zero_init:
-            cpu_mem = largest_layer_params * 4 * num_gpus_per_node * additional_buffer_factor
-        else:
-            cpu_mem = total_params * 4 * num_gpus_per_node * additional_buffer_factor
-
-    return int(cpu_mem), int(gpu_mem), largest_layer_memory
-
-
-def model_to_params(model):
-    # shared params calculated only once
-    total_params = sum(
-        dict((p.data_ptr(),
-              p.numel()) for p in model.parameters()).values())
-
-    largest_layer_params = 0
-    for m in model.modules():
-        # assuming no shared params within a single layer
-        layer_params = sum(p.numel() for p in m.parameters(recurse=False))
-        largest_layer_params = max(largest_layer_params, layer_params)
-
-    return total_params, largest_layer_params
-
-
-def estimate_zero3_model_states_mem_needs_all_live(model,
-                                                   num_gpus_per_node=1,
-                                                   num_nodes=1,
-                                                   additional_buffer_factor=1.5):
-    """
-    Print out estimates on memory usage requirements for ZeRO 3 params, optim states and gradients
-    for a given ``model`` and hardware setup.
-
-    If you have an actual model object, use this function and everything will be derived
-    automatically.
-
-    If it's a hypothetical model, use ``estimate_zero3_model_states_mem_needs_all_cold`` where you have to pass
-    the ``total_params`` and ``largest_layer_params`` explicitly.
-
-    Args:
-        - ``model``: ``nn.Module`` object
-        - ``num_gpus_per_node``: how many gpus per node (defaults to 1)
-        - ``num_nodes``: how many nodes (defaults to 1),
-        - ``additional_buffer_factor``: estimation factor (defaults to 1.5):
-
-    """
-
-    total_params, largest_layer_params = model_to_params(model)
-
-    estimate_zero3_model_states_mem_needs_all_cold(
-        total_params=total_params,
-        largest_layer_params=largest_layer_params,
-        num_gpus_per_node=num_gpus_per_node,
-        num_nodes=num_nodes,
-        additional_buffer_factor=additional_buffer_factor)
-
-
-def estimate_zero3_model_states_mem_needs_all_cold(total_params,
-                                                   largest_layer_params,
-                                                   num_gpus_per_node=1,
-                                                   num_nodes=1,
-                                                   additional_buffer_factor=1.5):
-    """
-    Print out estimates on memory usage requirements for ZeRO 3 params, optim states and gradients
-    for a given ``model`` and hardware setup.
-
-    If it's a hypothetical model, use this function where you have to pass
-    the ``total_params`` and ``largest_layer_params`` explicitly.
-
-    If you have an actual model object, use ``estimate_zero3_model_states_mem_needs_all_live`` and everything
-    will be derived automatically.
-
-    Args:
-        - ``total_params``: total  model params
-        - ``largest_layer_params``: largest layer's params
-        - ``num_gpus_per_node``: how many gpus per node (defaults to 1)
-        - ``num_nodes``: how many nodes (defaults to 1),
-        - ``additional_buffer_factor``: estimation factor (defaults to 1.5):
-
-    """
-
-    def format_options(cpu_offload, cpu_offload_params, zero_init):
-        enabled = []
-        enabled.append(f"cpu_offload={1 if cpu_offload else 0}")
-        enabled.append(f"cpu_offload_params={1 if cpu_offload_params else 0}")
-        enabled.append(f"zero_init={1 if zero_init else 0}")
-        return ", ".join(enabled)
-
-    nodes_str = "nodes" if num_nodes > 1 else "node"
-    gpus_str = "GPUs" if num_gpus_per_node > 1 else "GPU"
-    print(
-        "Estimated memory needed for params, optim states and gradients for a:\n"
-        f"HW: Setup with {num_nodes} {nodes_str}, {num_gpus_per_node} {gpus_str} per node.\n"
-        f"SW: Model with {int(total_params / 1e6)}M total params, {int(largest_layer_params / 1e6)}M largest layer params."
-    )
-    print("  per CPU  |  per GPU |   Options")
-    for cpu_offload in [True, False]:
-        for cpu_offload_params in [True, False]:
-            if not cpu_offload and cpu_offload_params:
-                continue
-            for zero_init in [True, False]:
-                cpu_mem, gpu_mem, largest_layer_memory = estimate_zero3_model_states_mem_needs(
-                    total_params=total_params,
-                    largest_layer_params=largest_layer_params,
-                    num_gpus_per_node=num_gpus_per_node,
-                    num_nodes=num_nodes,
-                    cpu_offload=cpu_offload,
-                    cpu_offload_params=cpu_offload_params,
-                    zero_init=zero_init,
-                    additional_buffer_factor=additional_buffer_factor
-                )
-
-                options_str = format_options(cpu_offload=cpu_offload,
-                                             cpu_offload_params=cpu_offload_params,
-                                             zero_init=zero_init)
-                print(
-                    f" {cpu_mem / 2 ** 30:7.2f}GB | {gpu_mem / 2 ** 30:6.2f}GB | {options_str}")
diff --git a/tests/test_utils/test_zero_gradient_clippling.py b/tests/test_utils/test_zero_gradient_clippling.py
new file mode 100644
index 000000000..c20dcd89c
--- /dev/null
+++ b/tests/test_utils/test_zero_gradient_clippling.py
@@ -0,0 +1,187 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+import copy
+import operator as op
+from functools import partial, reduce
+from typing import List
+
+import colossalai
+import pytest
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+import torch.nn as nn
+from colossalai.logging import disable_existing_loggers
+from colossalai.utils import checkpoint, clip_grad_norm_fp32, free_port
+from colossalai.zero.sharded_model import ShardedModel
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.nn.utils import clip_grad_norm_
+
+
+class Enumerator:
+    def __init__(self, arg_names: List[str], arg_values: List[tuple]) -> None:
+        self.arg_names = arg_names
+        self.enums = Enumerator.all_enumerate(arg_values)
+
+    def __len__(self):
+        return len(self.enums)
+
+    def __getitem__(self, idx):
+        return {name: self.enums[idx][i] for i, name in enumerate(self.arg_names)}
+
+    @staticmethod
+    def all_enumerate(args: List[tuple]):
+        num_states = reduce(op.mul, map(lambda xs: len(xs), args))
+        idxs = [0] * len(args)
+        states = []
+        for _ in range(num_states):
+            states.append(tuple(args[j][idx] for j, idx in enumerate(idxs)))
+            if len(states) == num_states:
+                break
+            i = 0
+            while idxs[i] + 1 == len(args[i]):
+                idxs[i] = 0
+                i += 1
+            idxs[i] += 1
+        return states
+
+
+def checkpoint_wrapper(module, enable=True):
+    if enable:
+        module.forward = partial(checkpoint, module.forward)
+    return module
+
+
+class Net(nn.Module):
+    def __init__(self, checkpoint=False) -> None:
+        super().__init__()
+        self.fc1 = nn.Linear(5, 5)
+        self.fc2 = nn.Linear(5, 5)
+        self.fc3 = nn.Linear(5, 1)
+        if checkpoint:
+            self.fc1 = checkpoint_wrapper(self.fc1)
+        self.layers = [
+            self.fc1,
+            self.fc2,
+            self.fc1,
+            self.fc2,
+            self.fc3
+        ]
+
+    def forward(self, x):
+        for layer in self.layers:
+            x = layer(x)
+        return x
+
+
+def run_step(model, optimizer, x, enable_autocast=False, norm_type=2.0):
+    model.train()
+    optimizer.zero_grad()
+    with torch.cuda.amp.autocast(enabled=enable_autocast):
+        y = model(x)
+        loss = y.sum()
+    loss = loss.float()
+    loss.backward()
+    clip_grad(model, norm_type)
+    optimizer.step()
+
+
+def clip_grad(model, norm_type):
+    if isinstance(model, DDP):
+        clip_grad_norm_(model.parameters(), max_norm=1.0, norm_type=norm_type)
+    else:
+        clip_grad_norm_fp32(model.parameters(), max_norm=1.0, norm_type=norm_type)
+
+
+def allclose(tensor_a: torch.Tensor, tensor_b: torch.Tensor, loose=False) -> bool:
+    if loose:
+        return torch.allclose(tensor_a, tensor_b, atol=1e-3, rtol=1e-3)
+    return torch.allclose(tensor_a, tensor_b)
+
+
+def check_grads(model, zero_model, loose=False):
+    rank = dist.get_rank()
+    for p, zero_p in zip(model.parameters(), zero_model.parameters()):
+        zero_grad = zero_p.grad.clone().to(p.device)
+        chunks = torch.flatten(p.grad).chunk(4)
+        if rank >= len(chunks):
+            continue
+        grad = chunks[rank]
+        if zero_p.zero_shard_padding > 0:
+            zero_grad = zero_grad[:-zero_p.zero_shard_padding]
+        assert grad.dtype == zero_grad.dtype
+        assert allclose(grad, zero_grad, loose=loose)
+
+
+def check_params(model, zero_model, loose=False):
+    rank = dist.get_rank()
+    for p, zero_p in zip(model.parameters(), zero_model.parameters()):
+        zero_shard_padding = zero_p.zero_shard_padding
+        zero_p = zero_p.clone().to(p.device)
+        chunks = torch.flatten(p).chunk(4)
+        if rank >= len(chunks):
+            continue
+        p = chunks[rank]
+        if zero_shard_padding > 0:
+            zero_p = zero_p[:-zero_shard_padding]
+        assert p.dtype == zero_p.dtype
+        assert allclose(p, zero_p, loose=loose)
+
+
+def check_config(checkpoint=False, fp16=False, offload=False, norm_type=2.0):
+    model = Net(checkpoint=checkpoint).cuda()
+    zero_model = copy.deepcopy(model)
+    ddp_model = DDP(model)
+
+    offload_config = {}
+    if offload:
+        offload_config['device'] = 'cpu'
+        zero_model = zero_model.cpu()
+    zero_model = ShardedModel(zero_model, mixed_precision=fp16, offload_config=offload_config)
+
+    optimizer = torch.optim.Adam(ddp_model.parameters(), lr=1e-3)
+    zero_optimizer = torch.optim.Adam(zero_model.parameters(), lr=1e-3)
+    for _ in range(5):
+        x = torch.rand(2, 5).cuda()
+        run_step(ddp_model, optimizer, x, enable_autocast=fp16, norm_type=norm_type)
+        run_step(zero_model, zero_optimizer, x, enable_autocast=fp16, norm_type=norm_type)
+        check_grads(ddp_model, zero_model)
+        check_params(ddp_model, zero_model)
+    for _ in range(5):
+        x = torch.rand(2, 5).cuda()
+        run_step(ddp_model, optimizer, x, enable_autocast=False, norm_type=norm_type)
+        run_step(zero_model, zero_optimizer, x, enable_autocast=False, norm_type=norm_type)
+        check_grads(ddp_model, zero_model, loose=True)
+        check_params(ddp_model, zero_model, loose=True)
+
+
+def run_dist(rank, world_size, port):
+    disable_existing_loggers()
+    colossalai.launch(config={},
+                      rank=rank,
+                      world_size=world_size,
+                      host='localhost',
+                      port=port,
+                      backend='nccl')
+
+    args = ['checkpoint', 'fp16', 'offload', 'norm_type']
+    arg_values = [(False, True), (False, True), (False, True), (1.0, 2.0, float('inf'))]
+    arg_enumerator = Enumerator(args, arg_values)
+
+    for kwargs in arg_enumerator:
+        if dist.get_rank() == 0:
+            print(kwargs)
+        check_config(**kwargs)
+    check_config()
+
+
+@ pytest.mark.dist
+def test_zero_clip_grad():
+    world_size = 4
+    run_func = partial(run_dist, world_size=world_size, port=free_port())
+    mp.spawn(run_func, nprocs=world_size)
+
+
+if __name__ == '__main__':
+    test_zero_clip_grad()
diff --git a/tests/test_zero_data_parallel/common.py b/tests/test_zero_data_parallel/common.py
new file mode 100644
index 000000000..353d759cb
--- /dev/null
+++ b/tests/test_zero_data_parallel/common.py
@@ -0,0 +1,82 @@
+from functools import partial
+from operator import imod
+from colossalai.utils import checkpoint
+import torch.nn as nn
+import torch
+from colossalai.logging import disable_existing_loggers, get_dist_logger
+
+LOGGER = get_dist_logger()
+
+CONFIG = dict(
+    fp16=dict(
+        mode=None,
+    ),
+    zero=dict(
+        level=3,
+        verbose=False,
+        offload_optimizer_config=dict(
+            device='cpu',
+            pin_memory=True,
+            buffer_count=5,
+            fast_init=False
+        ),
+        offload_param_config=dict(
+            device='cpu',
+            pin_memory=True,
+            buffer_count=5,
+            buffer_size=1e8,
+            max_in_cpu=1e9
+        )
+    ),
+    parallel=dict(
+        pipeline=dict(size=1),
+        tensor=dict(size=1, mode=None)
+    )
+)
+
+def checkpoint_wrapper(module, enable=True):
+    if enable:
+        module.forward = partial(checkpoint, module.forward)
+    return module
+
+
+class Net(nn.Module):
+    def __init__(self, checkpoint=False) -> None:
+        super().__init__()
+        self.fc1 = nn.Linear(5, 5)
+        self.fc2 = nn.Linear(5, 5)
+        self.fc3 = nn.Linear(5, 1)
+        if checkpoint:
+            self.fc1 = checkpoint_wrapper(self.fc1)
+        self.layers = [
+            self.fc1,
+            self.fc2,
+            self.fc1,
+            self.fc2,
+            self.fc3
+        ]
+
+    def forward(self, x):
+        for layer in self.layers:
+            x = layer(x)
+        return x
+
+def allclose(tensor_a: torch.Tensor, tensor_b: torch.Tensor, loose=False) -> bool:
+    if loose:
+        return torch.allclose(tensor_a, tensor_b, atol=1e-3, rtol=1e-3)
+    return torch.allclose(tensor_a, tensor_b)
+
+
+def check_grads(model, zero_model, loose=False):
+    for p, zero_p in zip(model.parameters(), zero_model.parameters()):
+        zero_grad = zero_p.grad.clone().to(p.device)
+        assert p.grad.dtype == zero_grad.dtype
+        assert allclose(p.grad, zero_grad, loose=loose)
+        LOGGER.info(torch.sum(p.grad-zero_grad))
+
+def check_params(model, zero_model, loose=False):
+    for p, zero_p in zip(model.parameters(), zero_model.parameters()):
+        zero_p = zero_p.clone().to(p.device)
+        assert p.dtype == zero_p.dtype
+        assert allclose(p, zero_p, loose=loose)
+
diff --git a/tests/test_zero_data_parallel/test_shard_model_v2.py b/tests/test_zero_data_parallel/test_shard_model_v2.py
new file mode 100644
index 000000000..e25224dca
--- /dev/null
+++ b/tests/test_zero_data_parallel/test_shard_model_v2.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+import copy
+from functools import partial
+from operator import mod
+from pyexpat import model
+
+import colossalai
+import pytest
+import torch
+import torch.multiprocessing as mp
+from colossalai.logging import disable_existing_loggers
+from colossalai.utils import free_port
+from colossalai.zero.sharded_model import ShardedModelV2
+from colossalai.core import global_context as gpc
+from colossalai.context.parallel_mode import ParallelMode
+from tests.test_zero_data_parallel.common import Net, CONFIG, check_grads
+
+
+def run_fwd_bwd(model, x, enable_autocast=False):
+    model.train()
+    with torch.cuda.amp.autocast(enabled=enable_autocast):
+        y = model(x)
+        loss = y.sum()
+    loss = loss.float()
+    loss.backward()
+    
+
+def run_dist(rank, world_size, port):
+    colossalai.launch(config=CONFIG,
+                      rank=rank,
+                      world_size=world_size,
+                      host='localhost',
+                      port=port,
+                      backend='nccl')
+    
+    model = Net(checkpoint=True).cuda()
+    zero_model = copy.deepcopy(model)
+    zero_model = ShardedModelV2(zero_model, process_group=gpc.get_group(ParallelMode.DATA))
+
+    for _ in range(2):
+        x = torch.rand(2, 5).cuda()
+        run_fwd_bwd(zero_model, x, False)
+        run_fwd_bwd(model, x, False)
+        check_grads(model, zero_model)
+
+
+@pytest.mark.dist
+def test_shard_model_v2():
+    world_size = 2
+    run_func = partial(run_dist, world_size=world_size, port=free_port())
+    mp.spawn(run_func, nprocs=world_size)
+
+
+if __name__ == '__main__':
+    test_shard_model_v2()
diff --git a/tests/test_zero_data_parallel/test_shard_param.py b/tests/test_zero_data_parallel/test_shard_param.py
new file mode 100644
index 000000000..9973ee524
--- /dev/null
+++ b/tests/test_zero_data_parallel/test_shard_param.py
@@ -0,0 +1,50 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+from asyncio.log import logger
+from functools import partial
+
+import colossalai
+import pytest
+import torch
+import torch.multiprocessing as mp
+from colossalai.zero.shard_param import ShardParam
+from colossalai.utils import free_port
+from colossalai.logging import get_dist_logger, disable_existing_loggers
+from tests.test_zero_data_parallel.common import Net, CONFIG
+
+def run_shard_param_check(rank, world_size, port):
+    colossalai.launch(config=CONFIG,
+                      rank=rank,
+                      world_size=world_size,
+                      host='localhost',
+                      port=port,
+                      backend='nccl')
+    
+    logger = get_dist_logger()
+    model = Net()
+
+    # add an attribute as ca_attr to hijack the access to param.data
+    for _, param in model.named_parameters():
+        numel_ref = (param.numel() + world_size - 1) // world_size
+        param.ca_attr = ShardParam(param)
+        param.ca_attr.shard()
+        param_data = param.ca_attr.payload(torch.device('cpu'))
+        logger.info(f'shard {param_data.shape} {param_data}', ranks = [1])
+        assert(numel_ref == param_data.numel())
+
+    for _, param in model.named_parameters():
+        param.ca_attr.gather()
+        param_data = param.ca_attr.payload(torch.device('cpu'))
+        logger.info(f'gather {param_data.shape} {param_data}', ranks = [1])
+    
+    disable_existing_loggers([logger])
+
+@pytest.mark.dist
+def test_run_shard_shape():
+    world_size = 2
+    run_func = partial(run_shard_param_check, world_size=world_size, port=free_port())
+    mp.spawn(run_func, nprocs=world_size)
+
+if __name__ == '__main__':
+    test_run_shard_shape()
\ No newline at end of file
diff --git a/tests/test_zero_data_parallel/test_zero_dev_3.py b/tests/test_zero_data_parallel/test_zero_dev_3.py
new file mode 100644
index 000000000..a6fd9df17
--- /dev/null
+++ b/tests/test_zero_data_parallel/test_zero_dev_3.py
@@ -0,0 +1,119 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+import copy
+from functools import partial
+
+import colossalai
+import pytest
+import torch
+import torch.multiprocessing as mp
+import torch.nn as nn
+from colossalai.logging import disable_existing_loggers
+from colossalai.utils import checkpoint, free_port
+from colossalai.zero.sharded_model import ShardedModel
+from common import Net, check_grads, check_params, check_params
+
+def checkpoint_wrapper(module, enable=True):
+    if enable:
+        module.forward = partial(checkpoint, module.forward)
+    return module
+
+
+class Net(nn.Module):
+    def __init__(self, checkpoint=False) -> None:
+        super().__init__()
+        self.fc1 = nn.Linear(5, 5)
+        self.fc2 = nn.Linear(5, 5)
+        self.fc3 = nn.Linear(5, 1)
+        if checkpoint:
+            self.fc1 = checkpoint_wrapper(self.fc1)
+        self.layers = [
+            self.fc1,
+            self.fc2,
+            self.fc1,
+            self.fc2,
+            self.fc3
+        ]
+
+    def forward(self, x):
+        for layer in self.layers:
+            x = layer(x)
+        return x
+
+
+def run_step(model, optimizer, x, enable_autocast=False):
+    model.train()
+    optimizer.zero_grad()
+    with torch.cuda.amp.autocast(enabled=enable_autocast):
+        y = model(x)
+        loss = y.sum()
+    loss = loss.float()
+    loss.backward()
+    optimizer.step()
+
+
+def decode_booleans(intval, bits):
+    res = []
+    for bit in range(bits):
+        mask = 1 << bit
+        res.append((intval & mask) == mask)
+    return res
+
+
+def check_config(checkpoint=False, fp16=False, offload=False):
+    model = Net(checkpoint=checkpoint).cuda()
+    zero_model = copy.deepcopy(model)
+
+    offload_config = {}
+    if offload:
+        offload_config['device'] = 'cpu'
+        zero_model = zero_model.cpu()
+    zero_model = ShardedModel(zero_model, mixed_precision=fp16, offload_config=offload_config)
+
+    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
+    zero_optimizer = torch.optim.Adam(zero_model.parameters(), lr=1e-3)
+    for _ in range(5):
+        x = torch.rand(2, 5).cuda()
+        run_step(model, optimizer, x, enable_autocast=fp16)
+        run_step(zero_model, zero_optimizer, x, enable_autocast=fp16)
+        check_grads(model, zero_model)
+        check_params(model, zero_model)
+    for _ in range(5):
+        x = torch.rand(2, 5).cuda()
+        run_step(model, optimizer, x, enable_autocast=False)
+        run_step(zero_model, zero_optimizer, x, enable_autocast=False)
+        check_grads(model, zero_model, loose=True)
+        check_params(model, zero_model, loose=True)
+
+
+def run_dist(rank, world_size, port):
+    disable_existing_loggers()
+    colossalai.launch(config={},
+                      rank=rank,
+                      world_size=world_size,
+                      host='localhost',
+                      port=port,
+                      backend='nccl')
+
+    args = ['checkpoint', 'fp16', 'offload']
+
+    def pack_args(i):
+        booleans = decode_booleans(i, len(args))
+        return {arg: booleans[idx] for idx, arg in enumerate(args)}
+
+    for j in range(2 ** len(args)):
+        kwargs = pack_args(j)
+        print(kwargs)
+        check_config(**kwargs)
+
+
+@pytest.mark.dist
+def test_zero_level_3():
+    world_size = 1
+    run_func = partial(run_dist, world_size=world_size, port=free_port())
+    mp.spawn(run_func, nprocs=world_size)
+
+
+if __name__ == '__main__':
+    test_zero_level_3()
diff --git a/tests/test_zero_data_parallel/test_zero_dev_3_mp4.py b/tests/test_zero_data_parallel/test_zero_dev_3_mp4.py
new file mode 100644
index 000000000..bfc805a89
--- /dev/null
+++ b/tests/test_zero_data_parallel/test_zero_dev_3_mp4.py
@@ -0,0 +1,123 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+import copy
+from functools import partial
+
+import colossalai
+import pytest
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+import torch.nn as nn
+from colossalai.logging import disable_existing_loggers
+from colossalai.utils import checkpoint, free_port
+from colossalai.zero.sharded_model import ShardedModel
+from torch.nn.parallel import DistributedDataParallel as DDP
+from common import Net, allclose
+
+def run_step(model, optimizer, x, enable_autocast=False):
+    model.train()
+    optimizer.zero_grad()
+    with torch.cuda.amp.autocast(enabled=enable_autocast):
+        y = model(x)
+        loss = y.sum()
+    loss = loss.float()
+    loss.backward()
+    optimizer.step()
+
+def check_grads_padding(model, zero_model, loose=False):
+    rank = dist.get_rank()
+    for p, zero_p in zip(model.parameters(), zero_model.parameters()):
+        zero_grad = zero_p.grad.clone().to(p.device)
+        chunks = torch.flatten(p.grad).chunk(4)
+        if rank >= len(chunks):
+            continue
+        grad = chunks[rank]
+        if zero_p.zero_shard_padding > 0:
+            zero_grad = zero_grad[:-zero_p.zero_shard_padding]
+        assert grad.dtype == zero_grad.dtype
+        assert allclose(grad, zero_grad, loose=loose)
+
+
+def check_params_padding(model, zero_model, loose=False):
+    rank = dist.get_rank()
+    for p, zero_p in zip(model.parameters(), zero_model.parameters()):
+        zero_shard_padding = zero_p.zero_shard_padding
+        zero_p = zero_p.clone().to(p.device)
+        chunks = torch.flatten(p).chunk(4)
+        if rank >= len(chunks):
+            continue
+        p = chunks[rank]
+        if zero_shard_padding > 0:
+            zero_p = zero_p[:-zero_shard_padding]
+        assert p.dtype == zero_p.dtype
+        assert allclose(p, zero_p, loose=loose)
+
+
+def decode_booleans(intval, bits):
+    res = []
+    for bit in range(bits):
+        mask = 1 << bit
+        res.append((intval & mask) == mask)
+    return res
+
+
+def check_config(checkpoint=False, fp16=False, offload=False):
+    model = Net(checkpoint=checkpoint).cuda()
+    zero_model = copy.deepcopy(model)
+    ddp_model = DDP(model)
+
+    offload_config = {}
+    if offload:
+        offload_config['device'] = 'cpu'
+        zero_model = zero_model.cpu()
+    zero_model = ShardedModel(zero_model, mixed_precision=fp16, offload_config=offload_config)
+
+    optimizer = torch.optim.Adam(ddp_model.parameters(), lr=1e-3)
+    zero_optimizer = torch.optim.Adam(zero_model.parameters(), lr=1e-3)
+    for _ in range(5):
+        x = torch.rand(2, 5).cuda()
+        run_step(ddp_model, optimizer, x, enable_autocast=fp16)
+        run_step(zero_model, zero_optimizer, x, enable_autocast=fp16)
+        check_grads_padding(ddp_model, zero_model)
+        check_params_padding(ddp_model, zero_model)
+    for _ in range(5):
+        x = torch.rand(2, 5).cuda()
+        run_step(ddp_model, optimizer, x, enable_autocast=False)
+        run_step(zero_model, zero_optimizer, x, enable_autocast=False)
+        check_grads_padding(ddp_model, zero_model, loose=True)
+        check_params_padding(ddp_model, zero_model, loose=True)
+
+
+def run_dist(rank, world_size, port):
+    disable_existing_loggers()
+    colossalai.launch(config={},
+                      rank=rank,
+                      world_size=world_size,
+                      host='localhost',
+                      port=port,
+                      backend='nccl')
+
+    args = ['checkpoint', 'fp16', 'offload']
+
+    def pack_args(i):
+        booleans = decode_booleans(i, len(args))
+        return {arg: booleans[idx] for idx, arg in enumerate(args)}
+
+    for j in range(2 ** len(args)):
+        kwargs = pack_args(j)
+        if dist.get_rank() == 0:
+            print(kwargs)
+        check_config(**kwargs)
+
+
+@pytest.mark.dist
+def test_zero_level_3():
+    world_size = 4
+    run_func = partial(run_dist, world_size=world_size, port=free_port())
+    mp.spawn(run_func, nprocs=world_size)
+
+
+if __name__ == '__main__':
+    test_zero_level_3()
diff --git a/tests/test_zero_data_parallel/test_zero_level_2.py b/tests/test_zero_data_parallel/test_zero_level_2.py
deleted file mode 100644
index 9bdd1b124..000000000
--- a/tests/test_zero_data_parallel/test_zero_level_2.py
+++ /dev/null
@@ -1,102 +0,0 @@
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-
-import os
-from functools import partial
-from pathlib import Path
-
-import colossalai
-import pytest
-import torch
-import torch.multiprocessing as mp
-from colossalai.core import global_context as gpc
-from colossalai.utils import free_port, get_dataloader
-from torchvision import transforms
-from torchvision.datasets import CIFAR10
-from torchvision.models import resnet18
-
-BATCH_SIZE = 16
-IMG_SIZE = 224
-
-CONFIG = dict(
-    fp16=dict(
-        mode=None,
-    ),
-    zero=dict(
-        level=2,
-        cpu_offload=True,
-        verbose=False,
-    ),
-    parallel=dict(
-        pipeline=dict(size=1),
-        tensor=dict(size=1, mode=None)
-    )
-)
-
-
-def run_dist(rank, world_size, port):
-    colossalai.launch(config=CONFIG,
-                      rank=rank,
-                      world_size=world_size,
-                      host='localhost',
-                      port=port,
-                      backend='nccl')
-
-    # build model
-    model = resnet18(num_classes=10)
-
-    # build dataloader# build dataloaders
-    train_dataset = CIFAR10(
-        root=Path(os.environ['DATA']),
-        download=True,
-        transform=transforms.Compose(
-            [
-                transforms.Resize(size=(IMG_SIZE, IMG_SIZE)),
-                transforms.ToTensor(),
-                transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))
-            ]
-        )
-    )
-    train_dataloader = get_dataloader(dataset=train_dataset,
-                                      shuffle=True,
-                                      batch_size=BATCH_SIZE,
-                                      pin_memory=True,
-                                      drop_last=True)
-
-    # build optimizer and loss
-    # optimizer = build_optimizer(global_context.config.optimizer, model)
-    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
-    criterion = torch.nn.CrossEntropyLoss()
-
-    engine, train_dataloader, *args = colossalai.initialize(model=model,
-                                                            optimizer=optimizer,
-                                                            criterion=criterion,
-                                                            train_dataloader=train_dataloader)
-
-    # train
-    model.train()
-    for idx, (data, label) in enumerate(train_dataloader):
-        engine.zero_grad()
-        data = data.cuda()
-        label = label.cuda()
-
-        output = engine(data)
-        loss = engine.criterion(output, label)
-
-        engine.backward(loss)
-        engine.step()
-        break
-
-    gpc.destroy()
-    torch.cuda.empty_cache()
-
-
-@pytest.mark.dist
-def test_zero_level_2():
-    world_size = 4
-    run_func = partial(run_dist, world_size=world_size, port=free_port())
-    mp.spawn(run_func, nprocs=world_size)
-
-
-if __name__ == '__main__':
-    test_zero_level_2()
diff --git a/tests/test_zero_data_parallel/test_zero_level_3.py b/tests/test_zero_data_parallel/test_zero_level_3.py
deleted file mode 100644
index 2655210db..000000000
--- a/tests/test_zero_data_parallel/test_zero_level_3.py
+++ /dev/null
@@ -1,114 +0,0 @@
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-
-import os
-from functools import partial
-from pathlib import Path
-
-import colossalai
-import pytest
-import torch
-import torch.multiprocessing as mp
-from colossalai.core import global_context as gpc
-from colossalai.utils import free_port, get_dataloader
-from torchvision import transforms
-from torchvision.datasets import CIFAR10
-from torchvision.models import resnet18
-
-BATCH_SIZE = 16
-IMG_SIZE = 224
-
-CONFIG = dict(
-    fp16=dict(
-        mode=None,
-    ),
-    zero=dict(
-        level=3,
-        verbose=False,
-        offload_optimizer_config=dict(
-            device='cpu',
-            pin_memory=True,
-            buffer_count=5,
-            fast_init=False
-        ),
-        offload_param_config=dict(
-            device='cpu',
-            pin_memory=True,
-            buffer_count=5,
-            buffer_size=1e8,
-            max_in_cpu=1e9
-        )
-    ),
-    parallel=dict(
-        pipeline=dict(size=1),
-        tensor=dict(size=1, mode=None)
-    )
-)
-
-
-def run_dist(rank, world_size, port):
-    colossalai.launch(config=CONFIG,
-                      rank=rank,
-                      world_size=world_size,
-                      host='localhost',
-                      port=port,
-                      backend='nccl')
-
-    # build model
-    model = resnet18(num_classes=10)
-
-    # build dataloader# build dataloaders
-    train_dataset = CIFAR10(
-        root=Path(os.environ['DATA']),
-        download=True,
-        transform=transforms.Compose(
-            [
-                transforms.Resize(size=(IMG_SIZE, IMG_SIZE)),
-                transforms.ToTensor(),
-                transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))
-            ]
-        )
-    )
-    train_dataloader = get_dataloader(dataset=train_dataset,
-                                      shuffle=True,
-                                      batch_size=BATCH_SIZE,
-                                      pin_memory=True,
-                                      drop_last=True)
-
-    # build optimizer and loss
-    # optimizer = build_optimizer(global_context.config.optimizer, model)
-    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
-    criterion = torch.nn.CrossEntropyLoss()
-
-    engine, train_dataloader, *args = colossalai.initialize(model=model,
-                                                            optimizer=optimizer,
-                                                            criterion=criterion,
-                                                            train_dataloader=train_dataloader)
-
-    # train
-    model.train()
-    for idx, (data, label) in enumerate(train_dataloader):
-        engine.zero_grad()
-        data = data.cuda()
-        label = label.cuda()
-
-        output = engine(data)
-        loss = engine.criterion(output, label)
-
-        engine.backward(loss)
-        engine.step()
-        break
-
-    gpc.destroy()
-    torch.cuda.empty_cache()
-
-
-@pytest.mark.dist
-def test_zero_level_3():
-    world_size = 4
-    run_func = partial(run_dist, world_size=world_size, port=free_port())
-    mp.spawn(run_func, nprocs=world_size)
-
-
-if __name__ == '__main__':
-    test_zero_level_3()
diff --git a/tests/test_zero_data_parallel/test_zero_param_mgr.py b/tests/test_zero_data_parallel/test_zero_param_mgr.py
new file mode 100644
index 000000000..a38ed9286
--- /dev/null
+++ b/tests/test_zero_data_parallel/test_zero_param_mgr.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+import os
+from functools import partial
+from pathlib import Path
+
+import colossalai
+import pytest
+import torch
+import torch.multiprocessing as mp
+from colossalai.zero.sharded_model.param_manager import Zero3ParameterManager
+from colossalai.core import global_context as gpc
+from colossalai.context.parallel_mode import ParallelMode
+from colossalai.utils import free_port
+from common import CONFIG
+
+def run_shard_shape_check(rank, world_size, port):
+    colossalai.launch(config=CONFIG,
+                      rank=rank,
+                      world_size=world_size,
+                      host='localhost',
+                      port=port,
+                      backend='nccl')
+    
+    model = torch.nn.Linear(2, 4 * world_size)
+    gpc.init_parallel_groups()
+    Zero3ParameterManager(module=model, process_group=gpc.get_group(ParallelMode.DATA), offload_config=CONFIG.get('offload_param_config'))
+
+    assert(model.weight.numel() == 4 * 2)
+    assert(model.bias.numel() == 4)
+
+
+@pytest.mark.dist
+def test_run_shard_shape():
+    world_size = 2
+    run_func = partial(run_shard_shape_check, world_size=world_size, port=free_port())
+    mp.spawn(run_func, nprocs=world_size)
+
+if __name__ == '__main__':
+    test_run_shard_shape()
diff --git a/tests/test_zero_tensor_parallel/test_vit_2d_level_2.py b/tests/test_zero_tensor_parallel/test_vit_2d_level_2.py
index 9d215f5ae..f87ea7c68 100644
--- a/tests/test_zero_tensor_parallel/test_vit_2d_level_2.py
+++ b/tests/test_zero_tensor_parallel/test_vit_2d_level_2.py
@@ -88,6 +88,7 @@ def run_2d_parallel_vision_transformer_level_2(rank, world_size, port):
 
 
 @pytest.mark.dist
+@pytest.mark.skip(reason="This test should be refactored for the reconstructed zero")
 def test_2d_vit_zero_level_2():
     world_size = 8
     run_func = partial(run_2d_parallel_vision_transformer_level_2, world_size=world_size, port=free_port())
diff --git a/tests/test_zero_tensor_parallel/test_vit_2d_level_3.py b/tests/test_zero_tensor_parallel/test_vit_2d_level_3.py
index 149fefb72..2f6416a17 100644
--- a/tests/test_zero_tensor_parallel/test_vit_2d_level_3.py
+++ b/tests/test_zero_tensor_parallel/test_vit_2d_level_3.py
@@ -88,7 +88,7 @@ def run_2d_parallel_vision_transformer_level_3(rank, world_size, port):
 
 
 @pytest.mark.dist
-@pytest.mark.skip("Level 3 has unknown bug so skip this test for now")
+@pytest.mark.skip(reason="This test should be refactored for the reconstructed zero")
 def test_3d_vit_zero_level_3():
     world_size = 8
     run_func = partial(run_2d_parallel_vision_transformer_level_3, world_size=world_size, port=free_port())