Refactored docstring to google style

2025-06-19 20:23:41 +00:00 · 2022-03-25 13:02:39 +08:00 · 2022-03-25 13:02:39 +08:00 · ec5086c49c
commit ec5086c49c
parent 53b1b6e340
94 changed files with 3389 additions and 2982 deletions
--- a/colossalai/amp/init.py
+++ b/colossalai/amp/init.py
@ -12,21 +12,27 @@ from .naive_amp import convert_to_naive_amp
 def convert_to_amp(model: nn.Module, optimizer: Optimizer, criterion: _Loss, mode: AMP_TYPE, amp_config: Config = None):
-    """A helper function to wrap training components with Torch AMP modules
+    """A helper function to wrap training components with Torch AMP modules.
-    :param model: your model object
+    Args:
-    :type model: :class:`torch.nn.Module`
+        param model (:class:`torch.nn.Module`): your model object.
-    :param optimizer: your optimizer object
+        optimizer (:class:`torch.optim.Optimizer`): your optimizer object.
-    :type optimizer: :class:`torch.optim.Optimizer`
+        criterion (:class:`torch.nn.modules.loss._Loss`): your loss function object.
-    :param criterion: your loss function object
+        mode (:class:`colossalai.amp.AMP_TYPE`): amp mode.
-    :type criterion: :class:`torch.nn.modules.loss._Loss`
+        amp_config (:class:`colossalai.context.Config` or dict): configuration for different amp modes
    :param mode: amp mode
    :type mode: :class:`colossalai.amp.AMP_TYPE`
    :param amp_config: configuration for different amp modes
    :type amp_config: :class:`colossalai.context.Config` or dict
-    :return: (model, optimizer, criterion)
+    Returns:
-    :rtype: Tuple
+        A tuple (model, optimizer, criterion).
    Note:
        ``amp_config`` may vary from different mode you choose. You should check the corresponding amp mode
        for more details about ``amp_config``.
        For ``apex_amp``, please check
        `apex_amp config <https://nvidia.github.io/apex/amp.html?highlight=apex%20amp>`_.
        For ``naive_amp``, please check
        `naive_amp config <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/amp/naive_amp/_fp16_optimizer.py#L42>`_.
        For ``torch_amp``, please check
        `torch_amp config <https://github.com/pytorch/pytorch/blob/master/torch/cuda/amp/grad_scaler.py#L97>`_.
    """
    assert isinstance(mode, AMP_TYPE), \
        f'expected the argument mode be AMP_TYPE, but got {type(mode)}'
--- a/colossalai/amp/apex_amp/init.py
+++ b/colossalai/amp/apex_amp/init.py
@ -4,17 +4,33 @@ from torch.optim import Optimizer
 def convert_to_apex_amp(model: nn.Module, optimizer: Optimizer, amp_config):
-    """A helper function to wrap training components with Apex AMP modules
+    r"""A helper function to wrap training components with Apex AMP modules
-    :param model: your model object
+    Args:
-    :type model: :class:`torch.nn.Module`
+        model (:class:`torch.nn.Module`): your model object.
-    :param optimizer: your optimizer object
+        optimizer (:class:`torch.optim.Optimizer`): your optimizer object.
-    :type optimizer: :class:`torch.optim.Optimizer`
+        amp_config (:class: colossalai.context.Config or dict): configuration for initializing apex_amp.
    :param amp_config: configuration for nvidia apex
    :type amp_config: :class:`colossalai.context.Config` or dict
-    :return: (model, optimizer)
+    The ``amp_config`` should include parameters below:
-    :rtype: Tuple
+    ::
        enabled (bool, optional, default=True)
        opt_level (str, optional, default="O1")
        cast_model_type (``torch.dtype``, optional, default=None)
        patch_torch_functions (bool, optional, default=None)
        keep_batchnorm_fp32 (bool or str, optional, default=None
        master_weights (bool, optional, default=None)
        loss_scale (float or str, optional, default=None)
        cast_model_outputs (torch.dtype, optional, default=None)
        num_losses (int, optional, default=1)
        verbosity (int, default=1)
        min_loss_scale (float, default=None)
        max_loss_scale (float, default=2.**24)
    Returns:
        Tuples: A tuple (model, optimizer).
    More details about ``amp_config`` refer to `amp_config <https://nvidia.github.io/apex/amp.html?highlight=apex%20amp>`_.
    """
    import apex.amp as apex_amp
    model, optimizer = apex_amp.initialize(model, optimizer, **amp_config)
--- a/colossalai/amp/apex_amp/apex_amp.py
+++ b/colossalai/amp/apex_amp/apex_amp.py
@ -21,8 +21,8 @@ class ApexAMPOptimizer(ColossalaiOptimizer):
    def backward(self, loss: Tensor):
        """Backward pass to get all gradients
-        :param loss: Loss computed by a loss function
+        Args:
-        :type loss: torch.Tensor
+            loss (torch.Tensor): Loss computed by a loss function
        """
        with apex_amp.scale_loss(loss, self.optim) as scaled_loss:
            scaled_loss.backward()
@ -30,10 +30,9 @@ class ApexAMPOptimizer(ColossalaiOptimizer):
    def clip_grad_norm(self, model: nn.Module, max_norm: float):
        """Clip gradients' norm
-        :param model: Your model object
+        Args:
-        :type model: torch.nn.Module
+            model (torch.nn.Module): Your model object
-        :param max_norm: The max norm value for gradient clipping
+            max_norm (float): The max norm value for gradient clipping
        :type max_norm: float
        """
        if max_norm > 0:
            clip_grad_norm_fp32(apex_amp.master_params(self.optim), max_norm)
--- a/colossalai/amp/naive_amp/init.py
+++ b/colossalai/amp/naive_amp/init.py
@ -4,20 +4,30 @@ from torch.optim import Optimizer
 from colossalai.utils import is_no_pp_or_last_stage
 from .naive_amp import NaiveAMPOptimizer, NaiveAMPModel
 from .grad_scaler import DynamicGradScaler, ConstantGradScaler
 from ._fp16_optimizer import FP16Optimizer
 def convert_to_naive_amp(model: nn.Module, optimizer: Optimizer, amp_config):
-    """A helper function to wrap training components with naive AMP modules
+    """A helper function to wrap training components with naive AMP modules. In this mode,
    we forcibly cast the model weights and inputs to FP16, and cast the model outputs to FP32 to calculate loss,
    which is equivalent to Apex O3.
-    :param model: your model object
+    Args:
-    :type model: :class:`torch.nn.Module`
+        model (:class:`torch.nn.Module`): your model object
-    :param optimizer: your optimizer object
+        optimizer (:class:`torch.optim.Optimizer`): your optimizer object
-    :type optimizer: :class:`torch.optim.Optimizer`
+        amp_config (:class:`colossalai.context.Config` or dict): configuration for naive mode amp.
    :param amp_config: configuration for naive mode amp
    :type amp_config: :class:`colossalai.context.Config` or dict
-    :return: (model, optimizer)
+
-    :rtype: Tuple
+    The ``amp_config`` should contain parameters below:
    :
        verbose (bool, optional): if set to `True`, will print debug info (Default: False).
        clip_grad_norm (float, optional): clip gradients with this global L2 norm (Default 0).
                                          Note that clipping is ignored if clip_grad == 0.
        dynamic_grad_scale (bool): whether to use dynamic grad scaler.
    Returns:
        Tuples: A tuple (model, optimizer)
    """
    if isinstance(model, nn.ModuleList):
        # interleaved pipeline
@ -46,4 +56,4 @@ def convert_to_naive_amp(model: nn.Module, optimizer: Optimizer, amp_config):
    return model, optimizer
-__all__ = ['convert_to_naive_amp', 'NaiveAMPOptimizer']
+__all__ = ['convert_to_naive_amp', 'NaiveAMPOptimizer', 'FP16Optimizer']
--- a/colossalai/amp/naive_amp/_fp16_optimizer.py
+++ b/colossalai/amp/naive_amp/_fp16_optimizer.py
@ -42,24 +42,13 @@ def _multi_tensor_copy_this_to_that(this, that, overflow_buf=None):
 class FP16Optimizer(Optimizer):
    """Float16 optimizer for fp16 and bf16 data types.
-    :param optimizer: base optimizer such as Adam or SGD
+    Args:
-    :type optimizer: torch.optim.Optimizer
+        optimizer (torch.optim.Optimizer): base optimizer such as Adam or SGD
-    :param clip_grad: clip gradeints with this global L2 norm. Note that clipping is ignored if clip_grad == 0
+        grad_scaler (BaseGradScaler): grad scaler for gradient chose in
-    :type param clip_grad: float
+                                      ``constant_grad_scaler`` or ``dynamic_grad_scaler``.
-    :param log_num_zeros_in_grad: return number of zeros in the gradients.
+        clip_grad_norm (float, optional): clip gradients with this global L2 norm. Default 0.
-    :type log_num_zeros_in_grad: bool
+                        Note that clipping is ignored if clip_grad == 0
-    :param initial_scale: initial scale of gradient scaler
+        verbose (bool, optional): if set to `True`, will print debug info. Default False.
    :type initial_scale: int
    :param growth_factor: the growth rate of loss scale
    :type growth_factor: int
    :param backoff_factor: the decrease rate of loss scale
    :type backoff_factor: float
    :param hysterisis: delay shift in dynamic loss scaling
    :type hysterisis: int
    :param max_scale: maximum loss scale allowed
    :type max_scale: int
    :param verbose: if set to `True`, will print debug info
    :type verbose: bool
    """
    def __init__(self,
--- a/colossalai/amp/naive_amp/naive_amp.py
+++ b/colossalai/amp/naive_amp/naive_amp.py
@ -18,11 +18,15 @@ from ._fp16_optimizer import FP16Optimizer
 class NaiveAMPOptimizer(ColossalaiOptimizer):
    """A wrapper class for optimizer to cast all parameters to fp16
-    :param optim: A normal optimizer like Adam or SGD
+    Args:
-    :param args: Args used to initialize FP16 optimizer
+        optim (torch.optim.Optimizer): A normal optimizer like Adam or SGD.
-    :param kwargs: Kwargs used to initialize FP16 optimizer
+        grad_scaler (BaseGradScaler): grad scaler for gradient chose in
                                      ``constant_grad_scaler`` or ``dynamic_grad_scaler``.
        clip_grad_norm (float, optional): clip gradients with this global L2 norm. Default 0.
        verbose (bool, optional): if set to `True`, will print debug info. Default False.
-    :type optim: torch.optim.Optimizer
+    Note:
        clipping is ignored if ``clip_grad_norm`` equals 0.
    """
    def __init__(self, optim: Optimizer, *args, **kwargs):
@ -40,8 +44,19 @@ class NaiveAMPOptimizer(ColossalaiOptimizer):
 class NaiveAMPModel(nn.Module):
-    """A wrapper class for model to cast the model into fp16 and
+    r"""A wrapper class for model to cast the model into fp16 and
    automatically cast the input and output
    Args:
        model (torch.nn.Module): torch.nn.Module to be wrapped.
        output_to_fp32 (bool, optional): Whether cast output of this module into fp32. (Default: True)
        parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel group mode used in this module.
                                                                  (Default: ``ParallelMode.DATA``)
        sync_buffer (bool, optional): whether to synchronize buffer. (Default: True)
    Note:
        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
    """
    def __init__(self,
--- a/colossalai/amp/torch_amp/init.py
+++ b/colossalai/amp/torch_amp/init.py
@ -10,18 +10,25 @@ def convert_to_torch_amp(model: nn.Module,
                         optimizer: Optimizer,
                         criterion: Optional[_Loss] = None,
                         amp_config: Optional[Config] = None):
-    """A helper function to wrap training components with Torch AMP modules
+    """A helper function to wrap training components with Pytorch AMP modules
-    :param model: your model object
+    Args:
-    :type model: :class:`torch.nn.Module`
+        model (:class:`torch.nn.Module`): your model object.
-    :param optimizer: your optimizer object
+        optimizer (:class:`torch.optim.Optimizer`): your optimizer object
-    :type optimizer: :class:`torch.optim.Optimizer`
+        criterion (:class:`torch.nn.modules.loss._Loss`, optional): your loss function object
-    :param criterion: your loss function object
+        amp_config (:class:`colossalai.context.Config` or dict, optional): configuration for Pytorch AMP.
-    :type criterion: :class:`torch.nn.modules.loss._Loss`, optional
+
-    :param amp_config: configuration for different amp modes
+    The ``amp_config`` should include parameters below:
-    :type amp_config: :class:`colossalai.context.Config` or dict, optional
+    ::
-    :return: (model, optimizer, criterion)
+
-    :rtype: Tuple
+        init_scale (float, optional, default=2.**16)
        growth_factor (float, optional, default=2.0)
        backoff_factor (float, optional, default=0.5)
        growth_interval (int, optional, default=2000)
        enabled (bool, optional, default=True)
    Returns:
        A tuple (model, optimizer, criterion)
    """
    model = TorchAMPModel(model)
    if amp_config is None:
--- a/colossalai/amp/torch_amp/torch_amp.py
+++ b/colossalai/amp/torch_amp/torch_amp.py
@ -14,13 +14,19 @@ from colossalai.utils import clip_grad_norm_fp32
 class TorchAMPOptimizer(ColossalaiOptimizer):
-    """A wrapper class which integrate pytorch amp with an optimizer
+    """A wrapper class which integrate Pytorch AMP with an optimizer
-    :param optim: A normal optimizer like Adam or SGD
+    Args:
-    :param args: Args used to initialize gradient scaler
+        optim (torch.optim.Optimizer): A normal optimizer like Adam or SGD.
-    :param kwargs: Kwargs used to initialize gradient scaler
+        init_scale (float, optional, default=2.**16):  Initial scale factor.
-
+        growth_factor (float, optional, default=2.0):  Factor by which the scale is multiplied during
-    :type optim: torch.optim.Optimizer
+            :meth:`update` if no inf/NaN gradients occur for ``growth_interval`` consecutive iterations.
        backoff_factor (float, optional, default=0.5):  Factor by which the scale is multiplied during
            :meth:`update` if inf/NaN gradients occur in an iteration.
        growth_interval (int, optional, default=2000):  Number of consecutive iterations without inf/NaN gradients
            that must occur for the scale to be multiplied by ``growth_factor``.
        enabled (bool, optional, default=True):  If ``False``, disables gradient scaling. :meth:`step` simply
            invokes the underlying ``optimizer.step()``, and other methods become no-ops.
    """
    def __init__(self, optim: Optimizer, *args, **kwargs):
@ -30,8 +36,8 @@ class TorchAMPOptimizer(ColossalaiOptimizer):
    def backward(self, loss: Tensor):
        """Backward with torch amp gradient scaler
-        :param loss: Loss computed by a loss function
+        Args:
-        :type loss: torch.Tensor
+            loss (torch.Tensor): Loss computed by a loss function
        """
        self.scaler.scale(loss).backward()
@ -44,10 +50,9 @@ class TorchAMPOptimizer(ColossalaiOptimizer):
    def clip_grad_norm(self, model: nn.Module, max_norm: float):
        """Apply gradient clipping to the model parameters
-        :param model: Your model object
+        Args:
-        :type model: torch.nn.Module
+            model (torch.nn.Module): Your model object
-        :param max_norm: Max norm value for gradient clipping
+            max_norm (float): Max norm value for gradient clipping
        :type max_norm: float
        """
        if max_norm > 0.0:
            self.scaler.unscale_(self.optim)
@ -71,8 +76,8 @@ class TorchAMPModel(nn.Module):
 class TorchAMPLoss(nn.Module):
    """A wrapper class for a criterion object which computes the loss in mixed-precision context
-    :param loss: A loss function object
+    Args:
-    :type loss: torch.nn.modules.loss._Loss
+        loss (torch.nn.modules.loss._Loss): A loss function object
    """
    def __init__(self, loss: _Loss):
--- a/colossalai/builder/builder.py
+++ b/colossalai/builder/builder.py
@ -10,34 +10,40 @@ from colossalai.registry import *
 def build_from_config(module, config: dict):
    """Returns an object of :class:`module` constructed from `config`.
-    :param module: A python or user-defined class
+    Args:
-    :type module: class
+        module: A python or user-defined class
-    :param config: A python dict containing information used in the construction
+        config: A python dict containing information used in the construction of the return object
-        of the return object
+
-    :type config: dict
+    Returns: An ``object`` of interest
-    :raises AssertionError: Raises an AssertionError if `module` is not a class
+
-    :return: An object of interest
+    Raises:
-    :rtype: Object
+        AssertionError: Raises an AssertionError if `module` is not a class
    """
    assert inspect.isclass(module), 'module must be a class'
    return module(**config)
 def build_from_registry(config, registry: Registry):
-    """Returns an object constructed from `config`, the type of the object
+    r"""Returns an object constructed from `config`, the type of the object
    is specified by `registry`.
-    :param config: A python dict or a :class:`colossalai.context.Config` object
+    Note:
-        containing information used in the construction of the return object
+        the `config` is used to construct the return object such as `LAYERS`,
-    :type config: dict or :class:`colossalai.context.colossalai.context.Config`
+         `OPTIMIZERS` and other support types in `registry`. The `config` should contain
-    :param registry: A registry specifying the type of the return object
+         all required parameters of corresponding object. The details of support
-    :type registry: :class:`Registry`
+         types in `registry` and the `mod_type` in `config` could be found in
-    :raises AssertionError: Raises an AssertionError if `registry` is not an object
+         `registry <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/registry/__init__.py>`_.
-        of :class:`Registry` or `mod_type` in `config` is not found in `registry`
+
-    :raises Exception: Raises an Exception if an error occurred when building
+    Args:
-        from registry
+        config (dict or :class:`colossalai.context.colossalai.context.Config`): information
-    :return: An object specified by `registry`
+            used in the construction of the return object.
-    :rtype: Python object specified by `registry`
+        registry (:class:`Registry`): A registry specifying the type of the return object
    Returns: A Python object specified by `registry`
    Raises:
        Exception: Raises an Exception if an error occurred when building from registry
    """
    config_ = config.copy()  # keep the original config untouched
    assert isinstance(
@ -60,11 +66,13 @@ def build_from_registry(config, registry: Registry):
 def build_layer(config):
    """Returns a layer object of :class:`nn.Module` constructed from `config`.
-    :param config: A python dict or a :class:`colossalai.context.Config` object
+    Args:
-        containing information used in the construction of the return object
+        config (dict or :class:`colossalai.context.Config`): A python dict or
-    :type config: dict or :class:`colossalai.context.Config`
+            a :class:`colossalai.context.Config` object containing information
-    :return: An object of :class:`torch.nn.Module`
+            used in the construction of the ``LAYERS``.
-    :rtype: :class:`torch.nn.Module`
+
    Returns:
        An object of :class:`torch.nn.Module`
    """
    return build_from_registry(config, LAYERS)
@ -73,11 +81,13 @@ def build_loss(config):
    """Returns a loss function object of :class:`torch.autograd.Function` constructed
    from `config`.
-    :param config: A python dict or a :class:`colossalai.context.Config` object
+    Args:
-        containing information used in the construction of the return object
+        config (dict or :class:`colossalai.context.Config`): A python dict or
-    :type config: dict or :class:`colossalai.context.Config`
+            a :class:`colossalai.context.Config` object containing information
-    :return: An object of :class:`torch.nn.modules.loss._Loss`
+            used in the construction of the ``LOSSES``.
-    :rtype: :class:`torch.nn.modules.loss._Loss`
+
    Returns:
        An object of :class:`torch.nn.modules.loss._Loss`
    """
    return build_from_registry(config, LOSSES)
@ -85,11 +95,13 @@ def build_loss(config):
 def build_model(config):
    """Returns a model object of :class:`nn.Module` constructed from `config`.
-    :param config: A python dict or a :class:`colossalai.context.Config` object
+    Args:
-        containing information used in the construction of the return object
+        config (dict or :class:`colossalai.context.Config`): A python dict or
-    :type config: dict or :class:`colossalai.context.Config`
+            a :class:`colossalai.context.Config` object containing information
-    :return: An object of :class:`torch.nn.Module`
+            used in the construction of the ``MODELS``.
-    :rtype: :class:`torch.nn.Module`
+
    Returns:
        An object of :class:`torch.nn.Module`
    """
    return build_from_registry(config, MODELS)
@ -98,11 +110,13 @@ def build_dataset(config):
    """Returns a dataset object of :class:`torch.utils.data.Dataset` constructed
    from `config`.
-    :param config: A python dict or a :class:`colossalai.context.Config` object
+    Args:
-        containing information used in the construction of the return object
+        config (dict or :class:`colossalai.context.Config`): A python dict or
-    :type config: dict or :class:`colossalai.context.Config`
+            a :class:`colossalai.context.Config` object containing information
-    :return: An object of :class:`torch.utils.data.Dataset`
+            used in the construction of the ``DATASETS``.
-    :rtype: :class:`torch.utils.data.Dataset`
+
    Returns:
        An object of :class:`torch.utils.data.Dataset`
    """
    return build_from_registry(config, DATASETS)
@ -111,13 +125,14 @@ def build_optimizer(config, model):
    """Returns an optimizer object of :class:`torch.optim.Optimizer` constructed from `config`,
    'model' and 'params'.
-    :param config: A python dict or a :class:`colossalai.context.Config` object
+    Args:
-        containing information used in the construction of the return object
+        config (dict or :class:`colossalai.context.Config`): A python dict or
-    :type config: dict or :class:`colossalai.context.Config`
+            a :class:`colossalai.context.Config` object containing information
-    :param model: A model containing parameters for the optimizer
+            used in the construction of the ``OPTIMIZERS``.
-    :type model: :class:`nn.Module`
+        model (:class:`nn.Module`): A model containing parameters for the optimizer
-    :return: An object of :class:`torch.optim.Optimizer`
+
-    :rtype: :class:`torch.optim.Optimizer`
+    Returns:
        An object of :class:`torch.optim.Optimizer`
    """
    config_ = config.copy()
    config_['params'] = model.parameters()
@ -128,15 +143,15 @@ def build_gradient_handler(config, model, optimizer):
    """Returns a gradient handler object of :class:`BaseGradientHandler` constructed from `config`,
    `model` and `optimizer`.
-    :param config: A python dict or a :class:`colossalai.context.Config` object
+    Args:
-        containing information used in the construction of the return object
+        config (dict or :class:`colossalai.context.Config`): A python dict or
-    :type config: dict or :class:`colossalai.context.Config`
+            a :class:`colossalai.context.Config` object containing information
-    :param model: A model containing parameters for the gradient handler
+            used in the construction of the ``GRADIENT_HANDLER``.
-    :type model: :class:`nn.Module`
+        model (:class:`nn.Module`): A model containing parameters for the gradient handler
-    :param optimizer: An optimizer object containing parameters for the gradient handler
+        optimizer (:class:`torch.optim.Optimizer`): An optimizer object containing parameters for the gradient handler
-    :type optimizer: :class:`torch.optim.Optimizer`
+
-    :return: An object of :class:`colossalai.engine.BaseGradientHandler`
+    Returns:
-    :rtype: :class:`colossalai.engine.BaseGradientHandler`
+        An object of :class:`colossalai.engine.BaseGradientHandler`
    """
    config_ = config.copy()
    config_['model'] = model
@ -147,13 +162,13 @@ def build_gradient_handler(config, model, optimizer):
 def build_hooks(config, trainer):
    """Returns a hook object of :class:`BaseHook` constructed from `config` and `trainer`.
-    :param config: A python dict or a :class:`colossalai.context.Config` object
+    Args:
-        containing information used in the construction of the return object
+        config (dict or :class:`colossalai.context.Config`): A python dict or
-    :type config: dict or :class:`colossalai.context.Config`
+            a :class:`colossalai.context.Config` object containing information
-    :param trainer: A :class:`Trainer` object containing parameters for the hook
+            used in the construction of the ``HOOKS``.
-    :type trainer: :class:`Trainer`
+
-    :return: An object of :class:`colossalai.trainer.hooks.BaseHook`
+    Returns:
-    :rtype: :class:`colossalai.trainer.hooks.BaseHook`
+        An object of :class:`colossalai.trainer.hooks.BaseHook`
    """
    config_ = config.copy()
    config_['trainer'] = trainer
@ -163,11 +178,13 @@ def build_hooks(config, trainer):
 def build_ophooks(config):
    """Returns a hook object of :class:`BaseOpHook` constructed from `config`.
-    :param config: A python dict or a :class:`colossalai.context.Config` object
+    Args:
-        containing information used in the construction of the return object
+        config (dict or :class:`colossalai.context.Config`): A python dict or
-    :type config: dict or :class:`colossalai.context.Config`
+            a :class:`colossalai.context.Config` object containing information
-    :return: An object of :class:`colossalai.trainer.hooks.BaseOpHook`
+            used in the construction of the ``OPHOOKS``.
-    :rtype: :class:`colossalai.trainer.hooks.BaseOpHook`
+
    Returns:
        An object of :class:`colossalai.trainer.hooks.BaseOpHook`
    """
    config_ = config.copy()
    return build_from_registry(config_, OPHOOKS)
@ -177,11 +194,13 @@ def build_transform(config):
    """Returns a transformation object of :class:`torchvision.transforms` constructed
    from `config`.
-    :param config: A python dict or a :class:`colossalai.context.Config` object
+    Args:
-        containing information used in the construction of the return object
+        config (dict or :class:`colossalai.context.Config`): A python dict or
-    :type config: dict or :class:`colossalai.context.Config`
+            a :class:`colossalai.context.Config` object containing information
-    :return: An object of :class:`torchvision.transforms`
+            used in the construction of the ``TRANSFORMS``.
-    :rtype: :class:`torchvision.transforms`
+
    Returns:
        An object of :class:`torchvision.transforms`
    """
    return build_from_registry(config, TRANSFORMS)
@ -190,14 +209,15 @@ def build_data_sampler(config, dataset):
    """Returns a data sampler object of :class:`colossalai.nn.data.sampler.BaseSampler`
    constructed from `config`.
-    :param config: A python dict or a :class:`colossalai.context.Config` object
+    Args:
-        containing information used in the construction of the return object
+        config (dict or :class:`colossalai.context.Config`): A python dict or
-    :type config: dict or :class:`colossalai.context.Config`
+            a :class:`colossalai.context.Config` object containing information
-    :param dataset: An object of :class:`torch.utils.data.Dataset` containing information
+            used in the construction of the ``DATA_SAMPLERS``.
-        used in the construction of the return object
+        dataset (:class:`torch.utils.data.Dataset`): An object of
-    :type dataset: :class:`torch.utils.data.Dataset`
+            :class:`torch.utils.data.Dataset` containing information
-    :return: An object of :class:`colossalai.utils.data_sampler.BaseSampler`
+            used in the construction of the return object
-    :rtype: :class:`colossalai.utils.data_sampler.BaseSampler`
+    Returns:
        An object of :class:`colossalai.utils.data_sampler.BaseSampler`
    """
    config_ = config.copy()
    config_['dataset'] = dataset
@ -208,14 +228,15 @@ def build_lr_scheduler(config, optimizer):
    """Returns a learning rate scheduler object of :class:`torch.optim.lr_scheduler`
    constructed from `config`, `optimizer`, `total_steps` and `num_steps_per_epoch`.
-    :param config: A python dict or a :class:`colossalai.context.Config` object
+    Args:
-        containing information used in the construction of the return object
+        config (dict or :class:`colossalai.context.Config`): A python dict or
-    :type config: dict or :class:`colossalai.context.Config`
+            a :class:`colossalai.context.Config` object containing information
-    :param optimizer: An optimizer object containing parameters for the learning rate
+            used in the construction of the ``lr_schedule``.
-        scheduler
+        optimizer (:class:`torch.optim.Optimizer`): An optimizer object containing
-    :type optimizer: :class:`torch.optim.Optimizer`
+            parameters for the learning rate scheduler.
-    :return: An object of :class:`torch.optim.lr_scheduler`
+
-    :rtype: :class:`torch.optim.lr_scheduler`
+    Returns:
        An object of :class:`torch.optim.lr_scheduler`
    """
    config_ = config.copy()
    config_['optimizer'] = optimizer
@ -225,10 +246,12 @@ def build_lr_scheduler(config, optimizer):
 def build_schedule(config):
    """Returns a schedule of :class:`colossalai.engine.schedule.BaseSchedule`.
-    :param config: A python dict or a :class:`colossalai.context.Config` object
+    Args:
-        containing information used in the construction of the return object
+        config (dict or :class:`colossalai.context.Config`): A python dict or
-    :type config: dict or :class:`colossalai.context.Config`
+            a :class:`colossalai.context.Config` object containing information
-    :return: An object of :class:`colossalai.engine.schedule.BaseSchedule`
+            used in the construction of the ``Schedule``.
-    :rtype: :class:`colossalai.engine.schedule.BaseSchedule`
+
    Returns:
        An object of :class:`colossalai.engine.schedule.BaseSchedule`
    """
    return build_from_registry(config, SCHEDULE)
--- a/colossalai/builder/pipeline.py
+++ b/colossalai/builder/pipeline.py
@ -13,14 +13,13 @@ def _binary_partition(weights, st, ed):
    """Returns the binary partition position of `weights`, given the start
    position `st` and the end position `ed`.
-    :param weights: A python list to be binary partitioned
+    Args:
-    :type weights: list
+        weights (list): A python list to be binary partitioned
-    :param st: the start position of the binary partition
+        st (int): the start position of the binary partition
-    :type st: int
+        ed (int): the end position of the binary partition
-    :param ed: the end postition of the binary partition
+
-    :type ed: int
+    Returns:
-    :return: the binary partition position of `weights`
+        int: the binary partition position of `weights`
    :rtype: int
    """
    w_sum = weights[ed - 1]
    prefix = 0
@ -176,16 +175,13 @@ def build_pipeline_model_from_cfg(config, num_chunks: int = 1, partition_method:
        ...
    )
-    :param config: Configuration of the model
+    Args:
-    :type config: dict
+        config (dict): Configuration of the model.
-    :param num_chunks: The number of chunks you want to have on the current stage. This value should be 1
+        num_chunks (int, optional): The number of chunks you want to have on the current stage.
-                        in most cases unless you are using virutal pipeline parallelism.
+            This value should be 1 in most cases unless you are using virtual pipeline parallelism.
-    :type num_chunks: int, optional
+        partition_method (str, optional): This parameter determines how you want to split your model
-    :param partition_method: This parameter determines how you want to split your model layers into stages,
+            layers into stages, you can set it as 'layer' or 'parameter'.
-                                you can set it as 'layer' or 'parameter'
+        verbose (bool, optional): Whether to print the logs.
    :type partition_method: str, optional
    :param verbose: Whether to print the logs
    :type verbose: bool, optional
    """
    ori_model = build_model(config)
    layers = ori_model.layers_cfg
@ -240,13 +236,11 @@ def build_pipeline_model(layers: nn.Sequential, num_chunks: int = 1, verbose: bo
    """An intializer to split the model into different stages for pipeline parallelism.
    Note that `layer` must be `torch.nn.Sequential`.
-    :param layers: Layers of model
+    Args:
-    :type layers: `torch.nn.Sequential`
+        layers (`torch.nn.Sequential`): Layers of model
-    :param num_chunks: The number of chunks you want to have on the current stage. This value should be 1
+        num_chunks: The number of chunks you want to have on the current stage. This value should be 1
-                        in most cases unless you are using virutal pipeline parallelism.
+                        in most cases unless you are using virtual pipeline parallelism.
-    :type num_chunks: int, optional
+        verbose (bool, optional): Whether to print the logs.
    :param verbose: Whether to print the logs
    :type verbose: bool, optional
    """
    pipeline_parallel_size = gpc.get_world_size(ParallelMode.PIPELINE)
    pipeline_rank = gpc.get_local_rank(ParallelMode.PIPELINE)
--- a/colossalai/communication/collective.py
+++ b/colossalai/communication/collective.py
@ -12,21 +12,22 @@ from colossalai.utils import get_current_device
 def all_gather(tensor: Tensor, dim: int, parallel_mode: ParallelMode, async_op: bool = False) -> Tensor:
-    """Gathers all tensors from the parallel group and concatenates them in a 
+    r"""Gathers all tensors from the parallel group and concatenates them in a
    specific dimension.
-    :param tensor: Tensor to be gathered
+    Note:
-    :param dim: The dimension concatenating in
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
-    :param parallel_mode: Parallel group mode used in this communication
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
    :param async_op: Whether operations are asynchronous
-    :type tensor: :class:`torch.Tensor`
+    Args:
-    :type dim: int
+        tensor (:class:`torch.Tensor`): Tensor to be gathered.
-    :type parallel_mode: :class:`colossalai.context.ParallelMode`
+        dim (int): The dimension concatenating in.
-    :type async_op: bool, optional
+        parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel group mode used in this communication.
        async_op (bool, optional): Whether operations are asynchronous.
-    :return: The tensor generated by all-gather
+    Returns:
-    :rtype: :class:`torch.Tensor`
+        Union[tuple(:class:`torch.Tensor`, work handle), :class:`torch.Tensor`]: The result of all-together only,
        if async_op is set to False. A tuple of output of all-gather and Async work handle, if async_op is set to True.
    """
    depth = gpc.get_world_size(parallel_mode)
    if depth == 1:
@ -54,23 +55,26 @@ def reduce_scatter(tensor: Tensor,
                   parallel_mode: ParallelMode,
                   op: ReduceOp = ReduceOp.SUM,
                   async_op: bool = False) -> Tensor:
-    """Reduces all tensors then scatters it in a specific dimension to all 
+    r"""Reduces all tensors then scatters it in a specific dimension to all
    members in the parallel group.
-    :param tensor: Tensor to be reduced and scattered
+    Note:
-    :param dim: The dimension scattering in
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
-    :param parallel_mode: Parallel group mode used in this communication
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
    :param op: The type of reduce operation
    :param async_op: Whether operations are asynchronous
-    :type tensor: :class:`torch.Tensor`
+    Args:
-    :type dim: int
+        tensor (:class:`torch.Tensor`): Tensor to be reduce_scattered.
-    :type parallel_mode: :class:`colossalai.context.ParallelMode`
+        dim (int): The dimension concatenating in.
-    :type op: ReduceOp, optional
+        parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel group mode used in this communication.
-    :type async_op: bool, optional
+        op (torch.distributed.ReduceOp, optional): The type of reduce operation,
            should be included in [SUM, AVG, PRODUCT, MIN, MAX, BAND, BOR, BXOR].
            More details about ReduceOp please refer to
            `ReduceOp <https://pytorch.org/docs/stable/distributed.html#torch.distributed.ReduceOp>`_.
        async_op (bool, optional): Whether operations are asynchronous.
-    :return: The tensor generated by reduce-scatter
+    Returns:
-    :rtype: :class:`Tensor`
+        Union[tuple(:class:`torch.Tensor`, work handle), :class:`torch.Tensor`]: The result of reduce_scatter only,
        if async_op is set to False. A tuple of output of all-gather and Async work handle, if async_op is set to True.
    """
    depth = gpc.get_world_size(parallel_mode)
    if depth == 1:
@ -94,6 +98,25 @@ def all_reduce(tensor: Tensor,
               parallel_mode: ParallelMode,
               op: ReduceOp = ReduceOp.SUM,
               async_op: bool = False) -> Tensor:
    r"""Reduces the tensor data across whole parallel group in such a way that all get the final result.
    Note:
        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
    Args:
        tensor (:class:`torch.Tensor`): Tensor to be all-reduced.
        parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel group mode used in this communication.
        op (torch.distributed.ReduceOp, optional): The type of reduce operation,
            should be included in [SUM, AVG, PRODUCT, MIN, MAX, BAND, BOR, BXOR].
            More details about ReduceOp please refer to
            `ReduceOp <https://pytorch.org/docs/stable/distributed.html#torch.distributed.ReduceOp>`_.
        async_op (bool, optional): Whether operations are asynchronous.
    Returns:
        Union[tuple(:class:`torch.Tensor`, work handle), :class:`torch.Tensor`]: The result of all-gather only,
        if async_op is set to False. A tuple of output of all-gather and Async work handle, if async_op is set to True.
    """
    depth = gpc.get_world_size(parallel_mode)
    if depth == 1:
        out = tensor
@ -108,6 +131,23 @@ def all_reduce(tensor: Tensor,
 def broadcast(tensor: Tensor, src: int, parallel_mode: ParallelMode, async_op: bool = False):
    r"""Broadcast tensors to whole parallel group. Tensor must have the same
    number of elements in all processes participating in the collective.
    Note:
        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
    Args:
        tensor (:class:`torch.Tensor`): Tensor to be broadcast.
        src (int): Source rank.
        parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel group mode used in this communication.
        async_op (bool, optional): Whether operations are asynchronous.
    Returns:
        Union[tuple(:class:`torch.Tensor`, work handle), :class:`torch.Tensor`]: The tensor need to be broadcast only,
        if async_op is set to False. A tuple of output of all-gather and Async work handle, if async_op is set to True.
    """
    depth = gpc.get_world_size(parallel_mode)
    if depth == 1:
        out = tensor
@ -122,6 +162,23 @@ def broadcast(tensor: Tensor, src: int, parallel_mode: ParallelMode, async_op: b
 def reduce(tensor: Tensor, dst: int, parallel_mode: ParallelMode, op: ReduceOp = ReduceOp.SUM, async_op: bool = False):
    r"""Reduce tensors across whole parallel group. Only the process with
    rank ``dst`` is going to receive the final result.
    Note:
        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
    Args:
        tensor (:class:`torch.Tensor`): Tensor to be reduced.
        dst (int): Destination rank.
        parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel group mode used in this communication.
        async_op (bool, optional): Whether operations are asynchronous.
    Returns:
        Union[tuple(:class:`torch.Tensor`, work handle), :class:`torch.Tensor`]: The result of reduce only,
        if async_op is set to False. A tuple of output of all-gather and Async work handle, if async_op is set to True.
    """
    depth = gpc.get_world_size(parallel_mode)
    if depth == 1:
        out = tensor
--- a/colossalai/communication/p2p.py
+++ b/colossalai/communication/p2p.py
@ -19,12 +19,12 @@ TensorShape = Union[torch.Size, List[int], Tuple[int]]
 def _get_tensor_shape(tensor_shape: TensorShape, chunk_tensor: bool = False) -> Tuple[TensorShape, bool]:
    """get the exact tensor shape when communicating and return whether the tensor is a chunk
-    :param tensor_shape: shape of tensor
+    Args:
-    :type tensor_shape: TensorShape
+        tensor_shape (:class:`torch.Size`): shape of tensor
-    :param chunk_tensor: whether to chunk tensor, defaults to False
+        chunk_tensor (bool, optional): whether to chunk tensor, defaults to False
-    :type chunk_tensor: bool, optional
+
-    :return: exact tensor shape, whether to chunk tensor
+    Returns:
-    :rtype: Tuple[Union[torch.Size, List[int], Tuple[int]], bool]
+        Tuple[Union[torch.Size, List[int], Tuple[int]], bool]: exact tensor shape, whether to chunk tensor
    """
    if chunk_tensor:
        tensor_chunk_shape = reduce(operator.mul, tensor_shape, 1)
@ -134,14 +134,14 @@ def _communicate(tensor_send_next=None,
 def recv_forward(input_tensor_shape, prev_rank=None, dtype=torch.float, scatter_gather_tensors=False):
-    """Receives the input tensor from the previous member in pipeline.
+    """Copy the forward output from the previous stage in pipeline as the input tensor of this stage.
-    :param input_tensor_shape: The shape of the tensor to be recieved
+    Args:
-    :param prev_rank: The rank of the source of the tensor
+        input_tensor_shape (:class:`torch.Size`): The shape of the tensor to be received.
-    :type input_tensor_shape: torch.Size
+        prev_rank (int, optional): The rank of the source of the tensor.
-    :type prev_rank: int, optional
+
-    :return: The input tensor in forward step
+    Returns:
-    :rtype: :class:`torch.Tensor`
+        :class:`torch.Tensor`: The input tensor.
    """
    if gpc.is_pipeline_first_stage():
        input_tensor = None
@ -155,14 +155,14 @@ def recv_forward(input_tensor_shape, prev_rank=None, dtype=torch.float, scatter_
 def recv_backward(output_grad_shape, next_rank=None, dtype=torch.float, scatter_gather_tensors=False):
-    """Receives the grad tensor from the next member in pipeline.
+    """Copy the gradient tensor from the next stage in pipeline as the input gradient of this stage.
-    :param output_grad_shape: The shape of the tensor to be recieved
+    Args:
-    :param next_rank: The rank of the source of the tensor
+        output_grad_shape (:class:`torch.Size`): The shape of the tensor to be received.
-    :type output_grad_shape: torch.Size
+        next_rank (int, optional): The rank of the source of the tensor.
-    :type next_rank: int, optional
+
-    :return: The grad of output tensor in forward step
+    Returns:
-    :rtype: :class:`torch.Tensor`
+        :class:`torch.Tensor`: The input gradient tensor.
    """
    if gpc.is_pipeline_last_stage():
        output_tensor_grad = None
@ -176,12 +176,11 @@ def recv_backward(output_grad_shape, next_rank=None, dtype=torch.float, scatter_
 def send_forward(output_tensor, next_rank=None, scatter_gather_tensors=False):
-    """Sends the input tensor to the next member in pipeline.
+    """Sends the input tensor to the next stage in pipeline.
-    :param output_tensor: Tensor to be sent
+    Args:
-    :param next_rank: The rank of the recipient of the tensor
+        output_tensor (:class:`torch.Tensor`): Tensor to be sent.
-    :type output_tensor: :class:`torch.Tensor`
+        next_rank (int, optional): The rank of the recipient of the tensor.
    :type next_rank: int, optional
    """
    if not gpc.is_pipeline_last_stage():
        _communicate(tensor_send_next=output_tensor,
@ -190,12 +189,11 @@ def send_forward(output_tensor, next_rank=None, scatter_gather_tensors=False):
 def send_backward(input_tensor_grad, prev_rank=None, scatter_gather_tensors=False):
-    """Sends the grad tensor to the previous member in pipeline.
+    """Sends the gradient tensor to the previous stage in pipeline.
-    :param input_tensor_grad: Tensor to be sent
+    Args:
-    :param prev_rank: The rank of the recipient of the tensor
+        input_tensor_grad (:class:`torch.Tensor`): Tensor to be sent
-    :type input_tensor_grad: :class:`torch.Tensor`
+        prev_rank (int, optional): The rank of the recipient of the tensor
    :type prev_rank: int, optional
    """
    if not gpc.is_pipeline_first_stage():
        _communicate(tensor_send_prev=input_tensor_grad,
@ -210,15 +208,15 @@ def send_forward_recv_backward(output_tensor,
                               dtype=torch.float,
                               scatter_gather_tensors=False):
    """Batched communication operation. Sends the input tensor to the 
-    next member in pipeline, while recieves the grad tensor from the
+    next stage in pipeline, while receives the gradient tensor from the
-    next member in pipeline.
+    next stage in pipeline as the input gradient tensor of this stage.
-    :param output_tensor: Tensor to be sent
+    Args:
-    :param output_grad_shape: The shape of the tensor to be recieved
+        output_tensor (:class:`torch.Tensor`): Tensor to be sent.
-    :type output_tensor: :class:`torch.Tensor`
+        output_grad_shape (:class:`torch.Size`): The shape of the tensor to be received.
-    :type output_grad_shape: :class:`torch.Size`
+
-    :return: The grad of output tensor in forward step
+    Returns:
-    :rtype: :class:`torch.Tensor`
+        :class:`torch.Tensor`: The input gradient tensor.
    """
    if gpc.is_pipeline_last_stage():
        output_tensor_grad = None
@ -238,16 +236,16 @@ def send_backward_recv_forward(input_tensor_grad,
                               prev_rank=None,
                               dtype=torch.float,
                               scatter_gather_tensors=False):
-    """Batched communication operation. Sends the grad tensor to the 
+    """Batched communication operation. Sends the gradient tensor to the
-    previous member in pipeline, while recieves the input tensor from the
+    previous stage in pipeline, while receives the output tensor from the
-    previous member in pipeline.
+    previous stage in pipeline as the input of this stage.
-    :param input_tensor_grad: Tensor to be sent
+    Args:
-    :param input_tensor_shape: The shape of the tensor to be recieved
+        input_tensor_grad (:class:`torch.Tensor`): Tensor to be sent.
-    :type input_tensor_grad: :class:`torch.Tensor`
+        input_tensor_shape (:class:`torch.Size`): The shape of the tensor to be received.
-    :type input_tensor_shape: :class:`torch.Size`
+
-    :return: The input tensor in forward step
+    Returns:
-    :rtype: :class:`torch.Tensor`
+        :class:`torch.Tensor`: The input tensor.
    """
    if gpc.is_pipeline_first_stage():
        input_tensor = None
@ -269,15 +267,15 @@ def send_forward_recv_forward(output_tensor,
                              dtype=torch.float,
                              scatter_gather_tensors=False):
    """Batched communication operation. Sends the input tensor to the 
-    next member in pipeline, while recieves the input tensor from the
+    next stage in pipeline, while receives the output tensor from the
-    previous member in pipeline.
+    previous stage in pipeline as the input of this stage.
-    :param output_tensor: Tensor to be sent
+    Args:
-    :param input_tensor_shape: The shape of the tensor to be recieved
+        output_tensor (:class:`torch.Tensor`): Tensor to be sent.
-    :type output_tensor: :class:`torch.Tensor`
+        input_tensor_shape (:class:`torch.Size`): The shape of the tensor to be received.
-    :type input_tensor_shape: :class:`torch.Size`
+
-    :return: The input tensor in forward step
+    Returns:
-    :rtype: :class:`torch.Tensor`
+        :class:`torch.Tensor`: The input tensor.
    """
    input_tensor, _ = _communicate(tensor_send_next=output_tensor,
                                   recv_prev=recv_prev,
@ -296,16 +294,16 @@ def send_backward_recv_backward(input_tensor_grad,
                                next_rank=None,
                                dtype=torch.float,
                                scatter_gather_tensors=False):
-    """Batched communication operation. Sends the grad tensor to the 
+    """Batched communication operation. Sends the gradient tensor to the
-    previous member in pipeline, while recieves the grad tensor from the
+    previous stage in pipeline, while receives the gradient tensor from the
-    next member in pipeline.
+    next member in pipeline as the input of this stage.
-    :param input_tensor_grad: Tensor to be sent
+    Args:
-    :param output_grad_shape: The shape of the tensor to be recieved
+        input_tensor_grad (:class:`torch.Tensor`): Tensor to be sent.
-    :type input_tensor_grad: :class:`torch.Tensor`
+        output_grad_shape (:class:`torch.Size`): The shape of the tensor to be received.
-    :type output_grad_shape: :class:`torch.Size`
+
-    :return: The grad of output tensor in forward step
+    Returns:
-    :rtype: :class:`torch.Tensor`
+        :class:`torch.Tensor`: The input gradient tensor.
    """
    _, output_tensor_grad = _communicate(tensor_send_prev=input_tensor_grad,
                                         recv_next=recv_next,
@ -327,20 +325,18 @@ def send_forward_backward_recv_forward_backward(output_tensor,
                                                next_rank=None,
                                                dtype=torch.float,
                                                scatter_gather_tensors=False):
-    """Batched communication operation. Sends the input tensor to the next and 
+    """Batched communication operation. Sends the input tensor to the next stage in pipeline and
-    the grad tensor to the previous, while recieves the grad tensor from the
+    the gradient tensor to the previous stage, while receives the input gradient tensor from the
-    next and the input tensor from the previous.
+    next stage and the input tensor from the previous stage.
-    :param output_tensor: Tensor sent to the next
+    Args:
-    :param input_tensor_grad: Tensor sent to the previous
+        output_tensor (:class:`torch.Tensor`): Tensor sent to the next.
-    :param input_tensor_shape: The shape of the tensor recieved from the previous
+        input_tensor_grad (:class:`torch.Tensor`): Tensor sent to the previous.
-    :param output_grad_shape: The shape of the tensor recieved from the next
+        input_tensor_shape (:class:`torch.Size`): The shape of the tensor received from the previous.
-    :type output_tensor: :class:`torch.Tensor`
+        output_grad_shape (:class:`torch.Size`): The shape of the tensor received from the next.
-    :type input_tensor_grad: :class:`torch.Tensor`
+
-    :type input_tensor_shape: :class:`torch.Size`
+    Returns:
-    :type output_grad_shape: :class:`torch.Size`
+        Tuple(Tensor, Tensor): (the input tensor, the input gradient tensor)
    :return: (the input tensor in forward step, the grad of output tensor in forward step)
    :rtype: (Tensor, Tensor)
    """
    input_tensor, output_tensor_grad = _communicate(
        tensor_send_next=output_tensor,
--- a/colossalai/communication/ring.py
+++ b/colossalai/communication/ring.py
@ -9,15 +9,19 @@ from colossalai.utils import get_current_device, synchronize
 def ring_forward(tensor_send_next: torch.Tensor, parallel_mode: ParallelMode):
-    """Sends a tensor to the next member and recieves a tensor from the previous member.
+    """Sends a tensor to the next member and receives a tensor from the previous member.
-    This function returns the recieved tensor from the previous member.
+    This function returns the received tensor from the previous member.
-    :param tensor_send_next: Tensor sent to next member
+    Args:
-    :param parallel_mode: Parallel group mode used in this communication
+        tensor_send_next: Tensor sent to next member
-    :type tensor_send_next: :class:`torch.Tensor`
+        parallel_mode: Parallel group mode used in this communication
-    :type parallel_mode: :class:`colossalai.context.ParallelMode`
+
-    :return: The tensor recieved from the previous
+    Returns:
-    :rtype: :class:`torch.Tensor`
+        :class:`torch.Tensor`: The tensor received from the previous.
    Note:
        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
    """
    buffer_shape = tensor_send_next.size()
--- a/colossalai/communication/utils.py
+++ b/colossalai/communication/utils.py
@ -12,14 +12,13 @@ def send_tensor_meta(tensor, need_meta=True, next_rank=None):
    meta information of the tensor should be sent before communications. This function
    synchronizes with :func:`recv_tensor_meta`.
-    :param tensor: Tensor to be sent
+    Args:
-    :param need_meta: If False, meta information won't be sent
+        tensor (torch.Tensor): Tensor to be sent.
-    :param next_rank: The rank of the next member in pipeline parallel group
+        need_meta (bool, optional): If False, meta information won't be sent.
-    :type tensor: Tensor
+        next_rank (int): The rank of the next member in pipeline parallel group.
-    :type need_meta: bool, optional
+
-    :type next_rank: int
+    Returns:
-    :return: False
+        bool: False
    :rtype: bool
    """
    if need_meta:
        if next_rank is None:
@ -36,17 +35,17 @@ def send_tensor_meta(tensor, need_meta=True, next_rank=None):
 def recv_tensor_meta(tensor_shape, prev_rank=None):
-    """Recieves tensor meta information before recieving a specific tensor.
+    """Receives tensor meta information before receiving a specific tensor.
    Since the recipient must know the shape of the tensor in p2p communications,
-    meta information of the tensor should be recieved before communications. This function
+    meta information of the tensor should be received before communications. This function
    synchronizes with :func:`send_tensor_meta`.
-    :param tensor_shape: The shape of the tensor to be recieved
+    Args:
-    :param prev_rank: The rank of the source of the tensor
+        tensor_shape (torch.Size): The shape of the tensor to be received.
-    :type tensor_shape: torch.Size
+        prev_rank (int): The rank of the source of the tensor.
-    :type prev_rank: int, optional
+
-    :return: The shape of the tensor to be recieved
+    Returns:
-    :rtype: torch.Size
+        torch.Size: The shape of the tensor to be received.
    """
    if tensor_shape is None:
        if prev_rank is None:
@ -67,14 +66,12 @@ def recv_tensor_meta(tensor_shape, prev_rank=None):
 def split_tensor_into_1d_equal_chunks(tensor, new_buffer=False):
    """Break a tensor into equal 1D chunks.
-    :param tensor: Tensor to be splitted before communication
+    Args:
-    :param new_buffer: Whether uses a new buffer to store sliced tensor
+        tensor (torch.Tensor): Tensor to be split before communication.
        new_buffer (bool, optional): Whether to use a new buffer to store sliced tensor.
-    :type tensor: torch.Tensor
+    Returns:
-    :type new_buffer: bool, optional
+        torch.Tensor: The split tensor
    :return splitted_tensor: The splitted tensor
    :rtype splitted_tensor: torch.Tensor
    """
    partition_size = torch.numel(tensor) // gpc.get_world_size(ParallelMode.PARALLEL_1D)
    start_index = partition_size * gpc.get_local_rank(ParallelMode.PARALLEL_1D)
@ -92,11 +89,10 @@ def split_tensor_into_1d_equal_chunks(tensor, new_buffer=False):
 def gather_split_1d_tensor(tensor):
    """Opposite of above function, gather values from model parallel ranks.
-    :param tensor: Tensor to be gathered after communication
+    Args:
-    :type tensor: torch.Tensor
+        tensor (torch.Tensor): Tensor to be gathered after communication.
-
+    Returns:
-    :return gathered: The gathered tensor
+        gathered (torch.Tensor): The gathered tensor
    :rtype gathered: torch.Tensor
    """
    world_size = gpc.get_world_size(ParallelMode.PARALLEL_1D)
    numel = torch.numel(tensor)
--- a/colossalai/context/config.py
+++ b/colossalai/context/config.py
@ -12,8 +12,8 @@ class Config(dict):
    """This is a wrapper class for dict objects so that values of which can be
    accessed as attributes.
-    :param config: The dict object to be wrapped
+    Args:
-    :type config: dict
+        config (dict): The dict object to be wrapped.
    """
    def __init__(self, config: dict = None):
@ -50,12 +50,14 @@ class Config(dict):
    def from_file(filename: str):
        """Reads a python file and constructs a corresponding :class:`Config` object.
-        :param filename: Name of the file to construct the return object
+        Args:
-        :type filename: str
+            filename (str): Name of the file to construct the return object.
-        :raises AssertionError: Raises an AssertionError if the file does not exist, or the file
+
-            is not .py file
+        Returns:
-        :return: A :class:`Config` object constructed with information in the file
+            :class:`Config`: A :class:`Config` object constructed with information in the file.
-        :rtype: :class:`Config`
+
        Raises:
            AssertionError: Raises an AssertionError if the file does not exist, or the file is not .py file
        """
        # check config path
--- a/colossalai/context/parallel_context.py
+++ b/colossalai/context/parallel_context.py
@ -22,6 +22,10 @@ class ParallelContext(metaclass=SingletonMeta):
    """This class provides interface functions for users to get the parallel context,
    such as the global rank, the local rank, the world size, etc. of each device.
    Note:
        The parallel_mode used in this class should be concluded in ``ParallelMode``.
        More details about ``ParallelMode`` could be found in
        `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
    """
    def __init__(self):
@ -62,10 +66,12 @@ class ParallelContext(metaclass=SingletonMeta):
    def load_config(self, config: Union[dict, str]):
        """Loads the configuration from either a dict or a file.
-        :param config: Either a dict containing the configuration information or the filename
+        Args:
-            of a file containing the configuration information
+            config (dict or str): Either a dict containing the configuration information or the filename
-        :type config: dict or str
+                of a file containing the configuration information.
-        :raises TypeError: Raises a TypeError if `config` is neither a dict or a str
+
        Raises:
            TypeError: Raises a TypeError if `config` is neither a dict nor a str.
        """
        if isinstance(config, str):
            self._config = Config.from_file(config)
@ -81,20 +87,21 @@ class ParallelContext(metaclass=SingletonMeta):
    def get_global_rank(self):
        """Returns the global rank of the current device.
-        :return: The global rank of the current device
+        Returns:
-        :rtype: int
+            int: The global rank of the current device
        """
        return self._global_ranks[ParallelMode.GLOBAL]
    def add_global_rank(self, parallel_mode: ParallelMode, rank: int):
        """Adds the global rank of the current device for `parallel_mode` to the context.
-        :param parallel_mode: The parallel mode for the rank
+        Args:
-        :type parallel_mode: :class:`colossalai.context.ParallelMode`
+            parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode for the rank.
-        :param rank: The rank to be added
+            rank (int): The rank to be added
-        :type rank: int
+
-        :raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
+        Raises:
-            of :class:`colossalai.context.ParallelMode`
+            AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
                of :class:`colossalai.context.ParallelMode`.
        """
        self._check_parallel_mode(parallel_mode)
        self._global_ranks[parallel_mode] = rank
@ -102,12 +109,15 @@ class ParallelContext(metaclass=SingletonMeta):
    def get_local_rank(self, parallel_mode: ParallelMode):
        """Returns the local rank of the current device.
-        :param parallel_mode: The chosen parallel mode
+        Args:
-        :type parallel_mode: :class:`colossalai.context.ParallelMode`
+            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
-        :raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
+
-            of :class:`colossalai.context.ParallelMode`
+        Raises:
-        :return: The local rank of the current device for `parallel_mode`
+            AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
-        :rtype: int
+                of :class:`colossalai.context.ParallelMode`.
        Returns:
            int: The local rank of the current device for `parallel_mode`.
        """
        self._check_parallel_mode(parallel_mode)
        return self._local_ranks[parallel_mode]
@ -115,12 +125,13 @@ class ParallelContext(metaclass=SingletonMeta):
    def add_local_rank(self, parallel_mode: ParallelMode, rank: int):
        """Adds the local rank of the current device for `parallel_mode` to the context.
-        :param parallel_mode: The parallel mode for the rank
+        Args:
-        :type parallel_mode: :class:`colossalai.context.ParallelMode`
+            parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode for the rank.
-        :param rank: The rank to be added
+            rank (int): The rank to be added.
-        :type rank: int
+
-        :raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
+        Raises:
-            of :class:`colossalai.context.ParallelMode`
+            AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
                of :class:`colossalai.context.ParallelMode`.
        """
        self._check_parallel_mode(parallel_mode)
        self._local_ranks[parallel_mode] = rank
@ -128,12 +139,15 @@ class ParallelContext(metaclass=SingletonMeta):
    def get_next_global_rank(self, parallel_mode: ParallelMode):
        """Returns the global rank of the next device.
-        :param parallel_mode: The chosen parallel mode
+        Args:
-        :type parallel_mode: :class:`colossalai.context.ParallelMode`
+            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
-        :raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
+
-            of :class:`colossalai.context.ParallelMode`
+        Raises:
-        :return: The global rank of the next device for `parallel_mode`
+            AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
-        :rtype: int
+                of :class:`colossalai.context.ParallelMode`.
        Returns:
            int: The global rank of the next device for `parallel_mode`.
        """
        self._check_parallel_mode(parallel_mode)
@ -147,12 +161,15 @@ class ParallelContext(metaclass=SingletonMeta):
    def get_prev_global_rank(self, parallel_mode: ParallelMode):
        """Returns the global rank of the previous device.
-        :param parallel_mode: The chosen parallel mode
+        Args:
-        :type parallel_mode: :class:`colossalai.context.ParallelMode`
+            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
-        :raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
+
-            of :class:`colossalai.context.ParallelMode`
+        Raises:
-        :return: The global rank of the previous device for `parallel_mode`
+            AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
-        :rtype: int
+                of :class:`colossalai.context.ParallelMode`.
        Returns:
            int: The global rank of the previous device for `parallel_mode`.
        """
        self._check_parallel_mode(parallel_mode)
@ -167,13 +184,16 @@ class ParallelContext(metaclass=SingletonMeta):
        """Returns a boolean value indicating whether the current device is the first one
        among its group for `parallel_mode`.
-        :param parallel_mode: The chosen parallel mode
+        Args:
-        :type parallel_mode: :class:`colossalai.context.ParallelMode`
+            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
-        :raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
+
-            of :class:`colossalai.context.ParallelMode`
+        Raises:
-        :return: a boolean value indicating whether the current device is the first one
+            AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
-            among its group for `parallel_mode`
+                of :class:`colossalai.context.ParallelMode`.
-        :rtype: bool
+
        Returns:
            bool: a boolean value indicating whether the current device is the first one
                among its group for `parallel_mode`.
        """
        rank = self.get_local_rank(parallel_mode)
        return rank == 0
@ -182,13 +202,16 @@ class ParallelContext(metaclass=SingletonMeta):
        """Returns a boolean value indicating whether the current device is the last one
        among its group for `parallel_mode`.
-        :param parallel_mode: The chosen parallel mode
+        Args:
-        :type parallel_mode: :class:`colossalai.context.ParallelMode`
+            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
-        :raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
+
-            of :class:`colossalai.context.ParallelMode`
+        Raises:
-        :return: a boolean value indicating whether the current device is the last one
+            AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
-            among its group for `parallel_mode`
+                of :class:`colossalai.context.ParallelMode`.
-        :rtype: bool
+
        Returns:
            bool: a boolean value indicating whether the current device is the first one
                among its group for `parallel_mode`.
        """
        rank = self.get_local_rank(parallel_mode)
        world_size = self.get_world_size(parallel_mode)
@ -210,12 +233,15 @@ class ParallelContext(metaclass=SingletonMeta):
    def get_world_size(self, parallel_mode: ParallelMode):
        """Returns the world size for `parallel_mode`.
-        :param parallel_mode: The chosen parallel mode
+        Args:
-        :type parallel_mode: :class:`colossalai.context.ParallelMode`
+            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
-        :raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
+
-            of :class:`colossalai.context.ParallelMode`
+        Raises:
-        :return: The world size for `parallel_mode`
+            AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
-        :rtype: int
+                of :class:`colossalai.context.ParallelMode`.
        Returns:
            int: The world size for `parallel_mode`.
        """
        self._check_parallel_mode(parallel_mode)
        return self._world_sizes[parallel_mode]
@ -223,12 +249,13 @@ class ParallelContext(metaclass=SingletonMeta):
    def add_world_size(self, parallel_mode: ParallelMode, world_size: int):
        """Adds world size for `parallel_mode`.
-        :param parallel_mode: The chosen parallel mode
+        Args:
-        :type parallel_mode: :class:`colossalai.context.ParallelMode`
+            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
-        :param world_size: The world size to be added
+            world_size (int): The world size to be added
-        :type world_size: int
+
-        :raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
+        Raises:
-            of :class:`colossalai.context.ParallelMode`
+            AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
                of :class:`colossalai.context.ParallelMode`.
        """
        self._check_parallel_mode(parallel_mode)
        self._world_sizes[parallel_mode] = world_size
@ -236,12 +263,15 @@ class ParallelContext(metaclass=SingletonMeta):
    def get_group(self, parallel_mode: ParallelMode):
        """Returns the group of the current device for `parallel_mode`.
-        :param parallel_mode: The chosen parallel mode
+        Args:
-        :type parallel_mode: :class:`colossalai.context.ParallelMode`
+            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
-        :raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
+
-            of :class:`colossalai.context.ParallelMode`
+        Raises:
-        :return: The group of the current device for `parallel_mode`
+            AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
-        :rtype: torch.distributed.ProcessGroup
+                of :class:`colossalai.context.ParallelMode`.
        Returns:
            torch.distributed.ProcessGroup: The group of the current device for `parallel_mode`.
        """
        self._check_parallel_mode(parallel_mode)
        return self._groups[parallel_mode]
@ -249,12 +279,13 @@ class ParallelContext(metaclass=SingletonMeta):
    def add_group(self, parallel_mode: ParallelMode, group: dist.ProcessGroup):
        """Adds the group of the current device for `parallel_mode`.
-        :param parallel_mode: The chosen parallel mode
+        Args:
-        :type parallel_mode: :class:`colossalai.context.ParallelMode`
+            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
-        :param group: The group to be added
+            group (torch.distributed.ProcessGroup): The group to be added
-        :type group: torch.distributed.ProcessGroup
+
-        :raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
+        Raises:
-            of :class:`colossalai.context.ParallelMode`
+            AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
                of :class:`colossalai.context.ParallelMode`.
        """
        self._check_parallel_mode(parallel_mode)
        self._groups[parallel_mode] = group
@ -262,12 +293,15 @@ class ParallelContext(metaclass=SingletonMeta):
    def get_ranks_in_group(self, parallel_mode: ParallelMode):
        """Returns the rank of the current device for `parallel_mode` in the group.
-        :param parallel_mode: The chosen parallel mode
+        Args:
-        :type parallel_mode: :class:`colossalai.context.ParallelMode`
+            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
-        :raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
+
-            of :class:`colossalai.context.ParallelMode`
+        Raises:
-        :return: the rank of the current device for `parallel_mode` in the group
+            AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
-        :rtype: int
+                of :class:`colossalai.context.ParallelMode`.
        Returns:
            int: The rank of the current device for `parallel_mode` in the group.
        """
        self._check_parallel_mode(parallel_mode)
        return self._ranks_in_group[parallel_mode]
@ -275,28 +309,26 @@ class ParallelContext(metaclass=SingletonMeta):
    def add_ranks_in_group(self, parallel_mode: ParallelMode, ranks: list):
        """Adds the ranks of the current device for `parallel_mode` in the group.
-        :param parallel_mode: The chosen parallel mode
+        Args:
-        :type parallel_mode: :class:`colossalai.context.ParallelMode`
+            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
-        :param ranks: List of ranks to be added
+            ranks (list): List of ranks to be added
-        :type ranks: list
+
-        :raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
+        Raises:
-            of :class:`colossalai.context.ParallelMode`
+            AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
                of :class:`colossalai.context.ParallelMode`.
        """
        self._check_parallel_mode(parallel_mode)
        self._ranks_in_group[parallel_mode] = ranks
    def init_global_dist(self, rank: int, world_size: int, backend: str, host: str, port: int):
        """Initializes the global distributed environment
-        :param rank: rank for the default process group
+
-        :type rank: int
+        Args:
-        :param world_size: world size of the default process group
+           rank (int): rank for the default process group.
-        :type world_size: int
+           world_size (int): world size of the default process group.
-        :param host: the master address for distributed training
+           backend (str): backend for ``torch.distributed``
-        :type host: str
+           host (str): the master address for distributed training.
-        :param port: the master port for distributed training
+           port (str): the master port for distributed training
        :type port: str
        :param backend: backend for torch.distributed
        :type backend: str
        """
        # initialize the default process group
        init_method = f'tcp://{host}:{port}'
@ -315,8 +347,9 @@ class ParallelContext(metaclass=SingletonMeta):
    def check_sanity(self):
        """Checks sanity of the parallel context.
-        :raises AssertionError: Raises an AssertionError if the world size does not equal to the product
+        Raises:
-            of data paralle size, pipeline parallel size and tensor parallel size
+            AssertionError: Raises an AssertionError if the world size does not equal to the product
                of data parallel size, pipeline parallel size and tensor parallel size.
        """
        dps = self.data_parallel_size
        pps = self.pipeline_parallel_size
@ -341,7 +374,8 @@ class ParallelContext(metaclass=SingletonMeta):
    def init_parallel_groups(self):
        """Initializes the parallel groups.
-        :raises AssertionError: Raises an AssertionError if the field paralle is not present in the config file
+        Raises:
            AssertionError: Raises an AssertionError if the field parallel is not present in the config file.
        """
        # get rank and world size
@ -411,11 +445,11 @@ class ParallelContext(metaclass=SingletonMeta):
        """Returns a boolean value indicating whether `parallel_mode` is initialized
        in the current system.
-        :param parallel_mode: The chosen parallel mode
+        Args:
-        :type parallel_mode: :class:`colossalai.context.ParallelMode`
+            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
-        :return: a boolean value indicating whether `parallel_mode` is initialized
+
-            in the current system
+        Returns:
-        :rtype: bool
+            bool: a boolean value indicating whether `parallel_mode` is initialized in the current system.
        """
        return parallel_mode in self._groups
@ -432,8 +466,8 @@ class ParallelContext(metaclass=SingletonMeta):
    def set_device(self, device_ordinal: int = None):
        """Sets distributed processes to be bound to devices.
-        :param device_ordinal: the device id to be bound to
+        Args:
-        :type device_ordinal: int, optional
+           device_ordinal (int, optional): the device id to be bound to
        """
        global_rank = self.get_global_rank()
        if device_ordinal is None:
@ -447,8 +481,8 @@ class ParallelContext(metaclass=SingletonMeta):
    def set_seed(self, seed: int):
        """Sets seeds for all random libraries.
-        :param seed: seed for random states
+        Args:
-        :type seed: int
+            seed (int): seed for random states
        """
        random.seed(seed)
        np.random.seed(seed)
--- a/colossalai/context/process_group_initializer/initializer_1d.py
+++ b/colossalai/context/process_group_initializer/initializer_1d.py
@ -11,8 +11,16 @@ from .process_group_initializer import ProcessGroupInitializer
@DIST_GROUP_INITIALIZER.register_module
 class Initializer_1D(ProcessGroupInitializer):
-    '''A ProcessGroupInitializer for 1d tensor parallelism.
+    """A ProcessGroupInitializer for 1d tensor parallelism.
-    '''
+
    Args:
        rank (int): The rank of current process.
        world_size (int): Size of whole communication world.
        config (Config): Running configuration.
        data_parallel_size (int): Size of data parallel.
        pipeline_parallel_size (int): Size of pipeline parallel.
        tensor_parallel_size (int): Size of tensor parallel.
    """
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
@ -20,8 +28,10 @@ class Initializer_1D(ProcessGroupInitializer):
    def init_dist_group(self):
        """Initialize 1D tensor parallel groups, and assign local_ranks and groups to each gpu.
-        :return: (local_rank, group_world_size, process_group, ranks_in_group, mode)
+
-        :rtype: Tuple
+        Returns:
            Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
                1D tensor parallelism's information in a tuple.
        """
        local_rank = None
        ranks_in_group = None
--- a/colossalai/context/process_group_initializer/initializer_2d.py
+++ b/colossalai/context/process_group_initializer/initializer_2d.py
@ -22,12 +22,16 @@ def _check_summa_env_var(summa_dim):
 class Initializer_2D_Row(ProcessGroupInitializer):
    """2d tensor parallel initialization among rows.
-    :param num_group: The number of all tensor groups
+
-    :param summa_dim: The dimension of SUMMA
+    Args:
-    :param args: Args used to initialize base class
+        num_group (int): The number of all tensor groups.
-    :param kwargs: Kwargs used to initialize base class
+        summa_dim (int): The dimension of SUMMA.
-    :type num_group: int
+        rank (int): The rank of current process.
-    :type summa_dim: int
+        world_size (int): Size of whole communication world.
        config (Config): Running configuration.
        data_parallel_size (int): Size of data parallel.
        pipeline_parallel_size (int): Size of pipeline parallel.
        tensor_parallel_size (int): Size of tensor parallel.
    """
    def __init__(self, num_group, summa_dim, *args, **kwargs):
@ -37,9 +41,9 @@ class Initializer_2D_Row(ProcessGroupInitializer):
    def init_dist_group(self):
        """Initialize 2D tensor row parallel groups, and assign local_ranks and groups to each gpu.
-
+        Returns:
-        :return: 2D tensor row parallelism's information
+            Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
-        :rtype: Tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
+                2D tensor row parallelism's information in a tuple.
        """
        local_rank = None
        ranks_in_group = None
@ -64,13 +68,15 @@ class Initializer_2D_Row(ProcessGroupInitializer):
 class Initializer_2D_Col(ProcessGroupInitializer):
    """2d tensor parallel initialization among cols.
-    :param num_group: The number of all tensor groups
+    Args:
-    :param summa_dim: The dimension of SUMMA
+        num_group (int): The number of all tensor groups.
-    :param args: Args used to initialize base class
+        summa_dim (int): The dimension of SUMMA.
-    :param kwargs: Kwargs used to initialize base class
+        rank (int): The rank of current process.
-
+        world_size (int): Size of whole communication world.
-    :type num_group: int
+        config (Config): Running configuration.
-    :type summa_dim: int
+        data_parallel_size (int): Size of data parallel.
        pipeline_parallel_size (int): Size of pipeline parallel.
        tensor_parallel_size (int): Size of tensor parallel.
    """
    def __init__(self, num_group, summa_dim, *args, **kwargs):
@ -81,8 +87,9 @@ class Initializer_2D_Col(ProcessGroupInitializer):
    def init_dist_group(self):
        """Initialize 2D tensor row parallel groups, and assign local_ranks and groups to each gpu.
-        :return: 2D tensor col parallelism's information
+        Returns:
-        :rtype: Tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
+            Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
                2D tensor col parallelism's information in a tuple.
        """
        local_rank = None
        ranks_in_group = None
@ -109,8 +116,13 @@ class Initializer_2D(ProcessGroupInitializer):
    """
    Serve as the single entry point to 2D parallel initialization.
-    :param args: Args used to initialize ProcessGroupInitializer
+    Args:
-    :param kwargs: Kwargs used to initialize ProcessGroupInitializer
+        rank (int): The rank of current process.
        world_size (int): Size of whole communication world.
        config (Config): Running configuration.
        data_parallel_size (int): Size of data parallel.
        pipeline_parallel_size (int): Size of pipeline parallel.
        tensor_parallel_size (int): Size of tensor parallel.
    """
    def __init__(self, *args, **kwargs):
@ -127,8 +139,10 @@ class Initializer_2D(ProcessGroupInitializer):
    def init_dist_group(self):
        """Initialize 2D tensor row and col parallel groups, and assign local_ranks and groups to each gpu.
-        :return: 2D tensor parallelism's information
+
-        :rtype: list of Tuples (local_rank, group_world_size, process_group, ranks_in_group, mode)
+        Returns:
            List[Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode)]:
                2D tensor parallelism's information in a list of tuples.
        """
        parallel_setting = [self.row_initializer.init_dist_group(), self.col_initializer.init_dist_group()]
        return parallel_setting
--- a/colossalai/context/process_group_initializer/initializer_2p5d.py
+++ b/colossalai/context/process_group_initializer/initializer_2p5d.py
@ -31,14 +31,17 @@ def _check_tesseract_env_var(tesseract_dim: int, tesseract_dep: int):
 # i row j col k dep
 class Initializer_2p5D_ROW(ProcessGroupInitializer):
-    """2p5d tensor parallel initialization among rows.
+    """2.5d tensor parallel initialization among rows.
-    :param tesseract_dim: The dimension of tesseract
+    Args:
-    :param tesseract_dep: The dimension of depth
+        tesseract_dim (int): The dimension of tesseract.
-    :param args: Args used to initialize base class
+        tesseract_dep (int): The dimension of depth.
-
+        rank (int): The rank of current process.
-    :type tesseract_dim: int
+        world_size (int): Size of whole communication world.
-    :type tesseract_dep: int
+        config (Config): Running configuration.
        data_parallel_size (int): Size of data parallel.
        pipeline_parallel_size (int): Size of pipeline parallel.
        tensor_parallel_size (int): Size of tensor parallel.
    """
    def __init__(self, tesseract_dim: int, tesseract_dep: int, *args):
@ -50,10 +53,11 @@ class Initializer_2p5D_ROW(ProcessGroupInitializer):
            "Tensor parallel size should be depth * dim ** 2 in 2.5D parallel"
    def init_dist_group(self):
-        """Initialize 2p5D tensor row parallel groups, and assign local_ranks and groups to each gpu.
+        """Initialize 2.5D tensor row parallel groups, and assign local_ranks and groups to each gpu.
-        :return: 2p5D tensor row parallelism's information
+        Returns:
-        :rtype: Tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
+            Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
                2.5D tensor row parallelism's information in a tuple.
        """
        local_rank = None
        ranks_in_group = None
@ -80,14 +84,17 @@ class Initializer_2p5D_ROW(ProcessGroupInitializer):
 class Initializer_2p5D_Col(ProcessGroupInitializer):
-    """2p5d tensor parallel initialization among cols.
+    """2.5d tensor parallel initialization among cols.
-    :param tesseract_dim: The dimension of tesseract
+    Args:
-    :param tesseract_dep: The dimension of depth
+        tesseract_dim (int): The dimension of tesseract.
-    :param args: Args used to initialize base class
+        tesseract_dep (int): The dimension of depth.
-
+        rank (int): The rank of current process.
-    :type tesseract_dim: int
+        world_size (int): Size of whole communication world.
-    :type tesseract_dep: int
+        config (Config): Running configuration.
        data_parallel_size (int): Size of data parallel.
        pipeline_parallel_size (int): Size of pipeline parallel.
        tensor_parallel_size (int): Size of tensor parallel.
    """
    def __init__(self, tesseract_dim: int, tesseract_dep: int, *args):
@ -99,10 +106,11 @@ class Initializer_2p5D_Col(ProcessGroupInitializer):
            "Tensor parallel size should be depth * dim ** 2 in 2.5D parallel"
    def init_dist_group(self):
-        """Initialize 2p5D tensor col parallel groups, and assign local_ranks and groups to each gpu.
+        """Initialize 2.5D tensor col parallel groups, and assign local_ranks and groups to each gpu.
-        :return: 2p5D tensor col parallelism's information
+        Returns:
-        :rtype: Tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
+            Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
                2.5D tensor col parallelism's information in a tuple.
        """
        local_rank = None
        ranks_in_group = None
@ -129,14 +137,17 @@ class Initializer_2p5D_Col(ProcessGroupInitializer):
 class Initializer_2p5D_Dep(ProcessGroupInitializer):
-    """2p5D tensor parallel initialization among depths.
+    """2.5D tensor parallel initialization among depths.
-    :param tesseract_dim: The dimension of tesseract
+    Args:
-    :param tesseract_dep: The dimension of depth
+        tesseract_dim (int): The dimension of tesseract.
-    :param args: Args used to initialize base class
+        tesseract_dep (int): The dimension of depth.
-
+        rank (int): The rank of current process.
-    :type tesseract_dim: int
+        world_size (int): Size of whole communication world.
-    :type tesseract_dep: int
+        config (Config): Running configuration.
        data_parallel_size (int): Size of data parallel.
        pipeline_parallel_size (int): Size of pipeline parallel.
        tensor_parallel_size (int): Size of tensor parallel.
    """
    def __init__(self, tesseract_dim: int, tesseract_dep: int, *args):
@ -148,10 +159,11 @@ class Initializer_2p5D_Dep(ProcessGroupInitializer):
            "Tensor parallel size should be depth * dim ** 2 in 2.5D parallel"
    def init_dist_group(self):
-        """Initialize 2p5D tensor depth parallel groups, and assign local_ranks and groups to each gpu.
+        """Initialize 2.5D tensor depth parallel groups, and assign local_ranks and groups to each gpu.
-        :return: 2p5D tensor depth parallelism's information
+        Returns:
-        :rtype: Tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
+            Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
                2.5D tensor depth parallelism's information in a tuple.
        """
        local_rank = None
        ranks_in_group = None
@ -179,14 +191,17 @@ class Initializer_2p5D_Dep(ProcessGroupInitializer):
 # i row j col k dep
 class Initializer_2p5D_XZ(ProcessGroupInitializer):
-    """2p5d tensor parallel initialization among cols times dep.
+    """2.5d tensor parallel initialization among cols times dep.
-    :param tesseract_dim: The dimension of tesseract
+    Args:
-    :param tesseract_dep: The dimension of depth
+        tesseract_dim (int): The dimension of tesseract.
-    :param args: Args used to initialize base class
+        tesseract_dep (int): The dimension of depth.
-
+        rank (int): The rank of current process.
-    :type tesseract_dim: int
+        world_size (int): Size of whole communication world.
-    :type tesseract_dep: int
+        config (Config): Running configuration.
        data_parallel_size (int): Size of data parallel.
        pipeline_parallel_size (int): Size of pipeline parallel.
        tensor_parallel_size (int): Size of tensor parallel.
    """
    def __init__(self, tesseract_dim: int, tesseract_dep: int, *args):
@ -198,10 +213,11 @@ class Initializer_2p5D_XZ(ProcessGroupInitializer):
            "Tensor parallel size should be depth * dim ** 2 in 2.5D parallel"
    def init_dist_group(self):
-        """Initialize 2p5D tensor colXdepth parallel groups, and assign local_ranks and groups to each gpu.
+        """Initialize 2.5D tensor colXdepth parallel groups, and assign local_ranks and groups to each gpu.
-        :return: 2p5D tensor colXdepth parallelism's information
+        Returns:
-        :rtype: Tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
+            Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
                2.5D tensor colXdepth parallelism's information in a tuple.
        """
        local_rank = None
        ranks_in_group = None
@ -232,20 +248,14 @@ class Initializer_2p5D(ProcessGroupInitializer):
    """
    Serve as the single entry point to Tesseract parallel initialization.
-    :param rank: The rank of current process
+    Args:
-    :param world_size: Size of whole communication world
+        rank (int): The rank of current process.
-    :param config: Running configuration
+        world_size (int): Size of whole communication world.
-    :param data_parallel_size: Size of data parallel
+        config (Config): Running configuration.
-    :param pipeline_parallel_size: Size of pipeline parallel
+        data_parallel_size (int): Size of data parallel.
-    :param tensor_parallel_size: Size of tensor parallel
+        pipeline_parallel_size (int): Size of pipeline parallel.
-    :param depth: The depth of 2p5d parallel
+        tensor_parallel_size (int): Size of tensor parallel.
-    :type rank: int
+        depth (int): The depth of 2.5d parallel.
    :type world_size: int
    :type config: Config
    :type data_parallel_size: int
    :type pipeline_parallel_size: int
    :type tensor_parallel_size: int
    :type depth: int
    """
    def __init__(self, rank: int, world_size: int, config: Config, data_parallel_size: int, pipeline_parallel_size: int,
@ -266,9 +276,11 @@ class Initializer_2p5D(ProcessGroupInitializer):
        self.xz_initializer = Initializer_2p5D_XZ(self.tesseract_dim, self.tesseract_dep, *args)
    def init_dist_group(self):
-        """Initialize 2p5D tensor row, col, depth, and colXdepth parallel groups, and assign local_ranks and groups to each gpu.
+        """Initialize 2.5D tensor row, col, depth, and colXdepth parallel groups, and assign local_ranks and groups to each gpu.
-        :return: Whole 2p5D tensor parallelism's information
+
-        :rtype: list of Tuples (local_rank, group_world_size, process_group, ranks_in_group, mode)
+        Returns:
            List[Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode)]:
                Whole 2.5D tensor parallelism's information in a list of tuples.
        """
        parallel_setting = [
            self.col_initializer.init_dist_group(),
--- a/colossalai/context/process_group_initializer/initializer_3d.py
+++ b/colossalai/context/process_group_initializer/initializer_3d.py
@ -26,12 +26,15 @@ def _check_depth_env_var(depth):
 class Initializer_3D_Input(ProcessGroupInitializer):
    """3D tensor parallel initialization among input.
-    :param num_group: The number of all tensor groups
+    Args:
-    :param depth: Depth of 3D parallelism
+        num_group (int): The number of all tensor groups.
-    :param args: Args used in base class
+        depth (int): Depth of 3D parallelism.
-
+        rank (int): The rank of current process.
-    :type num_group: int
+        world_size (int): Size of whole communication world.
-    :type depth: int
+        config (Config): Running configuration.
        data_parallel_size (int): Size of data parallel.
        pipeline_parallel_size (int): Size of pipeline parallel.
        tensor_parallel_size (int): Size of tensor parallel.
    """
    def __init__(self, num_group: int, depth: int, *args):
@ -42,8 +45,9 @@ class Initializer_3D_Input(ProcessGroupInitializer):
    def init_dist_group(self):
        """Initialize 3D tensor parallel groups among input, and assign local_ranks and groups to each gpu.
-        :return: 3D tensor parallelism's information among input
+        Returns:
-        :rtype: Tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
+            Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
                3D tensor parallelism's information among input in a tuple.
        """
        local_rank = None
        ranks_in_group = None
@ -70,12 +74,15 @@ class Initializer_3D_Input(ProcessGroupInitializer):
 class Initializer_3D_Weight(ProcessGroupInitializer):
    """3D tensor parallel initialization among weight.
-    :param num_group: The number of all tensor groups
+    Args:
-    :param depth: Depth of 3D parallelism
+        num_group (int): The number of all tensor groups.
-    :param args: Args used in base class
+        depth (int): Depth of 3D parallelism.
-
+        rank (int): The rank of current process.
-    :type num_group: int
+        world_size (int): Size of whole communication world.
-    :type depth: int
+        config (Config): Running configuration.
        data_parallel_size (int): Size of data parallel.
        pipeline_parallel_size (int): Size of pipeline parallel.
        tensor_parallel_size (int): Size of tensor parallel.
    """
    def __init__(self, num_group: int, depth: int, *args):
@ -86,8 +93,9 @@ class Initializer_3D_Weight(ProcessGroupInitializer):
    def init_dist_group(self):
        """Initialize 3D tensor parallel groups among weight, and assign local_ranks and groups to each gpu.
-        :return: 3D tensor parallelism's information among weight
+        Returns:
-        :rtype: Tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
+            Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
                3D tensor parallelism's information among weight in a tuple.
        """
        local_rank = None
        ranks_in_group = None
@ -114,12 +122,15 @@ class Initializer_3D_Weight(ProcessGroupInitializer):
 class Initializer_3D_Output(ProcessGroupInitializer):
    """3D tensor parallel initialization among output.
-    :param num_group: The number of all tensor groups
+    Args:
-    :param depth: Depth of 3D parallelism
+        num_group (int): The number of all tensor groups.
-    :param args: Args used in base class
+        depth (int): Depth of 3D parallelism.
-
+        rank (int): The rank of current process.
-    :type num_group: int
+        world_size (int): Size of whole communication world.
-    :type depth: int
+        config (Config): Running configuration.
        data_parallel_size (int): Size of data parallel.
        pipeline_parallel_size (int): Size of pipeline parallel.
        tensor_parallel_size (int): Size of tensor parallel.
    """
    def __init__(self, num_group: int, depth: int, *args):
@ -130,8 +141,9 @@ class Initializer_3D_Output(ProcessGroupInitializer):
    def init_dist_group(self):
        """Initialize 3D tensor parallel groups among output, and assign local_ranks and groups to each gpu.
-        :return: 3D tensor parallelism's information among output
+        Returns:
-        :rtype: Tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
+            Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
                3D tensor parallelism's information among output in a tuple.
        """
        local_rank = None
        ranks_in_group = None
@ -158,7 +170,14 @@ class Initializer_3D_Output(ProcessGroupInitializer):
@DIST_GROUP_INITIALIZER.register_module
 class Initializer_3D(ProcessGroupInitializer):
    """Serve as the single entry point to 3D parallel initialization.
-    :param args: Args used to initialize ProcessGroupInitializer
+
    Args:
        rank (int): The rank of current process.
        world_size (int): Size of whole communication world.
        config (Config): Running configuration.
        data_parallel_size (int): Size of data parallel.
        pipeline_parallel_size (int): Size of pipeline parallel.
        tensor_parallel_size (int): Size of tensor parallel.
    """
    def __init__(self, *args):
@ -175,8 +194,10 @@ class Initializer_3D(ProcessGroupInitializer):
    def init_dist_group(self):
        """Initialize 3D tensor parallel groups, and assign local_ranks and groups to each gpu.
-        :return: 3D tensor parallelism's information
+
-        :rtype: list of Tuples (local_rank, group_world_size, process_group, ranks_in_group, mode)
+        Returns:
            List[Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode)]:
                Whole 3D tensor parallelism's information in a list of tuples.
        """
        parallel_setting = [
            self.input_initializer.init_dist_group(),
--- a/colossalai/context/process_group_initializer/initializer_data.py
+++ b/colossalai/context/process_group_initializer/initializer_data.py
@ -12,8 +12,13 @@ from ..parallel_mode import ParallelMode
 class Initializer_Data(ProcessGroupInitializer):
    """A ProcessGroupInitializer for data parallelism.
-    :param args: Args used to initialize ProcessGroupInitializer
+    Args:
-    :param kwargs: Kwargs used to initialize ProcessGroupInitializer
+        rank (int): The rank of current process.
        world_size (int): Size of whole communication world.
        config (Config): Running configuration.
        data_parallel_size (int): Size of data parallel.
        pipeline_parallel_size (int): Size of pipeline parallel.
        tensor_parallel_size (int): Size of tensor parallel.
    """
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
@ -22,8 +27,9 @@ class Initializer_Data(ProcessGroupInitializer):
    def init_dist_group(self):
        """Initialize data parallel groups, and assign local_ranks and groups to each gpu.
-        :return: Data parallelism's information
+        Returns:
-        :rtype: Tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
+            Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
                A Data parallelism's information tuple.
        """
        local_rank = None
        ranks_in_group = None
--- a/colossalai/context/process_group_initializer/initializer_model.py
+++ b/colossalai/context/process_group_initializer/initializer_model.py
@ -12,8 +12,13 @@ class Initializer_Model(ProcessGroupInitializer):
    """A ProcessGroupInitializer for model parallelism (model parallel group contains pipeline and tensor parallel
    groups).
-    :param args: Args used to initialize ProcessGroupInitializer
+    Args:
-    :param kwargs: Kwargs used to initialize ProcessGroupInitializer
+        rank (int): The rank of current process.
        world_size (int): Size of whole communication world.
        config (Config): Running configuration.
        data_parallel_size (int): Size of data parallel.
        pipeline_parallel_size (int): Size of pipeline parallel.
        tensor_parallel_size (int): Size of tensor parallel.
    """
    def __init__(self, *args, **kwargs):
@ -24,8 +29,9 @@ class Initializer_Model(ProcessGroupInitializer):
    def init_dist_group(self):
        """Initialize model parallel groups, and assign local_ranks and groups to each gpu.
-        :return: (local_rank, group_world_size, process_group, ranks_in_group, mode)
+        Returns:
-        :rtype: Tuple
+            Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
                A Model parallelism's information tuple.
        """
        local_rank = None
        ranks_in_group = None
--- a/colossalai/context/process_group_initializer/initializer_pipeline.py
+++ b/colossalai/context/process_group_initializer/initializer_pipeline.py
@ -12,8 +12,13 @@ from ..parallel_mode import ParallelMode
 class Initializer_Pipeline(ProcessGroupInitializer):
    """A ProcessGroupInitializer for pipeline parallelism.
-    :param args: Args used to initialize ProcessGroupInitializer
+    Args:
-    :param kwargs: Kwargs used to initialize ProcessGroupInitializer
+        rank (int): The rank of current process
        world_size (int): Size of whole communication world
        config (Config): Running configuration
        data_parallel_size (int): Size of data parallel
        pipeline_parallel_size (int): Size of pipeline parallel
        tensor_parallel_size (int): Size of tensor parallel
    """
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
@ -23,8 +28,9 @@ class Initializer_Pipeline(ProcessGroupInitializer):
    def init_dist_group(self):
        """Initialize pipeline parallel groups, and assign local_ranks and groups to each gpu.
-        :return: Pipeline parallelism's information
+        Returns:
-        :rtype: list of Tuples (local_rank, group_world_size, process_group, ranks_in_group, mode)
+            List[Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode)]:
                A Pipeline parallelism's information in list of tuples.
        """
        dist_settings = list()
        for i in range(self.data_parallel_size):
--- a/colossalai/context/process_group_initializer/initializer_sequence.py
+++ b/colossalai/context/process_group_initializer/initializer_sequence.py
@ -15,8 +15,13 @@ class Initializer_Sequence_DP(ProcessGroupInitializer):
    In Sequence Parallelism, each GPU holds the full copy of model weights,
    thus, gradient all-reduce occurs across all processes in the same pipeline stage
-    :param args: Args used to initialize ProcessGroupInitializer
+    Args:
-    :param kwargs: Kwargs used to initialize ProcessGroupInitializer
+        rank (int): The rank of current process
        world_size (int): Size of whole communication world
        config (Config): Running configuration
        data_parallel_size (int): Size of data parallel
        pipeline_parallel_size (int): Size of pipeline parallel
        tensor_parallel_size (int): Size of tensor parallel
    """
    def __init__(self, *args, **kwargs):
@ -27,8 +32,8 @@ class Initializer_Sequence_DP(ProcessGroupInitializer):
    def init_dist_group(self):
        """Initialize Sequence Parallel process groups used for gradient all-reduce.
-        :return: (local_rank, group_world_size, process_group, ranks_in_group, mode)
+        Returns:
-        :rtype: Tuple
+            Tuple: A tuple (local_rank, group_world_size, process_group, ranks_in_group, mode).
        """
        local_rank = None
        ranks_in_group = None
@ -52,8 +57,13 @@ class Initializer_Sequence_DP(ProcessGroupInitializer):
 class Initializer_Sequence(ProcessGroupInitializer):
    """A ProcessGroupInitializer for sequence parallelism.
-    :param args: Args used to initialize ProcessGroupInitializer
+    Args:
-    :param kwargs: Kwargs used to initialize ProcessGroupInitializer
+        rank (int): The rank of current process.
        world_size (int): Size of whole communication world.
        config (Config): Running configuration.
        data_parallel_size (int): Size of data parallel.
        pipeline_parallel_size (int): Size of pipeline parallel.
        tensor_parallel_size (int): Size of tensor parallel.
    """
    def __init__(self,
                 *args, **kwargs):
@ -66,11 +76,12 @@ class Initializer_Sequence(ProcessGroupInitializer):
        """Initialize Sequence parallel process groups and assign local_ranks and groups to each gpu.
        Sequence parallelism requires 2 process groups. The first is for model forward where several processes
-        exchange paritial query, key and value embedding to compute self attention values. The second is for
+        exchange partial query, key and value embedding to compute self attention values. The second is for
        all-reduce to synchronize the model parameters.
-        :return: Sequence parallelism's information
+        Returns:
-        :rtype: list of Tuples (local_rank, group_world_size, process_group, ranks_in_group, mode)
+            List[Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode)]:
                A Sequence parallelism's information in list of tuples.
        """
        parallel_setting = []
--- a/colossalai/context/process_group_initializer/initializer_tensor.py
+++ b/colossalai/context/process_group_initializer/initializer_tensor.py
@ -12,8 +12,13 @@ from ..parallel_mode import ParallelMode
 class Initializer_Tensor(ProcessGroupInitializer):
    """A ProcessGroupInitializer for tensor parallelism.
-    :param args: Args used to initialize ProcessGroupInitializer
+    Args:
-    :param kwargs: Kwargs used to initialize ProcessGroupInitializer
+        rank (int): The rank of current process.
        world_size (int): Size of whole communication world.
        config (Config): Running configuration.
        data_parallel_size (int): Size of data parallel.
        pipeline_parallel_size (int): Size of pipeline parallel.
        tensor_parallel_size (int): Size of tensor parallel.
    """
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
@ -22,8 +27,9 @@ class Initializer_Tensor(ProcessGroupInitializer):
    def init_dist_group(self):
        """Initialize tensor parallel groups, and assign local_ranks and groups to each gpu.
-        :return: Tensor parallelism's information
+        Returns:
-        :rtype: Tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
+            Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
                A Tensor parallelism's information tuple.
        """
        local_rank = None
        ranks_in_group = None
--- a/colossalai/context/process_group_initializer/process_group_initializer.py
+++ b/colossalai/context/process_group_initializer/process_group_initializer.py
@ -9,19 +9,13 @@ from colossalai.context import Config
 class ProcessGroupInitializer(ABC):
    """An object, knowing the parallelism configuration, that initializes parallel groups.
-    :param rank: The rank of current process
+    Args:
-    :param world_size: Size of whole communication world
+        rank (int): The rank of current process.
-    :param config: Running configuration
+        world_size (int): Size of whole communication world.
-    :param data_parallel_size: Size of data parallel
+        config (Config): Running configuration.
-    :param pipeline_parallel_size: Size of pipeline parallel
+        data_parallel_size (int): Size of data parallel.
-    :param tensor_parallel_size: Size of tensor parallel
+        pipeline_parallel_size (int): Size of pipeline parallel.
-
+        tensor_parallel_size (int): Size of tensor parallel.
    :type rank: int
    :type world_size: int
    :type config: Config
    :type data_parallel_size: int
    :type pipeline_parallel_size: int
    :type tensor_parallel_size: int
    """
    def __init__(self,
                 rank: int,
--- a/colossalai/context/random/_helper.py
+++ b/colossalai/context/random/_helper.py
@ -16,8 +16,8 @@ _SEED_MANAGER = SeedManager()
 def get_seeds():
    """Returns the seeds of the seed manager.
-    :return: The seeds of the seed manager
+    Returns:
-    :rtype: dict
+        dict: The seeds of the seed manager.
    """
    return _SEED_MANAGER.seeds
@ -25,8 +25,8 @@ def get_seeds():
 def get_states(copy=False):
    """Returns the seed states of the seed manager.
-    :return: The seed states of the seed manager
+    Returns:
-    :rtype: dict
+        dict: The seed states of the seed manager.
    """
    states = _SEED_MANAGER.seed_states
@ -43,8 +43,8 @@ def get_states(copy=False):
 def get_current_mode():
    """Returns the current mode of the seed manager.
-    :return: The current mode of the seed manager.
+    Returns:
-    :rtype: :class:`torch.ByteTensor`
+        :class:`torch.ByteTensor`: The current mode of the seed manager.
    """
    return _SEED_MANAGER.current_mode
@ -52,12 +52,16 @@ def get_current_mode():
 def add_seed(parallel_mode: ParallelMode, seed: int, overwrite: bool = False):
    """Adds a seed to the seed manager for `parallel_mode`.
-    :param parallel_mode: The chosen parallel mode
+    Args:
-    :type parallel_mode: :class:`colossalai.context.ParallelMode`
+        parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
-    :param seed: The seed to be added
+        seed (int): The seed to be added
-    :type seed: int
+    Raises:
-    :raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance of
+        AssertionError: Raises an AssertionError if `parallel_mode` is not an instance of
-        :class:`colossalai.context.ParallelMode` or the seed for `parallel_mode` has been added
+            :class:`colossalai.context.ParallelMode` or the seed for `parallel_mode` has been added.
    Note:
        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
    """
    _SEED_MANAGER.add_seed(parallel_mode, seed, overwrite)
@ -65,8 +69,12 @@ def add_seed(parallel_mode: ParallelMode, seed: int, overwrite: bool = False):
 def set_mode(parallel_mode: ParallelMode):
    """Sets the current mode of the seed manager.
-    :param parallel_mode: The chosen parallel mode
+    Args:
-    :type parallel_mode: :class:`colossalai.context.ParallelMode`
+        parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
    Note:
        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
    """
    _SEED_MANAGER.set_mode(parallel_mode)
@ -74,11 +82,12 @@ def set_mode(parallel_mode: ParallelMode):
 def set_seed_states(parallel_mode: ParallelMode, state: Tensor):
    """Sets the state of the seed manager for `parallel_mode`.
-    :param parallel_mode: The chosen parallel mode
+    Args:
-    :type parallel_mode: :class:`colossalai.context.ParallelMode`
+        parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
-    :param state: the state to be set
+        state (:class:`torch.Tensor`): the state to be set.
-    :type state: :class:`torch.Tensor`
+
-    :raises AssertionError: Raises an AssertionError if `parallel_mode` is not found in the seed manager
+    Raises:
        AssertionError: Raises an AssertionError if `parallel_mode` is not found in the seed manager.
    """
    _SEED_MANAGER.set_state(parallel_mode, state)
@ -98,6 +107,9 @@ def seed(parallel_mode: ParallelMode):
        with seed(ParallelMode.DATA):
            output = F.dropout(input)
    Note:
        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
    """
    try:
        # set to new mode
@ -125,6 +137,9 @@ def with_seed(func, parallel_mode: ParallelMode):
        wrapper_forward = with_seed(forward, ParallelMode.DATA)
        out = wrapped_forward(input)
    Note:
        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
    """
    @functools.wraps(func)
--- a/colossalai/context/random/seed_manager.py
+++ b/colossalai/context/random/seed_manager.py
@ -9,6 +9,10 @@ from colossalai.context.parallel_mode import ParallelMode
 class SeedManager:
    """This class is a manager of all random seeds involved in the system.
    Note:
        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
    """
    def __init__(self):
@ -30,12 +34,12 @@ class SeedManager:
    def set_state(self, parallel_mode: ParallelMode, state: Tensor):
        """Sets the state of the seed manager for `parallel_mode`.
        Args:
            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
            state (:class:`torch.Tensor`): the state to be set.
-        :param parallel_mode: The chosen parallel mode
+        Raises:
-        :type parallel_mode: :class:`colossalai.context.ParallelMode`
+            AssertionError: Raises an AssertionError if `parallel_mode` is not found in the seed manager.
        :param state: the state to be set
        :type state: :class:`torch.Tensor`
        :raises AssertionError: Raises an AssertionError if `parallel_mode` is not found in the seed manager
        """
        assert parallel_mode in self._seed_states, f'Parallel mode {parallel_mode} is not found in the seed manager'
        self._seed_states[parallel_mode] = state
@ -43,8 +47,8 @@ class SeedManager:
    def set_mode(self, parallel_mode: ParallelMode):
        """Sets the current mode of the seed manager.
-        :param parallel_mode: The chosen parallel mode
+        Args:
-        :type parallel_mode: :class:`colossalai.context.ParallelMode`
+            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
        """
        if self.current_mode:
            # save the current state for current mode
@ -57,14 +61,14 @@ class SeedManager:
    def add_seed(self, parallel_mode: ParallelMode, seed: int, overwrtie: bool = False):
        """Adds a seed to the seed manager for `parallel_mode`.
-        :param parallel_mode: The chosen parallel mode
+        Args:
-        :type parallel_mode: :class:`colossalai.context.ParallelMode`
+            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
-        :param seed: The seed to be added
+            seed (int): The seed to be added.
-        :type seed: int
+            overwrtie (bool, optional): Whether allows to overwrite the seed that has been set already
-        :param overwrtie: Whether allows to overwrite the seed that has been set already
+
-        :type overwrtie: bool, optional
+        Raises
-        :raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance of
+            AssertionError: Raises an AssertionError if `parallel_mode` is not an instance of
-            :class:`colossalai.context.ParallelMode` or the seed for `parallel_mode` has been added
+                :class:`colossalai.context.ParallelMode` or the seed for `parallel_mode` has been added.
        """
        assert isinstance(parallel_mode, ParallelMode), 'A valid ParallelMode must be provided'
        if overwrtie is False:
--- a/colossalai/engine/_base_engine.py
+++ b/colossalai/engine/_base_engine.py
@ -19,20 +19,37 @@ class Engine:
    :meth:`step` which is based on the given :attr:`schedule` over each batch of a dataset.
    It controls a iteration in training.
-    :param model: The neural network model
+    Args:
-    :type model: ``torch.nn.Module``
+        model (``torch.nn.Module``): The neural network model.
-    :param optimizer: Optimizer for updating the parameters
+        optimizer (``torch.optim.Optimizer``): Optimizer for updating the parameters.
-    :type optimizer: ``torch.optim.Optimizer``
+        criterion (``torch.nn.modules.loss._Loss``, optional): Loss function for calculating loss.
-    :param criterion: Loss function for calculating loss
+        gradient_handlers (List[``BaseGradientHandler``], optional): A list of gradient handler used in backward.
-    :type criterion: ``torch.nn.modules.loss._Loss``, optional
+        clip_grad_norm (float, optional): The norm of gradient clipping.
-    :param gradient_handlers: A list of gradient handler used in backward
+        ophook_list (list): List of ophook.
-    :type gradient_handlers: a list of ``BaseGradientHandler``, optional
+        verbose (bool): whether to display log info.
-    :param clip_grad_norm: The norm of gradient clipping
+
-    :type clip_grad_norm: float, optional
+    Examples:
-    :param ophook_list: List of ophook
+        >>> # define model, criterion, optimizer, lr_scheduler, train_dataloader for your training
-    :type ophook_list: list
+        >>> model = ...
-    :param verbose: whether to display log info
+        >>> criterion = ...
-    :type verbose: bool
+        >>> optimizer = ...
        >>> train_dataloader = ...
        >>> engine, _, _, _ = colossalai.initialize(model, optimizer, criterion)
        >>> engine.train()
        >>> for inputs, labels in train_dataloader
        >>>     # set gradients to zero
        >>>     engine.zero_grad()
        >>>     # run forward pass
        >>>     outputs = engine(inputs)
        >>>     # compute loss value and run backward pass
        >>>     loss = engine.criterion(outputs, labels)
        >>>     engine.backward(loss)
        >>>     # update parameters
        >>>     engine.step()
    The example of using Engine in training could be find in
    `Training with engine and trainer <https://www.colossalai.org/docs/basics/engine_trainer>`_. and
    `Run resnet cifar10 with engine <https://github.com/hpcaitech/ColossalAI-Examples/blob/main/image/resnet/run_resnet_cifar10_with_engine.py>`_.
    """
    def __init__(self,
@ -113,10 +130,10 @@ class Engine:
        return self.optimizer.step()
    def backward(self, loss: Tensor):
-        """Start backward propagation given the loss value computed by a loss function
+        """Start backward propagation given the loss value computed by a loss function.
-        :param loss: Loss value computed by a loss function
+        Args:
-        :type loss: :class:`torch.Tensor`
+            loss (:class:`torch.Tensor`): Loss value computed by a loss function.
        """
        ret = self.optimizer.backward(loss)
        for ophook in self._ophook_list:
@ -124,34 +141,22 @@ class Engine:
        return ret
    def backward_by_grad(self, tensor, grad):
-        """Start backward propagation given the gradient of the output tensor
+        """Start backward propagation given the gradient of the output tensor.
-        :param tensor: Output tensor
+        Args:
-        :type tensor: :class:`torch.Tensor`
+            tensor (:class:`torch.Tensor`): Output tensor.
-        :param grad: Gradient passed back to the output
+            grad (:class:`torch.Tensor`): Gradient passed back to the output.
        :type grad: :class:`torch.Tensor`
        """
        ret = self.optimizer.backward_by_grad(tensor, grad)
        for ophook in self._ophook_list:
            ophook.post_iter()
        return ret
    def calc_loss(self, *args, **kwargs):
        """Compute the loss value
        :param args: Args used in criterion function
        :param kwargs: Kwargs used in criterion function
        :return: The loss value
        :rtype: :class:`torch.Tensor`
        """
        return self.criterion(*args, **kwargs)
    def __call__(self, *args, **kwargs):
-        """Run the forward step for the model
+        """Run the forward step for the model.
-        :return: Output the model
+        Returns:
-        :rtype: Tuple[:class:`torch.Tensor`] or :class:`torch.Tensor`
+            Tuple[:class:`torch.Tensor`] or :class:`torch.Tensor`: Output of the model.
        """
        return self.model(*args, **kwargs)
--- a/colossalai/engine/gradient_handler/_base_gradient_handler.py
+++ b/colossalai/engine/gradient_handler/_base_gradient_handler.py
@ -8,10 +8,9 @@ class BaseGradientHandler(ABC):
    """A basic helper class to handle all-reduce operations of gradients across different parallel groups 
    before optimization.
-    :param model: Model where the gradients accumulate
+    Args:
-    :param optimizer: Optimizer for updating the parameters
+        model (Module): Model where the gradients accumulate.
-    :type model: Module
+        optimizer (Optimizer): Optimizer for updating the parameters.
    :type optimizer: Optimizer
    """
    def __init__(self, model, optimizer):
        self._model = model
--- a/colossalai/engine/ophooks/_memtracer_ophook.py
+++ b/colossalai/engine/ophooks/_memtracer_ophook.py
@ -17,12 +17,11 @@ import math
 class MemTracerOpHook(BaseOpHook):
    """
    Collect GPU memory usage information
-    :param warmup: This parameter indicates how many iterations to truncate before profiling, defaults to 50
+
-    :type warmup: int
+    Args:
-    :param refreshrate: This parameter decides the frequency of write file, defaults to 10
+        warmup (int): This parameter indicates how many iterations to truncate before profiling, defaults to 50.
-    :type refreshrate: int
+        refreshrate (int): This parameter decides the frequency of write file, defaults to 10.
-    :param data_prefix: The prefix of the stats data file, defaults to "memstats"
+        data_prefix (string): The prefix of the stats data file, defaults to "memstats".
    :type data_prefix: string
    """
    def __init__(self, warmup: int = 50, refreshrate: int = 10, data_prefix: str = "memstats"):
--- a/colossalai/engine/schedule/_base_schedule.py
+++ b/colossalai/engine/schedule/_base_schedule.py
@ -15,8 +15,12 @@ class BaseSchedule(ABC):
    """A basic helper class to control the process of training or evaluation.
    It mainly composes of forward_backward_step for gradient backward and
    optimizer_step for parameters update.
-    For the convenience to enable FP16, we aggreate all codes that contain the
+    For the convenience to enable FP16, we aggregate all codes that contain the
    control of FP16 in class schedule.
    Args:
        batch_data_process_func (Callable, optional): The preprocessing function which receives a batch of data,
        and it will be executed in load_batch.
    """
    def __init__(self, batch_data_process_func: Callable = None):
@ -46,13 +50,12 @@ class BaseSchedule(ABC):
        """Loads a batch from data iterator. It returns the data and labels which are
        already in the same GPU as where the model's.
-        :param data_iter: Data iterator from which get a batch of data
+        Args:
-        :type data_iter: DataIter
+            data_iter (Iterable): Data iterator from which get a batch of data, obtained by calling iter(dataloader).
-        :param to_gpu: Whether the data should be moved to GPU
+            to_gpu (bool, optional): Whether the data should be moved to GPU
        :type to_gpu: bool, optional
-        :return: (data, label)
+        Returns:
-        :rtype: (:class:`Tensor`, :class:`torch.Tensor`)
+            Tuple (:class:`Tensor`, :class:`torch.Tensor`): A tuple of (data, label).
        """
        if data_iter is None:
            raise RuntimeError('Dataloader is not defined.')
@ -87,16 +90,12 @@ class BaseSchedule(ABC):
                              ):
        """The process function over a batch of dataset for training or evaluation.
-        :param engine: Colossalai training engine
+        Args:
-        :type engine: colossalai.engine.Engine
+            engine (colossalai.engine.Engine): Colossalai engine for training and inference.
-        :param data_iter: Data iterator from which get a batch of data
+            data_iter (Iterable): Data iterator from which get a batch of data, obtained by calling iter(dataloader).
-        :type data_iter: DataIter
+            forward_only (bool): If True, the process won't include backward.
-        :param forward_only: If True, the process won't include backward
+            return_loss (bool, optional): If False, the loss won't be returned.
-        :type forward_only: bool
+            return_output_label (bool, optional): If False, the output and label won't be returned.
        :param return_loss: If False, the loss won't be returned
        :type return_loss: bool, optional
        :param return_output_label: If False, the output and label won't be returned
        :type return_output_label: bool, optional
        """
        pass
--- a/colossalai/engine/schedule/_non_pipeline_schedule.py
+++ b/colossalai/engine/schedule/_non_pipeline_schedule.py
@ -15,6 +15,10 @@ class NonPipelineSchedule(BaseSchedule):
    During one process, it loads a batch of dataset and feeds it to the model.
    After getting the output and calculating the loss, it will use :meth:`step`
    to update the parameters if it is in training mode.
    Args:
        batch_data_process_func (Callable, optional): The preprocessing function which receives a batch of data,
        and it will be executed in load_batch.
    """
    def forward_backward_step(self,
@ -23,22 +27,19 @@ class NonPipelineSchedule(BaseSchedule):
                              forward_only: bool = False,
                              return_loss: bool = True,
                              return_output_label: bool = True):
-        """The process function that loads loads a batch of dataset and feeds it to the model.
+        """The process function that loads a batch of dataset and feeds it to the model.
        The returned labels and loss will None if :attr:`return_loss` is False.
-        :param engine: Model for training and inference
+        Args:
-        :param data_iter: Data iterator of the dataloader, e.g. iter(dataloader)
+            engine (colossalai.engine.Engine): Colossalai engine for training and inference.
-        :param forward_only: If True, the model is run for the forward pass, else back propagation will be executed
+            data_iter (Iterable): Dataloader as the form of an iterator, obtained by calling iter(dataloader).
-        :param return_loss: Loss will be returned if True
+            forward_only (bool, optional):
-        :param return_output_label: Output and label will be returned if True
+                If True, the model is run for the forward pass, else back propagation will be executed.
-        :type engine: Iterator
+            return_loss (bool, optional): Loss will be returned if True.
-        :type data_iter: Iterator
+            return_output_label (bool, optional): Output and label will be returned if True.
        :type forward_only: bool, optional
        :type return_loss: bool, optional
        :type return_output_label: bool, optional
-        :return: (output, label, loss)
+        Returns:
-        :rtype: Tuple[:class:`torch.Tensor`]
+            Tuple[:class:`torch.Tensor`]: A tuple of (output, label, loss), loss and label could be None.
        """
        assert forward_only or return_loss, \
            "The argument 'return_loss' has to be True when 'forward_only' is False, but got False."
--- a/colossalai/engine/schedule/_pipeline_schedule.py
+++ b/colossalai/engine/schedule/_pipeline_schedule.py
@ -41,14 +41,13 @@ class PipelineSchedule(BaseSchedule):
    It uses non-interleaved 1F1B strategy. Other properties are similar as
    :class:`NonPipelineSchedule`.
-    :param num_microbatches: The number of microbatches
+    Args:
-    :type num_microbatches: int
+        num_microbatches (int): The number of microbatches.
-    :param batch_data_process_func: The preprocessing function which receives a batch of data, and it will be executed in `load_batch`
+        batch_data_process_func (Callable, optional):
-    :type batch_data_process_func: Callable, optional
+            The preprocessing function which receives a batch of data, and it will be executed in `load_batch`.
-    :param tensor_shape: Specified shape in pipeline communication
+        tensor_shape (torch.Size, optional): Specified shape in pipeline communication.
-    :type tensor_shape: torch.Size, optional
+        scatter_gather_tensors (bool, optional):
-    :param scatter_gather_tensors: If set to `True`, communication will be reduced over pipeline when using 1D tensor parallelization
+            If set to `True`, communication will be reduced over pipeline when using 1D tensor parallelization.
    :type scatter_gather_tensors: bool, optional
    """
    def __init__(self,
@ -131,19 +130,14 @@ class PipelineSchedule(BaseSchedule):
        is obtained from data_iterator, otherwise the passed-in input_tensor is used.
        Returns output tensor. This is a helper function and can be ignored by users.
-        :param engine: Your engine object
+        Args:
-        :type engine: colossalai.engine.Engine
+            engine (colossalai.engine.Engine): Colossalai engine for training and inference.
-        :param input_tensor: Input tensor for this pipeline stage
+            input_tensor (:class:`torch.Tensor`): Input tensor for this pipeline stage.
-        :type input_tensor: :class:`torch.Tensor`
+            return_tensors (List[:class:`torch.Tensor`]): A list of tensors to return.
-        :param return_tensors: A list of tensors to return
+            return_output_label (bool, optional): Whether returns output labels.
-        :type return_tensors: List[:class:`torch.Tensor`]
+            accum_loss (optional): Where accumulated loss stores.
-        :param return_output_label: Whether returns output labels
+        Returns:
-        :type return_output_label: bool, optional
+            :class:`torch.Tensor`: output or the loss value of the current pipeline stage.
        :param accum_loss: Where accumulated loss stores
        :type  accum_loss: optional
        :return: output or the loss value of the current pipeline stage
        :rtype: :class:`torch.Tensor`
        """
        data, label = self.load_micro_batch()
        output_tensor = self._call_engine(engine.model, input_tensor, data)
@ -173,17 +167,14 @@ class PipelineSchedule(BaseSchedule):
        Returns the gradients with respect to the input tensor (None if first stage).
        This is a helper function and can be ignored by users.
-        :param engine: your engine object
+        Args:
-        :type engine: colossalai.engine.Engine
+            engine (colossalai.engine.Engine): Colossalai engine for training and inference.
-        :param input_tensor: input tensor for this pipeline stage
+            input_tensor (:class:`torch.Tensor`): input tensor for this pipeline stage.
-        :type input_tensor: :class:`torch.Tensor`
+            output_tensor (:class:`torch.Tensor`): output tensor for this pipeline stage.
-        :param output_tensor: output tensor for this pipeline stage
+            output_tensor_grad (:class:`torch.Tensor`): gradient of output tensor for this pipeline stage.
        :type output_tensor: :class:`torch.Tensor`
        :param output_tensor_grad: gradient of output tensor for this pipeline stage
        :type output_tensor_grad: :class:`torch.Tensor`
-        :return: gradient of input tensor
+        Returns:
-        :rtype: :class:`torch.Tensor`
+            :class:`torch.Tensor`: gradient of input tensor.
        """
        # Retain the grad on the input_tensor.
@ -207,19 +198,16 @@ class PipelineSchedule(BaseSchedule):
        """Runs non-interleaved 1F1B schedule, with communication between pipeline stages.
        Returns a tuple with losses if the last stage, an empty tuple otherwise.
-        :param engine: Your engine object
+        Args:
-        :type engine: colossalai.engine.Engine
+            engine (colossalai.engine.Engine): Colossalai engine for training and inference.
-        :param data_iter: Dataloader as the form of an iterator, obtained by calling iter(dataloader)
+            data_iter (Iterable): Dataloader as the form of an iterator, obtained by calling iter(dataloader).
-        :type data_iter: Iterable
+            forward_only (bool, optional):
-        :param forward_only: Whether run forward step only. Default is false. If true, no backward will be run.
+                Whether run forward step only. Default is false. If true, no backward will be run.
-        :type forward_only: bool
+            return_loss (bool, optional): Whether returns the loss value. Default is true.
-        :param return_loss: Whether returns the loss value. Default is true.
+            return_output_label (bool, optional): If False, the output and label won't be returned.
        :type return_loss: bool
        :param return_output_label: If False, the output and label won't be returned
        :type return_output_label: bool
-        :return: (output, label, loss)
+        Returns:
-        :rtype: Tuple[:class:`torch.Tensor`]
+            Tuple[:class:`torch.Tensor`]: A tuple of (output, label, loss), loss and label could be None.
        """
        assert forward_only or return_loss, \
@ -354,16 +342,14 @@ class InterleavedPipelineSchedule(PipelineSchedule):
        It uses interleaved 1F1B strategy. Other properties are similar as
        :class:`NonPipelineSchedule`.
-        :param num_microbatches: The number of microbatches
+        Args:
-        :type num_microbatches: int
+            num_microbatches (int): The number of microbatches.
-        :param num_model_chunks: The number of model chunks
+            num_model_chunks (int): The number of model chunks.
-        :type num_model_chunks: int
+            batch_data_process_func (Callable, optional):
-        :param batch_data_process_func: The preprocessing function which receives a batch of data, and it will be executed in `load_batch`
+                The preprocessing function which receives a batch of data, and it will be executed in `load_batch`.
-        :type batch_data_process_func: Callable, optional
+            tensor_shape (torch.Size, optional): Specified shape in pipeline communication.
-        :param tensor_shape: Specified shape in pipeline communication
+            scatter_gather_tensors (bool, optional):
-        :type tensor_shape: torch.Size, optional
+                If set to `True`, communication will be reduced over pipeline when using 1D tensor parallelization.
        :param scatter_gather_tensors: If set to `True`, communication will be reduced over pipeline when using 1D tensor parallelization
        :type scatter_gather_tensors: bool, optional
        """
        assert num_microbatches % gpc.get_world_size(ParallelMode.PIPELINE) == 0, \
            'num_microbatches must be an integer multiple of pipeline parallel world size'
@ -408,6 +394,16 @@ class InterleavedPipelineSchedule(PipelineSchedule):
        """Forward step for passed-in model. If it is the first stage, the input tensor 
        is obtained from data_iterator, otherwise the passed-in input_tensor is used.
        Returns output tensor. This is a helper function and can be ignored by users.
        Args:
            engine (colossalai.engine.Engine): Colossalai engine for training and inference.
            model_chunk_id (int): The id of model chunks.
            input_tensor (:class:`torch.Tensor`): Input tensor for this pipeline stage.
            return_tensors (List[:class:`torch.Tensor`]): A list of tensors to return.
            return_output_label (bool, optional): Whether returns output labels.
            accum_loss (optional): Where accumulated loss stores.
        Returns:
            :class:`torch.Tensor`: output or the loss value of the current pipeline stage.
        """
        data, label = self.load_micro_batch(model_chunk_id)
        output_tensor = self._call_engine(engine.model[model_chunk_id], input_tensor, data)
@ -435,18 +431,17 @@ class InterleavedPipelineSchedule(PipelineSchedule):
        """Run interleaved 1F1B schedule (model split into model chunks), with
        communication between pipeline stages as needed.
-        Returns dictionary with losses if the last stage, empty dict otherwise.
+        Args:
            engine (colossalai.engine.Engine): Colossalai engine for training and inference.
            data_iter (Iterable): Dataloader as the form of an iterator, obtained by calling iter(dataloader).
            forward_only (bool, optional):
                Whether run forward step only. Default is false. If true, no backward will be run.
            return_loss (bool, optional): Whether returns the loss value. Default is true.
            return_output_label (bool, optional): If False, the output and label won't be returned.
-        :param engine: Your engine object
+        Returns:
-        :type engine: colossalai.engine.Engine
+            Tuple[:class:`torch.Tensor`]: A tuple of (output, label, loss), loss and label could be None.
-        :param data_iter: Dataloader as the form of an iterator, obtained by calling iter(dataloader)
+                The loss would be returned only in the last stage.
        :type data_iter: Iterable
        :param forward_only: Whether run forward step only. Default is false. If true, no backward will be run.
        :type forward_only: bool
        :param return_loss: Whether returns the loss value. Default is true.
        :type return_loss: bool
        :param return_output_label: If False, the output and label won't be returned
        :type return_output_label: bool
        """
        assert forward_only or return_loss, \
            'The argument \'return_loss\' has to be True when \'forward_only\' is False, but got False.'
--- a/colossalai/initialize.py
+++ b/colossalai/initialize.py
@ -37,8 +37,8 @@ def get_default_parser():
    """Reads user command line and uses an argument parser to parse the input arguments.
    Input arguments include configuration, host, port, world size, local rank, backend for torch.distributed.
-    :return: Returns the parser with the default arguments, the user may add customized arguments into this parser
+    Returns:
-    :rtype: Namespace
+       Namespace: Returns the parser with the default arguments, the user may add customized arguments into this parser.
    """
    parser = argparse.ArgumentParser()
    parser.add_argument('--config', type=str, help='path to the config file')
@ -63,26 +63,21 @@ def launch(config: Union[str, Path, Config, Dict],
    """This function first parses the configuration arguments, using :func:`parse_args()` in case one of the input
    arguments are not given. Then initialize and set distributed environment by calling global_context's functions.
-    :param config: Config file or config file path are both acceptable
+    Args:
-    :type config: Union[str, dict, Config]
+        config (Union[str, dict, Config]): Config file or config file path are both acceptable
-    :param rank: Rank for the default process group
+        rank (int): Rank for the default process group
-    :type rank: int
+        world_size (int): World size of the default process group
-    :param world_size: World size of the default process group
+        host (str): The master address for distributed training
-    :type world_size: int
+        port (str): The master port for distributed training
-    :param host: The master address for distributed training
+        backend (str, optional): Backend for ``torch.distributed``, defaults to ``nccl``
-    :type host: str
+        local_rank (int, optional):
-    :param port: The master port for distributed training
+            Rank for the process on the node and is used to set the default CUDA device,
-    :type port: str
+            defaults to None. If local_rank = None, the default device ordinal will be calculated automatically.
-    :param backend: Backend for torch.distributed
+        seed (int, optional): Specified random seed for every process. Defaults to 1024.
-    :type backend: str, optional
+        verbose (bool, optional): Whether to print logs. Defaults to True.
-    :param local_rank: Rank for the process on the node and is used to set the default CUDA device, defaults to None.
+
-        If local_rank = None, the default device ordinal will be calculated automatically
+    Raises:
-    :type local_rank: int, optional
+        Exception: Raise exception when config type is wrong
    :param seed: Specified random seed for every processes
    :type seed: int, optional
    :param verbose: Whether to print logs
    :type verbose: bool, optional
    :raises Exception: Raise exception when config type is wrong
    """
    gpc.verbose = verbose
@ -126,18 +121,13 @@ def launch_from_slurm(config: Union[str, Path, Config, Dict],
    """A wrapper for colossalai.launch for SLURM launcher by reading rank and world size from the environment variables
    set by SLURM
-    :param config: Config file or config file path are both acceptable
+    Args:
-    :type config: Union[str, dict, Config]
+        config (Union[str, dict, Config]): Config file or config file path are both acceptable
-    :param host: The master address for distributed training
+        host (str): The master address for distributed training
-    :type host: str
+        port (str): The master port for distributed training
-    :param port: The master port for distributed training
+        backend (str, optional): Backend for ``torch.distributed``, defaults to ``nccl``
-    :type port: str
+        seed (int, optional): Specified random seed for every process. Defaults to 1024.
-    :param backend: Backend for torch.distributed
+        verbose (bool, optional): Whether to print logs. Defaults to True.
    :type backend: str, optional
    :param seed: Specified random seed for every processes
    :type seed: int, optional
    :param verbose: Whether to print logs
    :type verbose: bool, optional
    """
    rank = int(os.environ['SLURM_PROCID'])
    world_size = int(os.environ['SLURM_NPROCS'])
@ -160,18 +150,13 @@ def launch_from_openmpi(config: Union[str, Path, Config, Dict],
    """A wrapper for colossalai.launch for OpenMPI launcher by reading rank and world size from the environment variables
    set by OpenMPI
-    :param config: Config file or config file path are both acceptable
+    Args:
-    :type config: Union[str, dict, Config]
+        config (Union[str, dict, Config]): Config file or config file path are both acceptable
-    :param host: The master address for distributed training
+        host (str): The master address for distributed training
-    :type host: str
+        port (str): The master port for distributed training
-    :param port: The master port for distributed training
+        backend (str, optional): Backend for ``torch.distributed``, defaults to ``nccl``
-    :type port: str
+        seed (int, optional): Specified random seed for every process. Defaults to 1024.
-    :param backend: Backend for torch.distributed
+        verbose (bool, optional): Whether to print logs. Defaults to True.
    :type backend: str, optional
    :param seed: Specified random seed for every processes
    :type seed: int, optional
    :param verbose: Whether to print logs
    :type verbose: bool, optional
    """
    rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
    local_rank = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])
@ -194,14 +179,11 @@ def launch_from_torch(config: Union[str, Path, Config, Dict],
    """A wrapper for colossalai.launch for torchrun or torch.distributed.launch by reading rank and world size
    from the environment variables set by PyTorch
-    :param config: Config file or config file path are both acceptable
+    Args:
-    :type config: Union[str, dict, Config]
+        config (Union[str, dict, Config]): Config file or config file path are both acceptable
-    :param backend: Backend for torch.distributed
+        backend (str, optional): Backend for ``torch.distributed``, defaults to ``nccl``
-    :type backend: str, optional
+        seed (int, optional): Specified random seed for every process. Defaults to 1024.
-    :param seed: Specified random seed for every processes
+        verbose (bool, optional): Whether to print logs. Defaults to True.
    :type seed: int, optional
    :param verbose: Whether to print logs
    :type verbose: bool, optional
    """
    rank = int(os.environ['RANK'])
    local_rank = int(os.environ['LOCAL_RANK'])
@ -230,22 +212,20 @@ def initialize(model: nn.Module,
    """Core function to wrap the essential training components with our functionality based on the config which is
    loaded into gpc.config.
-    :param model: Your model instance or a function to build the model
+    Args:
-    :type model: :class:`torch.nn.Module` or Callbale
+        model (:class:`torch.nn.Module` or Callbale): Your model instance or a function to build the model.
-    :param optimizer: Your optimizer instance
+        optimizer (:class:`torch.optim.optimizer.Optimizer` or :class:`Type[torch.optim.optimizer]`):
-    :type optimizer: :class:`torch.optim.optimizer.Optimizer` or :class:`Type[torch.optim.optimizer]`
+            Your optimizer instance.
-    :param criterion: Your criterion instance
+        criterion (:class:`torch.nn.modules.loss._Loss`, optional): Your criterion instance.
-    :type criterion: :class:`torch.nn.modules.loss._Loss`, optional
+        train_dataloader (:class:`torch.utils.data.DataLoader`, optional): Dataloader for training.
-    :param train_dataloader: Dataloader for training
+        test_dataloader (:class:`torch.utils.data.DataLoader`, optional): Dataloader for testing.
-    :type train_dataloader: :class:`torch.utils.data.DataLoader`, optional
+        lr_scheduler (:class:`torch.nn.lr_scheduler._LRScheduler`, optional): Your lr scheduler instance, optional.
-    :param test_dataloader: Dataloader for testing
+        verbose (bool, optional): Whether to print logs.
-    :type test_dataloader: :class:`torch.utils.data.DataLoader`, optional
+
-    :param lr_scheduler: Your lr scheduler instance, optional
+    Returns:
-    :type lr_scheduler: :class:`torch.nn.lr_scheduler._LRScheduler`, optional
+        Tuple (engine, train_dataloader, test_dataloader, lr_scheduler):
-    :param verbose: Whether to print logs
+            A tuple of ``(engine, train_dataloader, test_dataloader, lr_scheduler)``
-    :type verbose: bool, optional
+            where only ``engine`` could not be None.
    :return: (engine, train_dataloader, test_dataloader, lr_scheduler)
    :rtype: Tuple
    """
    # get logger
    logger = get_dist_logger()
--- a/colossalai/logging/init.py
+++ b/colossalai/logging/init.py
@ -10,6 +10,8 @@ def get_dist_logger(name='colossalai'):
    """Get logger instance based on name. The DistributedLogger will create singleton instances,
    which means that only one logger instance is created per name.
    Args:
    :param name: name of the logger, name must be unique
    :type name: str
--- a/colossalai/logging/logger.py
+++ b/colossalai/logging/logger.py
@ -23,8 +23,13 @@ except ImportError:
 class DistributedLogger:
    """This is a distributed event logger class essentially based on :class:`logging`.
-    :param name: The name of the logger
+    Args:
-    :type name: str
+        name (str): The name of the logger.
    Note:
        The parallel_mode used in ``info``, ``warning``, ``debug`` and ``error``
        should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
    """
    __instances = dict()
@ -33,10 +38,10 @@ class DistributedLogger:
    def get_instance(name: str):
        """Get the unique single logger instance based on name.
-        :param name: The name of the logger
+        Args:
-        :type name: str
+            name (str): The name of the logger.
-        :return: A DistributedLogger object
+        Returns:
-        :rtype: DistributedLogger
+            DistributedLogger: A DistributedLogger object
        """
        if name in DistributedLogger.__instances:
            return DistributedLogger.__instances[name]
@ -73,8 +78,8 @@ class DistributedLogger:
    def set_level(self, level: str):
        """Set the logging level
-        :param level: Can only be INFO, DEBUG, WARNING and ERROR
+        Args:
-        :type level: str
+            level (str): Can only be INFO, DEBUG, WARNING and ERROR.
        """
        self._check_valid_logging_level(level)
        self._logger.setLevel(getattr(logging, level))
@ -82,14 +87,11 @@ class DistributedLogger:
    def log_to_file(self, path: Union[str, Path], mode: str = 'a', level: str = 'INFO', suffix: str = None):
        """Save the logs to file
-        :param path: The file to save the log
+        Args:
-        :type path: A string or pathlib.Path object
+            path (A string or pathlib.Path object): The file to save the log.
-        :param mode: The mode to write log into the file
+            mode (str): The mode to write log into the file.
-        :type mode: str
+            level (str): Can only be INFO, DEBUG, WARNING and ERROR.
-        :param level: Can only be INFO, DEBUG, WARNING and ERROR
+            suffix (str): The suffix string of log's name.
        :type level: str
        :param suffix: The suffix string of log's name
        :type suffix: str
        """
        assert isinstance(path, (str, Path)), \
            f'expected argument path to be type str or Path, but got {type(path)}'
@ -131,12 +133,11 @@ class DistributedLogger:
    def info(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: list = None):
        """Log an info message.
-        :param message: The message to be logged
+        Args:
-        :type message: str
+            message (str): The message to be logged.
-        :param parallel_mode: The parallel mode used for logging. Defaults to ParallelMode.GLOBAL
+            parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`):
-        :type parallel_mode: :class:`colossalai.context.parallel_mode.ParallelMode`
+                The parallel mode used for logging. Defaults to ParallelMode.GLOBAL.
-        :param ranks: List of parallel ranks
+            ranks (List): List of parallel ranks.
        :type ranks: list
        """
        message_prefix = "{}:{} {}".format(*self.__get_call_info())
        self._log('info', message_prefix, parallel_mode, ranks)
@ -145,12 +146,11 @@ class DistributedLogger:
    def warning(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: list = None):
        """Log a warning message.
-        :param message: The message to be logged
+        Args:
-        :type message: str
+            message (str): The message to be logged.
-        :param parallel_mode: The parallel mode used for logging. Defaults to ParallelMode.GLOBAL
+            parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`):
-        :type parallel_mode: :class:`colossalai.context.parallel_mode.ParallelMode`
+                The parallel mode used for logging. Defaults to ParallelMode.GLOBAL.
-        :param ranks: List of parallel ranks
+            ranks (List): List of parallel ranks.
        :type ranks: list
        """
        message_prefix = "{}:{} {}".format(*self.__get_call_info())
        self._log('warning', message_prefix, parallel_mode, ranks)
@ -159,12 +159,11 @@ class DistributedLogger:
    def debug(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: list = None):
        """Log a debug message.
-        :param message: The message to be logged
+        Args:
-        :type message: str
+            message (str): The message to be logged.
-        :param parallel_mode: The parallel mode used for logging. Defaults to ParallelMode.GLOBAL
+            parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`):
-        :type parallel_mode: :class:`colossalai.context.parallel_mode.ParallelMode`
+                The parallel mode used for logging. Defaults to ParallelMode.GLOBAL.
-        :param ranks: List of parallel ranks
+            ranks (List): List of parallel ranks.
        :type ranks: list
        """
        message_prefix = "{}:{} {}".format(*self.__get_call_info())
        self._log('debug', message_prefix, parallel_mode, ranks)
@ -173,12 +172,11 @@ class DistributedLogger:
    def error(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: list = None):
        """Log an error message.
-        :param message: The message to be logged
+        Args:
-        :type message: str
+            message (str): The message to be logged.
-        :param parallel_mode: The parallel mode used for logging. Defaults to ParallelMode.GLOBAL
+            parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`):
-        :type parallel_mode: :class:`colossalai.context.parallel_mode.ParallelMode`
+                The parallel mode used for logging. Defaults to ParallelMode.GLOBAL.
-        :param ranks: List of parallel ranks
+            ranks (List): List of parallel ranks.
        :type ranks: list
        """
        message_prefix = "{}:{} {}".format(*self.__get_call_info())
        self._log('error', message_prefix, parallel_mode, ranks)
--- a/colossalai/nn/init.py
+++ b/colossalai/nn/init.py
@ -6,6 +6,7 @@ import torch.nn as nn
 def zeros_():
    """Return the initializer filling the input Tensor with the scalar zeros"""
    def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None):
        return nn.init.zeros_(tensor)
@ -13,6 +14,7 @@ def zeros_():
 def ones_():
    """Return the initializer filling the input Tensor with the scalar ones"""
    def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None):
        return nn.init.ones_(tensor)
@ -20,6 +22,14 @@ def ones_():
 def uniform_(a: float = 0., b: float = 1.):
    r"""Return the initializer filling the input Tensor with values drawn from the uniform
    distribution :math:`\mathcal{U}(a, b)`.
    Args:
        a (float): the lower bound of the uniform distribution. Defaults 0.0.
        b (float): the upper bound of the uniform distribution. Defaults 1.0.
    """
    def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None):
        return nn.init.uniform_(tensor, a, b)
@ -27,6 +37,15 @@ def uniform_(a: float = 0., b: float = 1.):
 def normal_(mean: float = 0., std: float = 1.):
    r"""Return the initializer filling the input Tensor with values drawn from the normal distribution
     .. math::
        \mathcal{N}(\text{mean}, \text{std}^2)
    Args:
        mean (float): the mean of the normal distribution. Defaults 0.0.
        std (float): the standard deviation of the normal distribution. Defaults 1.0.
     """
    def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None):
        return nn.init.normal_(tensor, mean, std)
@ -34,6 +53,19 @@ def normal_(mean: float = 0., std: float = 1.):
 def trunc_normal_(mean: float = 0., std: float = 1., a: float = -2., b: float = 2.):
    r"""Return the initializer filling the input Tensor with values drawn from a truncated
    normal distribution. The values are effectively drawn from the
    normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
    with values outside :math:`[a, b]` redrawn until they are within
    the bounds. The method used for generating the random values works
    best when :math:`a \leq \text{mean} \leq b`.
    Args:
        mean (float): the mean of the normal distribution. Defaults 0.0.
        std (float): the standard deviation of the normal distribution. Defaults 1.0.
        a (float): the minimum cutoff value. Defaults -2.0.
        b (float): the maximum cutoff value. Defaults 2.0.
    """
    def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None):
        return nn.init.trunc_normal_(tensor, mean, std, a, b)
@ -41,6 +73,26 @@ def trunc_normal_(mean: float = 0., std: float = 1., a: float = -2., b: float =
 def kaiming_uniform_(a=0, mode='fan_in', nonlinearity='leaky_relu'):
    r"""Return the initializer filling the input `Tensor` with values according to the method
    described in `Delving deep into rectifiers: Surpassing human-level
    performance on ImageNet classification` - He, K. et al. (2015), using a
    uniform distribution. The resulting tensor will have values sampled from
    :math:`\mathcal{U}(-\text{bound}, \text{bound})` where
    .. math::
        \text{bound} = \text{gain} \times \sqrt{\frac{3}{\text{fan_mode}}}
    Also known as 'He initialization'.
    Args:
        a (int): the negative slope of the rectifier used after this layer (only used with ``'leaky_relu'``).
        mode (str, optional): either ``'fan_in'`` (default) or ``'fan_out'``. Choosing ``'fan_in'``
                preserves the magnitude of the variance of the weights in the
                forward pass. Choosing ``'fan_out'`` preserves the magnitudes in the
                backwards pass.
        nonlinearity (str, optional): the non-linear function (`nn.functional` name),
                        recommended to use only with ``'relu'`` or ``'leaky_relu'`` (default).
    """
    # adapted from torch.nn.init
    def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None):
        if 0 in tensor.shape:
@ -64,6 +116,26 @@ def kaiming_uniform_(a=0, mode='fan_in', nonlinearity='leaky_relu'):
 def kaiming_normal_(a=0, mode='fan_in', nonlinearity='leaky_relu'):
    r"""Return the initializer filling the input `Tensor` with values according to the method
    described in `Delving deep into rectifiers: Surpassing human-level
    performance on ImageNet classification` - He, K. et al. (2015), using a
    normal distribution. The resulting tensor will have values sampled from
    :math:`\mathcal{N}(0, \text{std}^2)` where
    .. math::
        \text{std} = \frac{\text{gain}}{\sqrt{\text{fan_mode}}}
    Also known as 'He initialization'.
    Args:
        a (int): the negative slope of the rectifier used after this layer (only used with ``'leaky_relu'``).
        mode (str, optional): either ``'fan_in'`` (default) or ``'fan_out'``. Choosing ``'fan_in'``
                preserves the magnitude of the variance of the weights in the
                forward pass. Choosing ``'fan_out'`` preserves the magnitudes in the
                backwards pass.
        nonlinearity (str, optional): the non-linear function (`nn.functional` name),
                        recommended to use only with ``'relu'`` or ``'leaky_relu'`` (default).
    """
    # adapted from torch.nn.init
    def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None):
        if 0 in tensor.shape:
@ -86,6 +158,23 @@ def kaiming_normal_(a=0, mode='fan_in', nonlinearity='leaky_relu'):
 def xavier_uniform_(a: float = math.sqrt(3.), scale: float = 2., gain: float = 1.):
    r"""Return the initializer filling the input `Tensor` with values according to the method
    described in `Understanding the difficulty of training deep feedforward
    neural networks` - Glorot, X. & Bengio, Y. (2010), using a uniform
    distribution. The resulting tensor will have values sampled from
    :math:`\mathcal{U}(-a, a)` where
    .. math::
        a = \text{gain} \times \sqrt{\frac{6}{\text{fan_in} + \text{fan_out}}}
    Also known as 'Glorot initialization'.
    Args:
        a (float, optional): an optional scaling factor used to calculate uniform
            bounds from standard deviation. Defaults ``math.sqrt(3.)``.
        scale (float, optional): an optional scaling factor used to calculate standard deviation. Defaults 2.0.
        gain (float, optional): an optional scaling factor. Defaults 1.0.
    """
    # adapted from torch.nn.init
    def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None):
        assert fan_in is not None, 'Fan_in is not provided.'
@ -102,6 +191,21 @@ def xavier_uniform_(a: float = math.sqrt(3.), scale: float = 2., gain: float = 1
 def xavier_normal_(scale: float = 2., gain: float = 1.):
    r"""Return the initializer filling the input `Tensor` with values according to the method
    described in `Understanding the difficulty of training deep feedforward
    neural networks` - Glorot, X. & Bengio, Y. (2010), using a normal
    distribution. The resulting tensor will have values sampled from
    :math:`\mathcal{N}(0, \text{std}^2)` where
    .. math::
        \text{std} = \text{gain} \times \sqrt{\frac{2}{\text{fan_in} + \text{fan_out}}}
    Also known as 'Glorot initialization'.
    Args:
        scale (float, optional): an optional scaling factor used to calculate standard deviation. Defaults 2.0.
        gain (float, optional): an optional scaling factor. Defaults 1.0.
    """
    # adapted from torch.nn.init
    def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None):
        assert fan_in is not None, 'Fan_in is not provided.'
--- a/colossalai/nn/layer/colossalai_layer/dropout.py
+++ b/colossalai/nn/layer/colossalai_layer/dropout.py
@ -6,13 +6,11 @@ from ..utils import get_tensor_parallel_mode
 class Dropout(nn.Module):
-    """
+    """Dropout layer of colossalai.
    Dropout layer of colossalai
-    :param p: dropout rate, defaults to 0.5
+    Args:
-    :type p: float, optional
+        p (float, optional): probability of an element to be zeroed, defaults 0.5.
-    :param inplace: If set to ``True``, will do this operation in-place, defaults tp ``False``
+        inplace (bool, optional): whether to do dropout in-place, default to be False.
    :type inplace: bool, optional
    """
    def __init__(self, p: float = 0.5, inplace: bool = False) -> None:
        super().__init__()
--- a/colossalai/nn/layer/colossalai_layer/embedding.py
+++ b/colossalai/nn/layer/colossalai_layer/embedding.py
@ -35,21 +35,33 @@ _parallel_patchembedding = {
 class Embedding(nn.Module):
-    """
+    r"""Embedding for colossalai.
    Embedding for colossalai
-    :param num_embeddings: number of embeddings
+    Args:
-    :type num_embeddings: int
+        num_embeddings (int): number of embeddings.
-    :param embedding_dim: dimension of embedding
+        embedding_dim (int): dimension of embedding.
-    :type embedding_dim: int
+        padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
-    :param padding_idx: index of padding, defaults to None
+            therefore, the embedding vector at padding_idx is not updated during training,
-    :type padding_idx: int, optional
+            i.e. it remains as a fixed “pad”, defaults to None.
-    :param dtype: The dtype of parameters, defaults to None
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
-    :type dtype: torch.dtype, optional
+        weight_initializer (:class:`typing.Callable`, optional):
-    :param weight_initializer: The intializer of weight, defaults to normal initializer
+            he initializer of weight, defaults to normal initializer.
-    :type weight_initializer: typing.Callable, optional
+
-    :param args: Args used in F.embedding
+    The ``args`` and ``kwargs`` used in :class:`torch.nn.functional.embedding` should contain:
-    :param kwargs: Kwargs used in F.embedding
+    ::
        max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
                    renormalized to have norm max_norm. Note: this will modify weight in-place.
        norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
        scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
                    of frequency of the words in the mini-batch. Default False.
        sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
    More details about ``args`` and ``kwargs`` could be found in
    `Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
    More details about ``initializer`` please refer to
    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_
    """
    def __init__(self,
@ -97,27 +109,24 @@ class Embedding(nn.Module):
 class PatchEmbedding(nn.Module):
-    """
+    """2D Image to Patch Embedding.
    2D Image to Patch Embedding
-    :param img_size: image size
+    Args:
-    :type img_size: int
+        img_size (int): image size.
-    :param patch_size: patch size
+        patch_size (int): patch size.
-    :type patch_size: int
+        in_chans (int): number of channels of input image.
-    :param in_chans: number of channels of input image
+        embed_size (int): size of embedding.
-    :type in_chans: int
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
-    :param embed_size: size of embedding
+        flatten (bool, optional): whether to flatten output tensor, defaults to True.
-    :type embed_size: int
+        weight_initializer (:class:`typing.Callable`, optional):
-    :param dtype: The dtype of parameters, defaults to None
+            The initializer of weight, defaults to kaiming uniform initializer.
-    :type dtype: torch.dtype, optional
+        bias_initializer (:class:`typing.Callable`, optional):
-    :param flatten: whether to flatten output tensor, defaults to True
+            The initializer of bias, defaults to xavier uniform initializer.
-    :type flatten: bool, optional
+        position_embed_initializer (:class:`typing.Callable`, optional):
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
+            The initializer of position embedding, defaults to zeros initializer.
-    :type weight_initializer: typing.Callable, optional
+
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
+    More details about ``initializer`` please refer to
-    :type bias_initializer: typing.Callable, optional
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    :param position_embed_initializer: The intializer of position embedding, defaults to zero
    :type position_embed_initializer: typing.Callable, optional
    """
    def __init__(
--- a/colossalai/nn/layer/colossalai_layer/linear.py
+++ b/colossalai/nn/layer/colossalai_layer/linear.py
@ -31,22 +31,35 @@ _vocab_parallel_classifier = {
 class Linear(nn.Module):
-    """
+    """Linear layer of colossalai.
    Linear layer of colossalai
-    :param in_features: size of each input sample
+    Args:
-    :type in_features: int
+        in_features (int): size of each input sample.
-    :param out_features: size of each output sample
+        out_features (int): size of each output sample.
-    :type out_features: int
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
-    :type bias: bool, optional
+        weight_initializer (:class:`typing.Callable`, optional):
-    :param dtype: The dtype of parameters, defaults to None
+            The initializer of weight, defaults to kaiming uniform initializer.
-    :type dtype: torch.dtype, optional
+        bias_initializer (:class:`typing.Callable`, optional):
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
+            The initializer of bias, defaults to xavier uniform initializer.
-    :type weight_initializer: typing.Callable, optional
+
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
+    Note: ``kwargs`` would contain different parameters when you use different parallelisms.
-    :type bias_initializer: typing.Callable, optional
+
-    :param kwargs: Kwargs used for particular parallelisms
+    The ``kwargs`` should contain parameters below:
    ::
        Linear1D:
            gather_output: bool (optional, default to be false)
            skip_bias_add: bool (optional, default to be false)
        Linear2D:
            skip_bias_add: bool (optional, default to be false)
        Linear2p5D:
            skip_bias_add: bool (optional, default to be false)
        Linear3D:
            None
    More details about ``initializer`` please refer to
    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """
    def __init__(self,
@ -88,21 +101,21 @@ class Linear(nn.Module):
 class Classifier(nn.Module):
-    """
+    """Classifier layer of colossalai.
    Classifier layer of colossalai
-    :param in_features: size of each input sample
+    Args:
-    :type in_features: int
+        in_features (int): size of each input sample.
-    :param num_classes: number of total classes for the dataset
+        num_classes (int): number of classes.
-    :type num_classes: int
+        weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
-    :type bias: bool, optional
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
-    :param dtype: The dtype of parameters, defaults to None
+        weight_initializer (:class:`typing.Callable`, optional):
-    :type dtype: torch.dtype, optional
+            The initializer of weight, defaults to kaiming uniform initializer.
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
+        bias_initializer (:class:`typing.Callable`, optional):
-    :type weight_initializer: typing.Callable, optional
+            The initializer of bias, defaults to xavier uniform initializer.
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
+
-    :type bias_initializer: typing.Callable, optional
+    More details about ``initializer`` please refer to
    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """
    def __init__(self,
--- a/colossalai/nn/layer/colossalai_layer/normalization.py
+++ b/colossalai/nn/layer/colossalai_layer/normalization.py
@ -19,18 +19,15 @@ _parallel_layernorm = {
 class LayerNorm(nn.Module):
-    r"""
+    r"""Layer Normalization for colossalai.
    Layer Normalization for colossalai
-    :param normalized_shape: input shape from an expected input
+    Args:
-        of size. :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1] \times \ldots \times \text{normalized_shape}[-1]]`
+        normalized_shape (int): input shape from an expected input of size.
-        If a single integer is used, it is treated as a singleton list, and this module will
+            :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1] \times \ldots \times \text{normalized_shape}[-1]]`
-        normalize over the last dimension which is expected to be of that specific size.
+            If a single integer is used, it is treated as a singleton list, and this module will
-    :type normalized_shape: int
+            normalize over the last dimension which is expected to be of that specific size.
-    :param eps: a value added to the denominator for numerical stability, defaults to 1e-05
+        eps (float, optional): a value added to the denominator for numerical stability, defaults to 1e-05
-    :type eps: float, optional
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
    :param dtype: The dtype of parameters, defaults to None
    :type dtype: torch.dtype, optional
    """
    def __init__(self, normalized_shape: int, eps=1e-05, dtype=None) -> None:
--- a/colossalai/nn/layer/moe/experts.py
+++ b/colossalai/nn/layer/moe/experts.py
@ -28,11 +28,10 @@ class Experts(MoeExperts):
    moe model parallel group, where E is the number of experts. Every expert
    is a instence of the class, 'expert' in initialization parameters.
-    :param expert: The class of all experts
+    Args:
-    :param num_experts: The number of experts
+        expert_cls (:class:`torch.nn.Module`): The class of all experts
-    :param expert_args: Args used to initialize experts
+        num_experts (int): The number of experts
-
+        expert_args: Args used to initialize experts, the args could be found in corresponding expert class
    :type num_experts: int
    """
    def __init__(self, expert_cls: Type[nn.Module], num_experts: int, **expert_args):
--- a/colossalai/nn/layer/moe/layers.py
+++ b/colossalai/nn/layer/moe/layers.py
@ -18,19 +18,13 @@ class Top1Router(nn.Module):
    for routing usage. More deailted function can be found in the paper about Switch Transformer
    of Google.
-    :param capacity_factor_train: Capacity factor in routing during training
+    Args:
-    :param capacity_factor_eval: Capacity factor in routing during evaluation
+        capacity_factor_train (float, optional): Capacity factor in routing of training.
-    :param min_capacity: The minimum number of the capacity of each expert
+        capacity_factor_eval (float, optional): Capacity factor in routing of evaluation.
-    :param select_policy: The policy about tokens selection
+        min_capacity (int, optional): The minimum number of the capacity of each expert.
-    :param noisy_func: Noisy function used in logits
+        select_policy (str, optional): The policy about tokens selection.
-    :param drop_tks: Whether drops tokens in evaluation
+        noisy_func (:class:`typing.Callable`, optional): Noisy function used in logits.
-
+        drop_tks (bool, optional): Whether drops tokens in evaluation
    :type capacity_factor_train: float, optional
    :type capacity_factor_eval: float, optional
    :type min_capacity: int, optional
    :type select_policy: str, optional
    :type noisy_func: Callable, optional
    :type drop_tks: bool, optional
    """
    def __init__(self,
@ -119,17 +113,12 @@ class Top2Router(nn.Module):
    """Top2 router that returns the dispatch mask [s, e, c] and combine weight [s, e, c]
    for routing usage. More deailted function can be found in the paper about ViT-MoE.
-    :param capacity_factor_train: Capacity factor in routing during training
+    Args:
-    :param capacity_factor_eval: Capacity factor in routing during evaluation
+        capacity_factor_train (float, optional): Capacity factor in routing of training.
-    :param min_capacity: The minimum number of the capacity of each expert
+        capacity_factor_eval (float, optional): Capacity factor in routing of evaluation.
-    :param noisy_func: Noisy function used in logits
+        min_capacity (int, optional): The minimum number of the capacity of each expert
-    :param drop_tks: Whether drops tokens in evaluation
+        noisy_func (:class:`typing.Callable`, optional): Noisy function used in logits.
-
+        drop_tks (bool, optional): Whether drops tokens in evaluation.
    :type capacity_factor_train: float, optional
    :type capacity_factor_eval: float, optional
    :type min_capacity: int, optional
    :type noisy_func: Callable, optional
    :type drop_tks: bool, optional
    """
    def __init__(self,
@ -239,15 +228,11 @@ class MoeLayer(nn.Module):
    the moe tensor group by all to all comunication. Then it will get the output of all
    experts and exchange the output. At last returns the output of the moe system.
-    :param dim_model: Dimension of model
+    Args:
-    :param num_experts: The number of experts
+        dim_model (int): Dimension of model.
-    :param router: Instance of router used in routing
+        num_experts (int): The number of experts.
-    :param experts: Instance of experts generated by Expert
+        router (:class:`torch.nn.Module`): Instance of router used in routing.
-
+        experts (:class:`torch.nn.Module`): Instance of experts generated by Expert.
    :type dim_model: int
    :type num_experts: int
    :type router: nn.Module
    :type experts: nn.Module
    """
    def __init__(self, dim_model: int, num_experts: int, router: nn.Module, experts: MoeExperts):
--- a/colossalai/nn/layer/moe/utils.py
+++ b/colossalai/nn/layer/moe/utils.py
@ -16,8 +16,8 @@ class NormalNoiseGenerator:
    All noise is generated from a normal distribution (0, 1 / E^2), where
    E = the number of experts.
-    :param num_experts: The number of experts
+    Args:
-    :type num_experts: int
+        num_experts (int): The number of experts.
    """
    def __init__(self, num_experts: int):
@ -37,8 +37,8 @@ class UniformNoiseGenerator:
    Makes models more resilient to rounding errors introduced by bfloat16.
    This seems particularly important for logits.
-    :param eps: Epsilon in generator
+    Args:
-    :type eps: float
+        eps (float, optional): Epsilon in generator, defaults 1e-2.
    """
    def __init__(self, eps: float = 1e-2):
--- a/colossalai/nn/layer/parallel_1d/_operation.py
+++ b/colossalai/nn/layer/parallel_1d/_operation.py
@ -7,17 +7,17 @@ except:
 class FusedLayerNormAffineFunction1D(torch.autograd.Function):
-    r"""
+    r"""Layernorm
  Layernorm
-  :param input: input maxtrix
+    Args:
-  :param weight: weight matrix
+        input: input matrix.
-  :param bias: bias matrix
+        weight: weight matrix.
-  :param normalized_shape: input shape from an expected input
+        bias: bias matrix.
-        of size. :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1] \times \ldots \times \text{normalized_shape}[-1]]`
+        normalized_shape: input shape from an expected input of size.
-        If a single integer is used, it is treated as a singleton list, and this module will
+            :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1] \times \ldots \times \text{normalized_shape}[-1]]`
-        normalize over the last dimension which is expected to be of that specific size.
+            If a single integer is used, it is treated as a singleton list, and this module will
-  :param eps: a value added to the denominator for numerical stability
+            normalize over the last dimension which is expected to be of that specific size.
        eps: a value added to the denominator for numerical stability
  """
    @staticmethod
--- a/colossalai/nn/layer/parallel_1d/_utils.py
+++ b/colossalai/nn/layer/parallel_1d/_utils.py
@ -78,8 +78,9 @@ class _ReduceGrad(torch.autograd.Function):
    """
    Pass the input to the model parallel region.
-    :param input_: input matrix
+    Args:
-    :param parallel_mode: parallel mode
+        input_: input matrix.
        parallel_mode: parallel mode.
    """
    @staticmethod
@ -100,8 +101,9 @@ class _ReduceInput(torch.autograd.Function):
    """
    All-reduce the input from the model parallel region.
-    :param input_: input matrix
+    Args:
-    :param parallel_mode: parallel mode
+        input_: input matrix.
        parallel_mode: parallel mode.
    """
    @staticmethod
@ -121,9 +123,10 @@ class _SplitForwardGatherBackward(torch.autograd.Function):
    """
    Split the input and keep only the corresponding chuck to the rank.
-    :param input_: input matrix
+    Args:
-    :param parallel_mode: parallel mode
+        input_: input matrix.
-    :param dim: dimension
+        parallel_mode: parallel mode.
        dim: dimension
    """
    @staticmethod
@ -142,12 +145,12 @@ class _SplitForwardGatherBackward(torch.autograd.Function):
 class _GatherForwardSplitBackward(torch.autograd.Function):
-    """
+    """Gather the input from model parallel region and concatenate.
    Gather the input from model parallel region and concatinate.
-    :param input_: input matrix
+    Args:
-    :param parallel_mode: parallel mode
+        input_: input matrix.
-    :param dim: dimension
+        parallel_mode: parallel mode.
        dim: dimension
    """
    @staticmethod
--- a/colossalai/nn/layer/parallel_1d/layers.py
+++ b/colossalai/nn/layer/parallel_1d/layers.py
@ -24,24 +24,23 @@ from ._utils import (gather_forward_split_backward, get_parallel_input, reduce_g
@LAYERS.register_module
 class Linear1D(torch.nn.Module):
-    """
+    r"""Linear layer for 1D parallelism.
    Linear layer for 1D parallelism
-    :param in_features: size of each input sample
+    Args:
-    :type in_features: int
+        in_features (int): size of each input sample.
-    :param out_features: size of each output sample
+        out_features (int): size of each output sample.
-    :type out_features: int
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
-    :type bias: bool, optional
+        gather_output (bool, optional): Whether to call all-gather on output, defaults to False.
-    :param dtype: The dtype of parameters, defaults to None
+        skip_bias_add (bool, optional): If set to ``True``, it will skip bias add for linear layer,
-    :type dtype: torch.dtype, optional
+            which is preserved for kernel fusion, defaults to False
-    :param skip_bias_add: If set to ``True``, it will skip bias add for linear layer,
+        weight_initializer (:class:`typing.Callable`, optional):
-        which is preserved for kernel fusion, defaults to False
+            The initializer of weight, defaults to kaiming uniform initializer.
-    :type skip_bias_add: bool, optional
+        bias_initializer (:class:`typing.Callable`, optional):
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
+            The initializer of bias, defaults to xavier uniform initializer.
-    :type weight_initializer: typing.Callable, optional
+
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
+    More details about ``initializer`` please refer to
-    :type bias_initializer: typing.Callable, optional
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """
    def __init__(self,
@ -88,23 +87,21 @@ class Linear1D(torch.nn.Module):
@LAYERS.register_module
 class Classifier1D(ParallelLayer):
-    """RowLinear with given weight
+    r"""RowLinear with given weight. Classifier of 1D parallelism.
    Classifier of 1D parallelism
-    :param in_features: size of input features
+    Args:
-    :type in_features: int
+        in_features (int): size of each input sample.
-    :param num_classes: number of classes in the dataset
+        num_classes (int): number of classes.
-    :type num_classes: int
+        weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
-    :param weight: weight of the classifier, defaults to True
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
-    :type weight: torch.nn.Parameter, optional
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True``
+        weight_initializer (:class:`typing.Callable`, optional):
-    :type bias: bool, optional
+            The initializer of weight, defaults to kaiming uniform initializer.
-    :param dtype: The dtype of parameters, defaults to None
+        bias_initializer (:class:`typing.Callable`, optional):
-    :type dtype: torch.dtype, optional
+            The initializer of bias, defaults to xavier uniform initializer.
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
+
-    :type weight_initializer: typing.Callable, optional
+    More details about ``initializer`` please refer to
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    :type bias_initializer: typing.Callable, optional
    """
    def __init__(self,
@ -171,23 +168,21 @@ class Classifier1D(ParallelLayer):
@LAYERS.register_module
 class VocabParallelClassifier1D(ParallelLayer):
-    """ColLinear with given weight
+    r"""ColLinear with given weight. Classifier of 1D parallelism.
    Classifier of 1D parallelism
-    :param in_features: size of input features
+    Args:
-    :type in_features: int
+        in_features (int): size of each input sample.
-    :param num_classes: number of classes in the dataset
+        num_classes (int): number of classes.
-    :type num_classes: int
+        weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
-    :param weight: weight of the classifier, defaults to True
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
-    :type weight: torch.nn.Parameter, optional
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True``
+        weight_initializer (:class:`typing.Callable`, optional):
-    :type bias: bool, optional
+            The initializer of weight, defaults to kaiming uniform initializer.
-    :param dtype: The dtype of parameters, defaults to None
+        bias_initializer (:class:`typing.Callable`, optional):
-    :type dtype: torch.dtype, optional
+            The initializer of bias, defaults to xavier uniform initializer.
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
+
-    :type weight_initializer: typing.Callable, optional
+    More details about ``initializer`` please refer to
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    :type bias_initializer: typing.Callable, optional
    """
    def __init__(self,
@ -249,30 +244,28 @@ class VocabParallelClassifier1D(ParallelLayer):
@LAYERS.register_module
 class Linear1D_Col(ParallelLayer):
-    """Linear layer with column parallelism.
+    r"""Linear layer with column parallelism.
    The linear layer is defined as :math:`Y = XA + b`. A is parallelized along
    its second dimension as :math:`A = [A_1, ..., A_p]`.
-    :param in_features: first dimension of matrix A.
+    Args:
-    :type in_features: int
+        in_features (int): size of each input sample.
-    :param output_size: second dimension of matrix A.
+        out_features (int): size of each output sample.
-    :type output_size: int
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True``
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
-    :type bias: bool, optional
+        gather_output (bool, optional): If true, call all-gather on output and make Y available
    :param dtype: The dtype of parameters, defaults to None
    :type dtype: torch.dtype, optional
    :param gather_output: If true, call all-gether on output and make Y avaiable
                    to all GPUs, otherwise, every GPU will have its output
                    which is :math:`Y_i = XA_i`, defaults to False
-    :type gather_output: bool, optional
+        skip_bias_add (bool, optional): If set to ``True``, it will skip bias add for linear layer,
-    :param skip_bias_add: If set to ``True``, it will skip bias add for linear layer,
+            which is preserved for kernel fusion, defaults to Fals
-        which is preserved for kernel fusion, defaults to False
+        weight_initializer (:class:`typing.Callable`, optional):
-    :type skip_bias_add: bool, optional
+            The initializer of weight, defaults to kaiming uniform initializer.
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
+        bias_initializer (:class:`typing.Callable`, optional):
-    :type weight_initializer: typing.Callable, optional
+            The initializer of bias, defaults to xavier uniform initializer.
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
+
-    :type bias_initializer: typing.Callable, optional
+    More details about ``initializer`` please refer to
    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """
    def __init__(self,
@ -343,25 +336,23 @@ class Linear1D_Col(ParallelLayer):
@LAYERS.register_module
 class Linear1D_Row(ParallelLayer):
-    """ Linear layer with row parallelism 
+    r""" Linear layer with row parallelism
-    :param in_features: size of each input sample
+    Args:
-    :type in_features: int
+        in_features (int): size of each input sample.
-    :param out_features: size of each output sample
+        out_features (int): size of each output sample.
-    :type out_features: int
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True``
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
-    :type bias: bool, optional
+        parallel_input (bool, optional): If set to ``True``, it's assumed that the input is split, defaults to False.
-    :param dtype: The dtype of parameters, defaults to None
+        skip_bias_add (bool, optional): If set to ``True``, it will skip bias add for linear layer,
-    :type dtype: torch.dtype, optional
+            which is preserved for kernel fusion, defaults to Fals
-    :param parallel_input: If set to ``True``, it's assumed that the input is splitted, defaults to False
+        weight_initializer (:class:`typing.Callable`, optional):
-    :type parallel_input: bool, optional
+            The initializer of weight, defaults to kaiming uniform initializer.
-    :param skip_bias_add: If set to ``True``, it will skip bias add for linear layer,
+        bias_initializer (:class:`typing.Callable`, optional):
-        which is preserved for kernel fusion, defaults to False
+            The initializer of bias, defaults to xavier uniform initializer.
-    :type skip_bias_add: bool, optional
+
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
+    More details about ``initializer`` please refer to
-    :type weight_initializer: typing.Callable, optional
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
    :type bias_initializer: typing.Callable, optional
    """
    def __init__(self,
@ -432,21 +423,33 @@ class Linear1D_Row(ParallelLayer):
@LAYERS.register_module
 class Embedding1D(ParallelLayer):
-    """
+    r"""Embedding for 1D parallelism.
    Embedding for 1D parallelism
-    :param num_embeddings: number of embeddings
+    Args:
-    :type num_embeddings: int
+        num_embeddings (int): number of embeddings.
-    :param embedding_dim: dimension of embedding
+        embedding_dim (int): dimension of embedding.
-    :type embedding_dim: int
+        padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
-    :param padding_idx: index of padding, defaults to None
+            therefore, the embedding vector at padding_idx is not updated during training,
-    :type padding_idx: int, optional
+            i.e. it remains as a fixed “pad”, defaults to None.
-    :param dtype: The dtype of parameters, defaults to None
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
-    :type dtype: torch.dtype, optional
+        weight_initializer (:class:`typing.Callable`, optional):
-    :param weight_initializer: The intializer of weight, defaults to normal initializer
+            he initializer of weight, defaults to normal initializer.
-    :type weight_initializer: typing.Callable, optional
+
-    :param args: Args used in F.embedding
+    The ``args`` and ``kwargs`` used in :class:`torch.nn.functional.embedding` should contain:
-    :param kwargs: Kwargs used in F.embedding
+    ::
        max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
                    renormalized to have norm max_norm. Note: this will modify weight in-place.
        norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
        scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
                    of frequency of the words in the mini-batch. Default False.
        sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
    More details about ``args`` and ``kwargs`` could be found in
    `Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
    More details about ``initializer`` please refer to
    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_
    """
    def __init__(self,
@ -499,20 +502,33 @@ class Embedding1D(ParallelLayer):
@LAYERS.register_module
 class VocabParallelEmbedding1D(torch.nn.Module):
-    """Embedding parallelized in the vocabulary dimension.
+    r"""Embedding parallelized in the vocabulary dimension.
-    :param num_embeddings: number of embeddings
+    Args:
-    :type num_embeddings: int
+        num_embeddings (int): number of embeddings.
-    :param embedding_dim: dimension of embedding
+        embedding_dim (int): dimension of embedding.
-    :type embedding_dim: int
+        padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
-    :param padding_idx: index of padding, defaults to None
+            therefore, the embedding vector at padding_idx is not updated during training,
-    :type padding_idx: int, optional
+            i.e. it remains as a fixed “pad”, defaults to None.
-    :param dtype: The dtype of parameters, defaults to None
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
-    :type dtype: torch.dtype, optional
+        weight_initializer (:class:`typing.Callable`, optional):
-    :param weight_initializer: The intializer of weight, defaults to normal initializer
+            he initializer of weight, defaults to normal initializer.
-    :type weight_initializer: typing.Callable, optional
+
-    :param args: Args used in F.embedding
+    The ``args`` and ``kwargs`` used in :class:``torch.nn.functional.embedding`` should contain:
-    :param kwargs: Kwargs used in F.embedding
+    ::
        max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
                    renormalized to have norm max_norm. Note: this will modify weight in-place.
        norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
        scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
                    of frequency of the words in the mini-batch. Default False.
        sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
    More details about ``args`` and ``kwargs`` could be found in
    `Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
    More details about initializer please refer to
    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """
    def __init__(self,
@ -578,13 +594,11 @@ class VocabParallelEmbedding1D(torch.nn.Module):
@LAYERS.register_module
 class Dropout1D(ParallelLayer):
-    """
+    """Dropout layer of 1D parallelism.
    Dropout layer of 1D parallelism
-    :param p: dropout rate, defaults to 0.5
+    Args:
-    :type p: float, optional
+        p (float, optional): probability of an element to be zeroed, defaults 0.5.
-    :param inplace: If set to ``True``, will do this operation in-place, defaults tp ``False``
+        inplace (bool, optional): whether to do dropout in-place, default to be False.
    :type inplace: bool, optional
    """
    def __init__(self, p: float = 0.5, inplace: bool = False):
--- a/colossalai/nn/layer/parallel_2d/_operation.py
+++ b/colossalai/nn/layer/parallel_2d/_operation.py
@ -21,27 +21,26 @@ def matmul_2d(
    row_parallel_mode=ParallelMode.PARALLEL_2D_ROW,
    col_parallel_mode=ParallelMode.PARALLEL_2D_COL,
 ):
-    """
+    r"""Matrix multiplication for 2D parallelism.
    Matrix multiplication for 2D parallelism
-    :param a: matrix :math:`A`
+    Args:
-    :type a: torch.tensor
+        a (:class:`torch.tensor`): matrix :math:`A`.
-    :param b: matrix :math:`B`
+        b (:class:`torch.tensor`): matrix :math:`B`.
-    :type b: torch.tensor
+        summa_dim (int): dimension of SUMMA fo 2D parallelism.
-    :param summa_dim: dimension of SUMMA fo 2D parallelism
+        out_shape (:class:`torch.size`): shape of output tensor.
-    :type summa_dim: int
+        row_rank (int, optional): the rank of row, defaults to None.
-    :param out_shape: shape of output tensor
+        col_rank (int, optional): the rank of column, defaults to None.
-    :type out_shape: tuple
+        row_parallel_mode (:class:`colossalai.context.ParallelMode`, optional):
-    :param row_rank: the rank of row, defaults to None
+            row parallel mode, defaults to ParallelMode.PARALLEL_2D_ROW.
-    :type row_rank: int, optional
+        col_parallel_mode (:class:`colossalai.context.ParallelMode`, optional):
-    :param col_rank: the rank of column, defaults to None
+            column parallel mode, defaults to ParallelMode.PARALLEL_2D_COL.
-    :type col_rank: int, optional
+
-    :param row_parallel_mode: row parallel mode, defaults to ParallelMode.PARALLEL_2D_ROW
+    Returns:
-    :type row_parallel_mode: str, optional
+        :class:`torch.tensor`: :math:`C = AB`.
-    :param col_parallel_mode: column parallel mode, defaults to ParallelMode.PARALLEL_2D_COL
+
-    :type col_parallel_mode: str, optional
+    Note:
-    :return: :math:`C = AB`
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
-    :rtype: torch.tensor
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """
    if row_rank is None:
        row_rank = gpc.get_local_rank(col_parallel_mode)
@ -135,35 +134,26 @@ def classifier_2d(A: Tensor, B: Tensor, bias: Optional[Tensor], summa_dim: int,
                  row_rank: int, col_rank: int, row_parallel_mode: ParallelMode, col_parallel_mode: ParallelMode,
                  data_parallel_rank: int, pipeline_parallel_rank: int, pipeline_parallel_size: int,
                  tensor_parallel_size: int) -> Tensor:
-    """
+    r"""2D parallel classifier.
    2D parallel classifier
-    :param a: matrix :math:`A`
+    Args:
-    :type a: torch.tensor
+        A (:class:`torch.tensor`): matrix :math:`A`.
-    :param b: matrix :math:`B`
+        B (:class:`torch.tensor`): matrix :math:`B`.
-    :type b: torch.tensor
+        bias (:class:`torch.tensor`, optional): matrix of bias.
-    :param bias: matrix of bias
+        summa_dim (int): dimension of SUMMA fo 2D parallelism.
-    :type bias: torch.tensor, optional
+        out_shape (:class:`torch.size`): shape of output tensor.
-    :param summa_dim: dimension of SUMMA fo 2D parallelism
+        row_rank (int, optional): the rank of row, defaults to None.
-    :type summa_dim: int
+        col_rank (int, optional): the rank of column, defaults to None.
-    :param out_shape: shape of output tensor
+        row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
-    :type out_shape: tuple
+        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
-    :param row_rank: the rank of row
+        data_parallel_rank (int): data parallel rank.
-    :type row_rank: int
+        pipeline_parallel_rank (int): pipeline parallel rank
-    :param col_rank: the rank of column
+        pipeline_parallel_size (int): pipeline parallel size.
-    :type col_rank: int
+        tensor_parallel_size (int): tensor parallel size.
-    :param row_parallel_mode: row parallel mode
+
-    :type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
+    Note:
-    :param col_parallel_mode: column parallel mode
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
-    :type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    :param data_parallel_rank: data parallel rank
    :type data_parallel_rank: int
    :param pipeline_parallel_rank: pipeline parallel rank
    :type pipeline_parallel_rank: int
    :param pipeline_parallel_size: pipeline parallel size
    :type pipeline_parallel_size: int
    :param tensor_parallel_size: tensor parallel size
    :type tensor_parallel_size: int
    """
    return _Classifier2D.apply(A, B, bias, summa_dim, out_shape, row_rank, col_rank, row_parallel_mode,
                               col_parallel_mode, data_parallel_rank, pipeline_parallel_rank, pipeline_parallel_size,
@ -171,33 +161,25 @@ def classifier_2d(A: Tensor, B: Tensor, bias: Optional[Tensor], summa_dim: int,
 class Matmul_AB_2D(torch.autograd.Function):
-    """
+    r"""Matrix multiplication for :math:`C = AB`.
    Matrix multiplication for :math:`C = AB`
-    :param a: matrix :math:`A`
+    Args:
-    :type a: torch.tensor
+        A (:class:`torch.tensor`): matrix :math:`A`.
-    :param b: matrix :math:`B`
+        B (:class:`torch.tensor`): matrix :math:`B`.
-    :type b: torch.tensor
+        summa_dim (int): dimension of SUMMA fo 2D parallelism.
-    :param summa_dim: dimension of SUMMA fo 2D parallelism
+        out_shape (:class:`torch.size`): shape of output tensor.
-    :type summa_dim: int
+        row_rank (int, optional): the rank of row, defaults to None.
-    :param out_shape: shape of output tensor
+        col_rank (int, optional): the rank of column, defaults to None.
-    :type out_shape: tuple
+        row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
-    :param row_rank: the rank of row
+        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
-    :type row_rank: int
+        data_parallel_rank (int): data parallel rank.
-    :param col_rank: the rank of column
+        pipeline_parallel_rank (int): pipeline parallel rank
-    :type col_rank: int
+        pipeline_parallel_size (int): pipeline parallel size.
-    :param row_parallel_mode: row parallel mode
+        tensor_parallel_size (int): tensor parallel size.
-    :type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
+
-    :param col_parallel_mode: column parallel mode
+    Note:
-    :type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
-    :param data_parallel_rank: data parallel rank
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    :type data_parallel_rank: int
    :param pipeline_parallel_rank: pipeline parallel rank
    :type pipeline_parallel_rank: int
    :param pipeline_parallel_size: pipeline parallel size
    :type pipeline_parallel_size: int
    :param tensor_parallel_size: tensor parallel size
    :type tensor_parallel_size: int
    """
    @staticmethod
    @custom_fwd(cast_inputs=torch.float16)
@ -305,33 +287,26 @@ class Matmul_AB_2D(torch.autograd.Function):
 class Matmul_ABT_2D(torch.autograd.Function):
-    """
+    r"""Matrix multiplication for :math:`C = AB^T`
    Matrix multiplication for :math:`C = AB^T`
-    :param a: matrix :math:`A`
+    Args:
-    :type a: torch.tensor
+        A (:class:`torch.tensor`): matrix :math:`A`.
-    :param b: matrix :math:`B`
+        B (:class:`torch.tensor`): matrix :math:`B`.
-    :type b: torch.tensor
+        summa_dim (int): dimension of SUMMA fo 2D parallelism.
-    :param summa_dim: dimension of SUMMA fo 2D parallelism
+        out_shape (:class:`torch.size`): shape of output tensor.
-    :type summa_dim: int
+        row_rank (int, optional): the rank of row, defaults to None.
-    :param out_shape: shape of output tensor
+        col_rank (int, optional): the rank of column, defaults to None.
-    :type out_shape: tuple
+        row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
-    :param row_rank: the rank of row
+        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
-    :type row_rank: int
+            column parallel mode, defaults to ParallelMode.PARALLEL_2D_COL.
-    :param col_rank: the rank of column
+        data_parallel_rank (int): data parallel rank.
-    :type col_rank: int
+        pipeline_parallel_rank (int): pipeline parallel rank
-    :param row_parallel_mode: row parallel mode
+        pipeline_parallel_size (int): pipeline parallel size.
-    :type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
+        tensor_parallel_size (int): tensor parallel size.
-    :param col_parallel_mode: column parallel mode
+
-    :type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
+    Note:
-    :param data_parallel_rank: data parallel rank
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
-    :type data_parallel_rank: int
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
    :param pipeline_parallel_rank: pipeline parallel rank
    :type pipeline_parallel_rank: int
    :param pipeline_parallel_size: pipeline parallel size
    :type pipeline_parallel_size: int
    :param tensor_parallel_size: tensor parallel size
    :type tensor_parallel_size: int
    """
    @staticmethod
    @custom_fwd(cast_inputs=torch.float16)
@ -445,33 +420,25 @@ class Matmul_ABT_2D(torch.autograd.Function):
 class Matmul_ATB_2D(torch.autograd.Function):
-    """
+    r"""Matrix multiplication for :math:`C = A^TB`.
    Matrix multiplication for :math:`C = A^TB`
-    :param a: matrix :math:`A`
+    Args:
-    :type a: torch.tensor
+        A (:class:`torch.tensor`): matrix :math:`A`.
-    :param b: matrix :math:`B`
+        B (:class:`torch.tensor`): matrix :math:`B`.
-    :type b: torch.tensor
+        summa_dim (int): dimension of SUMMA fo 2D parallelism.
-    :param summa_dim: dimension of SUMMA fo 2D parallelism
+        out_shape (:class:`torch.size`): shape of output tensor.
-    :type summa_dim: int
+        row_rank (int, optional): the rank of row, defaults to None.
-    :param out_shape: shape of output tensor
+        col_rank (int, optional): the rank of column, defaults to None.
-    :type out_shape: tuple
+        row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
-    :param row_rank: the rank of row
+        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
-    :type row_rank: int
+        data_parallel_rank (int): data parallel rank.
-    :param col_rank: the rank of column
+        pipeline_parallel_rank (int): pipeline parallel rank
-    :type col_rank: int
+        pipeline_parallel_size (int): pipeline parallel size.
-    :param row_parallel_mode: row parallel mode
+        tensor_parallel_size (int): tensor parallel size.
-    :type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
+
-    :param col_parallel_mode: column parallel mode
+    Note:
-    :type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
-    :param data_parallel_rank: data parallel rank
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
    :type data_parallel_rank: int
    :param pipeline_parallel_rank: pipeline parallel rank
    :type pipeline_parallel_rank: int
    :param pipeline_parallel_size: pipeline parallel size
    :type pipeline_parallel_size: int
    :param tensor_parallel_size: tensor parallel size
    :type tensor_parallel_size: int
    """
    @staticmethod
    @custom_fwd(cast_inputs=torch.float16)
@ -639,33 +606,26 @@ def add_bias_2d(input_: Tensor, bias: Tensor, output_size_per_partition: int, ro
                row_parallel_mode: ParallelMode, col_parallel_mode: ParallelMode, skip_bias_add: bool,
                data_parallel_rank: int, pipeline_parallel_rank: int, pipeline_parallel_size: int,
                tensor_parallel_size: int) -> Tensor:
-    """
+    r"""Matrix add bias: :math:`C = A + b`.
    Matrix add bias: :math:`C = A + b`
-    :param input_: matrix :math:`A`
+    Args:
-    :type input_: torch.tensor
+        input_ (:class:`torch.tensor`): matrix :math:`A`.
-    :param bias: matrix :math:`b`
+        bias (:class:`torch.tensor`): matrix :math:`B`.
-    :type bias: torch.tensor
+        output_size_per_partition (int): size of output per partition.
-    :param output_size_per_partition: size of ouput per partition
+        row_rank (int, optional): the rank of row, defaults to None.
-    :type output_size_per_partition: int
+        col_rank (int, optional): the rank of column, defaults to None.
-    :param row_rank: the rank of row
+        row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
-    :type row_rank: int
+        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
-    :param col_rank: the rank of column
+        skip_bias_add (bool):
-    :type col_rank: int
+            If set to ``True``, it will skip bias add for linear layer, which is preserved for kernel fusion.
-    :param row_parallel_mode: row parallel mode
+        data_parallel_rank (int): data parallel rank.
-    :type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
+        pipeline_parallel_rank (int): pipeline parallel rank
-    :param col_parallel_mode: column parallel mode
+        pipeline_parallel_size (int): pipeline parallel size.
-    :type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
+        tensor_parallel_size (int): tensor parallel size.
-    :param skip_bias_add: If set to ``True``, it will skip bias add for linear layer, which is preserved for kernel fusion
+
-    :type skip_bias_add: bool
+    Note:
-    :param data_parallel_rank: data parallel rank
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
-    :type data_parallel_rank: int
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    :param pipeline_parallel_rank: pipeline parallel rank
    :type pipeline_parallel_rank: int
    :param pipeline_parallel_size: pipeline parallel size
    :type pipeline_parallel_size: int
    :param tensor_parallel_size: tensor parallel size
    :type tensor_parallel_size: int
    """
    return _Add_Bias_2D.apply(input_, bias, output_size_per_partition, row_rank, col_rank, row_parallel_mode,
                              col_parallel_mode, skip_bias_add, data_parallel_rank, pipeline_parallel_rank,
@ -711,21 +671,19 @@ class _Layernorm_2D(torch.autograd.Function):
 def layernorm_2d(input_: Tensor, E_x: Tensor, Var_x: Tensor, hidden_size: int, row_parallel_mode: ParallelMode,
                 col_parallel_mode: ParallelMode) -> Tensor:
-    """
+    r"""Layernorm.
    Layernorm
-    :param input_: input maxtrix
+    Args:
-    :type input_: torch.tensor
+        input_ (:class:`torch.tensor`): input matrix.
-    :param E_x: mean
+        E_x (:class:`torch.tensor`): mean.
-    :type E_x: torch.tensor
+        Var_x (:class:`torch.tensor`): variance.
-    :param Var_x: variance
+        hidden_size (int): hidden size.
-    :type Var_x: torch.tensor
+        row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
-    :param hidden_size: hidden size
+        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
-    :type hidden_size: int
+
-    :param row_parallel_mode: row parallel mode
+    Note:
-    :type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
-    :param col_parallel_mode: column parallel mode
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    :type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
    """
    return _Layernorm_2D.apply(input_, E_x, Var_x, hidden_size, row_parallel_mode, col_parallel_mode)
@ -748,27 +706,29 @@ class _AllGatherTensor2D(torch.autograd.Function):
 def all_gather_tensor_2d(tensor: Tensor, dim: int, parallel_mode: ParallelMode) -> Tensor:
-    """
+    r"""All gather the tensor of 2D parallelism.
    All gather the tensor of 2D parallelism
-    :param inputs: input maxtrix
+    Args:
-    :type inputs: torch.tensor
+        tensor (:class:`torch.tensor`): Input tensor.
-    :param dim: dimension to gather
+        dim (int): Dimension to gather.
-    :type dim: int
+        parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode tensor used.
-    :param parallel_mode: parallel mode
+
-    :type parallel_mode: colossalai.context.parallel_mode.ParallelMode
+    Note:
        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """
    return _AllGatherTensor2D.apply(tensor, dim, parallel_mode)
 def split_tensor_2d(input_: Tensor, dim: int = 0) -> Tensor:
-    """Splits 2D tensor in specified dimension across cols
+    """Splits 2D tensor in specified dimension across cols.
-    :param input_: Input tensor
+
-    :param dim: Specified dimension in which to split
+    Args:
-    :type input_: torch.Tensor
+        input_ (:class:`torch.tensor`): Input tensor.
-    :type dim: int, optional
+        dim (int): Specified dimension in which to split.
-    :return output: Splitted tensor
+
-    :rtype output: torch.Tensor
+    Returns:
        :class:`torch.tensor`: The tensor has been split.
    """
    if input_.size(dim) <= 1:
        return input_
@ -787,11 +747,15 @@ class _ReduceTensor2D(torch.autograd.Function):
 def reduce_tensor_2d(input_: Tensor, parallel_mode: ParallelMode) -> Tensor:
-    """
+    r"""All-reduce the input.
    All-reduce the input.
-    :param input_: input tensor
+    Args:
-    :param parallel_mode: parallel mode
+        input_ (:class:`torch.tensor`): Input tensor.
        parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode tensor used.
    Note:
        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """
    return _ReduceTensor2D.apply(input_, parallel_mode)
@ -809,12 +773,16 @@ class _ReduceScatterTensor2D(torch.autograd.Function):
 def reduce_scatter_tensor_2d(tensor: Tensor, dim: int, parallel_mode: ParallelMode) -> Tensor:
-    """
+    r"""Reduce-scatter the input.
    Reduce-scatter the input.
-    :param tensor: Input tensor
+    Args:
-    :param dim: Dimension to scatter
+        tensor (:class:`torch.tensor`): Input tensor.
-    :param parallel_mode: Parallel mode
+        dim (int): Dimension to reduce.
        parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode tensor used.
    Note:
        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """
    return _ReduceScatterTensor2D.apply(tensor, dim, parallel_mode)
@ -849,11 +817,11 @@ class _ReduceByBatch2D(torch.autograd.Function):
 def reduce_by_batch_2d(input_, reduce_mean: bool = False) -> Tensor:
-    """All-reduce the input from the model parallel region.
+    r"""All-reduce the input from the model parallel region.
-    :param input_: input maxtrix
+    Args:
-    :type input_: torch.tensor
+        input_ (:class:`torch.tensor`): input matrix.
-    :param reduce_mean:  If set to ``True``, it will divide the output by column parallel size, default to False
+        reduce_mean (bool, optional):
-    :type reduce_mean: bool, optional
+            If set to ``True``, it will divide the output by column parallel size, default to False.
    """
    return _ReduceByBatch2D.apply(input_, reduce_mean)
--- a/colossalai/nn/layer/parallel_2d/layers.py
+++ b/colossalai/nn/layer/parallel_2d/layers.py
@ -22,23 +22,22 @@ from ._utils import assert_summa_initialization, get_summa_dim_from_env
@LAYERS.register_module
 class Linear2D(ParallelLayer):
-    """
+    r"""Linear layer for 2D parallelism
    Linear layer for 2D parallelism
-    :param in_features: size of each input sample
+    Args:
-    :type in_features: int
+        in_features (int): size of each input sample.
-    :param out_features: size of each output sample
+        out_features (int): size of each output sample.
-    :type out_features: int
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
-    :type bias: bool, optional
+        skip_bias_add (bool, optional): If set to ``True``, it will skip bias add for linear layer,
-    :param dtype: The dtype of parameters, defaults to None
+            which is preserved for kernel fusion, defaults to False.
-    :type dtype: torch.dtype, optional
+        weight_initializer (:class:`typing.Callable`, optional):
-    :param skip_bias_add: If set to ``True``, it will skip bias add for linear layer, which is preserved for kernel fusion, defaults to False
+            The initializer of weight, defaults to kaiming uniform initializer.
-    :type skip_bias_add: bool, optional
+        bias_initializer (:class:`typing.Callable`, optional):
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
+            The initializer of bias, defaults to xavier uniform initializer.
-    :type weight_initializer: typing.Callable, optional
+
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
+    More details about ``initializer`` please refer to
-    :type bias_initializer: typing.Callable, optional
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """
    def __init__(self,
                 in_features: int,
@ -119,18 +118,16 @@ class Linear2D(ParallelLayer):
@LAYERS.register_module
 class LayerNorm2D(ParallelLayer):
-    r"""
+    r"""Layer Normalization for 2D parallelism.
    Layer Normalization for 2D parallelism
-    :param normalized_shape: input shape from an expected input
+    Args:
-        of size. :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1] \times \ldots \times \text{normalized_shape}[-1]]`
+        normalized_shape (int): input shape from an expected input of size.
-        If a single integer is used, it is treated as a singleton list, and this module will
+            :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
-        normalize over the last dimension which is expected to be of that specific size.
+            \times \ldots \times \text{normalized_shape}[-1]]`
-    :type normalized_shape: int
+            If a single integer is used, it is treated as a singleton list, and this module will
-    :param eps: a value added to the denominator for numerical stability, defaults to 1e-05
+            normalize over the last dimension which is expected to be of that specific size.
-    :type eps: float, optional
+        eps (float, optional): a value added to the denominator for numerical stability, defaults to 1e-05.
-    :param dtype: The dtype of parameters, defaults to None
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
    :type dtype: torch.dtype, optional
    """
    def __init__(self, normalized_shape: int, eps: float = 1e-05, dtype=None):
        super().__init__()
@ -189,27 +186,24 @@ class LayerNorm2D(ParallelLayer):
@LAYERS.register_module
 class PatchEmbedding2D(ParallelLayer):
-    """
+    r"""2D Image to Patch Embedding.
    2D Image to Patch Embedding
-    :param img_size: image size
+    Args:
-    :type img_size: int
+        img_size (int): image size.
-    :param patch_size: patch size
+        patch_size (int): patch size.
-    :type patch_size: int
+        in_chans (int): number of channels of input image.
-    :param in_chans: number of channels of input image
+        embed_size (int): size of embedding.
-    :type in_chans: int
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
-    :param embed_size: size of embedding
+        flatten (bool, optional): whether to flatten output tensor, defaults to True.
-    :type embed_size: int
+        weight_initializer (:class:`typing.Callable`, optional):
-    :param dtype: The dtype of parameters, defaults to None
+            The initializer of weight, defaults to kaiming uniform initializer.
-    :type dtype: torch.dtype, optional
+        bias_initializer (:class:`typing.Callable`, optional):
-    :param flatten: whether to flatten output tensor, defaults to True
+            The initializer of bias, defaults to xavier uniform initializer.
-    :type flatten: bool, optional
+        position_embed_initializer (:class:`typing.Callable`, optional):
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
+            The initializer of position embedding, defaults to zeros initializer.
-    :type weight_initializer: typing.Callable, optional
+
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
+    More details about ``initializer`` please refer to
-    :type bias_initializer: typing.Callable, optional
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    :param position_embed_initializer: The intializer of position embedding, defaults to zero
    :type position_embed_initializer: typing.Callable, optional
    """
    def __init__(self,
                 img_size: int,
@ -291,21 +285,33 @@ class PatchEmbedding2D(ParallelLayer):
@LAYERS.register_module
 class Embedding2D(ParallelLayer):
-    """
+    r"""Embedding for 2D parallelism.
    Embedding for 2D parallelism
-    :param num_embeddings: number of embeddings
+    Args:
-    :type num_embeddings: int
+        num_embeddings (int): number of embeddings.
-    :param embedding_dim: dimension of embedding
+        embedding_dim (int): dimension of embedding.
-    :type embedding_dim: int
+        padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
-    :param padding_idx: index of padding, defaults to None
+            therefore, the embedding vector at padding_idx is not updated during training,
-    :type padding_idx: int, optional
+            i.e. it remains as a fixed “pad”, defaults to None.
-    :param dtype: The dtype of parameters, defaults to None
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
-    :type dtype: torch.dtype, optional
+        weight_initializer (:class:`typing.Callable`, optional):
-    :param weight_initializer: The intializer of weight, defaults to normal initializer
+            he initializer of weight, defaults to normal initializer.
-    :type weight_initializer: typing.Callable, optional
+
-    :param args: Args used in F.embedding
+    The ``args`` and ``kwargs`` used in :class:``torch.nn.functional.embedding`` should contain:
-    :param kwargs: Kwargs used in F.embedding
+    ::
        max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
                    renormalized to have norm max_norm. Note: this will modify weight in-place.
        norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
        scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
                    of frequency of the words in the mini-batch. Default False.
        sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
    More details about ``args`` and ``kwargs`` could be found in
    `Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
    More details about initializer please refer to
    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_
    """
    def __init__(self,
                 num_embeddings: int,
@ -358,20 +364,33 @@ class Embedding2D(ParallelLayer):
@LAYERS.register_module
 class VocabParallelEmbedding2D(torch.nn.Module):
-    """Embedding parallelized in the vocabulary dimension.
+    r"""Embedding parallelized in the vocabulary dimension.
-    :param num_embeddings: number of embeddings
+    Args:
-    :type num_embeddings: int
+        num_embeddings (int): number of embeddings.
-    :param embedding_dim: dimension of embedding
+        embedding_dim (int): dimension of embedding.
-    :type embedding_dim: int
+        padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
-    :param padding_idx: index of padding, defaults to None
+            therefore, the embedding vector at padding_idx is not updated during training,
-    :type padding_idx: int, optional
+            i.e. it remains as a fixed “pad”, defaults to None.
-    :param dtype: The dtype of parameters, defaults to None
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
-    :type dtype: torch.dtype, optional
+        weight_initializer (:class:`typing.Callable`, optional):
-    :param weight_initializer: The intializer of weight, defaults to normal initializer
+            he initializer of weight, defaults to normal initializer.
-    :type weight_initializer: typing.Callable, optional
+
-    :param args: Args used in F.embedding
+    The ``args`` and ``kwargs`` used in :class:``torch.nn.functional.embedding`` should contain:
-    :param kwargs: Kwargs used in F.embedding
+    ::
        max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
                    renormalized to have norm max_norm. Note: this will modify weight in-place.
        norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
        scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
                    of frequency of the words in the mini-batch. Default False.
        sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
    More details about ``args`` and ``kwargs`` could be found in
    `Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
    More details about initializer please refer to
    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """
    def __init__(self,
                 num_embeddings: int,
@ -435,23 +454,21 @@ class VocabParallelEmbedding2D(torch.nn.Module):
@LAYERS.register_module
 class Classifier2D(ParallelLayer):
-    """
+    r"""Classifier for 2D parallelism.
    Classifier for 2D parallelism
-    :param in_features: size of each input sample
+    Args:
-    :type in_features: int
+        in_features (int): size of each input sample.
-    :param num_classes: number of classes
+        num_classes (int): number of classes.
-    :type num_classes: int
+        weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
-    :param weight: weight of the classifier, defaults to True
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
-    :type weight: torch.nn.Parameter, optional
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True``
+        weight_initializer (:class:`typing.Callable`, optional):
-    :type bias: bool, optional
+            The initializer of weight, defaults to kaiming uniform initializer.
-    :param dtype: The dtype of parameters, defaults to None
+        bias_initializer (:class:`typing.Callable`, optional):
-    :type dtype: torch.dtype, optional
+            The initializer of bias, defaults to xavier uniform initializer.
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
+
-    :type weight_initializer: typing.Callable, optional
+    More details about ``initializer`` please refer to
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    :type bias_initializer: typing.Callable, optional
    """
    def __init__(self,
                 in_features: int,
@ -515,23 +532,21 @@ class Classifier2D(ParallelLayer):
@LAYERS.register_module
 class VocabParallelClassifier2D(ParallelLayer):
-    """
+    r"""Vocab parallel classifier layer for 2D parallelism.
    Vocab parallel classifier layer for 2D parallelism
-    :param in_features: size of each input sample
+    Args:
-    :type in_features: int
+        in_features (int): size of each input sample.
-    :param num_classes: number of classes
+        num_classes (int): number of classes.
-    :type num_classes: int
+        weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
-    :param weight: weight of the classifier, defaults to True
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
-    :type weight: torch.nn.Parameter, optional
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True``
+        weight_initializer (:class:`typing.Callable`, optional):
-    :type bias: bool, optional
+            The initializer of weight, defaults to kaiming uniform initializer.
-    :param dtype: The dtype of parameters, defaults to None
+        bias_initializer (:class:`typing.Callable`, optional):
-    :type dtype: torch.dtype, optional
+            The initializer of bias, defaults to xavier uniform initializer.
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
+
-    :type weight_initializer: typing.Callable, optional
+    More details about ``initializer`` please refer to
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    :type bias_initializer: typing.Callable, optional
    """
    def __init__(self,
                 in_features: int,
--- a/colossalai/nn/layer/parallel_2p5d/_operation.py
+++ b/colossalai/nn/layer/parallel_2p5d/_operation.py
@ -100,35 +100,26 @@ def classifier_2p5d(A: Tensor, B: Tensor, bias, tesseract_dim: int, out_shape: T
                                                                                     ...], row_rank: int, col_rank: int,
                    row_parallel_mode: ParallelMode, col_parallel_mode: ParallelMode, data_parallel_rank: int,
                    pipeline_parallel_rank: int, pipeline_parallel_size: int, tensor_parallel_size: int) -> Tensor:
-    """
+    r"""Classifier.
    Classifier
-    :param a: matrix :math:`A`
+    Args:
-    :type a: torch.tensor
+        A (:class:`torch.tensor`): matrix :math:`A`.
-    :param b: matrix :math:`B`
+        B (:class:`torch.tensor`): matrix :math:`B`.
-    :type b: torch.tensor
+        bias (:class:`torch.tensor`): matrix of bias.
-    :param bias: matrix of bias
+        tesseract_dim (int): dimension of TESSERACT fo 2.5D parallelism.
-    :type bias: torch.tensor, optional
+        out_shape (:class:`torch.size`): shape of output tensor.
-    :param tesseract_dim: dimension of TESSERACT fo 2.5D parallelism
+        row_rank (int): the rank of row.
-    :type tesseract_dim: int
+        col_rank (int): the rank of column.
-    :param out_shape: shape of output tensor
+        row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
-    :type out_shape: tuple
+        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
-    :param row_rank: the rank of row
+        data_parallel_rank (int): data parallel rank.
-    :type row_rank: int
+        pipeline_parallel_rank (int): pipeline parallel rank
-    :param col_rank: the rank of column
+        pipeline_parallel_size (int): pipeline parallel size.
-    :type col_rank: int
+        tensor_parallel_size (int): tensor parallel size.
-    :param row_parallel_mode: row parallel mode
+
-    :type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
+    Note:
-    :param col_parallel_mode: column parallel mode
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
-    :type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    :param data_parallel_rank: data parallel rank
    :type data_parallel_rank: int
    :param pipeline_parallel_rank: pipeline parallel rank
    :type pipeline_parallel_rank: int
    :param pipeline_parallel_size: pipeline parallel size
    :type pipeline_parallel_size: int
    :param tensor_parallel_size: tensor parallel size
    :type tensor_parallel_size: int
    """
    return _Classifier2p5D.apply(A, B, bias, tesseract_dim, out_shape, row_rank, col_rank, row_parallel_mode,
                                 col_parallel_mode, data_parallel_rank, pipeline_parallel_rank, pipeline_parallel_size,
@ -136,35 +127,26 @@ def classifier_2p5d(A: Tensor, B: Tensor, bias, tesseract_dim: int, out_shape: T
 class Matmul_AB_2p5D(torch.autograd.Function):
-    """
+    r"""Matrix multiplication for :math:`C = AB`.
    Matrix multiplication for :math:`C = AB`
-    :param a: matrix :math:`A`
+    Args:
-    :type a: torch.tensor
+        A (:class:`torch.tensor`): matrix :math:`A`.
-    :param b: matrix :math:`B`
+        B (:class:`torch.tensor`): matrix :math:`B`.
-    :type b: torch.tensor
+        tesseract_dim (int): dimension of TESSERACT fo 2.5D parallelism.
-    :param tesseract_dim: dimension of TESSERACT fo 2.5D parallelism
+        out_shape (:class:`torch.size`): shape of output tensor.
-    :type tesseract_dim: int
+        row_rank (int): the rank of row.
-    :param out_shape: shape of output tensor
+        col_rank (int): the rank of column.
-    :type out_shape: tuple
+        dep_rank (int): the rank of depth.
-    :param row_rank: the rank of row
+        row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
-    :type row_rank: int
+        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
-    :param col_rank: the rank of column
+        data_parallel_rank (int): data parallel rank.
-    :type col_rank: int
+        pipeline_parallel_rank (int): pipeline parallel rank
-    :param dep_rank: the rank of depth
+        pipeline_parallel_size (int): pipeline parallel size.
-    :type dep_rank: int
+        tensor_parallel_size (int): tensor parallel size.
-    :param row_parallel_mode: row parallel mode
+
-    :type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
+    Note:
-    :param col_parallel_mode: column parallel mode
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
-    :type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    :param data_parallel_rank: data parallel rank
    :type data_parallel_rank: int
    :param pipeline_parallel_rank: pipeline parallel rank
    :type pipeline_parallel_rank: int
    :param pipeline_parallel_size: pipeline parallel size
    :type pipeline_parallel_size: int
    :param tensor_parallel_size: tensor parallel size
    :type tensor_parallel_size: int
    """
    @staticmethod
@ -270,35 +252,26 @@ class Matmul_AB_2p5D(torch.autograd.Function):
 class Matmul_ABT_2p5D(torch.autograd.Function):
-    """
+    r"""Matrix multiplication for :math:`C = AB^T`.
    Matrix multiplication for :math:`C = AB^T`
-    :param a: matrix :math:`A`
+    Args:
-    :type a: torch.tensor
+        A (:class:`torch.tensor`): matrix :math:`A`.
-    :param b: matrix :math:`B`
+        B (:class:`torch.tensor`): matrix :math:`B`.
-    :type b: torch.tensor
+        tesseract_dim (int): dimension of TESSERACT fo 2.5D parallelism.
-    :param tesseract_dim: dimension of TESSERACT fo 2.5D parallelism
+        out_shape (:class:`torch.size`): shape of output tensor.
-    :type tesseract_dim: int
+        row_rank (int): the rank of row.
-    :param out_shape: shape of output tensor
+        col_rank (int): the rank of column.
-    :type out_shape: tuple
+        dep_rank (int): the rank of depth.
-    :param row_rank: the rank of row
+        row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
-    :type row_rank: int
+        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
-    :param col_rank: the rank of column
+        data_parallel_rank (int): data parallel rank.
-    :type col_rank: int
+        pipeline_parallel_rank (int): pipeline parallel rank
-    :param dep_rank: the rank of depth
+        pipeline_parallel_size (int): pipeline parallel size.
-    :type dep_rank: int
+        tensor_parallel_size (int): tensor parallel size.
-    :param row_parallel_mode: row parallel mode
+
-    :type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
+    Note:
-    :param col_parallel_mode: column parallel mode
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
-    :type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    :param data_parallel_rank: data parallel rank
    :type data_parallel_rank: int
    :param pipeline_parallel_rank: pipeline parallel rank
    :type pipeline_parallel_rank: int
    :param pipeline_parallel_size: pipeline parallel size
    :type pipeline_parallel_size: int
    :param tensor_parallel_size: tensor parallel size
    :type tensor_parallel_size: int
    """
    @staticmethod
@ -409,35 +382,26 @@ class Matmul_ABT_2p5D(torch.autograd.Function):
 class Matmul_ATB_2p5D(torch.autograd.Function):
-    """
+    r"""Matrix multiplication for :math:`C = A^TB`
    Matrix multiplication for :math:`C = A^TB`
-    :param a: matrix :math:`A`
+    Args:
-    :type a: torch.tensor
+        A (:class:`torch.tensor`): matrix :math:`A`.
-    :param b: matrix :math:`B`
+        B (:class:`torch.tensor`): matrix :math:`B`.
-    :type b: torch.tensor
+        tesseract_dim (int): dimension of TESSERACT fo 2.5D parallelism.
-    :param tesseract_dim: dimension of TESSERACT fo 2.5D parallelism
+        out_shape (:class:`torch.size`): shape of output tensor.
-    :type tesseract_dim: int
+        row_rank (int): the rank of row.
-    :param out_shape: shape of output tensor
+        col_rank (int): the rank of column.
-    :type out_shape: tuple
+        dep_rank (int): the rank of depth.
-    :param row_rank: the rank of row
+        row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
-    :type row_rank: int
+        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
-    :param col_rank: the rank of column
+        data_parallel_rank (int): data parallel rank.
-    :type col_rank: int
+        pipeline_parallel_rank (int): pipeline parallel rank
-    :param dep_rank: the rank of depth
+        pipeline_parallel_size (int): pipeline parallel size.
-    :type dep_rank: int
+        tensor_parallel_size (int): tensor parallel size.
-    :param row_parallel_mode: row parallel mode
+
-    :type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
+    Note:
-    :param col_parallel_mode: column parallel mode
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
-    :type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    :param data_parallel_rank: data parallel rank
    :type data_parallel_rank: int
    :param pipeline_parallel_rank: pipeline parallel rank
    :type pipeline_parallel_rank: int
    :param pipeline_parallel_size: pipeline parallel size
    :type pipeline_parallel_size: int
    :param tensor_parallel_size: tensor parallel size
    :type tensor_parallel_size: int
    """
    @staticmethod
@ -629,36 +593,27 @@ def add_bias_2p5d(input: Tensor, bias: Tensor, output_size_per_partition: int, t
                  col_rank: int, dep_rank: int, col_parallel_mode: ParallelMode, skip_bias_add: bool,
                  data_parallel_rank: int, pipeline_parallel_rank: int, pipeline_parallel_size: int,
                  tensor_parallel_size: int) -> Tensor:
-    """
+    r"""Matrix add bias: :math:`C = A + b`.
    Matrix add bias: :math:`C = A + b`
-    :param input: matrix :math:`A`
+    Args:
-    :type input: torch.tensor
+        input (:class:`torch.tensor`): matrix :math:`A`.
-    :param bias: matrix :math:`b`
+        bias (:class:`torch.tensor`): matrix :math:`B`.
-    :type bias: torch.tensor
+        tesseract_dim (int): dimension of TESSERACT fo 2.5D parallelism.
-    :param output_size_per_partition: output size in each partition
+        output_size_per_partition (int): output size in each partition.
-    :type output_size_per_partition: int
+        row_rank (int): the rank of row.
-    :param tesseract_dim: dimension of TESSERACT fo 2.5D parallelism
+        col_rank (int): the rank of column.
-    :type tesseract_dim: int
+        dep_rank (int): the rank of depth.
-    :param row_rank: the rank of row
+        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
-    :type row_rank: int
+        skip_bias_add (bool): If set to ``True``, it will skip bias add for linear layer,
-    :param col_rank: the rank of column
+            which is preserved for kernel fusion.
-    :type col_rank: int
+        data_parallel_rank (int): data parallel rank.
-    :param row_parallel_mode: row parallel mode
+        pipeline_parallel_rank (int): pipeline parallel rank
-    :type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
+        pipeline_parallel_size (int): pipeline parallel size.
-    :param col_parallel_mode: column parallel mode
+        tensor_parallel_size (int): tensor parallel size.
-    :type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
+
-    :param skip_bias_add: If set to ``True``, it will skip bias add for linear layer,
+    Note:
-           which is preserved for kernel fusion
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
-    :type skip_bias_add: bool
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    :param data_parallel_rank: data parallel rank
    :type data_parallel_rank: int
    :param pipeline_parallel_rank: pipeline parallel rank
    :type pipeline_parallel_rank: int
    :param pipeline_parallel_size: pipeline parallel size
    :type pipeline_parallel_size: int
    :param tensor_parallel_size: tensor parallel size
    :type tensor_parallel_size: int
    """
    return _Add_Bias_2p5D.apply(input, bias, output_size_per_partition, tesseract_dim, row_rank, col_rank, dep_rank,
                                col_parallel_mode, skip_bias_add, data_parallel_rank, pipeline_parallel_rank,
@ -666,19 +621,18 @@ def add_bias_2p5d(input: Tensor, bias: Tensor, output_size_per_partition: int, t
 class _Layernorm2p5D(torch.autograd.Function):
-    """
+    r"""Layernorm.
    Layernorm
-    :param input: input maxtrix
+    Args:
-    :type input: torch.tensor
+        input (:class:`torch.tensor`): input matrix.
-    :param E_x: mean
+        E_x (:class:`torch.tensor`): mean.
-    :type E_x: torch.tensor
+        Var_x (:class:`torch.tensor`): variance.
-    :param Var_x: variance
+        hidden_size (int): hidden size.
-    :type Var_x: torch.tensor
+        row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
-    :param hidden_size: hidden size
+
-    :type hidden_size: int
+    Note:
-    :param row_parallel_mode: row parallel mode
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
-    :type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """
    @staticmethod
@ -718,19 +672,18 @@ class _Layernorm2p5D(torch.autograd.Function):
 def layernorm_2p5d(input: Tensor, E_x: Tensor, Var_x: Tensor, hidden_size: int,
                   row_parallel_mode: ParallelMode) -> Tensor:
-    """
+    r"""Layernorm.
    Layernorm
-    :param input: input maxtrix
+    Args:
-    :type input: torch.tensor
+        input (:class:`torch.tensor`): input matrix.
-    :param E_x: mean
+        E_x (:class:`torch.tensor`): mean.
-    :type E_x: torch.tensor
+        Var_x (:class:`torch.tensor`): variance.
-    :param Var_x: variance
+        hidden_size (int): hidden size.
-    :type Var_x: torch.tensor
+        row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
-    :param hidden_size: hidden size
+
-    :type hidden_size: int
+    Note:
-    :param row_parallel_mode: row parallel mode
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
-    :type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
    """
    return _Layernorm2p5D.apply(input, E_x, Var_x, hidden_size, row_parallel_mode)
@ -753,29 +706,31 @@ class _AllGatherTensor2p5D(torch.autograd.Function):
 def all_gather_tensor_2p5d(inputs: Tensor, dim: int, col_parallel_mode: ParallelMode) -> Tensor:
-    """
+    r"""all gather the weight of 2.5D parallelism.
    all gather the weight of 2.5D parallelism
-    :param inputs: input maxtrix
+    Args:
-    :type inputs: torch.tensor
+        inputs (:class:`torch.tensor`): input tensor.
-    :param dim: dimension of all gather
+        dim (int): dimension of all-gather.
-    :type dim: int
+        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
-    :param tesseract_dim: dimension of TESSERACT fo 2.5D parallelism
+
-    :type tesseract_dim: int
+    Note:
-    :param col_parallel_mode: column parallel mode
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
-    :type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
    """
    return _AllGatherTensor2p5D.apply(inputs, dim, col_parallel_mode)
 class SplitFirst(torch.autograd.Function):
-    """
+    r"""
-    :param inputs: input maxtrix
+
-    :type inputs: torch.tensor
+    Args:
-    :param tesseract_dim: dimension of TESSERACT fo 2.5D parallelism
+        inputs (:class:`torch.tensor`): input tensor.
-    :type tesseract_dim: int
+        tesseract_dim (int): dimension of TESSERACT fo 2.5D parallelism
-    :param col_parallel_mode: column parallel mode
+        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
-    :type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
+
    Note:
        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
    """
    @staticmethod
@ -801,16 +756,14 @@ class SplitFirst(torch.autograd.Function):
 def split_tensor_2p5d(input_: Tensor, dim: int = 0) -> Tensor:
-    """Splits 2P5D tensor in specified dimension across cols
+    """Splits 2P5D tensor in specified dimension across cols.
-    :param input_: Input tensor
+    Args:
-    :param dim: Specified dimension in which to split
+        input_ (:class:`torch.tensor`): Input tensor.
        dim (int): Specified dimension in which to split.
-    :type input_: torch.Tensor
+    Returns:
-    :type dim: int, optional
+        :class:`torch.tensor`: The tensor has been split.
    :return output: Splitted tensor
    :rtype output: torch.Tensor
    """
    if input_.size(dim) <= 1:
        return input_
@ -829,11 +782,15 @@ class _ReduceTensor2p5D(torch.autograd.Function):
 def reduce_tensor_2p5d(input_: Tensor, parallel_mode: ParallelMode) -> Tensor:
-    """
+    r"""All-reduce the input.
    All-reduce the input.
-    :param input_: input tensor
+    Args:
-    :param parallel_mode: parallel mode
+        input_ (:class:`torch.tensor`): Input tensor.
        parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode tensor used.
    Note:
        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """
    return _ReduceTensor2p5D.apply(input_, parallel_mode)
@ -851,11 +808,16 @@ class _ReduceScatterTensor2p5D(torch.autograd.Function):
 def reduce_scatter_tensor_2p5d(input_: Tensor, dim: int, parallel_mode: ParallelMode) -> Tensor:
-    """
+    r"""Reduce-scatter the input.
    Reduce-scatter the input.
-    :param input_: input tensor
+    Args:
-    :param parallel_mode: parallel mode
+        input_ (:class:`torch.tensor`): Input tensor.
        dim (int): Dimension to reduce.
        parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode tensor used.
    Note:
        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """
    return _ReduceScatterTensor2p5D.apply(input_, dim, parallel_mode)
@ -890,12 +852,11 @@ class _RreduceByBatch2p5D(torch.autograd.Function):
 def reduce_by_batch_2p5d(input_, reduce_mean: bool = False) -> Tensor:
-    """
+    r"""All-reduce the input from the model parallel region.
    All-reduce the input from the model parallel region.
-    :param input_: input maxtrix
+    Args:
-    :type input_: torch.tensor
+        input_ (:class:`torch.tensor`): input matrix.
-    :param reduce_mean:  If set to ``True``, it will divide the output by column parallel size, default to False
+        reduce_mean (bool, optional):
-    :type reduce_mean: bool, optional
+            If set to ``True``, it will divide the output by column parallel size, default to False.
    """
    return _RreduceByBatch2p5D.apply(input_, reduce_mean)
--- a/colossalai/nn/layer/parallel_2p5d/layers.py
+++ b/colossalai/nn/layer/parallel_2p5d/layers.py
@ -23,21 +23,22 @@ from ._utils import assert_tesseract_initialization, get_tesseract_dim_dep_from_
@LAYERS.register_module
 class Linear2p5D(ParallelLayer):
-    """
+    r"""Linear layer for 2.5D parallelism.
    Linear layer for 2.5D parallelism
-    :param in_features: size of each input sample
+    Args:
-    :type in_features: int
+        in_features (int): size of each input sample.
-    :param out_features: size of each output sample
+        out_features (int): size of each output sample.
-    :type out_features: int
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
-    :type bias: bool, optional
+        skip_bias_add (bool, optional): If set to ``True``, it will skip bias add for linear layer,
-    :param dtype: The dtype of parameters, defaults to None
+            which is preserved for kernel fusion, defaults to False.
-    :type dtype: torch.dtype, optional
+        weight_initializer (:class:`typing.Callable`, optional):
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
+            The initializer of weight, defaults to kaiming uniform initializer.
-    :type weight_initializer: typing.Callable, optional
+        bias_initializer (:class:`typing.Callable`, optional):
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
+            The initializer of bias, defaults to xavier uniform initializer.
-    :type bias_initializer: typing.Callable, optional
+
    More details about ``initializer`` please refer to
    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """
    def __init__(self,
                 in_features: int,
@ -131,19 +132,16 @@ class Linear2p5D(ParallelLayer):
@LAYERS.register_module
 class LayerNorm2p5D(ParallelLayer):
-    r"""
+    r"""Layer Normalization for 2.5D parallelism.
    Layer Normalization for 2.5D parallelism
-    :param normalized_shape: input shape from an expected input of size.
+    Args:
-        :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
+        normalized_shape (int): input shape from an expected input of size.
-        \times \ldots \times \text{normalized_shape}[-1]]`
+            :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
-        If a single integer is used, it is treated as a singleton list, and this module will
+            \times \ldots \times \text{normalized_shape}[-1]]`
-        normalize over the last dimension which is expected to be of that specific size.
+            If a single integer is used, it is treated as a singleton list, and this module will
-    :type normalized_shape: int
+            normalize over the last dimension which is expected to be of that specific size.
-    :param eps: a value added to the denominator for numerical stability, defaults to 1e-05
+        eps (float, optional): a value added to the denominator for numerical stability, defaults to 1e-05.
-    :type eps: float, optional
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
    :param dtype: The dtype of parameters, defaults to None
    :type dtype: torch.dtype, optional
    """
    def __init__(self, normalized_shape: int, eps: float = 1e-05, dtype=None):
        super().__init__()
@ -204,27 +202,24 @@ class LayerNorm2p5D(ParallelLayer):
@LAYERS.register_module
 class PatchEmbedding2p5D(ParallelLayer):
-    """
+    r"""2D Image to Patch Embedding.
    2D Image to Patch Embedding
-    :param img_size: image size
+    Args:
-    :type img_size: int
+        img_size (int): image size.
-    :param patch_size: patch size
+        patch_size (int): patch size.
-    :type patch_size: int
+        in_chans (int): number of channels of input image.
-    :param in_chans: number of channels of input image
+        embed_size (int): size of embedding.
-    :type in_chans: int
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
-    :param embed_size: size of embedding
+        flatten (bool, optional): whether to flatten output tensor, defaults to True.
-    :type embed_size: int
+        weight_initializer (:class:`typing.Callable`, optional):
-    :param dtype: The dtype of parameters, defaults to None
+            The initializer of weight, defaults to kaiming uniform initializer.
-    :type dtype: torch.dtype, optional
+        bias_initializer (:class:`typing.Callable`, optional):
-    :param flatten: whether to flatten output tensor, defaults to True
+            The initializer of bias, defaults to xavier uniform initializer.
-    :type flatten: bool, optional
+        position_embed_initializer (:class:`typing.Callable`, optional):
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
+            The initializer of position embedding, defaults to zeros initializer.
-    :type weight_initializer: typing.Callable, optional
+
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
+    More details about ``initializer`` please refer to
-    :type bias_initializer: typing.Callable, optional
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    :param position_embed_initializer: The intializer of position embedding, defaults to zero
    :type position_embed_initializer: typing.Callable, optional
    """
    def __init__(self,
                 img_size: int,
@ -306,21 +301,33 @@ class PatchEmbedding2p5D(ParallelLayer):
@LAYERS.register_module
 class Embedding2p5D(ParallelLayer):
-    """
+    r"""Embedding for 2.5D parallelism.
    Embedding for 2.5D parallelism
-    :param num_embeddings: number of embeddings
+    Args:
-    :type num_embeddings: int
+        num_embeddings (int): number of embeddings.
-    :param embedding_dim: dimension of embedding
+        embedding_dim (int): dimension of embedding.
-    :type embedding_dim: int
+        padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
-    :param padding_idx: index of padding, defaults to None
+            therefore, the embedding vector at padding_idx is not updated during training,
-    :type padding_idx: int, optional
+            i.e. it remains as a fixed “pad”, defaults to None.
-    :param dtype: The dtype of parameters, defaults to None
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
-    :type dtype: torch.dtype, optional
+        weight_initializer (:class:`typing.Callable`, optional):
-    :param weight_initializer: The intializer of weight, defaults to normal initializer
+            he initializer of weight, defaults to normal initializer.
-    :type weight_initializer: typing.Callable, optional
+
-    :param args: Args used in F.embedding
+    The ``args`` and ``kwargs`` used in :class:``torch.nn.functional.embedding`` should contain:
-    :param kwargs: Kwargs used in F.embedding
+    ::
        max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
                    renormalized to have norm max_norm. Note: this will modify weight in-place.
        norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
        scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
                    of frequency of the words in the mini-batch. Default False.
        sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
    More details about ``args`` and ``kwargs`` could be found in
    `Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
    More details about initializer please refer to
    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_
    """
    def __init__(self,
                 num_embeddings: int,
@ -376,18 +383,31 @@ class Embedding2p5D(ParallelLayer):
 class VocabParallelEmbedding2p5D(torch.nn.Module):
    """Embedding parallelized in the vocabulary dimension.
-    :param num_embeddings: number of embeddings
+    Args:
-    :type num_embeddings: int
+        num_embeddings (int): number of embeddings.
-    :param embedding_dim: dimension of embedding
+        embedding_dim (int): dimension of embedding.
-    :type embedding_dim: int
+        padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
-    :param padding_idx: index of padding, defaults to None
+            therefore, the embedding vector at padding_idx is not updated during training,
-    :type padding_idx: int, optional
+            i.e. it remains as a fixed “pad”, defaults to None.
-    :param dtype: The dtype of parameters, defaults to None
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
-    :type dtype: torch.dtype, optional
+        weight_initializer (:class:`typing.Callable`, optional):
-    :param weight_initializer: The intializer of weight, defaults to normal initializer
+            he initializer of weight, defaults to normal initializer.
-    :type weight_initializer: typing.Callable, optional
+
-    :param args: Args used in F.embedding
+    The ``args`` and ``kwargs`` used in :class:``torch.nn.functional.embedding`` should contain:
-    :param kwargs: Kwargs used in F.embedding
+    ::
        max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
                    renormalized to have norm max_norm. Note: this will modify weight in-place.
        norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
        scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
                    of frequency of the words in the mini-batch. Default False.
        sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
    More details about ``args`` and ``kwargs`` could be found in
    `Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
    More details about initializer please refer to
    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """
    def __init__(self,
                 num_embeddings: int,
@ -455,23 +475,21 @@ class VocabParallelEmbedding2p5D(torch.nn.Module):
@LAYERS.register_module
 class Classifier2p5D(ParallelLayer):
-    """
+    r"""Classifier for 2.5D parallelism.
    Classifier for 2.5D parallelism
-    :param in_features: size of each input sample
+    Args:
-    :type in_features: int
+        in_features (int): size of each input sample.
-    :param num_classes: number of classes
+        num_classes (int): number of classes.
-    :type num_classes: int
+        weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
-    :param weight: weight of the classifier, defaults to True
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
-    :type weight: torch.nn.Parameter, optional
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True
+        weight_initializer (:class:`typing.Callable`, optional):
-    :type bias: bool, optional
+            The initializer of weight, defaults to kaiming uniform initializer.
-    :param dtype: The dtype of parameters, defaults to None
+        bias_initializer (:class:`typing.Callable`, optional):
-    :type dtype: torch.dtype, optional
+            The initializer of bias, defaults to xavier uniform initializer.
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
+
-    :type weight_initializer: typing.Callable, optional
+    More details about ``initializer`` please refer to
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    :type bias_initializer: typing.Callable, optional
    """
    def __init__(self,
                 in_features: int,
@ -537,23 +555,21 @@ class Classifier2p5D(ParallelLayer):
@LAYERS.register_module
 class VocabParallelClassifier2p5D(ParallelLayer):
-    """
+    r"""Vocab parallel classifier layer for 2.5D parallelism.
    Vocab parallel classifier layer for 2.5D parallelism
-    :param in_features: size of each input sample
+    Args:
-    :type in_features: int
+        in_features (int): size of each input sample.
-    :param num_classes: number of classes
+        num_classes (int): number of classes.
-    :type num_classes: int
+        weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
-    :param weight: weight of the classifier, defaults to True
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
-    :type weight: torch.nn.Parameter, optional
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True``
+        weight_initializer (:class:`typing.Callable`, optional):
-    :type bias: bool, optional
+            The initializer of weight, defaults to kaiming uniform initializer.
-    :param dtype: The dtype of parameters, defaults to None
+        bias_initializer (:class:`typing.Callable`, optional):
-    :type dtype: torch.dtype, optional
+            The initializer of bias, defaults to xavier uniform initializer.
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
+
-    :type weight_initializer: typing.Callable, optional
+    More details about ``initializer`` please refer to
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    :type bias_initializer: typing.Callable, optional
    """
    def __init__(self,
                 in_features: int,
--- a/colossalai/nn/layer/parallel_3d/_operation.py
+++ b/colossalai/nn/layer/parallel_3d/_operation.py
@ -88,27 +88,22 @@ def linear_3d(input_: Tensor,
              input_dim: int = 0,
              weight_dim: int = -1,
              output_dim: int = 0) -> Tensor:
-    """
+    r"""Linear layer for 3D parallelism.
    Linear layer for 3D parallelism
-    :param input_: matrix of input
+    Args:
-    :type input_: torch.tensor
+        input_ (:class:`torch.tensor`): input matrix.
-    :param weight: matrix of weight
+        weight (:class:`torch.tensor`): matrix of weight.
-    :type weight: torch.tensor
+        bias (:class:`torch.tensor`): matrix of bias.
-    :param bias: matrix of bias
+        input_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): input parallel mode.
-    :type bias: torch.tensor, optional
+        weight_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): weight parallel mode.
-    :param input_parallel_mode: input parallel mode
+        output_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): output parallel mode.
-    :type input_parallel_mode: colossalai.context.parallel_mode.ParallelMode
+        input_dim (int, optional): dimension of input, defaults to 0.
-    :param weight_parallel_mode: weight parallel mode
+        weight_dim (int, optional): dimension of weight, defaults to -1.
-    :type weight_parallel_mode: colossalai.context.parallel_mode.ParallelMode
+        output_dim (int, optional): dimension of output, defaults to 0.
-    :param output_parallel_mode: output parallel mode
+
-    :type output_parallel_mode: colossalai.context.parallel_mode.ParallelMode
+    Note:
-    :param input_dim: dimension of input, defaults to 0
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
-    :type input_dim: int, optional
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    :param weight_dim: dimension of weight, defaults to -1
    :type weight_dim: int, optional
    :param output_dim: dimension of output, defaults to 0
    :type output_dim: int, optional
    """
    return _Linear3D.apply(input_, weight, bias, input_parallel_mode, weight_parallel_mode, output_parallel_mode,
                           input_dim, weight_dim, output_dim)
@ -174,21 +169,19 @@ class _Classifier3D(torch.autograd.Function):
 def classifier_3d(input_: Tensor, weight: Tensor, bias: Optional[Tensor], input_parallel_mode: ParallelMode,
                  weight_parallel_mode: ParallelMode, output_parallel_mode: ParallelMode) -> Tensor:
-    """
+    r"""3D parallel classifier.
    3D parallel classifier
-    :param input_: matrix of input
+    Args:
-    :type input_: torch.tensor
+        input_ (:class:`torch.tensor`): input matrix.
-    :param weight: matrix of weight
+        weight (:class:`torch.tensor`): matrix of weight.
-    :type weight: torch.tensor
+        bias (:class:`torch.tensor`): matrix of bias.
-    :param bias: matrix of bias
+        input_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): input parallel mode.
-    :type bias: torch.tensor, optional
+        weight_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): weight parallel mode.
-    :param input_parallel_mode: input parallel mode
+        output_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): output parallel mode.
-    :type input_parallel_mode: colossalai.context.parallel_mode.ParallelMode
+
-    :param weight_parallel_mode: weight parallel mode
+    Note:
-    :type weight_parallel_mode: colossalai.context.parallel_mode.ParallelMode
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
-    :param output_parallel_mode: output parallel mode
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    :type output_parallel_mode: colossalai.context.parallel_mode.ParallelMode
    """
    return _Classifier3D.apply(input_, weight, bias, input_parallel_mode, weight_parallel_mode, output_parallel_mode)
@ -244,48 +237,44 @@ class _Layernorm3D(torch.autograd.Function):
 def layernorm_3d(input_: Tensor, weight: Tensor, bias: Tensor, normalized_shape: int, eps: float,
                 input_parallel_mode: ParallelMode, weight_parallel_mode: ParallelMode,
                 output_parallel_mode: ParallelMode) -> Tensor:
-    r"""
+    r"""3D parallel Layernorm.
    3D parallel Layernorm
-    :param input_: input maxtrix
+    Args:
-    :type input_: torch.tensor
+        input_ (:class:`torch.tensor`): input matrix.
-    :param weight: matrix of weight
+        weight (:class:`torch.tensor`): matrix of weight.
-    :type weight: torch.tensor
+        bias (:class:`torch.tensor`): matrix of bias.
-    :param bias: matrix of bias
+        normalized_shape (int): input shape from an expected input of size.
-    :type bias: torch.tensor
+            :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
-    :param normalized_shape: input shape from an expected input of size.
+            \times \ldots \times \text{normalized_shape}[-1]]`
-    :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
+            If a single integer is used, it is treated as a singleton list, and this module will
-    \times \ldots \times \text{normalized_shape}[-1]]`
+            normalize over the last dimension which is expected to be of that specific size.
-        If a single integer is used, it is treated as a singleton list, and this module will
+        eps (float): a value added to the denominator for numerical stability
-        normalize over the last dimension which is expected to be of that specific size.
+        input_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): input parallel mode.
-    :type normalized_shape: int
+        weight_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): weight parallel mode.
-    :param eps: a value added to the denominator for numerical stability
+        output_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): output parallel mode.
-    :type eps: float
+
-    :param input_parallel_mode: input parallel mode
+    Note:
-    :type input_parallel_mode: colossalai.context.parallel_mode.ParallelMode
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
-    :param weight_parallel_mode: weight parallel mode
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    :type weight_parallel_mode: colossalai.context.parallel_mode.ParallelMode
    :param output_parallel_mode: output parallel mode
    :type output_parallel_mode: colossalai.context.parallel_mode.ParallelMode
    """
    return _Layernorm3D.apply(input_, weight, bias, normalized_shape, eps, input_parallel_mode, weight_parallel_mode,
                              output_parallel_mode)
 def split_tensor_3d(tensor: Tensor, dim: int, parallel_mode: ParallelMode) -> Tensor:
-    """Splits 3D parallel tensor in specified dimension
+    r"""Splits 3D parallel tensor in specified dimension.
-    :param tensor: Input tensor
+     Args:
-    :param dim: Specified dimension in which to split
+        tensor (:class:`torch.tensor`): Input tensor.
-    :param parallel_mode: Parallel mode
+        dim (int): Specified dimension in which to split.
-    :param weight_parallel_mode: Weight parallel mode
+        parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`, optional): Parallel mode.
-    :type tensor: torch.Tensor
+    Returns:
-    :type dim: int
+        :class:`torch.tensor`: The tensor has been split.
    :type parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :return output: Splitted tensor
+    Note:
-    :rtype output: torch.Tensor
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
    """
    if tensor.size(dim) <= 1:
        return tensor
@ -298,17 +287,20 @@ def split_batch_3d(input_: Tensor,
                   dim: int = 0,
                   input_parallel_mode: ParallelMode = ParallelMode.PARALLEL_3D_INPUT,
                   weight_parallel_mode: ParallelMode = ParallelMode.PARALLEL_3D_WEIGHT) -> Tensor:
-    """Splits 3D tensor in batch
+    r"""Splits 3D tensor in batch.
-    :param input_: Input tensor
+
-    :param dim: Specified dimension in which to split
+    Args:
-    :param input_parallel_mode: Input parallel mode
+        input_ (:class:`torch.tensor`): Input tensor.
-    :param weight_parallel_mode: Weight parallel mode
+        dim (int): Specified dimension in which to split.
-    :type input_: torch.Tensor
+        input_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`, optional): input parallel mode.
-    :type dim: int, optional
+        weight_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`, optional): weight parallel mode.
-    :type input_parallel_mode: colossalai.context.parallel_mode.ParallelMode, optional
+
-    :type weight_parallel_mode: colossalai.context.parallel_mode.ParallelMode, optional
+    Returns:
-    :return output: Splitted tensor
+        :class:`torch.tensor`: The tensor has been split.
-    :rtype output: torch.Tensor
+
    Note:
        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
    """
    if input_.size(dim) <= 1:
        return input_
@ -333,11 +325,15 @@ class _ReduceTensor3D(torch.autograd.Function):
 def reduce_tensor_3d(tensor: Tensor, parallel_mode: ParallelMode) -> Tensor:
-    """
+    r"""All-reduce the input
    All-reduce the input
-    :param tensor: Input tensor
+    Args:
-    :param parallel_mode: Parallel mode
+        tensor (:class:`torch.tensor`): Input tensor.
        parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): Parallel mode.
    Note:
        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
    """
    return _ReduceTensor3D.apply(tensor, parallel_mode)
@ -358,11 +354,16 @@ class _AllGatherTensor3D(torch.autograd.Function):
 def all_gather_tensor_3d(tensor: Tensor, dim: int, parallel_mode: ParallelMode) -> Tensor:
-    """
+    r"""All-reduce the gradient in backward pass.
    All-reduce the gradient in backward pass.
-    :param tensor: Input tensor
+    Args:
-    :param parallel_mode: Parallel mode
+        tensor (:class:`torch.tensor`): Input tensor.
        dim (int): Dimension to gather.
        parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): Parallel mode.
    Note:
        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
    """
    return _AllGatherTensor3D.apply(tensor, dim, parallel_mode)
@ -382,12 +383,16 @@ class _ReduceScatterTensor3D(torch.autograd.Function):
 def reduce_scatter_tensor_3d(tensor: Tensor, dim: int, parallel_mode: ParallelMode) -> Tensor:
-    """
+    r"""Reduce-scatter the input.
    Reduce-scatter the input.
-    :param tensor: Input tensor
+    Args:
-    :param dim: Dimension to scatter
+        tensor (:class:`torch.tensor`): Input tensor.
-    :param parallel_mode: Parallel mode
+        dim (int): Dimension to scatter.
        parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): Parallel mode.
    Note:
        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """
    return _ReduceScatterTensor3D.apply(tensor, dim, parallel_mode)
@ -423,34 +428,33 @@ def reduce_by_batch_3d(tensor: Tensor,
                       input_parallel_mode: ParallelMode,
                       weight_parallel_mode: ParallelMode,
                       reduce_mean: bool = False) -> Tensor:
-    """
+    r"""All-reduce the input from the model parallel region.
    All-reduce the input from the model parallel region.
-    :param input_: input maxtrix
+    Args:
-    :type input_: torch.tensor
+        input_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): input parallel mode.
-    :param input_parallel_mode: input parallel mode
+        weight_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): weight parallel mode.
-    :type input_parallel_mode: colossalai.context.parallel_mode.ParallelMode
+        reduce_mean (bool, optional): If set to ``True``, it will divide the output by
-    :param weight_parallel_mode: weight parallel mode
+            (input parallel size * weight parallel size), default to False.
-    :type weight_parallel_mode: colossalai.context.parallel_mode.ParallelMode
+
-    :param reduce_mean:  If set to ``True``, it will divide the output by (input parallel size * weight parallel size),
+    Note:
-    default to False
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
-    :type reduce_mean: int, optional
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """
    return _ReduceByBatch3D.apply(tensor, input_parallel_mode, weight_parallel_mode, reduce_mean)
 class _BroadcastWeight3D_FromDiagonal(torch.autograd.Function):
-    """
+    r"""broadcast weight from diagonal.
    broadcast weight from diagonal
-    :param input_: input maxtrix
+    Args:
-    :type input_: torch.tensor
+        input_ (:class:`torch.tensor`): input matrix.
-    :param input_parallel_mode: input parallel mode
+        input_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): input parallel mode.
-    :type input_parallel_mode: colossalai.context.parallel_mode.ParallelMode
+        weight_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): weight parallel mode.
-    :param weight_parallel_mode: weight parallel mode
+        output_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): output parallel mode.
-    :type weight_parallel_mode: colossalai.context.parallel_mode.ParallelMode
+
-    :param weight_parallel_mode: output parallel mode
+    Note:
-    :type weight_parallel_mode: colossalai.context.parallel_mode.ParallelMode
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """
    @staticmethod
--- a/colossalai/nn/layer/parallel_3d/layers.py
+++ b/colossalai/nn/layer/parallel_3d/layers.py
@ -24,19 +24,16 @@ from ._utils import get_depth_from_env, get_last_group, get_parallel_mode_from_e
@LAYERS.register_module
 class LayerNorm3D(ParallelLayer):
-    r"""
+    r"""Layer Normalization for 3D parallelism.
    Layer Normalization for 3D parallelism
-    :param normalized_shape: input shape from an expected input of size.
+    Args:
-    :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
+        normalized_shape (int): input shape from an expected input of size.
-    \times \ldots \times \text{normalized_shape}[-1]]`
+            :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
-        If a single integer is used, it is treated as a singleton list, and this module will
+            \times \ldots \times \text{normalized_shape}[-1]]`
-        normalize over the last dimension which is expected to be of that specific size.
+            If a single integer is used, it is treated as a singleton list, and this module will
-    :type normalized_shape: int
+            normalize over the last dimension which is expected to be of that specific size.
-    :param eps: a value added to the denominator for numerical stability, defaults to 1e-12
+        eps (float, optional): a value added to the denominator for numerical stability, defaults to 1e-12.
-    :type eps: float, optional
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
    :param dtype: The dtype of parameters, defaults to None
    :type dtype: torch.dtype, optional
    """
    def __init__(self, normalized_shape: int, eps: float = 1e-12, dtype=None):
@ -71,21 +68,20 @@ class LayerNorm3D(ParallelLayer):
@LAYERS.register_module
 class Linear3D(ParallelLayer):
-    """
+    r"""Linear layer for 3D parallelism.
    Linear layer for 3D parallelism
-    :param in_features: size of each input sample
+    Args:
-    :type in_features: int
+        in_features (int): size of each input sample.
-    :param out_features: size of each output sample
+        out_features (int): size of each output sample.
-    :type out_features: int
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
-    :type bias: bool, optional
+        weight_initializer (:class:`typing.Callable`, optional):
-    :param dtype: The dtype of parameters, defaults to None
+            The initializer of weight, defaults to kaiming uniform initializer.
-    :type dtype: torch.dtype, optional
+        bias_initializer (:class:`typing.Callable`, optional):
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
+            The initializer of bias, defaults to xavier uniform initializer.
-    :type weight_initializer: typing.Callable, optional
+
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
+    More details about ``initializer`` please refer to
-    :type bias_initializer: typing.Callable, optional
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """
    def __init__(self,
@ -146,23 +142,21 @@ class Linear3D(ParallelLayer):
@LAYERS.register_module
 class Classifier3D(ParallelLayer):
-    """
+    r"""Classifier for 3D parallelism.
    Classifier for 3D parallelism
-    :param in_features: size of each input sample
+    Args:
-    :type in_features: int
+        in_features (int): size of each input sample.
-    :param num_classes: number of classes
+        num_classes (int): number of classes.
-    :type num_classes: int
+        weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
-    :param weight: weight of the classifier, defaults to True
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
-    :type weight: torch.nn.Parameter, optional
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True
+        weight_initializer (:class:`typing.Callable`, optional):
-    :type bias: bool, optional
+            The initializer of weight, defaults to kaiming uniform initializer.
-    :param dtype: The dtype of parameters, defaults to None
+        bias_initializer (:class:`typing.Callable`, optional):
-    :type dtype: torch.dtype, optional
+            The initializer of bias, defaults to xavier uniform initializer.
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
+
-    :type weight_initializer: typing.Callable, optional
+    More details about ``initializer`` please refer to
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    :type bias_initializer: typing.Callable, optional
    """
    def __init__(self,
@ -225,23 +219,21 @@ class Classifier3D(ParallelLayer):
@LAYERS.register_module
 class VocabParallelClassifier3D(ParallelLayer):
-    """
+    r"""Vocab parallel classifier layer for 3D parallelism.
    Vocab parallel classifier layer for 2D parallelism
-    :param in_features: size of each input sample
+    Args:
-    :type in_features: int
+        in_features (int): size of each input sample.
-    :param num_classes: number of classes
+        num_classes (int): number of classes.
-    :type num_classes: int
+        weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
-    :param weight: weight of the classifier, defaults to True
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
-    :type weight: torch.nn.Parameter, optional
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True``
+        weight_initializer (:class:`typing.Callable`, optional):
-    :type bias: bool, optional
+            The initializer of weight, defaults to kaiming uniform initializer.
-    :param dtype: The dtype of parameters, defaults to None
+        bias_initializer (:class:`typing.Callable`, optional):
-    :type dtype: torch.dtype, optional
+            The initializer of bias, defaults to xavier uniform initializer.
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
+
-    :type weight_initializer: typing.Callable, optional
+    More details about ``initializer`` please refer to
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    :type bias_initializer: typing.Callable, optional
    """
    def __init__(self,
@ -311,27 +303,24 @@ class VocabParallelClassifier3D(ParallelLayer):
@LAYERS.register_module
 class PatchEmbedding3D(ParallelLayer):
-    """
+    r"""2D Image to Patch Embedding.
    2D Image to Patch Embedding
-    :param img_size: image size
+    Args:
-    :type img_size: int
+        img_size (int): image size.
-    :param patch_size: patch size
+        patch_size (int): patch size.
-    :type patch_size: int
+        in_chans (int): number of channels of input image.
-    :param in_chans: number of channels of input image
+        embed_size (int): size of embedding.
-    :type in_chans: int
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
-    :param embed_size: size of embedding
+        flatten (bool, optional): whether to flatten output tensor, defaults to True.
-    :type embed_size: int
+        weight_initializer (:class:`typing.Callable`, optional):
-    :param dtype: The dtype of parameters, defaults to None
+            The initializer of weight, defaults to kaiming uniform initializer.
-    :type dtype: torch.dtype, optional
+        bias_initializer (:class:`typing.Callable`, optional):
-    :param flatten: whether to flatten output tensor, defaults to True
+            The initializer of bias, defaults to xavier uniform initializer.
-    :type flatten: bool, optional
+        position_embed_initializer (:class:`typing.Callable`, optional):
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
+            The initializer of position embedding, defaults to zeros initializer.
-    :type weight_initializer: typing.Callable, optional
+
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
+    More details about ``initializer`` please refer to
-    :type bias_initializer: typing.Callable, optional
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    :param position_embed_initializer: The intializer of position embedding, defaults to zero
    :type position_embed_initializer: typing.Callable, optional
    """
    def __init__(self,
@ -419,21 +408,33 @@ class PatchEmbedding3D(ParallelLayer):
@LAYERS.register_module
 class Embedding3D(ParallelLayer):
-    """
+    r"""Embedding for 3D parallelism.
    Embedding for 3D parallelism
-    :param num_embeddings: number of embeddings
+    Args:
-    :type num_embeddings: int
+        num_embeddings (int): number of embeddings.
-    :param embedding_dim: dimension of embedding
+        embedding_dim (int): dimension of embedding.
-    :type embedding_dim: int
+        padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
-    :param padding_idx: index of padding, defaults to None
+            therefore, the embedding vector at padding_idx is not updated during training,
-    :type padding_idx: int, optional
+            i.e. it remains as a fixed “pad”, defaults to None.
-    :param dtype: The dtype of parameters, defaults to None
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
-    :type dtype: torch.dtype, optional
+        weight_initializer (:class:`typing.Callable`, optional):
-    :param weight_initializer: The intializer of weight, defaults to normal initializer
+            he initializer of weight, defaults to normal initializer.
-    :type weight_initializer: typing.Callable, optional
+
-    :param args: Args used in F.embedding
+    The ``args`` and ``kwargs`` used in :class:``torch.nn.functional.embedding`` should contain:
-    :param kwargs: Kwargs used in F.embedding
+    ::
        max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
                    renormalized to have norm max_norm. Note: this will modify weight in-place.
        norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
        scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
                    of frequency of the words in the mini-batch. Default False.
        sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
    More details about ``args`` and ``kwargs`` could be found in
    `Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
    More details about initializer please refer to
    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_
    """
    def __init__(self,
@ -491,20 +492,33 @@ class Embedding3D(ParallelLayer):
@LAYERS.register_module
 class VocabParallelEmbedding3D(torch.nn.Module):
-    """Embedding parallelized in the vocabulary dimension.
+    r"""Embedding parallelized in the vocabulary dimension.
-    :param num_embeddings: number of embeddings
+    Args:
-    :type num_embeddings: int
+        num_embeddings (int): number of embeddings.
-    :param embedding_dim: dimension of embedding
+        embedding_dim (int): dimension of embedding.
-    :type embedding_dim: int
+        padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
-    :param padding_idx: index of padding, defaults to None
+            therefore, the embedding vector at padding_idx is not updated during training,
-    :type padding_idx: int, optional
+            i.e. it remains as a fixed “pad”, defaults to None.
-    :param dtype: The dtype of parameters, defaults to None
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
-    :type dtype: torch.dtype, optional
+        weight_initializer (:class:`typing.Callable`, optional):
-    :param weight_initializer: The intializer of weight, defaults to normal initializer
+            he initializer of weight, defaults to normal initializer.
-    :type weight_initializer: typing.Callable, optional
+
-    :param args: Args used in F.embedding
+    The ``args`` and ``kwargs`` used in :class:``torch.nn.functional.embedding`` should contain:
-    :param kwargs: Kwargs used in F.embedding
+    ::
        max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
                    renormalized to have norm max_norm. Note: this will modify weight in-place.
        norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
        scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
                    of frequency of the words in the mini-batch. Default False.
        sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
    More details about ``args`` and ``kwargs`` could be found in
    `Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
    More details about initializer please refer to
    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """
    def __init__(self,
--- a/colossalai/nn/layer/parallel_sequence/layers.py
+++ b/colossalai/nn/layer/parallel_sequence/layers.py
@ -24,14 +24,13 @@ class TransformerSelfAttentionRing(nn.Module):
    Self-attention layer takes input with size [b, s, h]
    and returns output of the same size.
-    :param hidden_size: hidden size
+    Args:
-    :type hidden_size: int
+        hidden_size (int): hidden size.
-    :param kv_channels: channels of key/value tensor
+        num_attention_heads (int): number of attention heads.
-    :type kv_channels: int
+        attention_dropout (float): dropout probability for attention layer.
-    :param num_attention_heads: number of attention heads
+        attention_mask_func (:class:`typing.Callable`): Mask function to be applied.
-    :type num_attention_heads: int
+        layer_number (int): number of layers.
-    :param attention_dropout: dropout probability for attention layer
+
    :type attention_dropout: float
    """
    def __init__(self,
--- a/colossalai/nn/layer/utils/common.py
+++ b/colossalai/nn/layer/utils/common.py
@ -38,11 +38,16 @@ class CheckpointModule(nn.Module):
 def divide(numerator, denominator):
-    """Only allow exact division
+    """Only allow exact division.
-    :param numerator: Numerator of the division
+    Args:
-    :param denominator: Denominator of the division
+        numerator (int): Numerator of the division.
        denominator (int): Denominator of the division.
    Returns:
        int: the result of exact division.
    """
    assert denominator != 0, 'denominator can not be zero'
    assert numerator % denominator == 0, \
        '{} is not divisible by {}'.format(numerator, denominator)
    return numerator // denominator
--- a/colossalai/nn/layer/vanilla/layers.py
+++ b/colossalai/nn/layer/vanilla/layers.py
@ -15,11 +15,16 @@ from ..utils import to_2tuple
 def drop_path(x, drop_prob: float = 0., training: bool = False):
    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
    'survival rate' as the argument.
    Args:
        drop_prob (float, optional): probability of dropping path, defaults 0.0.
        training (bool, optional): whether in training progress, defaults False.
    """
    if drop_prob == 0. or not training:
        return x
@ -35,6 +40,9 @@ class DropPath(nn.Module):
    """
    Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
    Adapted from https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/drop.py
    Args:
        drop_prob (float, optional): probability of dropping path, defaults None.
    """
    def __init__(self, drop_prob=None):
@ -46,7 +54,19 @@ class DropPath(nn.Module):
 class WrappedDropout(nn.Module):
-    """Same as torch.nn.Dropout. But it is wrapped with the context of seed manager.
+    r"""Same as torch.nn.Dropout. But it is wrapped with the context of seed manager. During training, randomly zeroes
    some elements of the input tensor with probability p using samples from a Bernoulli distribution. Each
    channel will be zeroed out independently on every forward call. Furthermore, the outputs are scaled by a factor of
    1/(1-p) during training. This means that during evaluation the module simply computes an identity function.
    Args:
        p (float, optional): probability of an element to be zeroed, defaults 0.5.
        inplace (bool, optional): whether to do dropout in-place, default to be False.
        mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
    Note:
        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """
    def __init__(self, p: float = 0.5, inplace: bool = False, mode=None):
@ -74,8 +94,16 @@ class WrappedDropout(nn.Module):
 class WrappedDropPath(nn.Module):
-    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    r"""Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
    Here, it is wrapped with the context of seed manager.
    Args:
        p (float, optional): probability of dropping path, defaults 0.0.
        mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
    Note:
        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """
    def __init__(self, p: float = 0., mode=None):
@ -101,27 +129,25 @@ class WrappedDropPath(nn.Module):
@LAYERS.register_module
 class VanillaPatchEmbedding(nn.Module):
-    """
+    r"""
    2D Image to Patch Embedding
-    :param img_size: image size
+    Args:
-    :type img_size: int
+        img_size (int): image size.
-    :param patch_size: patch size
+        patch_size (int): patch size.
-    :type patch_size: int
+        in_chans (int): number of channels of input image.
-    :param in_chans: number of channels of input image
+        embed_size (int): size of embedding.
-    :type in_chans: int
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
-    :param embed_size: size of embedding
+        flatten (bool, optional): whether to flatten output tensor, defaults to True.
-    :type embed_size: int
+        weight_initializer (:class:`typing.Callable`, optional):
-    :param dtype: The dtype of parameters, defaults to None
+            The initializer of weight, defaults to kaiming uniform initializer.
-    :type dtype: torch.dtype, optional
+        bias_initializer (:class:`typing.Callable`, optional):
-    :param flatten: whether to flatten output tensor, defaults to True
+            The initializer of bias, defaults to xavier uniform initializer.
-    :type flatten: bool, optional
+        position_embed_initializer (:class:`typing.Callable`, optional):
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
+            The initializer of position embedding, defaults to zeros initializer.
-    :type weight_initializer: typing.Callable, optional
+
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
+    More details about initializer please refer to
-    :type bias_initializer: typing.Callable, optional
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    :param position_embed_initializer: The intializer of position embedding, defaults to zero
    :type position_embed_initializer: typing.Callable, optional
    """
    def __init__(self,
@ -174,23 +200,21 @@ class VanillaPatchEmbedding(nn.Module):
@LAYERS.register_module
 class VanillaClassifier(nn.Module):
-    """
+    r"""Dense linear classifier.
    Dense linear classifier
-    :param in_features: size of each input sample
+    Args:
-    :type in_features: int
+        in_features (int): size of each input sample.
-    :param num_classes: number of classes
+        num_classes (int): number of classes.
-    :type num_classes: int
+        weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
-    :param weight: weight of the classifier, defaults to True
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
-    :type weight: torch.nn.Parameter, optional
+        flatten (bool, optional): whether to flatten output tensor, defaults to True.
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True
+        weight_initializer (:class:`typing.Callable`, optional):
-    :type bias: bool, optional
+            The initializer of weight, defaults to kaiming uniform initializer.
-    :param dtype: The dtype of parameters, defaults to None
+        bias_initializer (:class:`typing.Callable`, optional):
-    :type dtype: torch.dtype, optional
+            The initializer of bias, defaults to xavier uniform initializer.
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
+
-    :type weight_initializer: typing.Callable, optional
+    More details about initializer please refer to
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    :type bias_initializer: typing.Callable, optional
    """
    def __init__(self,
--- a/colossalai/nn/layer/wrapper/lambda_wrapper.py
+++ b/colossalai/nn/layer/wrapper/lambda_wrapper.py
@ -9,12 +9,11 @@ from colossalai.registry import LAYERS
@LAYERS.register_module
 class LambdaWrapper(nn.Module):
-    """Wrap a function to nn.Module, which takes a config of layers and can fully access them
+    """Wrap a function to nn.Module, which takes a config of layers and can fully access them.
-    :param func: User customed function
+    Args:
-    :type func: Callable
+        func (``Callable``): User customed function.
-    :param layers_cfg: Config of layers, defaults to None
+        layers_cfg (dict, optional): Config of layers, defaults to None.
    :type layers_cfg: dict, optional
    """
    def __init__(self, func, layers_cfg: dict = None):
--- a/colossalai/nn/loss/loss_1d.py
+++ b/colossalai/nn/loss/loss_1d.py
@ -86,12 +86,10 @@ class _VocabParallelCrossEntropy1D(torch.autograd.Function):
@LOSSES.register_module
 class VocabParallelCrossEntropyLoss1D(_Loss):
-    """
+    """Vocab parallel cross entropy loss for 1D parallelism.
    Vocab parallel cross entropy loss for 1D parallelism
-    :param reduction: whether to average the loss, defaults to True
+    Args:
-
+        reduction (bool, optional): whether to average the loss, defaults to True.
    :type reduction: bool, optional
    """
    def __init__(self, reduction=True):
@ -99,10 +97,11 @@ class VocabParallelCrossEntropyLoss1D(_Loss):
        self.reduction_mean = reduction
    def forward(self, logits, targets):
-        """Calculate loss between logits and targets
+        """Calculate loss between logits and targets.
-        :param logits: Output logits of model
+        Args:
-        :param targets: True targets from data
+            logits (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
            targets (:class:`torch.tensor`): Ground truth class indices or class probabilities.
        """
        loss = _VocabParallelCrossEntropy1D.apply(logits, targets)
        if self.reduction_mean:
--- a/colossalai/nn/loss/loss_2d.py
+++ b/colossalai/nn/loss/loss_2d.py
@ -13,14 +13,22 @@ from torch.nn.modules.loss import _Loss
@LOSSES.register_module
 class CrossEntropyLoss2D(_Loss):
-    """
+    r"""Cross entropy loss for 2D parallelism
    Cross entropy loss for 2D parallelism
-    :param reduction: whether to average the loss, defaults to True
+    Args:
-    :param args: Args for loss function
+        reduction (bool, optional): whether to average the loss, defaults to True.
    :param kwargs: Kwargs for loss function
-    :type reduction: bool, optional
+    The ``args`` and ``kwargs`` should include parameters below:
    ::
        weight (Tensor, optional)
        size_average (bool, optional)
        ignore_index (int, optional)
        reduce (bool, optional)
        label_smoothing (float, optional)
    More details about args, kwargs and torch.nn.functional.cross_entropy could be found in
    `Cross_entropy <https://pytorch.org/docs/stable/generated/torch.nn.functional.cross_entropy.html#torch.nn.functional.cross_entropy>`_.
    """
    def __init__(self, reduction=True, *args, **kwargs):
@ -31,10 +39,14 @@ class CrossEntropyLoss2D(_Loss):
        self.loss_kwargs = kwargs
    def forward(self, logits, targets):
-        """Calculate loss between logits and targets
+        """Calculate loss between logits and targets.
-        :param logits: Output logits of model
+        Args:
-        :param targets: True targets from data
+            logits (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
            targets (:class:`torch.tensor`): Ground truth class indices or class probabilities.
        Returns:
            float: the loss between logits and targets.
        """
        targets = split_tensor_2d(targets)
        loss = cross_entropy(logits, targets, reduction='none', *self.loss_args, **self.loss_kwargs)
@ -116,12 +128,10 @@ class _VocabParallelCrossEntropy2D(torch.autograd.Function):
@LOSSES.register_module
 class VocabParallelCrossEntropyLoss2D(_Loss):
-    """
+    """Vocab parallel cross entropy loss for 2D parallelism.
    Vocab parallel cross entropy loss for 2D parallelism
-    :param reduction: whether to average the loss, defaults to True
+    Args:
-
+        reduction (bool, optional): whether to average the loss, defaults to True.
    :type reduction: bool, optional
    """
    def __init__(self, reduction=True):
@ -129,10 +139,11 @@ class VocabParallelCrossEntropyLoss2D(_Loss):
        self.reduction_mean = reduction
    def forward(self, logits, targets):
-        """Calculate loss between logits and targets
+        """Calculate loss between logits and targets.
-        :param logits: Output logits of model
+        Args:
-        :param targets: True targets from data
+            logits (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
            targets (:class:`torch.tensor`): Ground truth class indices or class probabilities.
        """
        targets = split_tensor_2d(targets)
        loss = _VocabParallelCrossEntropy2D.apply(
--- a/colossalai/nn/loss/loss_2p5d.py
+++ b/colossalai/nn/loss/loss_2p5d.py
@ -13,14 +13,22 @@ from torch.nn.modules.loss import _Loss
@LOSSES.register_module
 class CrossEntropyLoss2p5D(_Loss):
-    """
+    r"""Cross entropy loss for 2.5D parallelism
    Cross entropy loss for 2.5D parallelism
-    :param reduction: whether to average the loss, defaults to True
+    Args:
-    :param args: Args for loss function
+        reduction (bool, optional): whether to average the loss, defaults to True.
    :param kwargs: Kwargs for loss function
-    :type reduction: bool, optional
+    The ``args`` and ``kwargs`` should include parameters below:
    ::
        weight (Tensor, optional)
        size_average (bool, optional)
        ignore_index (int, optional)
        reduce (bool, optional)
        label_smoothing (float, optional)
    More details about args, kwargs and torch.nn.functional.cross_entropy could be found in
    `Cross_entropy <https://pytorch.org/docs/stable/generated/torch.nn.functional.cross_entropy.html#torch.nn.functional.cross_entropy>`_.
    """
    def __init__(self, reduction=True, *args, **kwargs):
        super().__init__()
@ -30,10 +38,11 @@ class CrossEntropyLoss2p5D(_Loss):
        self.loss_kwargs = kwargs
    def forward(self, logits, targets):
-        """Calculate loss between logits and targets
+        """Calculate loss between logits and targets.
-        :param logits: Output logits of model
+        Args:
-        :param targets: True targets from data
+            logits (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
            targets (:class:`torch.tensor`): Ground truth class indices or class probabilities.
        """
        targets = split_tensor_2p5d(targets)
        loss = cross_entropy(logits, targets, reduction='none', *self.loss_args, **self.loss_kwargs)
@ -115,19 +124,19 @@ class VocabParallelCrossEntropyLoss2p5D(_Loss):
    """
    Vocab parallel cross entropy loss for 2.5D parallelism
-    :param reduction: whether to average the loss, defaults to True
+    Args:
-
+        reduction (bool, optional): whether to average the loss, defaults to True.
    :type reduction: bool, optional
    """
    def __init__(self, reduction=True):
        super().__init__()
        self.reduction_mean = reduction
    def forward(self, logits, targets):
-        """Calculate loss between logits and targets
+        """Calculate loss between logits and targets.
-        :param logits: Output logits of model
+        Args:
-        :param targets: True targets from data
+            logits (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
            targets (:class:`torch.tensor`): Ground truth class indices or class probabilities.
        """
        targets = split_tensor_2p5d(targets)
        loss = _VocabParallelCrossEntropy2p5D.apply(logits, targets)
--- a/colossalai/nn/loss/loss_3d.py
+++ b/colossalai/nn/loss/loss_3d.py
@ -13,14 +13,22 @@ from torch.nn.modules.loss import _Loss
@LOSSES.register_module
 class CrossEntropyLoss3D(_Loss):
-    """
+    r"""Cross entropy loss for 3D parallelism.
    Cross entropy loss for 3D parallelism
-    :param reduction: whether to average the loss, defaults to True
+    Args:
-    :param args: Args for loss function
+        reduction (bool, optional): whether to average the loss, defaults to True.
    :param kwargs: Kwargs for loss function
-    :type reduction: bool, optional
+    The ``args`` and ``kwargs`` should include parameters below:
    ::
        weight (Tensor, optional)
        size_average (bool, optional)
        ignore_index (int, optional)
        reduce (bool, optional)
        label_smoothing (float, optional)
    More details about args, kwargs and torch.nn.functional.cross_entropy could be found in
    `Cross_entropy <https://pytorch.org/docs/stable/generated/torch.nn.functional.cross_entropy.html#torch.nn.functional.cross_entropy>`_.
    """
    def __init__(self, reduction=True, *args, **kwargs):
@ -32,10 +40,11 @@ class CrossEntropyLoss3D(_Loss):
        self.loss_kwargs = kwargs
    def forward(self, logits, targets):
-        """Calculate loss between logits and targets
+        """Calculate loss between logits and targets.
-        :param logits: Output logits of model
+        Args:
-        :param targets: True targets from data
+            logits (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
            targets (:class:`torch.tensor`): Ground truth class indices or class probabilities.
        """
        targets = split_tensor_3d(targets, 0, self.weight_parallel_mode)
        targets = split_tensor_3d(targets, 0, self.input_parallel_mode)
@ -109,12 +118,10 @@ class _VocabParallelCrossEntropy3D(torch.autograd.Function):
@LOSSES.register_module
 class VocabParallelCrossEntropyLoss3D(_Loss):
-    """
+    """Vocab parallel cross entropy loss for 2D parallelism.
    Vocab parallel cross entropy loss for 2D parallelism
-    :param reduction: whether to average the loss, defaults to True
+    Args:
-
+        reduction (bool, optional): whether to average the loss, defaults to True.
    :type reduction: bool, optional
    """
    def __init__(self, reduction=True):
@ -125,10 +132,11 @@ class VocabParallelCrossEntropyLoss3D(_Loss):
        self.reduction_mean = reduction
    def forward(self, logits, targets):
-        """Calculate loss between logits and targets
+        """Calculate loss between logits and targets.
-        :param logits: Output logits of model
+        Args:
-        :param targets: True targets from data
+            logits (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
            targets (:class:`torch.tensor`): Ground truth class indices or class probabilities.
        """
        targets = split_tensor_3d(targets, 0, self.weight_parallel_mode)
        targets = split_tensor_3d(targets, 0, self.input_parallel_mode)
--- a/colossalai/nn/loss/loss_moe.py
+++ b/colossalai/nn/loss/loss_moe.py
@ -6,13 +6,25 @@ from colossalai.context.moe_context import MOE_CONTEXT
@LOSSES.register_module
 class MoeCrossEntropyLoss(_Loss):
-    """torch.nn.CrossEntropyLoss added with auxiliary loss.
+    r"""torch.nn.CrossEntropyLoss added with auxiliary loss.
-    :param aux_weight: Weight of auxiliary loss in total loss
+    Args:
-    :param args: Args in CrossEntropyLoss
+        input (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
-    :param kwargs: Kwargs in CrossEntropyLoss
+        target (:class:`torch.tensor`): Ground truth class indices or class probabilities.
        aux_weight (float, optional): Weight of auxiliary loss in total loss.Defaults 0.01.
-    :type aux_weight: float, optional
+    The ``args`` and ``kwargs`` should include parameters below:
    ::
        weight (Tensor, optional)
        size_average (bool, optional)
        ignore_index (int, optional)
        reduce (bool, optional)
        reduction (str, optional)
        label_smoothing (float, optional)
    More details about args, kwargs and torch.nn.functional.cross_entropy could be found in
    `Cross_entropy <https://pytorch.org/docs/stable/generated/torch.nn.functional.cross_entropy.html#torch.nn.functional.cross_entropy>`_.
    """
    def __init__(self, aux_weight: float = 0.01, *args, **kwargs):
@ -21,6 +33,16 @@ class MoeCrossEntropyLoss(_Loss):
        self.aux_weight = aux_weight
    def forward(self, *args):
        """
        The ``args`` should at least include parameters below:
        ::
            input (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
            target (:class:`torch.tensor`): Ground truth class indices or class probabilities.
        More details about args, kwargs and torch.nn.functional.cross_entropy could be found in
        `Cross_entropy <https://pytorch.org/docs/stable/generated/torch.nn.functional.cross_entropy.html#torch.nn.functional.cross_entropy>`_.
        """
        main_loss = self.loss(*args)
        aux_loss = MOE_CONTEXT.get_loss()
        return main_loss + self.aux_weight * aux_loss
@ -30,13 +52,11 @@ class MoeCrossEntropyLoss(_Loss):
 class MoeLoss(_Loss):
    """A wrapper class for any loss module to add with auxiliary loss.
-    :param aux_weight: Weight of auxiliary loss in total loss
+    Args:
-    :param loss_fn: Loss function
+        aux_weight (float): Weight of auxiliary loss in total loss.
-    :param args: Args in loss function
+        loss_fn (``Callable``): Loss function.
-    :param kwargs: Kwargs in loss function
+        args (list): Args in loss function.
-
+        kwargs (dict): Kwargs in loss function
    :type aux_weight: float
    :type loss_fn: Callable
    """
    def __init__(self, aux_weight: float, loss_fn, *args, **kwargs):
@ -45,6 +65,16 @@ class MoeLoss(_Loss):
        self.aux_weight = aux_weight
    def forward(self, *args, **kwargs):
        """
        The ``args`` and ``kwargs`` should at least include parameters below:
        ::
            input (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
            target (:class:`torch.tensor`): Ground truth class indices or class probabilities.
        Note:
            The ``args`` and ``kwargs`` may include different parameters varying with different loss function.
        """
        main_loss = self.loss_fn(*args, **kwargs)
        aux_loss = MOE_CONTEXT.get_loss()
        return main_loss + self.aux_weight * aux_loss
--- a/colossalai/nn/lr_scheduler/cosine.py
+++ b/colossalai/nn/lr_scheduler/cosine.py
@ -36,14 +36,12 @@ class CosineAnnealingLR(_CosineAnnealingLR):
    .. _SGDR\: Stochastic Gradient Descent with Warm Restarts:
        https://arxiv.org/abs/1608.03983
-    :param optimizer: Wrapped optimizer
+    Args:
-    :type optimizer: torch.optim.Optimizer
+        optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
-    :param total_steps: Number of total training steps
+        total_steps (int): Number of total training steps.
-    :type total_steps: int
+        eta_min (int, optional): Minimum learning rate, defaults to 0.
-    :param eta_min: Minimum learning rate, defaults to 0
+        last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
-    :type eta_min: int, optional
+            the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
    :param last_epoch: The index of last epoch, defaults to -1
    :type last_epoch: int, optional
    """
    def __init__(self, optimizer, total_steps: int, eta_min: int = 0, last_epoch: int = -1, **kwargs):
@ -54,16 +52,13 @@ class CosineAnnealingLR(_CosineAnnealingLR):
 class CosineAnnealingWarmupLR(WarmupScheduler):
    """Cosine annealing learning rate scheduler with learning rate warmup. A linear warmup schedule will be applied.
-    :param optimizer: Wrapped optimizer
+    Args:
-    :type optimizer: torch.optim.Optimizer
+        optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
-    :param total_steps: Number of total training steps
+        total_steps (int): Number of total training steps.
-    :type total_steps: int
+        warmup_steps (int, optional): Number of warmup steps, defaults to 0.
-    :param warmup_steps: Number of warmup steps, defaults to 0
+        eta_min (int, optional): Minimum learning rate, defaults to 0.
-    :type warmup_steps: int, optional
+        last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
-    :param eta_min: Minimum learning rate, defaults to 0
+            the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
    :type eta_min: int, optional
    :param last_epoch: The index of last epoch, defaults to -1
    :type last_epoch: int, optional
    """
    def __init__(self, optimizer, total_steps: int, warmup_steps: int = 0, eta_min: float = 0., last_epoch: int = -1):
@ -76,14 +71,12 @@ class CosineAnnealingWarmupLR(WarmupScheduler):
 class FlatAnnealingLR(DelayerScheduler):
    """Flat and cosine annealing learning rate scheduler. The learning rate will be a fixed value before starting decay.
-    :param optimizer: Wrapped optimizer
+    Args:
-    :type optimizer: torch.optim.Optimizer
+        optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
-    :param total_steps: Number of total training steps
+        total_steps (int): Number of total training steps.
-    :type total_steps: int
+        pct_start (float, optional): Percent of steps before starting learning rate decay, defaults to -0.72.
-    :param pct_start: Percent of steps before starting learning rate decay
+        last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
-    :type pct_start: float
+            the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
    :param last_epoch: The index of last epoch, defaults to -1
    :type last_epoch: int, optional
    """
    def __init__(self, optimizer, total_steps: int, pct_start: float = 0.72, last_epoch: int = -1, **kwargs):
@ -102,18 +95,14 @@ class FlatAnnealingWarmupLR(WarmupDelayerScheduler):
    """Flat and cosine annealing learning rate scheduler with learning rate warmup. A linear warmup schedule will be
    applied, and then the learning rate will be a fixed value before starting decay.
-    :param optimizer: Wrapped optimizer
+    Args:
-    :type optimizer: torch.optim.Optimizer
+        optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
-    :param total_steps: Number of total training steps
+        total_steps (int): Number of total training steps.
-    :type total_steps: int
+        warmup_steps (int, optional): Number of warmup steps, defaults to 0.
-    :param warmup_steps: Number of warmup steps, defaults to 0
+        pct_start (float, optional): Percent of steps before starting learning rate decay, defaults to -0.72.
-    :type warmup_steps: int, optional
+        eta_min (int, optional): Minimum learning rate, defaults to 0.
-    :param pct_start: Percent of steps before starting learning rate decay
+        last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
-    :type pct_start: float
+            the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
    :param eta_min: Minimum learning rate, defaults to 0
    :type eta_min: int, optional
    :param last_epoch: The index of last epoch, defaults to -1
    :type last_epoch: int, optional
    """
    def __init__(self, optimizer, total_steps: int, warmup_steps: int = 0, pct_start: float = 0.72, eta_min: int = 0,
--- a/colossalai/nn/lr_scheduler/delayed.py
+++ b/colossalai/nn/lr_scheduler/delayed.py
@ -14,16 +14,15 @@ class _enable_get_lr_call:
 class DelayerScheduler(_LRScheduler):
-    """ Starts with a flat lr schedule until it reaches N epochs the applies a scheduler 
+    """Starts with a flat lr schedule until it reaches N epochs then applies
    the specific scheduler (For example: ReduceLROnPlateau)
-    :param optimizer: Wrapped optimizer.
+    Args:
-    :type optimizer: torch.optim.Optimizer
+        optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
-    :param delay_epochs: Number of epochs to keep the initial lr until starting aplying the scheduler
+        delay_epochs (int): Number of epochs to keep the initial lr until starting applying the scheduler.
-    :type delay_epochs: int
+        after_scheduler (:class:`torch.optim.lr_scheduler`): After target_epoch, use this scheduler.
-    :param after_scheduler: After target_epoch, use this scheduler(eg. ReduceLROnPlateau)
+        last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
-    :type after_scheduler: torch.optim.lr_scheduler
+            the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
    :param last_epoch: The index of last epoch, defaults to -1
    :type last_epoch: int, optional
    """
    def __init__(self, optimizer, delay_epochs, after_scheduler, last_epoch=-1):
@ -57,16 +56,15 @@ class DelayerScheduler(_LRScheduler):
 class WarmupScheduler(_LRScheduler):
-    """ Starts with a linear warmup lr schedule until it reaches N epochs the applies a scheduler
+    """Starts with a linear warmup lr schedule until it reaches N epochs then applies
    the specific scheduler (For example: ReduceLROnPlateau).
-    :param optimizer: Wrapped optimizer.
+    Args:
-    :type optimizer: torch.optim.Optimizer
+        optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
-    :param warmup_epochs: Number of epochs to linearly warmup lr until starting aplying the scheduler
+        warmup_epochs (int): Number of epochs to linearly warmup lr until starting applying the scheduler.
-    :type warmup_epochs: int
+        after_scheduler (:class:`torch.optim.lr_scheduler`): After target_epoch, use this scheduler.
-    :param after_scheduler: After target_epoch, use this scheduler(eg. ReduceLROnPlateau)
+        last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
-    :type after_scheduler: torch.optim.lr_scheduler
+            the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
    :param last_epoch: The index of last epoch, defaults to -1
    :type last_epoch: int, optional
    """
    def __init__(self, optimizer, warmup_epochs, after_scheduler, last_epoch=-1):
@ -97,18 +95,16 @@ class WarmupScheduler(_LRScheduler):
 class WarmupDelayerScheduler(_LRScheduler):
-    """ Starts with a linear warmup lr schedule until it reaches N epochs and a flat lr schedule until it reaches M epochs the applies a scheduler 
+    """Starts with a linear warmup lr schedule until it reaches N epochs and a flat lr schedule
    until it reaches M epochs then applies the specific scheduler (For example: ReduceLROnPlateau).
-    :param optimizer: Wrapped optimizer.
+    Args:
-    :type optimizer: torch.optim.Optimizer
+        optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
-    :param warmup_epochs: Number of epochs to linearly warmup lr until starting aplying the scheduler
+        warmup_epochs (int): Number of epochs to linearly warmup lr until starting applying the scheduler.
-    :type warmup_epochs: int
+        delay_epochs (int): Number of epochs to keep the initial lr until starting applying the scheduler.
-    :param delay_epochs: Number of epochs to keep the initial lr until starting aplying the scheduler
+        after_scheduler (:class:`torch.optim.lr_scheduler`): After target_epoch, use this scheduler.
-    :type delay_epochs: int
+        last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
-    :param after_scheduler: After target_epoch, use this scheduler(eg. ReduceLROnPlateau)
+            the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
    :type after_scheduler: torch.optim.lr_scheduler
    :param last_epoch: The index of last epoch, defaults to -1
    :type last_epoch: int, optional
    """
    def __init__(self, optimizer, warmup_epochs, delay_epochs, after_scheduler, last_epoch=-1):
--- a/colossalai/nn/lr_scheduler/linear.py
+++ b/colossalai/nn/lr_scheduler/linear.py
@ -5,16 +5,14 @@ from colossalai.registry import LR_SCHEDULERS
@LR_SCHEDULERS.register_module
 class LinearWarmupLR(_LRScheduler):
-    """Linearly warmup learning rate and then linearly decay
+    """Linearly warmup learning rate and then linearly decay.
-    :param optimizer: Wrapped optimizer
+    Args:
-    :type optimizer: torch.optim.Optimizer
+        optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
-    :param total_steps: Number of total training steps
+        total_steps (int): Number of total training steps.
-    :type total_steps: int
+        warmup_steps (int, optional): Number of warmup steps, defaults to 0
-    :param warmup_steps: Number of warmup steps, defaults to 0
+        last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
-    :type warmup_steps: int, optional
+            the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
    :param last_epoch: The index of last epoch, defaults to -1
    :type last_epoch: int, optional
    """
    def __init__(self, optimizer, total_steps: int, warmup_steps: int = 0, last_epoch: int = -1, **kwargs):
--- a/colossalai/nn/lr_scheduler/multistep.py
+++ b/colossalai/nn/lr_scheduler/multistep.py
@ -13,18 +13,13 @@ class MultiStepLR(_MultiStepLR):
    happen simultaneously with other changes to the learning rate from outside
    this scheduler. When last_epoch=-1, sets initial lr as lr.
-    :param optimizer: Wrapped optimizer
+    Args:
-    :type optimizer: torch.optim.Optimizer
+        optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
-    :param total_steps: Number of total training steps
+        total_steps (int): Number of total training steps.
-    :type total_steps: int
+        milestones (List[int], optional): List of epoch indices. Must be increasing, defaults to None.
-    :param milestones: List of epoch indices. Must be increasing, defaults to None
+        gamma (float, optional): Multiplicative factor of learning rate decay, defaults to 0.1.
-    :type milestones: List[int], optional
+        last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
-    :param gamma: Multiplicative factor of learning rate decay, defaults to 0.1
+            the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
    :type gamma: float, optional
    :param num_steps_per_epoch: Number of steps per epoch, defaults to -1
    :type num_steps_per_epoch: int, optional
    :param last_epoch: The index of last epoch, defaults to -1
    :type last_epoch: int, optional
    """
    def __init__(self, optimizer, total_steps: int, milestones: List[int] = None, gamma: float = 0.1, last_epoch: int = -1, **kwargs):
@ -33,22 +28,17 @@ class MultiStepLR(_MultiStepLR):
@LR_SCHEDULERS.register_module
 class MultiStepWarmupLR(WarmupScheduler):
-    """Multi-step laerning rate scheduler with warmup.
+    """Multistep learning rate scheduler with warmup.
-    :param optimizer: Wrapped optimizer
+    Args:
-    :type optimizer: torch.optim.Optimizer
+        optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
-    :param total_steps: Number of total training steps
+        total_steps (int): Number of total training steps.
-    :type total_steps: int
+        warmup_steps (int, optional): Number of warmup steps, defaults to 0.
-    :param warmup_steps: Number of warmup steps, defaults to 0
+        milestones (List[int], optional): List of epoch indices. Must be increasing, defaults to None.
-    :type warmup_steps: int, optional
+        gamma (float, optional): Multiplicative factor of learning rate decay, defaults to 0.1.
-    :param milestones: List of epoch indices. Must be increasing, defaults to None
+        num_steps_per_epoch (int, optional): Number of steps per epoch, defaults to -1.
-    :type milestones: List[int], optional
+        last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
-    :param gamma: Multiplicative factor of learning rate decay, defaults to 0.1
+            the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
    :type gamma: float, optional
    :param num_steps_per_epoch: Number of steps per epoch, defaults to -1
    :type num_steps_per_epoch: int, optional
    :param last_epoch: The index of last epoch, defaults to -1
    :type last_epoch: int, optional
    """
    def __init__(self, optimizer, total_steps: int, warmup_steps: int = 0, milestones: List[int] = None,
--- a/colossalai/nn/lr_scheduler/onecycle.py
+++ b/colossalai/nn/lr_scheduler/onecycle.py
@ -28,43 +28,41 @@ class OneCycleLR(_OneCycleLR):
    claims that "unpublished work has shown even better results by using only two phases". To
    mimic the behaviour of the original paper instead, set ``three_phase=True``.
-    :param optimizer: Wrapped optimizer
+    Args:
-    :type optimizer: torch.optim.Optimizer
+        optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
-    :param total_steps: Number of total training steps
+        total_steps (int): Number of total training steps.
-    :type total_steps: int
+        pct_start (float, optional):
-    :param pct_start: The percentage of the cycle (in number of steps) spent increasing the learning rate, defaults to 0.3
+            The percentage of the cycle (in number of steps) spent increasing the learning rate, defaults to 0.3.
-    :type pct_start: float, optional
+        anneal_strategy (str, optional): {'cos', 'linear'}, Specifies the annealing strategy:
-    :param anneal_strategy: {'cos', 'linear'}
+            "cos" for cosine annealing, "linear" for linear annealing, defaults to 'cos'.
-        Specifies the annealing strategy: "cos" for cosine annealing, "linear" for
+        cycle_momentum (bool, optional): If ``True``, momentum is cycled inversely
-        linear annealing, defaults to 'cos'
+            to learning rate between 'base_momentum' and 'max_momentum', defaults to True.
-    :type anneal_strategy: str, optional
+        base_momentum (float, optional):  Lower momentum boundaries in the cycle for each parameter group.
-    :param cycle_momentum: If ``True``, momentum is cycled inversely
+            Note that momentum is cycled inversely to learning rate; at the peak of a cycle, momentum is
-        to learning rate between 'base_momentum' and 'max_momentum', defaults to True
+            'base_momentum' and learning rate is 'max_lr', defaults to 0.85.
-    :type cycle_momentum: bool, optional
+        max_momentum (float, optional): Upper momentum boundaries in the cycle for each parameter group.
-    :param base_momentum:  Lower momentum boundaries in the cycle
+            Functionally, it defines the cycle amplitude (max_momentum - base_momentum).
-        for each parameter group. Note that momentum is cycled inversely
+            Note that momentum is cycled inversely to learning rate; at the start of a cycle, momentum is 'max_momentum'
-        to learning rate; at the peak of a cycle, momentum is
+            and learning rate is 'base_lr', defaults to 0.95.
-        'base_momentum' and learning rate is 'max_lr', defaults to 0.85
+        div_factor (float, optional): Determines the initial learning rate via
-    :type base_momentum: float, optional
+            initial_lr = max_lr/div_factor, defaults to 25.0.
-    :param max_momentum: Upper momentum boundaries in the cycle
+        final_div_factor (float, optional): Determines the minimum learning rate via
-        for each parameter group. Functionally,
+            min_lr = initial_lr/final_div_factor, defaults to 10000.0.
-        it defines the cycle amplitude (max_momentum - base_momentum).
+        last_epoch (int, optional): The index of the last batch. This parameter is used when resuming a training job.
-        Note that momentum is cycled inversely
+            Since `step()` should be invoked after each batch instead of after each epoch, this number represents
-        to learning rate; at the start of a cycle, momentum is 'max_momentum'
+            the total number of *batches* computed, not the total number of epochs computed.
-        and learning rate is 'base_lr', defaults to 0.95
+            When last_epoch=-1, the schedule is started from the beginning, defaults to -1
-    :type max_momentum: float, optional
+
-    :param div_factor: Determines the initial learning rate via
+    The ``kwargs`` for initializing torch.optim.lr_scheduler.OneCycleLR should include parameters below:
-        initial_lr = max_lr/div_factor, defaults to 25.0
+    ::
-    :type div_factor: float, optional
+
-    :param final_div_factor: Determines the minimum learning rate via
+        epochs (int, optional, default=None)
-        min_lr = initial_lr/final_div_factor, defaults to 10000.0
+        steps_per_epoch (int, optional, default=None)
-    :type final_div_factor: float, optional
+        three_phase (bool, optional, default=False)
-    :param last_epoch: The index of the last batch. This parameter is used when
+        verbose (bool, optional, default=False)
-        resuming a training job. Since `step()` should be invoked after each
+
-        batch instead of after each epoch, this number represents the total
+    More details about kwargs could be found in
-        number of *batches* computed, not the total number of epochs computed.
+    `OneCycleLR <https://pytorch.org/docs/stable/generated/torch.optim.lr_scheduler.OneCycleLR.html#torch.optim.lr_scheduler.OneCycleLR>`_.
        When last_epoch=-1, the schedule is started from the beginning, defaults to -1
    :type last_epoch: int, optional
    .. _Super-Convergence\: Very Fast Training of Neural Networks Using Large Learning Rates:
        https://arxiv.org/abs/1708.07120
--- a/colossalai/nn/lr_scheduler/poly.py
+++ b/colossalai/nn/lr_scheduler/poly.py
@ -8,16 +8,13 @@ from .delayed import WarmupScheduler
 class PolynomialLR(_LRScheduler):
    """Polynomial learning rate scheduler.
-    :param optimizer: Wrapped optimizer
+    Args:
-    :type optimizer: torch.optim.Optimizer
+        optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
-    :param total_steps: Number of total training steps
+        total_steps (int): Number of total training steps.
-    :type total_steps: int
+        end_lr (float, optional): Minimum learning rate, defaults to 0.0001.
-    :param end_lr: Minimum learning rate, defaults to 0.0001
+        power (float, optional): The power of polynomial, defaults to 1.0.
-    :type end_lr: float, optional
+        last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
-    :param power: The power of polynomial, defaults to 1.0
+            the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
    :type power: float, optional
    :param last_epoch: The index of last epoch, defaults to -1
    :type last_epoch: int, optional
    """
    def __init__(self, optimizer, total_steps: int, end_lr: float = 0.0001, power: float = 1.0, last_epoch: int = -1,
@ -44,18 +41,14 @@ class PolynomialLR(_LRScheduler):
 class PolynomialWarmupLR(WarmupScheduler):
    """Polynomial learning rate scheduler with warmup.
-    :param optimizer: Wrapped optimizer
+    Args:
-    :type optimizer: torch.optim.Optimizer
+        optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
-    :param total_steps: Number of total training steps
+        total_steps (int): Number of total training steps.
-    :type total_steps: int
+        warmup_steps (int, optional): Number of warmup steps, defaults to 0.
-    :param warmup_steps: Number of warmup steps, defaults to 0
+        end_lr (float, optional): Minimum learning rate, defaults to 0.0001.
-    :type warmup_steps: int, optional
+        power (float, optional): The power of polynomial, defaults to 1.0.
-    :param end_lr: Minimum learning rate, defaults to 0.0001
+        last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
-    :type end_lr: float, optional
+            the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
    :param power: The power of polynomial, defaults to 1.0
    :type power: float, optional
    :param last_epoch: The index of last epoch, defaults to -1
    :type last_epoch: int, optional
    """
    def __init__(self, optimizer, total_steps: int, warmup_steps: int = 0, end_lr: float = 0.0001, power: float = 1.0,
--- a/colossalai/nn/lr_scheduler/torch.py
+++ b/colossalai/nn/lr_scheduler/torch.py
@ -11,16 +11,13 @@ class LambdaLR(_LambdaLR):
    """Sets the learning rate of each parameter group to the initial lr
    times a given function. When last_epoch=-1, sets initial lr as lr.
-    :param optimizer: Wrapped optimizer
+    Args:
-    :type optimizer: torch.optim.Optimizer
+        optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
-    :param total_steps: Number of total training steps
+        total_steps (int): Number of total training steps.
-    :type total_steps: int
+        lr_lambda (Union[``function``, ``list[function]``]): A function which computes a multiplicative
-    :param lr_lambda: A function which computes a multiplicative
+            factor given an integer parameter epoch, or a list of such functions,
-        factor given an integer parameter epoch, or a list of such
+            one for each group in optimizer.param_groups, defaults to None.
-        functions, one for each group in optimizer.param_groups, defaults to None
+        last_epoch (int, optional): The index of last epoch, defaults to -1.
    :type lr_lambda: function or list, optional
    :param last_epoch: The index of last epoch, defaults to -1
    :type last_epoch: int, optional
    """
    def __init__(self, optimizer, total_steps, lr_lambda=None, last_epoch: int = -1) -> None:
@ -30,18 +27,15 @@ class LambdaLR(_LambdaLR):
@LR_SCHEDULERS.register_module
 class MultiplicativeLR(_MultiplicativeLR):
    """Multiply the learning rate of each parameter group by the factor given
-    in the specified function. When last_epoch=-1, sets initial lr as lr
+    in the specified function. When last_epoch=-1, sets initial lr as lr.
-    :param optimizer: Wrapped optimizer
+    Args:
-    :type optimizer: torch.optim.Optimizer
+        optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
-    :param total_steps: Number of total training steps
+        total_steps (int): Number of total training steps.
-    :type total_steps: int
+        lr_lambda (Union[``function``, ``list[function]``]): A function which computes a multiplicative
-    :param lr_lambda: A function which computes a multiplicative
+            factor given an integer parameter epoch, or a list of such functions,
-        factor given an integer parameter epoch, or a list of such
+            one for each group in optimizer.param_groups, defaults to None.
-        functions, one for each group in optimizer.param_groups, defaults to None
+        last_epoch (int, optional): The index of last epoch, defaults to -1.
    :type lr_lambda: function or list, optional
    :param last_epoch: The index of last epoch, defaults to -1
    :type last_epoch: int, optional
    """
    def __init__(self, optimizer, total_steps, lr_lambda=None, last_epoch: int = -1) -> None:
@ -53,18 +47,14 @@ class StepLR(_StepLR):
    """Decays the learning rate of each parameter group by gamma every
    step_size epochs. Notice that such decay can happen simultaneously with
    other changes to the learning rate from outside this scheduler. When
-    last_epoch=-1, sets initial lr as lr
+    last_epoch=-1, sets initial lr as lr.
-    :param optimizer: Wrapped optimizer
+    Args:
-    :type optimizer: torch.optim.Optimizer
+        optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
-    :param total_steps: Number of total training steps
+        total_steps (int): Number of total training steps.
-    :type total_steps: int
+        step_size (int, optional): Period of learning rate decay, defaults to 1.
-    :param step_size: Period of learning rate decay, defaults to 1
+        gamma (float, optional): Multiplicative factor of learning rate decay, defaults to 0.1.
-    :type step_size: int, optional
+        last_epoch (int, optional): The index of last epoch, defaults to -1.
    :param gamma: Multiplicative factor of learning rate decay, defaults to 0.1
    :type gamma: float, optional
    :param last_epoch: The index of last epoch, defaults to -1
    :type last_epoch: int, optional
    """
    def __init__(self, optimizer, total_steps, step_size: int = 1, gamma: float = 0.1, last_epoch: int = -1) -> None:
@ -77,14 +67,11 @@ class ExponentialLR(_ExponentialLR):
    """Decays the learning rate of each parameter group by gamma every epoch.
    When last_epoch=-1, sets initial lr as lr
-    :param optimizer: Wrapped optimizer
+    Args:
-    :type optimizer: torch.optim.Optimizer
+        optimizer (Union[:class:`torch.optim.Optimizer`, :class:`colossalai.nn.optimizer`]): Wrapped optimizer.
-    :param total_steps: Number of total training steps
+        total_steps (int): Number of total training steps.
-    :type total_steps: int
+        gamma (float, optional): Multiplicative factor of learning rate decay, defaults to 1.0.
-    :param gamma: Multiplicative factor of learning rate decay, defaults to 1.0
+        last_epoch (int, optional): The index of last epoch, defaults to -1.
    :type gamma: float, optional
    :param last_epoch: The index of last epoch, defaults to -1
    :type last_epoch: int, optional
    """
    def __init__(self, optimizer, total_steps, gamma: float = 1.0,
--- a/colossalai/nn/metric/accuracy_2d.py
+++ b/colossalai/nn/metric/accuracy_2d.py
@ -14,8 +14,12 @@ class Accuracy2D(nn.Module):
    def forward(self, logits, targets):
        """Calculate the accuracy of predicted labels.
-        :param logits: Predicted labels
+        Args:
-        :param targets: True labels from data
+            logits (:class:`torch.tensor`): Predicted labels.
            targets (:class:`torch.tensor`): True labels from data.
        Returns:
            float: the accuracy of prediction.
        """
        with torch.no_grad():
            targets = split_tensor_2d(targets)
--- a/colossalai/nn/metric/accuracy_2p5d.py
+++ b/colossalai/nn/metric/accuracy_2p5d.py
@ -14,8 +14,12 @@ class Accuracy2p5D(nn.Module):
    def forward(self, logits, targets):
        """Calculate the accuracy of predicted labels.
-        :param logits: Predicted labels
+        Args:
-        :param targets: True labels from data
+            logits (:class:`torch.tensor`): Predicted labels.
            targets (:class:`torch.tensor`): True labels from data.
        Returns:
            float: the accuracy of prediction.
        """
        with torch.no_grad():
            targets = split_tensor_2p5d(targets)
--- a/colossalai/nn/metric/accuracy_3d.py
+++ b/colossalai/nn/metric/accuracy_3d.py
@ -18,8 +18,12 @@ class Accuracy3D(nn.Module):
    def forward(self, logits, targets):
        """Calculate the accuracy of predicted labels.
-         :param logits: Predicted labels
+        Args:
-         :param targets: True labels from data
+            logits (:class:`torch.tensor`): Predicted labels.
            targets (:class:`torch.tensor`): True labels from data.
        Returns:
            float: the accuracy of prediction.
         """
        with torch.no_grad():
            targets = split_tensor_3d(targets, 0, self.weight_parallel_mode)
--- a/colossalai/registry/registry.py
+++ b/colossalai/registry/registry.py
@ -9,11 +9,10 @@ class Registry:
    """This is a registry class used to register classes and modules so that a universal 
    object builder can be enabled.
-    :param name: The name of the registry
+    Args:
-    :type name: str
+        name (str): The name of the registry .
-    :param third_party_library: List of third party libraries which are used in the 
+        third_party_library (list, optional):
-        initialization of the register module
+            List of third party libraries which are used in the initialization of the register module.
    :type third_party_library: list, optional
    """
    def __init__(self, name: str, third_party_library: List[ModuleType] = None):
@ -28,12 +27,12 @@ class Registry:
    def register_module(self, module_class):
        """Registers a module represented in `module_class`.
-        :param module_class: The module to be registered
+        Args:
-        :type module_class: class
+            module_class (class): The module to be registered.
-        :raises AssertionError: Raises an AssertionError if the module has already been 
+        Returns:
-            registered before
+            class: The module to be registered, so as to use it normally if via importing.
-        :return: The module to be registered, so as to use it normally if via importing
+        Raises:
-        :rtype: class
+            AssertionError: Raises an AssertionError if the module has already been registered before.
        """
        module_name = module_class.__name__
        assert module_name not in self._registry
@ -46,12 +45,13 @@ class Registry:
        """Retrieves a module with name `module_name` and returns the module if it has 
        already been registered before.
-        :param module_name: The name of the module to be retrieved
+        Args:
-        :type module_name: str
+            module_name (str): The name of the module to be retrieved.
-        :raises NameError: Raises a NameError if the module to be retrieved has neither been 
+        Returns:
-            registered directly nor as third party modules before
+            :class:`object`: The retrieved module or None.
-        :return: The retrieved module or None
+        Raises:
-        :rtype: :class:`object`
+            NameError: Raises a NameError if the module to be retrieved has neither been
            registered directly nor as third party modules before.
        """
        if module_name in self._registry:
            return self._registry[module_name]
@ -65,11 +65,11 @@ class Registry:
        """Searches for a module with name `module_name` and returns a boolean value indicating
        whether the module has been registered directly or as third party modules before.
-        :param module_name: The name of the module to be searched for
+        Args:
-        :type module_name: str
+            module_name (str): The name of the module to be searched for.
-        :return: A boolean value indicating whether the module has been registered directly or
+        Returns:
-            as third party modules before
+            bool: A boolean value indicating whether the module has been registered directly or
-        :rtype: bool
+            as third party modules before.
        """
        found_flag = module_name in self._registry
--- a/colossalai/trainer/_trainer.py
+++ b/colossalai/trainer/_trainer.py
@ -17,18 +17,46 @@ from colossalai.trainer.hooks import BaseHook
 class Trainer:
-    """This a class tending for easy deployments of users' training and evaluation instead of
+    r"""This is a class tending for easy deployments of users' training and evaluation instead of
    writing their own scripts. It is similar with ``ignite.engine`` and ``keras.engine``, but is
    called `Trainer`.
-    :param engine: Engine responsible for the process function
+    Args:
-    :type engine: :class:`Engine`
+        engine (:class:`Engine`): Engine responsible for the process function.
-    :param schedule: Schedule responsible for forward and backward steps
+        schedule (:class:`BaseSchedule`, optional): Schedule responsible for forward and backward steps.
-    :type schedule: :class:`BaseSchedule`, optional
+        timer (:class:`MultiTimer`, optional): Timer used to monitor the whole training.
-    :param timer: Timer used to monitor the whole training
+        logger (:class:`colossalai.logging.DistributedLogger`, optional): Logger used to record the whole training log.
-    :type timer: :class:`MultiTimer`, optional
+
-    :param logger: Logger used to record the whole training
+    Note:
-    :type logger: :class:`colossalai.logging.DistributedLogger`, optional
+        when `schedule` is None, the ``NonPipelineSchedule`` would be used. If you would like to use pipeline,
        you should choose ``PipelineSchedule`` or ``InterleavedPipelineSchedule`` for the `schedule`
    Examples:
        >>> # define model, criterion, optimizer, lr_scheduler, train_dataloader for your training
        >>> model = ...
        >>> criterion = ...
        >>> optimizer = ...
        >>> train_dataloader = ...
        >>> # Initialize your engine, train_dataloader, test_dataloader, lr_scheduler
        >>> engine, train_dataloader, _, _ = colossalai.initialize(model, optimizer, criterion)
        >>> # Beginning training progress
        >>> timier = ...
        >>> logger = ...
        >>> trainer = Trainer(engine=engine, logger=logger, schedule=schedule, timer=timier)
        >>> # add hooks you would like to use here.
        >>> hook_list = []
        >>> trainer.fit(
        >>>    train_dataloader=train_dataloader,
        >>>    epochs=gpc.config.NUM_EPOCHS,
        >>>    test_interval=1,
        >>>    hooks=hook_list,
        >>>    display_progress=True,
        >>>    return_output_label=False
        >>>    )
    More examples and details could be found in
    `Training with engine and trainer <https://www.colossalai.org/docs/basics/engine_trainer>`_
    and `ColossalAI-Examples <https://github.com/hpcaitech/ColossalAI-Examples/tree/main>`_.
    """
    def __init__(
            self,
@ -108,20 +136,19 @@ class Trainer:
    def _set_current_step(self, epoch: int):
        """Sets current step number.
-        :param epoch: Step number to be set
+        Args:
-        :type epoch: int
+            epoch (int): Step number to be set.
        """
        self._cur_step = epoch * self._steps_per_epoch
    def _call_timer(self, action: str, item: str, *args, **kwargs) -> None:
        """Call timer funciton with a given timer name.
-        :param action: Function to be called on timer
+        Args:
-        :type action: str
+            action (str): Function to be called on timer.
-        :param item: Name of the timer
+            item (str): Name of the timer.
-        :type item: str
+            args (list): args used for action function.
-        :param args: args used for action function
+            kwargs (dict): kwargs used for action function.
        :param kwargs: kwargs used for action function
        """
        if self._timer is not None:
@ -134,10 +161,9 @@ class Trainer:
    def _call_hooks(self, func, output=None):
        """Calls specific hooks in the current time point.
-        :param func: A string represents the time point
+        Args:
-        :param output: Output of the model after running a iteration or None in any other time points
+            func (str): A string represents the time point.
-        :type func: str
+            output (Any, optional): Output of the model after running an iteration or None in any other time points.
        :type output: optional
        """
        # Only after iter hook will receive output
        for hook in self.hooks:
@ -273,25 +299,17 @@ class Trainer:
            display_progress: bool = False,
            return_output_label: bool = True,
    ):
-        """Trains the model to fit training data.
+        r"""Trains the model to fit training data.
-        :param train_dataloader: DataLoader in training
+        Args:
-        :param epochs: Maximum number of epoches
+            train_dataloader (:class:`torch.utils.data.DataLoader`): DataLoader for training.
-        :param max_steps: Maximum number of running iterations
+            epochs (int): Maximum number of epochs.
-        :param test_dataloader: DataLoader in testing
+            max_steps (int, optional): Maximum number of running iterations.
-        :param test_interval: Interval of testing
+            test_dataloader (:class:`torch.utils.data.DataLoader`, optional): DataLoader for validation.
-        :param hooks: A list of hooks used in training
+            test_interval (int, optional): Interval of validation
-        :param display_progress: If True, the training progress will be printed
+            hooks (list[`BaseHook <https://github.com/hpcaitech/ColossalAI/tree/main/colossalai/trainer/hooks>`_],
-        :param return_output_label: If True, the output of model and the label will be returned
+                optional): A list of hooks used in training.
-
+            display_progress (bool, optional): If True, a progress bar will be displayed.
        :type train_dataloader: DataLoader
        :type epochs: int
        :type max_steps: int, optional
        :type test_dataloader: DataLoader, optional
        :type test_interval: int, optional
        :type hooks: list, optional
        :type display_progress: bool, optional
        :type return_output_label: bool, optional
        """
        # set epochs and steps, consider gradient accumulation
@ -374,15 +392,12 @@ class Trainer:
    ):
        """Evaluates the model with testing data.
-        :param test_dataloader: DataLoader in testing
+        Args:
-        :param hooks: A list of hooks used in evaluation
+            test_dataloader (:class:`torch.utils.data.DataLoader`, optional): Dataloader for testing.
-        :param display_progress: If True, the evaluation progress will be printed
+            hooks (list, optional): A list of hooks used in evaluation. Defaults to None.
-        :param return_output_label: If True, the output of model and the label will be returned
+            display_progress (bool, optional): If True, the evaluation progress will be printed. Defaults to False.
-
+            return_output_label (bool, optional): If True, the output of model and the label
-        :type test_dataloader: DataLoader
+                will be returned. Defaults to True.
        :type hooks: list, optional
        :type display_progress: bool, optional
        :type return_output_label: bool
        """
        # set display
        display_progress = self._should_display_progress(display_progress)
@ -418,10 +433,11 @@ class Trainer:
    def predict(self, data: Union[Tensor, List[Tensor]]):
        """Uses trained model to make a prediction for a tensor or a tensor list.
-        :param data: Data as the input
+        Args:
-        :type data: Union[Tensor, List[Tensor]
+            data (Union[:class:`torch.tensor`, List[:class:`torch.tensor`]]): Data as the input.
-        :return: The output of model as the prediction
+
-        :rtype: Tensor
+        Returns:
            :class:`torch.tensor`: The output of model as the prediction
        """
        # predict without labels
        if isinstance(data, (list, tuple)):
--- a/colossalai/trainer/hooks/_base_hook.py
+++ b/colossalai/trainer/hooks/_base_hook.py
@ -40,14 +40,11 @@ class BaseHook(ABC):
    def after_train_iter(self, trainer, output: Tensor, label: Tensor, loss: Tensor):
        """Actions after running a training iteration.
-        :param trainer: Trainer which is using this hook
+        Args:
-        :type trainer: :class:`Trainer`
+           trainer (:class:`Trainer`): Trainer which is using this hook.
-        :param output: Output of the model
+           output (:class:`torch.Tensor`): Output of the model.
-        :type output: torch.Tensor
+           label (:class:`torch.Tensor`): Labels of the input data.
-        :param label: Labels of the input data
+           loss (:class:`torch.Tensor`): Loss between the output and input data.
        :type label: torch.Tensor
        :param loss: Loss between the output and input data
        :type loss: torch.Tensor
        """
        pass
@ -89,24 +86,21 @@ class BaseHook(ABC):
    def after_test_iter(self, trainer, output: Tensor, label: Tensor, loss: Tensor):
        """Actions after running a testing iteration.
-        :param trainer: Trainer which is using this hook
+        Args:
-        :type trainer: :class:`Trainer`
+           trainer (:class:`Trainer`): Trainer which is using this hook
-        :param output: Output of the model
+           output (:class:`torch.Tensor`): Output of the model
-        :type output: Tensor
+           label (:class:`torch.Tensor`): Labels of the input data
-        :param label: Labels of the input data
+           loss (:class:`torch.Tensor`): Loss between the output and input data
        :type label: Tensor
        :param loss: Loss between the output and input data
        :type loss: Tensor
        """
        pass
    def init_runner_states(self, trainer, key, val):
        """Initializes trainer's state.
-        :param trainer: Trainer which is using this hook
+        Args:
-        :type trainer: :class:`Trainer`
+            trainer (:class:`Trainer`): Trainer which is using this hook
-        :param key: Key of reseting state
+            key: Key of state to be reset
-        :param val: Value of reseting state
+            val: Value of state to be reset
        """
        if key not in trainer.states:
            trainer.states[key] = val
--- a/colossalai/trainer/hooks/_checkpoint_hook.py
+++ b/colossalai/trainer/hooks/_checkpoint_hook.py
@ -16,14 +16,13 @@ from ._lr_scheduler_hook import LRSchedulerHook
 class SaveCheckpointHook(BaseHook):
    """Saves the model by interval in training process.
-    :param interval: Saving interval, defaults to 1
+    Args:
-    :type interval: int, optional
+       interval (int, optional): Saving interval, defaults to 1.
-    :param checkpoint_dir: Directory of saving checkpoint, defaults to None
+       checkpoint_dir (str, optional): Directory of saving checkpoint, defaults to None.
-    :type checkpoint_dir: str, optional
+       suffix (str, optional): Saving suffix of the file, defaults to ''.
-    :param suffix: Saving suffix of the file, defaults to ''
+       priority (int, optional): Priority in the printing, hooks with small priority will be printed in front
-    :type suffix: str, optional
+            defaults to 10. If different hooks share same priority, the order of printing would
-    :param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10
+            depend on the hooks order in the hook list.
    :type priority: int, optional
    """
    def __init__(self,
@ -71,18 +70,17 @@ class SaveCheckpointHook(BaseHook):
 class LoadCheckpointHook(BaseHook):
    """Loads the model before training process.
-    :param checkpoint_dir: Directory of saving checkpoint, defaults to None
+    Args:
-    :type checkpoint_dir: str, optional
+        checkpoint_dir (str, optional): Directory of saving checkpoint, defaults to None.
-    :param epoch: Epoch number to be set, defaults to -1
+        epoch (str, optional): Loading checkpoint of setting epoch numbers, defaults to -1.
-    :type epoch: str, optional
+            Epoch equals to -1 means choosing the latest checkpoint.
-    :param finetune: Whether allows to load a part of the model, defaults to False
+        finetune (bool, optional): Whether allows to load a part of the model, defaults to False.
-    :type finetune: bool, optional
+        strict (bool, optional): Whether to strictly enforce that the keys in :attr:`state_dict` of the checkpoint
-    :param strict: Whether loads a model that has the same shape of parameters, defaults to False
+            match the names of parameters and buffers in model, defaults to False.
-    :type strict: bool, optional
+        suffix (str, optional): Suffix of checkpoint file path, defaults to ''.
-    :param suffix: Suffic, defaults to ''
+        priority (int, optional): Priority in the printing, hooks with small priority will be printed in front,
-    :type suffix: str, optional
+            defaults to 0. If different hooks share same priority, the order of printing would
-    :param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 0
+            depend on the hooks order in the hook list.
    :type priority: int, optional
    """
    def __init__(self,
--- a/colossalai/trainer/hooks/_log_hook.py
+++ b/colossalai/trainer/hooks/_log_hook.py
@ -25,13 +25,14 @@ def _format_number(val, prec=5):
 class LogByEpochHook(BaseHook):
-    """Hook to log by epoch
+    """Hook to log by epoch.
-    :param logger: Logger for the log
+    Args:
-    :param interval: Recording interval, defaults to 1
+        logger (:class:`colossalai.logging.DistributedLogger`): Logger for recording the log information.
-    :type interval: int, optional
+        interval (int, optional): Interval of printing log information, defaults to 1.
-    :param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 1
+        priority (int, optional): Priority in the printing, hooks with small priority will be printed in front,
-    :type priority: int, optional
+            defaults to 1. If different hooks share same priority, the order of printing would
            depend on the hooks order in the hook list.
    """
    def __init__(self,
@ -48,10 +49,12 @@ class LogByEpochHook(BaseHook):
@HOOKS.register_module
 class LogMetricByStepHook(BaseHook):
-    """Hook to log metric by step
+    """Hook to log metric by step.
-    :param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10
+    Args:
-    :type priority: int, optional
+        priority (int, optional): Priority in the printing, hooks with small priority will be printed in front,
            defaults to 10. If different hooks share same priority, the order of printing would
            depend on the hooks order in the hook list.
    """
    def __init__(self, priority: int = 10):
@ -74,11 +77,12 @@ class LogMetricByStepHook(BaseHook):
 class LogMetricByEpochHook(LogByEpochHook):
    """Specialized hook to record the metric to log.
-    :param logger: Logger for the log
+    Args:
-    :param interval: Recording interval, defaults to 1
+        logger (:class:`colossalai.logging.DistributedLogger`): Logger for recording the log information.
-    :type interval: int, optional
+        interval (int, optional): Interval of printing log information, defaults to 1.
-    :param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10
+        priority (int, optional): Priority in the printing, hooks with small priority will be printed in front,
-    :type priority: int, optional
+            defaults to 10. If different hooks share same priority, the order of printing would
            depend on the hooks order in the hook list.
    """
    def __init__(self,
@ -116,14 +120,14 @@ class LogMetricByEpochHook(LogByEpochHook):
 class TensorboardHook(BaseHook):
    """Specialized hook to record the metric to Tensorboard.
-    :param log_dir: Directory of log
+    Args:
-    :type log_dir: str
+        log_dir (str): Directory of log.
-    :param ranks: Ranks of processors
+        ranks (list): Ranks of processors.
-    :type ranks: typing.List
+        parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`, optional): Parallel mode used in trainer,
-    :param parallel_mode: Parallel mode, defaults to colossalai.context.parallel_mode.ParallelMode.GLOBAL
+            defaults to colossalai.context.parallel_mode.ParallelMode.GLOBAL.
-    :type parallel_mode: :class:`colossalai.context.parallel_mode.ParallelMode`, optional
+        priority (int, optional): Priority in the printing, hooks with small priority will be printed in front,
-    :param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10
+            defaults to 10. If different hooks share same priority, the order of printing would
-    :type priority: int, optional
+            depend on the hooks order in the hook list.
    """
    def __init__(self,
@ -200,18 +204,15 @@ class TensorboardHook(BaseHook):
 class LogTimingByEpochHook(LogByEpochHook):
    """Specialized hook to write timing record to log.
-    :param timer: Timer for the hook
+    Args:
-    :type timer: :class:`colossalai.utils.MultiTimer`
+        timer (:class:`colossalai.utils.MultiTimer`): Timer for the hook.
-    :param logger: Logger for the log
+        logger (:class:`colossalai.logging.DistributedLogger`): Logger for recording the log information.
-    :type logger: :class:`colossalai.logging.DistributedLogger`
+        interval (int, optional): Interval of printing log information, defaults to 1.
-    :param interval: Recording interval, defaults to 1
+        priority (int, optional): Priority in the printing, hooks with small priority will be printed in front
-    :type interval: int, optional
+            defaults to 10. If different hooks share same priority, the order of printing would
-    :param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10
+            depend on the hooks order in the hook list.
-    :type priority: int, optional
+        log_eval (bool, optional): Whether writes in evaluation, defaults to True.
-    :param log_eval: Whether writes in evaluation, defaults to True
+        ignore_num_train_steps (int, optional): Number of training steps to ignore, defaults to 0.
    :type log_eval: bool, optional
    :param ignore_num_train_steps: Number of training steps to ignore, defaults to 0
    :type ignore_num_train_steps: int, optional
    """
    def __init__(self,
@ -270,14 +271,13 @@ class LogTimingByEpochHook(LogByEpochHook):
 class LogMemoryByEpochHook(LogByEpochHook):
    """Specialized Hook to write memory usage record to log.
-    :param logger: Logger for the log
+    Args:
-    :type logger: colossalai.logging.DistributedLogger
+        logger (:class:`colossalai.logging.DistributedLogger`): Logger for recording the log information.
-    :param interval: Recording interval, defaults to 1
+        interval (int, optional): Interval of printing log information, defaults to 1.
-    :type interval: int, optional
+        priority (int, optional): Priority in the printing, hooks with small priority will be printed in front
-    :param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10
+            defaults to 1. If different hooks share same priority, the order of printing would
-    :type priority: int, optional
+            depend on the hooks order in the hook list.
-    :param log_eval: Whether writes in evaluation, defaults to True
+        log_eval (bool, optional): Whether writes in evaluation, defaults to True.
    :type log_eval: bool, optional
    """
    def __init__(self,
--- a/colossalai/trainer/hooks/_lr_scheduler_hook.py
+++ b/colossalai/trainer/hooks/_lr_scheduler_hook.py
@ -6,15 +6,17 @@ from ._metric_hook import LearningRateMetric, MetricHook
@HOOKS.register_module
 class LRSchedulerHook(MetricHook):
-    """Build LR scheduler
+    r"""Build LR scheduler for trainer.
-    :param lr_scheduler: LR scheduler
+    Args:
-    :param by_epoch: If `True`, the LR will be scheduled every epoch. Else, the LR will be scheduled every batch
+        lr_scheduler (:class:`colossalai.nn.lr_scheduler`): The specific LR scheduler
-    :type by_epoch: bool
+            in range of ``colossalai.nn.lr_scheduler``, more details about ``lr_scheduler`` could be found in
-    :param store_lr_in_state: If `True`, store the learning rate in each state, defaults to `True`
+            `lr_scheduler <https://github.com/hpcaitech/ColossalAI/tree/main/colossalai/nn/lr_scheduler>`_.
-    :type store_lr_in_state: bool, optional
+        by_epoch (bool): If `True`, the LR will be scheduled every epoch. Else, the LR will be scheduled every batch.
-    :param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 1
+        store_lr_in_state (bool, optional): If `True`, store the learning rate in each state, defaults to `True`.
-    :type priority: int, optional
+        priority (int, optional): Priority in the printing, hooks with small priority will be printed in front
            defaults to 1. If different hooks share same priority, the order of printing would
            depend on the hooks order in the hook list.
    """
    def __init__(
        self,
--- a/colossalai/trainer/hooks/_metric_hook.py
+++ b/colossalai/trainer/hooks/_metric_hook.py
@ -17,13 +17,13 @@ from ._base_hook import BaseHook
 class Metric(ABC):
    """A basic class of metric collectors. It collects a specific
-    metric during training or evaluation and it's always used with 
+    metric during training or evaluation and would always be used with
    :class:`MetricHook` to help it update its states and show the 
    metric. So please use corresponding hook class to make the metric 
    collector works.
-    :param epoch_only: Whether the metric only read for the full epoch
+    Args:
-    :type epoch_only: bool
+        epoch_only (bool): Whether the metric only read for the full epoch.
    """
    def __init__(self, epoch_only: bool):
@ -80,8 +80,8 @@ class Metric(ABC):
 class LossMetric(Metric):
    """A metric collector for loss.
-    :param epoch_only: Whether the metric only read for the full epoch
+    Args:
-    :type epoch_only: bool
+        epoch_only (bool): Whether the metric only read for the full epoch.
    """
    def __init__(self, epoch_only):
@ -101,7 +101,8 @@ class LossMetric(Metric):
        """Updates :attr:`last_step_loss` and :attr:`accum_loss` with current loss.
        It expects the output has loss.
-        :param loss: Current loss of the output
+        Args:
            loss (:class:`torch.tensor`): Current loss of the output.
        """
        # expect output to be logits, label and loss
        loss_ = loss.detach()
@ -132,10 +133,9 @@ class LossMetric(Metric):
 class LearningRateMetric(Metric):
    """A metric collector for learning rate.
-    :param epoch_only: Whether the metric only read for the full epoch
+    Args:
-    :type epoch_only: bool
+        epoch_only (bool): Whether the metric only read for the full epoch.
-    :param initial_lr: Initial learning rate, defaults to 0.0
+        initial_lr (float, optional): Initial learning rate, defaults to 0.0.
    :type initial_lr: float, optional
    """
    def __init__(self, epoch_only: bool, initial_lr: float = 0.):
@ -163,10 +163,9 @@ class AccuracyMetric(Metric):
    """A metric collector for accuracy. It only works for classification
    tasks.
-    :param epoch_only: Whether the metric only read for the full epoch
+    Args:
-    :type epoch_only: bool
+        epoch_only (bool): Whether the metric only read for the full epoch.
-    :param accuracy_func: Accuracy function for the classification task
+        accuracy_func (:class:`typing.Callable`): Accuracy function for the classification task.
    :type accuracy_func: :class:`typing.Callable`
    """
    def __init__(self, epoch_only: bool, accuracy_func: Callable):
@ -187,9 +186,10 @@ class AccuracyMetric(Metric):
        """Updates last step accuracy and accumulated accuracy with current logits
        and labels. It expects the output has logits and labels.
-        :param logits: The logits output of the model
+        Args:
-        :param targets: Real labels of the dataset
+            logits (:class:`torch.tensor`): The logits output of the model.
-        :param batch_size: Batch size of the task
+            targets (:class:`torch.tensor`): Real labels of the dataset.
            batch_size (int): Batch size of the task.
        """
        if isinstance(logits, (list, tuple)):
            logits = logits[0]
@ -224,8 +224,10 @@ class MetricHook(BaseHook):
    update their states. Others are used to display and 
    record the metric.
-    :param priority: Priority in the printing, hooks with small priority will be printed in front
+    Args:
-    :type priority: int
+        priority (int): Priority in the printing, hooks with small priority will be printed in front
            defaults to 1. If different hooks share same priority, the order of printing would
            depend on the hooks order in the hook list.
    """
    def __init__(
@ -244,8 +246,10 @@ class MetricHook(BaseHook):
 class LossHook(MetricHook):
    """Specialized hook class for :class:`Loss`.
-    :param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 0
+    Args:
-    :type priority: int, optional
+        priority (int, optional): Priority in the printing, hooks with small priority will be printed in front
            defaults to 0. If different hooks share same priority, the order of printing would
            depend on the hooks order in the hook list.
    """
    def __init__(self, priority: int = 0):
@ -283,10 +287,11 @@ class LossHook(MetricHook):
 class AccuracyHook(MetricHook):
    """Specialized hook class for :class:`Accuracy`.
-    :param accuracy_func: Priority in the printing, hooks with small priority will be printed in front
+    Args:
-    :type accuracy_func: typing.Callable
+        accuracy_func (:class:`typing.Callable`): Accuracy function for the classification task.
-    :param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 0
+        priority (int, optional): Priority in the printing, hooks with small priority will be printed in front
-    :type priority: int, optional
+            defaults to 0. If different hooks share same priority, the order of printing would
            depend on the hooks order in the hook list.
    """
    def __init__(self, accuracy_func: Callable, priority: int = 0):
@ -314,8 +319,8 @@ class AccuracyHook(MetricHook):
 class ThroughputMetric(Metric):
    """Metric for :class:`Throughput`.
-    :param epoch_only: epoch only
+    Args:
-    :type epoch_only: bool
+        epoch_only (bool): Whether the metric only read for the full epoch.
    """
    def __init__(self, epoch_only: bool, ignored_steps: int = 0):
        super().__init__(epoch_only=epoch_only)
@ -360,10 +365,13 @@ class ThroughputMetric(Metric):
@HOOKS.register_module
 class ThroughputHook(MetricHook):
-    """Specialized hook class for :class:`Throughput`.
+    """Specialized hook class for :class:`Throughput`. Hook to measure execution throughput (samples/sec).
-    :param priority: priority of throughput hook, defaults to 10
+    Args:
-    :type priority: int, optional
+        ignored_steps (int, optional): the number of initial training steps to ignore.
        priority (int, optional): Priority in the printing, hooks with small priority will be printed in front
            defaults to 10. If different hooks share same priority, the order of printing would
            depend on the hooks order in the hook list.
    """
    def __init__(self, ignored_steps: int = 0, priority: int = 10):
        super().__init__(priority)
--- a/colossalai/utils/activation_checkpoint.py
+++ b/colossalai/utils/activation_checkpoint.py
@ -114,10 +114,13 @@ class CheckpointFunction(torch.autograd.Function):
 def checkpoint(function, activation_offload ,*args):
-    """Checkpoint the computation while preserve the rng states, modified from Pytorch torch.utils.checkpoint
+    """Checkpoint the computation while preserve the rng states, modified from Pytorch torch.utils.checkpoint.
-    :param function: Describe the forward pass function. It should know how to handle the input tuples.
+    Args:
-    :param args: Tuple containing the parameters of the function
+        function: Describe the forward pass function. It should know how to handle the input tuples.
-    :return: Output of running function with provided args
+        args (list): Tuple containing the parameters of the function
    Returns:
        Output of running function with provided args.
    """
    return CheckpointFunction.apply(function, activation_offload, *args)
--- a/colossalai/utils/checkpointing.py
+++ b/colossalai/utils/checkpointing.py
@ -50,17 +50,17 @@ def _get_standard_checkpoint_filename(epoch: int, suffix: str = ''):
 def get_checkpoint_path(checkpoint_dir: str, epoch: int, suffix: str = ''):
-    """This is a function to generate the checkpoint path from the (checkpoint_dir, epoch, suffix, gpu_parallel_rank) tuple.
+    """This is a function to generate the checkpoint path from the tuple
    (checkpoint_dir, epoch, suffix, gpu_parallel_rank).
    This is useful during generation and recuperation of the checkpoint.
-    :param checkpoint_dir: Set up a directory for saving checkpoints
+    Args:
-    :type checkpoint_dir: str
+        checkpoint_dir (str): Set up a directory for saving checkpoints.
-    :param epoch: Epoch number (indicate how many epochs have you trained this model)
+        epoch (int): Epoch number (indicate how many epochs have you trained this model).
-    :type epoch: int
+        suffix (str, optional): Additional notation to specify the model or checkpoint, defaults to ''
-    :param suffix: Additional notation to specify the model or checkpoint, defaults to ''
+
-    :type suffix: str, optional
+    Returns:
-    :return: Checkpoint path to be generated
+        str: The checkpoint path to be generated.
    :rtype: path
    """
    ckpt_filename = _get_standard_checkpoint_filename(epoch, suffix)
    return os.path.join(checkpoint_dir, ckpt_filename)
@ -74,12 +74,13 @@ def _ensure_directory_exists(filename: str):
 def get_latest_checkpoint_pattern(suffix: str = ''):
-    """Generate Regular expression of latest checkpoint's pattern
+    """Generate Regular expression of the latest checkpoint's pattern.
-    :param suffix: Additional notation to specify the model or checkpoint, defaults to ''
+    Args:
-    :type suffix: str, optional
+        suffix (str, optional): Additional notation to specify the model or checkpoint, defaults to ''.
-    :return: Checkpoint pattern
+
-    :rtype: regular expression
+    Returns:
        str: The regular expression of checkpoint pattern.
    """
    ranks_name = _get_ranks_name()
    pattern = r'epoch(\d+)-{}{}\.pt'.format(ranks_name, suffix)
@ -88,16 +89,19 @@ def get_latest_checkpoint_pattern(suffix: str = ''):
 def get_latest_checkpoint_path(checkpoint_dir: str, suffix: str = ''):
-    """This is a function to retrieve the latest checkpoint path from the (checkpoint_dir, suffix, gpu_parallel_rank) tuple.
+    """This is a function to retrieve the latest checkpoint path from the tuple
    (checkpoint_dir, suffix, gpu_parallel_rank).
    This is useful during recuperation of the checkpoint, especially when you do not know the epoch number.
-    :param checkpoint_dir: Directory for saving checkpoints
+    Args:
-    :type checkpoint_dir: str
+        checkpoint_dir (str): Directory for saving checkpoints
-    :param suffix: Additional notation to specify the model or checkpoint, defaults to ''
+        suffix (str, optional): Additional notation to specify the model or checkpoint, defaults to ''
-    :type suffix: str, optional
+
-    :raises FileNotFoundError: Raise error when we cannot find the latest checkpoint file with inputs given
+    Returns:
-    :return: The latest checkpoint path to be retrieved
+        str: The latest retrieved checkpoint path.
-    :rtype: path
+
    Raises:
        FileNotFoundError: Raise error when we cannot find the latest checkpoint file with inputs given.
    """
    CKPT_NAME_PAT = get_latest_checkpoint_pattern(suffix=suffix)
@ -126,22 +130,19 @@ def save_checkpoint(checkpoint_path: str,
                    optimizer: torch.optim.Optimizer,
                    lr_scheduler: torch.optim.lr_scheduler._LRScheduler = None,
                    **kwargs):
-    """Given a directory to store the checkpoints, saves all the training components' parameters or buffers, such as model,
+    """Given a directory to store the checkpoints, saves all the training components' parameters or buffers, such as
-     optimizer, lr_scheduler and etc. into a checkpoint dictionary.
+    model, optimizer, lr_scheduler etc. into a checkpoint dictionary.
-    This method can be used for both colosalai nn.BaseModel and normal pytorch nn.Module.
+    This method can be used for both :class:`colossalai.nn.BaseModel` and normal  :class:`torch.nn.Module`.
-
+    Args:
-    :param checkpoint_path: Set up a directory for saving checkpoints
+        checkpoint_path (str): Set up a directory for saving checkpoints.
-    :type checkpoint_path: str
+        epoch (int): Epoch number (indicate how many epochs have you trained this model).
-    :param epoch: Epoch number (indicate how many epochs have you trained this model)
+        model (:class:`torch.nn.Module`): Model to be registered.
-    :type epoch: int
+        optimizer (Union[:class:`torch.optim.Optimizer`, :class:`colossalai.nn.optimizer`]): Optimizer to be registered.
-    :param model: Model to be registered
+        lr_scheduler (Union[:class:`torch.optim.lr_scheduler`,
-    :type model: torch.nn.Module
+            :class:`colossalai.nn.lr_scheduler`], optional): lr_scheduler to be registered, defaults to None.
-    :param optimizer: Optimizer to be registered
+        kwargs (dict): additional parameters to be saved.
    :type optimizer: torch.optim.Optimizer
    :param lr_scheduler: lr_scheduler to be registered, defaults to None
    :type lr_scheduler: torch.optim.lr_scheduler._LRScheduler, optional
    """
    # for compatibility with normal pytorch nn.Module
    if hasattr(model, 'state_dict_for_save_checkpoint'):
@ -165,31 +166,31 @@ def load_checkpoint(checkpoint_path: str,
                    finetune: bool = False,
                    strict: bool = True) -> Tuple:
    """Loads the checkpoint file.
    If finetune is False, then we intend to continue/resume the training process from the checkpoint given.
    So we copy parameters and buffers from state_dict into these modules(model, optimizer,lr_scheduler)
-     and its descendants.
+    and its descendants.
    If finetune is True, then only the weights and buffers of model should be reload.
    If strict is True, then the keys of state_dict must exactly match the keys returned by this module’s
     state_dict() function.
-    :param checkpoint_path: The exact and matched checkpoint_path directory to retrieve appropriate state_dict
+    If finetune is True, then only the weights and buffers of model should be reloaded.
-    :type checkpoint_path: str
+    If strict is True, then the keys of state_dict must exactly match the keys returned
-    :param model: Model to reload parameters and buffers
+    by this module’s state_dict() function.
    :type model: torch.nn.Module
    :param optimizer: Optimizer to recuperate
    :type optimizer: torch.optim.Optimizer
    :param lr_scheduler: lr_scheduler to recuperate, defaults to None
    :type lr_scheduler: torch.optim.lr_scheduler._LRScheduler, optional
    :param finetune: Whether to finetune the model with new dataset or continue the pre-training, defaults to False
    :type finetune: bool, optional
    :param strict: Whether to strictly enforce that the keys in
        :attr:`state_dict` of the checkpoint match the names of
        parameters and buffers in model., defaults to True
    :type strict: bool, optional
    :raises ValueError: Raise error if the model/optimizer cannot successfully be recuperated
    :return: (the epoch number of the checkpoint retrieved, the checkpoint retrieved)
    :rtype: Tuple
     Args:
        checkpoint_path (str): The exact and matched checkpoint_path directory to retrieve appropriate state_dict.
        model (:class:`torch.nn.Module`): Model to reload parameters and buffers.
        optimizer (Union[:class:`torch.optim.Optimizer`, :class:`colossalai.nn.optimizer`]): Optimizer to recuperate.
        lr_scheduler (:class:`torch.optim.lr_scheduler._LRScheduler`, optional):
            lr_scheduler to recuperate, defaults to None.
        finetune (bool, optional): Whether to finetune the model with new dataset or
            continue the pre-training, defaults to False.
        strict (bool, optional): Whether to strictly enforce that the keys in :attr:`state_dict`
            of the checkpoint match the names of parameters and buffers in model, defaults to True.
    Returns:
        Tuple(int, ``checkpoint``): The tuple (the epoch number of the checkpoint retrieved, the checkpoint retrieved).
    Raises:
        ValueError: Raise error if the model/optimizer cannot successfully be recuperated
    """
    # Load the checkpoint.
    checkpoint = torch.load(checkpoint_path, map_location='cpu')
--- a/colossalai/utils/common.py
+++ b/colossalai/utils/common.py
@ -27,10 +27,10 @@ from .multi_tensor_apply import multi_tensor_applier
 def print_rank_0(msg: str, logger=None):
    """Print messages and save logs(optional). This is executed only if you are the rank-0 gpu.
-    :param msg: A string message to output
+    Args:
-    :type msg: str
+        msg (str): A string message to output.
-    :param logger: Python logger object, defaults to None
+        logger (:class:`colossalai.logging.DistributedLogger`, optional):
-    :type logger: optional
+            The logger to record the message, defaults to None.
    """
    if gpc.get_global_rank() == 0:
        if logger is None:
@ -53,12 +53,15 @@ def free_port():
 def sync_model_param(model, parallel_mode):
-    """Make sure data parameters are consistent during Data Parallel Mode
+    r"""Make sure data parameters are consistent during Data Parallel Mode.
-    :param model: A pyTorch nn.model on whose parameters you check the consistency
+    Args:
-    :param parallel_mode: Parallel mode to be checked
+        model (:class:`torch.nn.Module`): A pyTorch model on whose parameters you check the consistency.
-    :type model: torch.nn.Module
+        parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel mode to be checked.
-    :type parallel_mode:  colossalai.context.ParallelMode
+
    Note:
        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """
    if gpc.is_initialized(parallel_mode) and gpc.get_world_size(parallel_mode) > 1:
        for param in model.parameters():
@ -146,18 +149,19 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
    """Clips gradient norm of an iterable of parameters whose gradients are in fp32.
    This is adapted from :func:`torch.nn.utils.clip_grad.clip_grad_norm_` and
-    added functionality to handle model parallel parameters. Note that
+    added functionality to handle model parallel parameters.
    the gradients are modified in place.
-    :param parameters: An iterable of Tensors or a single Tensor that will have gradients normalized
+    Note:
-    :type parameters: (Iterable[Tensor] or Tensor)
+        the gradients are modified in place.
    :param max_norm: Max norm of the gradients
    :type max_norm: float or int
    :param norm_type: Type of the used p-norm. Can be ``'inf'`` for infinity norm.
    :type norm_type: float or int 
-    :return: Total norm of the parameters (viewed as a single vector).
+    Args:
-    :rtype: float
+        parameters (Iterable[:class:`torch.tensor`] or :class:`torch.tensor`):
            An iterable of Tensors or a single Tensor that will have gradients normalized.
        max_norm (Union[float, int]): Max norm of the gradients.
        norm_type (Union[float, int, 'inf']): Type of the used p-norm. Can be ``'inf'`` for infinity norm.
    Returns:
        float: Total norm of the parameters.
    """
    if isinstance(parameters, torch.Tensor):
--- a/colossalai/utils/data_sampler/data_parallel_sampler.py
+++ b/colossalai/utils/data_sampler/data_parallel_sampler.py
@ -19,18 +19,15 @@ T_co = TypeVar('T_co', covariant=True)
@DATA_SAMPLERS.register_module
 class DataParallelSampler(Sampler):
-    """A data sampler for distributed data parallelism
+    """A data sampler for distributed data parallelism.
-    :param dataset: A Dataset instance
+    Args:
-    :type dataset: torch.utils.data.Dataset
+        dataset (:class:`torch.utils.data.Dataset`): The Dataset for sampling.
-    :param shuffle: Whether to shuffle data, defaults to False
+        shuffle (bool, optional): Whether to shuffle data, defaults to False.
-    :type shuffle: bool, optional
+        seed (int, optional): The random seed used for sampling, defaults to 0.
-    :param seed: The random seed, defaults to 0
+        drop_last (bool, optional): Set to True to drop the last incomplete batch, if the dataset size
-    :type seed: int, optional
+            is not divisible by the batch size. If False and the size of dataset is not divisible by
-    :param drop_last: Set to True to drop the last incomplete batch, if the dataset size is not divisible by the batch
+            the batch size, then the last batch will be smaller, defaults to False.
        size. If False and the size of dataset is not divisible by the batch size, then the last batch will be smaller,
        defaults to False
    :type drop_last: bool, optional
    """
    def __init__(self,
@ -104,8 +101,8 @@ class DataParallelSampler(Sampler):
        use a different random ordering for each epoch. Otherwise, the next iteration of this
        sampler will yield the same ordering.
-        :param epoch: Epoch number.
+        Args:
-        :type epoch: int
+            epoch (int): Epoch number.
        """
        self.epoch = epoch
@ -118,29 +115,27 @@ def get_dataloader(dataset,
                   pin_memory=False,
                   num_workers=0,
                   **kwargs):
-    """Set up a deterministic dataloader (also configure seed workers, samplers and whether shuffle or not)
+    r"""Set up a deterministic dataloader (also configure seed workers, samplers and whether shuffle or not)
-    .. note:: When pipeline parallel is enabled, shuffle cannot be True as it will result in mismatch between input data
+    Note:
-        on the 1st stage and label on the last stage
+        When pipeline parallel is enabled, shuffle cannot be True as it will result in mismatch between input data
        on the 1st stage and label on the last stage.
-    :param dataset: A :class:`torch.utils.data.Dataset` object
+    Args:
-    :param shuffle: Whether to shuffle the dataset
+        dataset (:class:`torch.utils.data.Dataset`): The dataset to be loaded.
-    :param seed: Random worker seed, defaults to 1024
+        shuffle (bool, optional): Whether to shuffle the dataset. Defaults to False.
-    :param add_sampler: Add DistributedDataParallelSampelr to the dataset
+        seed (int, optional): Random worker seed for sampling, defaults to 1024.
-    :param drop_last: Drop the last incomplete batch of data
+        add_sampler: Whether to add ``DistributedDataParallelSampler`` to the dataset. Defaults to True.
-    :param pin_memory: Whether to pin memory address in CPU memory
+        drop_last (bool, optional): Set to True to drop the last incomplete batch, if the dataset size
-    :param num_workers: Number of worker threads for this dataloader
+            is not divisible by the batch size. If False and the size of dataset is not divisible by
            the batch size, then the last batch will be smaller, defaults to False.
        pin_memory (bool, optional): Whether to pin memory address in CPU memory. Defaults to False.
        num_workers (int, optional): Number of worker threads for this dataloader. Defaults to 0.
        kwargs (dict): optional parameters for ``torch.utils.data.DataLoader``, more details could be found in
                `DataLoader <https://pytorch.org/docs/stable/_modules/torch/utils/data/dataloader.html#DataLoader>`_.
-    :type dataset: :class:`torch.utils.data.Dataset`
+    Returns:
-    :type shuffle: bool, optional. Default is False
+        :class:`torch.utils.data.DataLoader`: A DataLoader used for training or testing.
    :type seed: int, optional. Default is 1024
    :type add_sampler: bool, optional. Default is True
    :type drop_last: bool, optional. Default is False
    :type pin_memory: bool, optional. Default is False
    :type num_workers: int, optional. Default is 0
    :return: A object of :class:`torch.utils.data.DataLoader`
    :rtype: :class:`torch.utils.data.DataLoader`
    """
    _kwargs = kwargs.copy()
--- a/colossalai/utils/gradient_accumulation/init.py
+++ b/colossalai/utils/gradient_accumulation/init.py
@ -13,19 +13,25 @@ def accumulate_gradient(model: nn.Module,
                        accumulate_size: int,
                        gradient_handlers: List[BaseGradientHandler] = None,
                        lr_scheduler: _LRScheduler = None):
-    """
+    r"""Turning model, optimizer, dataloader into corresponding object for gradient accumulation.
-    :param model: your model object
+
-    :type model: :class:`torch.nn.Module`
+    Args:
-    :param optimizer: your optimizer object
+        model (:class:`torch.nn.Module`): your model object for gradient accumulation.
-    :type optimizer: :class:`torch.optim.Optimizer`
+        optimizer (:class:`torch.optim.Optimizer`): your optimizer object for gradient accumulation.
-    :param dataloader: your dataloader object
+        dataloader (:class:`torch.utils.data.DataLoader` or iterable objects):
-    :type dataloader: Iterable
+            your dataloader object, would be called like iter(dataloader)
-    :param accumulate_size: the number of steps to accumulate gradients
+        accumulate_size (int): the number of steps to accumulate gradients
-    :type accumulate_size: int
+        gradient_handlers (List[:class:`colossalai.engine.BaseGradientHandler`]):
-    :param gradient_handlers: list of gradient handler objects. Default is None
+            list of gradient handler objects. Default is None.
-    :type gradient_handlers: List[:class:`colossalai.engine.BaseGradientHandler`]
+        lr_scheduler (`torch.optim.lr_scheduler` or `colossalai.nn.lr_scheduler`):
-    :param lr_scheduler: your lr scheduler object. Default is None
+            your ``lr_scheduler`` object for gradient accumulation. Defaults to None.
-    :type lr_scheduler: `torch.optim.lr_scheduler._LRScheduler`
+
    More details about `gradient_handlers` could be found in
    `Gradient_handler <https://github.com/hpcaitech/ColossalAI/tree/main/colossalai/engine/gradient_handler>`_.
    More details about `lr_scheduler` could be found
    `lr_scheduler <https://github.com/hpcaitech/ColossalAI/tree/main/colossalai/nn/lr_scheduler>`_. and
    `how to adjust learning rate <https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate>`_.
    """
    optimizer = GradAccumOptimizer(optimizer, accumulate_size=accumulate_size, model=model)
    dataloader = GradAccumDataloader(dataloader, accumulate_size=accumulate_size)
--- a/colossalai/utils/gradient_accumulation/_gradient_accumulation.py
+++ b/colossalai/utils/gradient_accumulation/_gradient_accumulation.py
@ -15,15 +15,13 @@ from colossalai.engine import BaseGradientHandler
 class GradAccumOptimizer(ColossalaiOptimizer):
    """A wrapper for the optimizer to enable gradient accumulation by skipping the steps 
-    before accumulation size is reached
+    before accumulation size is reached.
    :param optim: Your optimizer object
    :type optim: :class:`torch.optim.Optimizer`
    :param accumulate_size: The number of steps to accumulate gradients
    :type accumulate_size: int
    :param model: Your model object to check if it is DDP for special handling of no_sync() context
    :type model: :class:`torch.nn.Module`
    Args:
        optim (:class:`torch.optim.Optimizer`): Your optimizer object for gradient accumulation.
        accumulate_size (int): The number of steps to accumulate gradients.
        model (:class:`torch.nn.Module`):
            Your model object to check if it is DistributedDataParallel for special handling of no_sync() context.
    """
    def __init__(self, optim: Optimizer, accumulate_size: int, model: nn.Module = None):
@ -76,18 +74,18 @@ class GradAccumOptimizer(ColossalaiOptimizer):
 class GradAccumDataloader:
-    """A wrapper for dataloder to enable gradient accumulation by dropping the last incomplete steps.
+    """A wrapper for dataloader to enable gradient accumulation by dropping the last incomplete steps.
-    For example, if a dataloader has 10 batches of data and accumulate size is 4. The model paramters will 
+    Note:
-    be update only twice at step 4 and step 8. The last two batches of data do not form a complete 4-step cycle.
+        The dataloader would drop the last incomplete steps for gradient accumulation.
-    Thus, they will be automatically skipped by this class. If the dataloader is not standard PyTorch dataloader, 
+        For example, if a dataloader has 10 batches of data and accumulate size is 4. The model parameters will
-    (e.g. Dali dataloader), this class will automatically consume (load data for nothing) the remaining 2 batches.
+        be updated only twice at step 4 and step 8. The last two batches of data do not form a complete 4-step cycle.
-
+        Thus, they will be automatically skipped by this class. If the dataloader is not standard PyTorch dataloader,
-    :param dataloader: Your dataloader object
+        (e.g. Dali dataloader), this class will automatically consume (load data for nothing) the remaining 2 batches.
    :type dataloader: Iterable
    :param accumulate_size: The number of steps to accumulate gradients
    :type accumulate_size: int
    Args:
        optim (``Iterable``): Your dataloader object for gradient accumulation.
        accumulate_size (int): The number of steps to accumulate gradients.
    """
    def __init__(self, dataloader: Iterable, accumulate_size: int) -> None:
@ -125,13 +123,12 @@ class GradAccumDataloader:
 class GradAccumLrSchedulerByStep(_LRScheduler):
    """A wrapper for the LR scheduler to enable gradient accumulation by skipping the steps 
-    before accumulation size is reached
+    before accumulation size is reached.
    :param lr_scheduler: Your lr scheduler object
    :type lr_scheduler: :class:`torch.optim.lr_scheduler._LRScheduler`    
    :param accumulate_size: The number of steps to accumulate gradients
    :type accumulate_size: int
    Args:
        lr_scheduler (:class:`torch.optim.lr_scheduler._LRScheduler`):
            Your ``lr_scheduler`` object for gradient accumulation.
        accumulate_size (int): The number of steps to accumulate gradients.
    """
    def __init__(self, lr_scheduler: _LRScheduler, accumulate_size: int) -> None:
@ -171,13 +168,16 @@ class GradAccumLrSchedulerByStep(_LRScheduler):
 class GradAccumGradientHandler:
-    """A wrapper for the gradient handler to enable gradient accumulation by skipping the steps 
+    r"""A wrapper for the gradient handler to enable gradient accumulation by skipping the steps
-    before accumulation size is reached
+    before accumulation size is reached.
-    :param grad_handler: Your gradient handler object
+    Args:
-    :type grad_handler: :class:`colossalai.engine.BaseGradientHandler`    
+        grad_handler (:class:`colossalai.engine.BaseGradientHandler`):
-    :param accumulate_size: The number of steps to accumulate gradients
+            Your ``gradient_handler`` object for gradient accumulation, would be called when achieving `accumulate_size`.
-    :type accumulate_size: int
+        accumulate_size (int): The number of steps to accumulate gradients.
    More details about ``gradient_handlers`` could be found in
    `Gradient_handler <https://github.com/hpcaitech/ColossalAI/tree/main/colossalai/engine/gradient_handler>`_.
    """
--- a/colossalai/utils/memory_utils/memory_monitor.py
+++ b/colossalai/utils/memory_utils/memory_monitor.py
@ -14,12 +14,13 @@ from typing import Optional
 def colo_cuda_memory_used(device: Optional[torch.device] = None) -> int:
-    """
+    """Get the free memory info of device.
-    Get the free memory info of device.
+
-    :param device: a torch device instance or None
+    Args:
-    :type device: Optional[torch.device]
+       device (Optional[``torch.device``]): a torch device instance or None. Defaults None.
-    :return: current memory usage, sized by Byte
+
-    :rtype: int
+    Returns:
        int: current memory usage, sized by Byte.
    """
    if device:
        assert device.type == 'cuda'
@ -34,7 +35,7 @@ def colo_cuda_memory_used(device: Optional[torch.device] = None) -> int:
 def bytes_to_GB(val, decimal=2):
-    """A byte-to-Gigabyte converter, defaultly using binary notation.
+    """A byte-to-Gigabyte converter, default using binary notation.
    :param val: X bytes to convert
    :return: X' GB
@ -43,7 +44,7 @@ def bytes_to_GB(val, decimal=2):
 def bytes_to_MB(val, decimal=2):
-    """A byte-to-Megabyte converter, defaultly using binary notation.
+    """A byte-to-Megabyte converter, default using binary notation.
    :param val: X bytes to convert
    :return: X' MB
@ -54,13 +55,13 @@ def bytes_to_MB(val, decimal=2):
 def report_memory_usage(message, logger=None, report_cpu=False):
    """Calculate and print RAM usage (in GB)
-    :param message: A prefix message to add in the log
+    Args:
-    :type message: str
+        message (str): A prefix message to add in the log.
-    :param logger: An instance of :class:`colossalai.logging.DistributedLogger`
+        logger (:class:`colossalai.logging.DistributedLogger`): The logger used to record memory information.
-    :type logger: :class:`colossalai.logging.DistributedLogger`, optional
+        report_cpu (bool, optional): Whether to report CPU memory.
-    :param report_cpu: Whether to report CPU memory
+
-    :type report_cpu: bool, optional
+    Raises:
-    :raises EnvironmentError: Raise error if no distributed environment has been initialized
+        EnvironmentError: Raise error if no distributed environment has been initialized.
    """
    if not gpc.is_initialized(ParallelMode.GLOBAL):
        raise EnvironmentError("No distributed environment is initialized")
--- a/colossalai/utils/moe.py
+++ b/colossalai/utils/moe.py
@ -12,8 +12,8 @@ def get_moe_epsize_param_dict(model: nn.Module) -> Dict[int, List[nn.Parameter]]
    size of every parameter. Since the parameters in data parallelism is replicated
    in each GPU, we set their ep_size to 1.
-    :param model: A pyTorch nn.model from which we get dict
+    Args:
-    :type model: torch.nn.Module
+        model (:class:`torch.nn.Module`): A pyTorch `nn.Module` from which we get dict.
    """
    epsize_param_dict = dict()
    for param in model.parameters():
@ -29,10 +29,10 @@ def get_moe_epsize_param_dict(model: nn.Module) -> Dict[int, List[nn.Parameter]]
 def sync_moe_model_param(model: nn.Module):
-    """Make sure model parameters are consistent in MoE parallel context
+    """Make sure model parameters are consistent in MoE parallel context.
-    :param model: A pyTorch nn.model on whose parameters you check the consistency
+    Args:
-    :type model: torch.nn.Module
+        model (:class:`torch.nn.Module`): A pyTorch model on whose parameters you check the consistency.
    """
    if is_using_ddp():
--- a/colossalai/utils/multi_tensor_apply/multi_tensor_apply.py
+++ b/colossalai/utils/multi_tensor_apply/multi_tensor_apply.py
@ -3,10 +3,10 @@
 class MultiTensorApply(object):
    """
-    Apply an operation to a list of tensors efficiently
+    Apply an operation to a list of tensors efficiently.
-    :param chunk_size: Size of a chunk
+    Args:
-    :type chunk_size: int
+        chunk_size (int): Size of a chunk.
    """
    available = False
--- a/colossalai/utils/tensor_detector/tensor_detector.py
+++ b/colossalai/utils/tensor_detector/tensor_detector.py
@ -9,6 +9,7 @@ from collections import defaultdict
 LINE_WIDTH = 108
 LINE = '-' * LINE_WIDTH + '\n'
 class TensorDetector():
    def __init__(self,
                 show_info: bool = True,
@ -16,17 +17,14 @@ class TensorDetector():
                 include_cpu: bool = False,
                 module: Optional[nn.Module] = None
                 ):
-        """This class is an detector to detect tensor on different devices.
+        """This class is a detector to detect tensor on different devices.
        :param show_info: whether to print the info on screen, default True
        :type show_info: bool
        :param log: the file name to save the log
        :type log: str
        :param include_cpu: whether to detect tensor on cpu, default False
        :type include_cpu: bool
        :param module: when sending an `nn.Module` it, the detector can name the tensors detected better
        :type module: Optional[nn.Module]
        Args:
            show_info (bool, optional): whether to print the info on screen, default True.
            log (str, optional): the file name to save the log. Defaults to None.
            include_cpu (bool, optional): whether to detect tensor on cpu, default False.
            module (Optional[:class:`nn.Module`]): when sending an ``nn.Module`` object,
                the detector can name the tensors detected better.
        """
        self.show_info = show_info
        self.log = log
@ -49,7 +47,6 @@ class TensorDetector():
                self.tensor_info[id(param)].append(param.dtype)
                self.tensor_info[id(param)].append(self.get_tensor_mem(param))
    def get_tensor_mem(self, tensor):
        # calculate the memory occupied by a tensor
        memory_size = tensor.element_size() * tensor.storage().size()
@ -58,7 +55,6 @@ class TensorDetector():
            memory_size += grad_memory_size
        return self.mem_format(memory_size)
    def mem_format(self, real_memory_size):
        # format the tensor memory into a reasonal magnitude
        if real_memory_size >= 2 ** 30:
@ -69,7 +65,6 @@ class TensorDetector():
            return str(real_memory_size / (2 ** 10)) + ' KB'
        return str(real_memory_size) + ' B' 
    def collect_tensors_state(self):
        for obj in gc.get_objects():
            if torch.is_tensor(obj):
@ -116,7 +111,6 @@ class TensorDetector():
                if obj.device not in self.devices:
                    self.devices.append(obj.device)
    def print_tensors_state(self):
        template_format = '{:3s}{:<30s}{:>10s}{:>20s}{:>10s}{:>20s}{:>15s}'
        self.info += LINE
@ -174,7 +168,6 @@ class TensorDetector():
            with open(self.log + '.log', 'a') as f:
                f.write(self.info)
    def detect(self, include_cpu = False):
        self.include_cpu = include_cpu
        self.collect_tensors_state()
--- a/colossalai/utils/timer.py
+++ b/colossalai/utils/timer.py
@ -25,7 +25,7 @@ class Timer:
        return time.time()
    def start(self):
-        """Fisrtly synchronize cuda, reset the clock and then start the timer.
+        """Firstly synchronize cuda, reset the clock and then start the timer.
        """
        self._elapsed = 0
        synchronize()
@ -40,10 +40,11 @@ class Timer:
    def stop(self, keep_in_history: bool = False):
        """Stop the timer and record the start-stop time interval.
-        :param keep_in_history: Whether does it record into history each start-stop interval, defaults to False
+        Args:
-        :type keep_in_history: bool, optional
+            keep_in_history (bool, optional): Whether does it record into history
-        :return: Start-stop interval
+                each start-stop interval, defaults to False.
-        :rtype: int
+        Returns:
            int: Start-stop interval.
        """
        synchronize()
        end_time = time.time()
@ -57,26 +58,27 @@ class Timer:
    def get_history_mean(self):
        """Mean of all history start-stop time intervals.
-        :return: Mean of time intervals
+        Returns:
-        :rtype: int
+            int: Mean of time intervals
        """
        return sum(self._history) / len(self._history)
    def get_history_sum(self):
        """Add up all the start-stop time intervals.
-        :return: Sum of time intervals
+        Returns:
-        :rtype: int
+            int: Sum of time intervals.
        """
        return sum(self._history)
    def get_elapsed_time(self):
        """Return the last start-stop time interval.
-        .. note:: Use it only when timer is not in progress
+        Returns:
            int: The last time interval.
-        :return: The last time interval
+        Note:
-        :rtype: int
+            Use it only when timer is not in progress
        """
        assert not self._started, 'Timer is still in progress'
        return self._elapsed
@ -90,10 +92,10 @@ class Timer:
 class MultiTimer:
-    """An object contains multiple timers
+    """An object contains multiple timers.
-    :param on: Whether the timer is enabled. Default is True
+    Args:
-    :type on: bool, optional
+        on (bool, optional): Whether the timer is enabled. Default is True.
    """
    def __init__(self, on: bool = True):
@ -101,10 +103,10 @@ class MultiTimer:
        self._timers = dict()
    def start(self, name: str):
-        """Start namely one of the timers
+        """Start namely one of the timers.
-        :param name: Timer's key
+        Args:
-        :type name: str
+            name (str): Timer's key.
        """
        if self._on:
            if name not in self._timers:
@ -114,10 +116,9 @@ class MultiTimer:
    def stop(self, name: str, keep_in_history: bool):
        """Stop namely one of the timers.
-        :param name: Timer's key
+        Args:
-        :type name: str
+            name (str): Timer's key.
-        :param keep_in_history: Whether does it record into history each start-stop interval
+            keep_in_history (bool): Whether does it record into history each start-stop interval.
        :type keep_in_history: bool
        """
        if self._on:
            return self._timers[name].stop(keep_in_history)
@ -127,17 +128,19 @@ class MultiTimer:
    def get_timer(self, name):
        """Get timer by its name (from multitimer)
-        :param name: Timer's key
+        Args:
-        :return: Timer with the name you give correctly
+            name (str): Timer's key.
-        :rtype: Timer
+        Returns:
            :class:`colossalai.utils.Timer`: Timer with the name you give correctly.
        """
        return self._timers[name]
    def reset(self, name=None):
        """Reset timers.
-        :param name: If name is designated, the named timer will be reset and others will not, defaults to None
+        Args:
-        :type name: optional
+            name (str, optional): If name is designated, the named timer will be reset
                and others will not, defaults to None.
        """
        if self._on:
            if name is not None: