Refactored docstring to google style

2025-07-10 22:03:35 +00:00 · 2022-03-25 13:02:39 +08:00 · 2022-03-25 13:02:39 +08:00 · ec5086c49c
commit ec5086c49c
parent 53b1b6e340
94 changed files with 3389 additions and 2982 deletions
--- a/colossalai/amp/init.py
+++ b/colossalai/amp/init.py
@ -12,21 +12,27 @@ from .naive_amp import convert_to_naive_amp


 def convert_to_amp(model: nn.Module, optimizer: Optimizer, criterion: _Loss, mode: AMP_TYPE, amp_config: Config = None):
-    """A helper function to wrap training components with Torch AMP modules
+    """A helper function to wrap training components with Torch AMP modules.

-    :param model: your model object
-    :type model: :class:`torch.nn.Module`
-    :param optimizer: your optimizer object
-    :type optimizer: :class:`torch.optim.Optimizer`
-    :param criterion: your loss function object
-    :type criterion: :class:`torch.nn.modules.loss._Loss`
-    :param mode: amp mode
-    :type mode: :class:`colossalai.amp.AMP_TYPE`
-    :param amp_config: configuration for different amp modes
-    :type amp_config: :class:`colossalai.context.Config` or dict
+    Args:
+        param model (:class:`torch.nn.Module`): your model object.
+        optimizer (:class:`torch.optim.Optimizer`): your optimizer object.
+        criterion (:class:`torch.nn.modules.loss._Loss`): your loss function object.
+        mode (:class:`colossalai.amp.AMP_TYPE`): amp mode.
+        amp_config (:class:`colossalai.context.Config` or dict): configuration for different amp modes

-    :return: (model, optimizer, criterion)
-    :rtype: Tuple
+    Returns:
+        A tuple (model, optimizer, criterion).
+
+    Note:
+        ``amp_config`` may vary from different mode you choose. You should check the corresponding amp mode
+        for more details about ``amp_config``.
+        For ``apex_amp``, please check
+        `apex_amp config <https://nvidia.github.io/apex/amp.html?highlight=apex%20amp>`_.
+        For ``naive_amp``, please check
+        `naive_amp config <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/amp/naive_amp/_fp16_optimizer.py#L42>`_.
+        For ``torch_amp``, please check
+        `torch_amp config <https://github.com/pytorch/pytorch/blob/master/torch/cuda/amp/grad_scaler.py#L97>`_.
    """
    assert isinstance(mode, AMP_TYPE), \
        f'expected the argument mode be AMP_TYPE, but got {type(mode)}'
--- a/colossalai/amp/apex_amp/init.py
+++ b/colossalai/amp/apex_amp/init.py
@ -4,17 +4,33 @@ from torch.optim import Optimizer


 def convert_to_apex_amp(model: nn.Module, optimizer: Optimizer, amp_config):
-    """A helper function to wrap training components with Apex AMP modules
+    r"""A helper function to wrap training components with Apex AMP modules

-    :param model: your model object
-    :type model: :class:`torch.nn.Module`
-    :param optimizer: your optimizer object
-    :type optimizer: :class:`torch.optim.Optimizer`
-    :param amp_config: configuration for nvidia apex
-    :type amp_config: :class:`colossalai.context.Config` or dict
+    Args:
+        model (:class:`torch.nn.Module`): your model object.
+        optimizer (:class:`torch.optim.Optimizer`): your optimizer object.
+        amp_config (:class: colossalai.context.Config or dict): configuration for initializing apex_amp.

-    :return: (model, optimizer)
-    :rtype: Tuple
+    The ``amp_config`` should include parameters below:
+    ::
+
+        enabled (bool, optional, default=True)
+        opt_level (str, optional, default="O1")
+        cast_model_type (``torch.dtype``, optional, default=None)
+        patch_torch_functions (bool, optional, default=None)
+        keep_batchnorm_fp32 (bool or str, optional, default=None
+        master_weights (bool, optional, default=None)
+        loss_scale (float or str, optional, default=None)
+        cast_model_outputs (torch.dtype, optional, default=None)
+        num_losses (int, optional, default=1)
+        verbosity (int, default=1)
+        min_loss_scale (float, default=None)
+        max_loss_scale (float, default=2.**24)
+
+    Returns:
+        Tuples: A tuple (model, optimizer).
+
+    More details about ``amp_config`` refer to `amp_config <https://nvidia.github.io/apex/amp.html?highlight=apex%20amp>`_.
    """
    import apex.amp as apex_amp
    model, optimizer = apex_amp.initialize(model, optimizer, **amp_config)
--- a/colossalai/amp/apex_amp/apex_amp.py
+++ b/colossalai/amp/apex_amp/apex_amp.py
@ -21,8 +21,8 @@ class ApexAMPOptimizer(ColossalaiOptimizer):
    def backward(self, loss: Tensor):
        """Backward pass to get all gradients

-        :param loss: Loss computed by a loss function
-        :type loss: torch.Tensor
+        Args:
+            loss (torch.Tensor): Loss computed by a loss function
        """
        with apex_amp.scale_loss(loss, self.optim) as scaled_loss:
            scaled_loss.backward()
@ -30,10 +30,9 @@ class ApexAMPOptimizer(ColossalaiOptimizer):
    def clip_grad_norm(self, model: nn.Module, max_norm: float):
        """Clip gradients' norm

-        :param model: Your model object
-        :type model: torch.nn.Module
-        :param max_norm: The max norm value for gradient clipping
-        :type max_norm: float
+        Args:
+            model (torch.nn.Module): Your model object
+            max_norm (float): The max norm value for gradient clipping
        """
        if max_norm > 0:
            clip_grad_norm_fp32(apex_amp.master_params(self.optim), max_norm)
--- a/colossalai/amp/naive_amp/init.py
+++ b/colossalai/amp/naive_amp/init.py
@ -4,20 +4,30 @@ from torch.optim import Optimizer
 from colossalai.utils import is_no_pp_or_last_stage
 from .naive_amp import NaiveAMPOptimizer, NaiveAMPModel
 from .grad_scaler import DynamicGradScaler, ConstantGradScaler
+from ._fp16_optimizer import FP16Optimizer


 def convert_to_naive_amp(model: nn.Module, optimizer: Optimizer, amp_config):
-    """A helper function to wrap training components with naive AMP modules
+    """A helper function to wrap training components with naive AMP modules. In this mode,
+    we forcibly cast the model weights and inputs to FP16, and cast the model outputs to FP32 to calculate loss,
+    which is equivalent to Apex O3.

-    :param model: your model object
-    :type model: :class:`torch.nn.Module`
-    :param optimizer: your optimizer object
-    :type optimizer: :class:`torch.optim.Optimizer`
-    :param amp_config: configuration for naive mode amp
-    :type amp_config: :class:`colossalai.context.Config` or dict
+    Args:
+        model (:class:`torch.nn.Module`): your model object
+        optimizer (:class:`torch.optim.Optimizer`): your optimizer object
+        amp_config (:class:`colossalai.context.Config` or dict): configuration for naive mode amp.

-    :return: (model, optimizer)
-    :rtype: Tuple
+
+    The ``amp_config`` should contain parameters below:
+    :
+
+        verbose (bool, optional): if set to `True`, will print debug info (Default: False).
+        clip_grad_norm (float, optional): clip gradients with this global L2 norm (Default 0).
+                                          Note that clipping is ignored if clip_grad == 0.
+        dynamic_grad_scale (bool): whether to use dynamic grad scaler.
+
+    Returns:
+        Tuples: A tuple (model, optimizer)
    """
    if isinstance(model, nn.ModuleList):
        # interleaved pipeline
@ -46,4 +56,4 @@ def convert_to_naive_amp(model: nn.Module, optimizer: Optimizer, amp_config):
    return model, optimizer


-__all__ = ['convert_to_naive_amp', 'NaiveAMPOptimizer']
+__all__ = ['convert_to_naive_amp', 'NaiveAMPOptimizer', 'FP16Optimizer']
--- a/colossalai/amp/naive_amp/_fp16_optimizer.py
+++ b/colossalai/amp/naive_amp/_fp16_optimizer.py
@ -41,25 +41,14 @@ def _multi_tensor_copy_this_to_that(this, that, overflow_buf=None):

 class FP16Optimizer(Optimizer):
    """Float16 optimizer for fp16 and bf16 data types.
-
-    :param optimizer: base optimizer such as Adam or SGD
-    :type optimizer: torch.optim.Optimizer
-    :param clip_grad: clip gradeints with this global L2 norm. Note that clipping is ignored if clip_grad == 0
-    :type param clip_grad: float
-    :param log_num_zeros_in_grad: return number of zeros in the gradients.
-    :type log_num_zeros_in_grad: bool
-    :param initial_scale: initial scale of gradient scaler
-    :type initial_scale: int
-    :param growth_factor: the growth rate of loss scale
-    :type growth_factor: int
-    :param backoff_factor: the decrease rate of loss scale
-    :type backoff_factor: float
-    :param hysterisis: delay shift in dynamic loss scaling
-    :type hysterisis: int
-    :param max_scale: maximum loss scale allowed
-    :type max_scale: int
-    :param verbose: if set to `True`, will print debug info
-    :type verbose: bool
+    
+    Args:
+        optimizer (torch.optim.Optimizer): base optimizer such as Adam or SGD
+        grad_scaler (BaseGradScaler): grad scaler for gradient chose in
+                                      ``constant_grad_scaler`` or ``dynamic_grad_scaler``.
+        clip_grad_norm (float, optional): clip gradients with this global L2 norm. Default 0.
+                        Note that clipping is ignored if clip_grad == 0
+        verbose (bool, optional): if set to `True`, will print debug info. Default False.
    """

    def __init__(self,
--- a/colossalai/amp/naive_amp/naive_amp.py
+++ b/colossalai/amp/naive_amp/naive_amp.py
@ -18,11 +18,15 @@ from ._fp16_optimizer import FP16Optimizer
 class NaiveAMPOptimizer(ColossalaiOptimizer):
    """A wrapper class for optimizer to cast all parameters to fp16

-    :param optim: A normal optimizer like Adam or SGD
-    :param args: Args used to initialize FP16 optimizer
-    :param kwargs: Kwargs used to initialize FP16 optimizer
+    Args:
+        optim (torch.optim.Optimizer): A normal optimizer like Adam or SGD.
+        grad_scaler (BaseGradScaler): grad scaler for gradient chose in
+                                      ``constant_grad_scaler`` or ``dynamic_grad_scaler``.
+        clip_grad_norm (float, optional): clip gradients with this global L2 norm. Default 0.
+        verbose (bool, optional): if set to `True`, will print debug info. Default False.

-    :type optim: torch.optim.Optimizer
+    Note:
+        clipping is ignored if ``clip_grad_norm`` equals 0.
    """

    def __init__(self, optim: Optimizer, *args, **kwargs):
@ -40,8 +44,19 @@ class NaiveAMPOptimizer(ColossalaiOptimizer):


 class NaiveAMPModel(nn.Module):
-    """A wrapper class for model to cast the model into fp16 and
+    r"""A wrapper class for model to cast the model into fp16 and
    automatically cast the input and output
+
+    Args:
+        model (torch.nn.Module): torch.nn.Module to be wrapped.
+        output_to_fp32 (bool, optional): Whether cast output of this module into fp32. (Default: True)
+        parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel group mode used in this module.
+                                                                  (Default: ``ParallelMode.DATA``)
+        sync_buffer (bool, optional): whether to synchronize buffer. (Default: True)
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
    """

    def __init__(self,
--- a/colossalai/amp/torch_amp/init.py
+++ b/colossalai/amp/torch_amp/init.py
@ -10,18 +10,25 @@ def convert_to_torch_amp(model: nn.Module,
                         optimizer: Optimizer,
                         criterion: Optional[_Loss] = None,
                         amp_config: Optional[Config] = None):
-    """A helper function to wrap training components with Torch AMP modules
+    """A helper function to wrap training components with Pytorch AMP modules

-    :param model: your model object
-    :type model: :class:`torch.nn.Module`
-    :param optimizer: your optimizer object
-    :type optimizer: :class:`torch.optim.Optimizer`
-    :param criterion: your loss function object
-    :type criterion: :class:`torch.nn.modules.loss._Loss`, optional
-    :param amp_config: configuration for different amp modes
-    :type amp_config: :class:`colossalai.context.Config` or dict, optional
-    :return: (model, optimizer, criterion)
-    :rtype: Tuple
+    Args:
+        model (:class:`torch.nn.Module`): your model object.
+        optimizer (:class:`torch.optim.Optimizer`): your optimizer object
+        criterion (:class:`torch.nn.modules.loss._Loss`, optional): your loss function object
+        amp_config (:class:`colossalai.context.Config` or dict, optional): configuration for Pytorch AMP.
+
+    The ``amp_config`` should include parameters below:
+    ::
+
+        init_scale (float, optional, default=2.**16)
+        growth_factor (float, optional, default=2.0)
+        backoff_factor (float, optional, default=0.5)
+        growth_interval (int, optional, default=2000)
+        enabled (bool, optional, default=True)
+
+    Returns:
+        A tuple (model, optimizer, criterion)
    """
    model = TorchAMPModel(model)
    if amp_config is None:
--- a/colossalai/amp/torch_amp/torch_amp.py
+++ b/colossalai/amp/torch_amp/torch_amp.py
@ -14,13 +14,19 @@ from colossalai.utils import clip_grad_norm_fp32


 class TorchAMPOptimizer(ColossalaiOptimizer):
-    """A wrapper class which integrate pytorch amp with an optimizer
+    """A wrapper class which integrate Pytorch AMP with an optimizer

-    :param optim: A normal optimizer like Adam or SGD
-    :param args: Args used to initialize gradient scaler
-    :param kwargs: Kwargs used to initialize gradient scaler
-
-    :type optim: torch.optim.Optimizer
+    Args:
+        optim (torch.optim.Optimizer): A normal optimizer like Adam or SGD.
+        init_scale (float, optional, default=2.**16):  Initial scale factor.
+        growth_factor (float, optional, default=2.0):  Factor by which the scale is multiplied during
+            :meth:`update` if no inf/NaN gradients occur for ``growth_interval`` consecutive iterations.
+        backoff_factor (float, optional, default=0.5):  Factor by which the scale is multiplied during
+            :meth:`update` if inf/NaN gradients occur in an iteration.
+        growth_interval (int, optional, default=2000):  Number of consecutive iterations without inf/NaN gradients
+            that must occur for the scale to be multiplied by ``growth_factor``.
+        enabled (bool, optional, default=True):  If ``False``, disables gradient scaling. :meth:`step` simply
+            invokes the underlying ``optimizer.step()``, and other methods become no-ops.
    """

    def __init__(self, optim: Optimizer, *args, **kwargs):
@ -30,8 +36,8 @@ class TorchAMPOptimizer(ColossalaiOptimizer):
    def backward(self, loss: Tensor):
        """Backward with torch amp gradient scaler

-        :param loss: Loss computed by a loss function
-        :type loss: torch.Tensor
+        Args:
+            loss (torch.Tensor): Loss computed by a loss function
        """
        self.scaler.scale(loss).backward()

@ -44,10 +50,9 @@ class TorchAMPOptimizer(ColossalaiOptimizer):
    def clip_grad_norm(self, model: nn.Module, max_norm: float):
        """Apply gradient clipping to the model parameters

-        :param model: Your model object
-        :type model: torch.nn.Module
-        :param max_norm: Max norm value for gradient clipping
-        :type max_norm: float
+        Args:
+            model (torch.nn.Module): Your model object
+            max_norm (float): Max norm value for gradient clipping
        """
        if max_norm > 0.0:
            self.scaler.unscale_(self.optim)
@ -71,8 +76,8 @@ class TorchAMPModel(nn.Module):
 class TorchAMPLoss(nn.Module):
    """A wrapper class for a criterion object which computes the loss in mixed-precision context

-    :param loss: A loss function object
-    :type loss: torch.nn.modules.loss._Loss
+    Args:
+        loss (torch.nn.modules.loss._Loss): A loss function object
    """

    def __init__(self, loss: _Loss):
--- a/colossalai/builder/builder.py
+++ b/colossalai/builder/builder.py
@ -10,34 +10,40 @@ from colossalai.registry import *
 def build_from_config(module, config: dict):
    """Returns an object of :class:`module` constructed from `config`.

-    :param module: A python or user-defined class
-    :type module: class
-    :param config: A python dict containing information used in the construction
-        of the return object
-    :type config: dict
-    :raises AssertionError: Raises an AssertionError if `module` is not a class
-    :return: An object of interest
-    :rtype: Object
+    Args:
+        module: A python or user-defined class
+        config: A python dict containing information used in the construction of the return object
+
+    Returns: An ``object`` of interest
+
+    Raises:
+        AssertionError: Raises an AssertionError if `module` is not a class
+
    """
    assert inspect.isclass(module), 'module must be a class'
    return module(**config)


 def build_from_registry(config, registry: Registry):
-    """Returns an object constructed from `config`, the type of the object
+    r"""Returns an object constructed from `config`, the type of the object
    is specified by `registry`.

-    :param config: A python dict or a :class:`colossalai.context.Config` object
-        containing information used in the construction of the return object
-    :type config: dict or :class:`colossalai.context.colossalai.context.Config`
-    :param registry: A registry specifying the type of the return object
-    :type registry: :class:`Registry`
-    :raises AssertionError: Raises an AssertionError if `registry` is not an object
-        of :class:`Registry` or `mod_type` in `config` is not found in `registry`
-    :raises Exception: Raises an Exception if an error occurred when building
-        from registry
-    :return: An object specified by `registry`
-    :rtype: Python object specified by `registry`
+    Note:
+        the `config` is used to construct the return object such as `LAYERS`,
+         `OPTIMIZERS` and other support types in `registry`. The `config` should contain
+         all required parameters of corresponding object. The details of support
+         types in `registry` and the `mod_type` in `config` could be found in
+         `registry <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/registry/__init__.py>`_.
+
+    Args:
+        config (dict or :class:`colossalai.context.colossalai.context.Config`): information
+            used in the construction of the return object.
+        registry (:class:`Registry`): A registry specifying the type of the return object
+
+    Returns: A Python object specified by `registry`
+
+    Raises:
+        Exception: Raises an Exception if an error occurred when building from registry
    """
    config_ = config.copy()  # keep the original config untouched
    assert isinstance(
@ -60,11 +66,13 @@ def build_from_registry(config, registry: Registry):
 def build_layer(config):
    """Returns a layer object of :class:`nn.Module` constructed from `config`.

-    :param config: A python dict or a :class:`colossalai.context.Config` object
-        containing information used in the construction of the return object
-    :type config: dict or :class:`colossalai.context.Config`
-    :return: An object of :class:`torch.nn.Module`
-    :rtype: :class:`torch.nn.Module`
+    Args:
+        config (dict or :class:`colossalai.context.Config`): A python dict or
+            a :class:`colossalai.context.Config` object containing information
+            used in the construction of the ``LAYERS``.
+
+    Returns:
+        An object of :class:`torch.nn.Module`
    """
    return build_from_registry(config, LAYERS)

@ -73,11 +81,13 @@ def build_loss(config):
    """Returns a loss function object of :class:`torch.autograd.Function` constructed
    from `config`.

-    :param config: A python dict or a :class:`colossalai.context.Config` object
-        containing information used in the construction of the return object
-    :type config: dict or :class:`colossalai.context.Config`
-    :return: An object of :class:`torch.nn.modules.loss._Loss`
-    :rtype: :class:`torch.nn.modules.loss._Loss`
+    Args:
+        config (dict or :class:`colossalai.context.Config`): A python dict or
+            a :class:`colossalai.context.Config` object containing information
+            used in the construction of the ``LOSSES``.
+
+    Returns:
+        An object of :class:`torch.nn.modules.loss._Loss`
    """
    return build_from_registry(config, LOSSES)

@ -85,11 +95,13 @@ def build_loss(config):
 def build_model(config):
    """Returns a model object of :class:`nn.Module` constructed from `config`.

-    :param config: A python dict or a :class:`colossalai.context.Config` object
-        containing information used in the construction of the return object
-    :type config: dict or :class:`colossalai.context.Config`
-    :return: An object of :class:`torch.nn.Module`
-    :rtype: :class:`torch.nn.Module`
+    Args:
+        config (dict or :class:`colossalai.context.Config`): A python dict or
+            a :class:`colossalai.context.Config` object containing information
+            used in the construction of the ``MODELS``.
+
+    Returns:
+        An object of :class:`torch.nn.Module`
    """
    return build_from_registry(config, MODELS)

@ -98,11 +110,13 @@ def build_dataset(config):
    """Returns a dataset object of :class:`torch.utils.data.Dataset` constructed
    from `config`.

-    :param config: A python dict or a :class:`colossalai.context.Config` object
-        containing information used in the construction of the return object
-    :type config: dict or :class:`colossalai.context.Config`
-    :return: An object of :class:`torch.utils.data.Dataset`
-    :rtype: :class:`torch.utils.data.Dataset`
+    Args:
+        config (dict or :class:`colossalai.context.Config`): A python dict or
+            a :class:`colossalai.context.Config` object containing information
+            used in the construction of the ``DATASETS``.
+
+    Returns:
+        An object of :class:`torch.utils.data.Dataset`
    """
    return build_from_registry(config, DATASETS)

@ -111,13 +125,14 @@ def build_optimizer(config, model):
    """Returns an optimizer object of :class:`torch.optim.Optimizer` constructed from `config`,
    'model' and 'params'.

-    :param config: A python dict or a :class:`colossalai.context.Config` object
-        containing information used in the construction of the return object
-    :type config: dict or :class:`colossalai.context.Config`
-    :param model: A model containing parameters for the optimizer
-    :type model: :class:`nn.Module`
-    :return: An object of :class:`torch.optim.Optimizer`
-    :rtype: :class:`torch.optim.Optimizer`
+    Args:
+        config (dict or :class:`colossalai.context.Config`): A python dict or
+            a :class:`colossalai.context.Config` object containing information
+            used in the construction of the ``OPTIMIZERS``.
+        model (:class:`nn.Module`): A model containing parameters for the optimizer
+
+    Returns:
+        An object of :class:`torch.optim.Optimizer`
    """
    config_ = config.copy()
    config_['params'] = model.parameters()
@ -128,15 +143,15 @@ def build_gradient_handler(config, model, optimizer):
    """Returns a gradient handler object of :class:`BaseGradientHandler` constructed from `config`,
    `model` and `optimizer`.

-    :param config: A python dict or a :class:`colossalai.context.Config` object
-        containing information used in the construction of the return object
-    :type config: dict or :class:`colossalai.context.Config`
-    :param model: A model containing parameters for the gradient handler
-    :type model: :class:`nn.Module`
-    :param optimizer: An optimizer object containing parameters for the gradient handler
-    :type optimizer: :class:`torch.optim.Optimizer`
-    :return: An object of :class:`colossalai.engine.BaseGradientHandler`
-    :rtype: :class:`colossalai.engine.BaseGradientHandler`
+    Args:
+        config (dict or :class:`colossalai.context.Config`): A python dict or
+            a :class:`colossalai.context.Config` object containing information
+            used in the construction of the ``GRADIENT_HANDLER``.
+        model (:class:`nn.Module`): A model containing parameters for the gradient handler
+        optimizer (:class:`torch.optim.Optimizer`): An optimizer object containing parameters for the gradient handler
+
+    Returns:
+        An object of :class:`colossalai.engine.BaseGradientHandler`
    """
    config_ = config.copy()
    config_['model'] = model
@ -147,13 +162,13 @@ def build_gradient_handler(config, model, optimizer):
 def build_hooks(config, trainer):
    """Returns a hook object of :class:`BaseHook` constructed from `config` and `trainer`.

-    :param config: A python dict or a :class:`colossalai.context.Config` object
-        containing information used in the construction of the return object
-    :type config: dict or :class:`colossalai.context.Config`
-    :param trainer: A :class:`Trainer` object containing parameters for the hook
-    :type trainer: :class:`Trainer`
-    :return: An object of :class:`colossalai.trainer.hooks.BaseHook`
-    :rtype: :class:`colossalai.trainer.hooks.BaseHook`
+    Args:
+        config (dict or :class:`colossalai.context.Config`): A python dict or
+            a :class:`colossalai.context.Config` object containing information
+            used in the construction of the ``HOOKS``.
+
+    Returns:
+        An object of :class:`colossalai.trainer.hooks.BaseHook`
    """
    config_ = config.copy()
    config_['trainer'] = trainer
@ -163,11 +178,13 @@ def build_hooks(config, trainer):
 def build_ophooks(config):
    """Returns a hook object of :class:`BaseOpHook` constructed from `config`.

-    :param config: A python dict or a :class:`colossalai.context.Config` object
-        containing information used in the construction of the return object
-    :type config: dict or :class:`colossalai.context.Config`
-    :return: An object of :class:`colossalai.trainer.hooks.BaseOpHook`
-    :rtype: :class:`colossalai.trainer.hooks.BaseOpHook`
+    Args:
+        config (dict or :class:`colossalai.context.Config`): A python dict or
+            a :class:`colossalai.context.Config` object containing information
+            used in the construction of the ``OPHOOKS``.
+
+    Returns:
+        An object of :class:`colossalai.trainer.hooks.BaseOpHook`
    """
    config_ = config.copy()
    return build_from_registry(config_, OPHOOKS)
@ -177,11 +194,13 @@ def build_transform(config):
    """Returns a transformation object of :class:`torchvision.transforms` constructed
    from `config`.

-    :param config: A python dict or a :class:`colossalai.context.Config` object
-        containing information used in the construction of the return object
-    :type config: dict or :class:`colossalai.context.Config`
-    :return: An object of :class:`torchvision.transforms`
-    :rtype: :class:`torchvision.transforms`
+    Args:
+        config (dict or :class:`colossalai.context.Config`): A python dict or
+            a :class:`colossalai.context.Config` object containing information
+            used in the construction of the ``TRANSFORMS``.
+
+    Returns:
+        An object of :class:`torchvision.transforms`
    """
    return build_from_registry(config, TRANSFORMS)

@ -190,14 +209,15 @@ def build_data_sampler(config, dataset):
    """Returns a data sampler object of :class:`colossalai.nn.data.sampler.BaseSampler`
    constructed from `config`.

-    :param config: A python dict or a :class:`colossalai.context.Config` object
-        containing information used in the construction of the return object
-    :type config: dict or :class:`colossalai.context.Config`
-    :param dataset: An object of :class:`torch.utils.data.Dataset` containing information
-        used in the construction of the return object
-    :type dataset: :class:`torch.utils.data.Dataset`
-    :return: An object of :class:`colossalai.utils.data_sampler.BaseSampler`
-    :rtype: :class:`colossalai.utils.data_sampler.BaseSampler`
+    Args:
+        config (dict or :class:`colossalai.context.Config`): A python dict or
+            a :class:`colossalai.context.Config` object containing information
+            used in the construction of the ``DATA_SAMPLERS``.
+        dataset (:class:`torch.utils.data.Dataset`): An object of
+            :class:`torch.utils.data.Dataset` containing information
+            used in the construction of the return object
+    Returns:
+        An object of :class:`colossalai.utils.data_sampler.BaseSampler`
    """
    config_ = config.copy()
    config_['dataset'] = dataset
@ -208,14 +228,15 @@ def build_lr_scheduler(config, optimizer):
    """Returns a learning rate scheduler object of :class:`torch.optim.lr_scheduler`
    constructed from `config`, `optimizer`, `total_steps` and `num_steps_per_epoch`.

-    :param config: A python dict or a :class:`colossalai.context.Config` object
-        containing information used in the construction of the return object
-    :type config: dict or :class:`colossalai.context.Config`
-    :param optimizer: An optimizer object containing parameters for the learning rate
-        scheduler
-    :type optimizer: :class:`torch.optim.Optimizer`
-    :return: An object of :class:`torch.optim.lr_scheduler`
-    :rtype: :class:`torch.optim.lr_scheduler`
+    Args:
+        config (dict or :class:`colossalai.context.Config`): A python dict or
+            a :class:`colossalai.context.Config` object containing information
+            used in the construction of the ``lr_schedule``.
+        optimizer (:class:`torch.optim.Optimizer`): An optimizer object containing
+            parameters for the learning rate scheduler.
+
+    Returns:
+        An object of :class:`torch.optim.lr_scheduler`
    """
    config_ = config.copy()
    config_['optimizer'] = optimizer
@ -225,10 +246,12 @@ def build_lr_scheduler(config, optimizer):
 def build_schedule(config):
    """Returns a schedule of :class:`colossalai.engine.schedule.BaseSchedule`.

-    :param config: A python dict or a :class:`colossalai.context.Config` object
-        containing information used in the construction of the return object
-    :type config: dict or :class:`colossalai.context.Config`
-    :return: An object of :class:`colossalai.engine.schedule.BaseSchedule`
-    :rtype: :class:`colossalai.engine.schedule.BaseSchedule`
+    Args:
+        config (dict or :class:`colossalai.context.Config`): A python dict or
+            a :class:`colossalai.context.Config` object containing information
+            used in the construction of the ``Schedule``.
+
+    Returns:
+        An object of :class:`colossalai.engine.schedule.BaseSchedule`
    """
    return build_from_registry(config, SCHEDULE)
--- a/colossalai/builder/pipeline.py
+++ b/colossalai/builder/pipeline.py
@ -13,14 +13,13 @@ def _binary_partition(weights, st, ed):
    """Returns the binary partition position of `weights`, given the start
    position `st` and the end position `ed`.

-    :param weights: A python list to be binary partitioned
-    :type weights: list
-    :param st: the start position of the binary partition
-    :type st: int
-    :param ed: the end postition of the binary partition
-    :type ed: int
-    :return: the binary partition position of `weights`
-    :rtype: int
+    Args:
+        weights (list): A python list to be binary partitioned
+        st (int): the start position of the binary partition
+        ed (int): the end position of the binary partition
+
+    Returns:
+        int: the binary partition position of `weights`
    """
    w_sum = weights[ed - 1]
    prefix = 0
@ -176,16 +175,13 @@ def build_pipeline_model_from_cfg(config, num_chunks: int = 1, partition_method:
        ...
    )

-    :param config: Configuration of the model
-    :type config: dict
-    :param num_chunks: The number of chunks you want to have on the current stage. This value should be 1
-                        in most cases unless you are using virutal pipeline parallelism.
-    :type num_chunks: int, optional
-    :param partition_method: This parameter determines how you want to split your model layers into stages,
-                                you can set it as 'layer' or 'parameter'
-    :type partition_method: str, optional
-    :param verbose: Whether to print the logs
-    :type verbose: bool, optional
+    Args:
+        config (dict): Configuration of the model.
+        num_chunks (int, optional): The number of chunks you want to have on the current stage.
+            This value should be 1 in most cases unless you are using virtual pipeline parallelism.
+        partition_method (str, optional): This parameter determines how you want to split your model
+            layers into stages, you can set it as 'layer' or 'parameter'.
+        verbose (bool, optional): Whether to print the logs.
    """
    ori_model = build_model(config)
    layers = ori_model.layers_cfg
@ -240,13 +236,11 @@ def build_pipeline_model(layers: nn.Sequential, num_chunks: int = 1, verbose: bo
    """An intializer to split the model into different stages for pipeline parallelism.
    Note that `layer` must be `torch.nn.Sequential`.

-    :param layers: Layers of model
-    :type layers: `torch.nn.Sequential`
-    :param num_chunks: The number of chunks you want to have on the current stage. This value should be 1
-                        in most cases unless you are using virutal pipeline parallelism.
-    :type num_chunks: int, optional
-    :param verbose: Whether to print the logs
-    :type verbose: bool, optional
+    Args:
+        layers (`torch.nn.Sequential`): Layers of model
+        num_chunks: The number of chunks you want to have on the current stage. This value should be 1
+                        in most cases unless you are using virtual pipeline parallelism.
+        verbose (bool, optional): Whether to print the logs.
    """
    pipeline_parallel_size = gpc.get_world_size(ParallelMode.PIPELINE)
    pipeline_rank = gpc.get_local_rank(ParallelMode.PIPELINE)
--- a/colossalai/communication/collective.py
+++ b/colossalai/communication/collective.py
@ -12,21 +12,22 @@ from colossalai.utils import get_current_device


 def all_gather(tensor: Tensor, dim: int, parallel_mode: ParallelMode, async_op: bool = False) -> Tensor:
-    """Gathers all tensors from the parallel group and concatenates them in a 
+    r"""Gathers all tensors from the parallel group and concatenates them in a
    specific dimension.
-    
-    :param tensor: Tensor to be gathered
-    :param dim: The dimension concatenating in
-    :param parallel_mode: Parallel group mode used in this communication
-    :param async_op: Whether operations are asynchronous

-    :type tensor: :class:`torch.Tensor`
-    :type dim: int
-    :type parallel_mode: :class:`colossalai.context.ParallelMode`
-    :type async_op: bool, optional
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.

-    :return: The tensor generated by all-gather
-    :rtype: :class:`torch.Tensor`
+    Args:
+        tensor (:class:`torch.Tensor`): Tensor to be gathered.
+        dim (int): The dimension concatenating in.
+        parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel group mode used in this communication.
+        async_op (bool, optional): Whether operations are asynchronous.
+
+    Returns:
+        Union[tuple(:class:`torch.Tensor`, work handle), :class:`torch.Tensor`]: The result of all-together only,
+        if async_op is set to False. A tuple of output of all-gather and Async work handle, if async_op is set to True.
    """
    depth = gpc.get_world_size(parallel_mode)
    if depth == 1:
@ -54,23 +55,26 @@ def reduce_scatter(tensor: Tensor,
                   parallel_mode: ParallelMode,
                   op: ReduceOp = ReduceOp.SUM,
                   async_op: bool = False) -> Tensor:
-    """Reduces all tensors then scatters it in a specific dimension to all 
+    r"""Reduces all tensors then scatters it in a specific dimension to all
    members in the parallel group.
-    
-    :param tensor: Tensor to be reduced and scattered
-    :param dim: The dimension scattering in
-    :param parallel_mode: Parallel group mode used in this communication
-    :param op: The type of reduce operation
-    :param async_op: Whether operations are asynchronous

-    :type tensor: :class:`torch.Tensor`
-    :type dim: int
-    :type parallel_mode: :class:`colossalai.context.ParallelMode`
-    :type op: ReduceOp, optional
-    :type async_op: bool, optional
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.

-    :return: The tensor generated by reduce-scatter
-    :rtype: :class:`Tensor`
+    Args:
+        tensor (:class:`torch.Tensor`): Tensor to be reduce_scattered.
+        dim (int): The dimension concatenating in.
+        parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel group mode used in this communication.
+        op (torch.distributed.ReduceOp, optional): The type of reduce operation,
+            should be included in [SUM, AVG, PRODUCT, MIN, MAX, BAND, BOR, BXOR].
+            More details about ReduceOp please refer to
+            `ReduceOp <https://pytorch.org/docs/stable/distributed.html#torch.distributed.ReduceOp>`_.
+        async_op (bool, optional): Whether operations are asynchronous.
+
+    Returns:
+        Union[tuple(:class:`torch.Tensor`, work handle), :class:`torch.Tensor`]: The result of reduce_scatter only,
+        if async_op is set to False. A tuple of output of all-gather and Async work handle, if async_op is set to True.
    """
    depth = gpc.get_world_size(parallel_mode)
    if depth == 1:
@ -94,6 +98,25 @@ def all_reduce(tensor: Tensor,
               parallel_mode: ParallelMode,
               op: ReduceOp = ReduceOp.SUM,
               async_op: bool = False) -> Tensor:
+    r"""Reduces the tensor data across whole parallel group in such a way that all get the final result.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
+
+    Args:
+        tensor (:class:`torch.Tensor`): Tensor to be all-reduced.
+        parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel group mode used in this communication.
+        op (torch.distributed.ReduceOp, optional): The type of reduce operation,
+            should be included in [SUM, AVG, PRODUCT, MIN, MAX, BAND, BOR, BXOR].
+            More details about ReduceOp please refer to
+            `ReduceOp <https://pytorch.org/docs/stable/distributed.html#torch.distributed.ReduceOp>`_.
+        async_op (bool, optional): Whether operations are asynchronous.
+
+    Returns:
+        Union[tuple(:class:`torch.Tensor`, work handle), :class:`torch.Tensor`]: The result of all-gather only,
+        if async_op is set to False. A tuple of output of all-gather and Async work handle, if async_op is set to True.
+    """
    depth = gpc.get_world_size(parallel_mode)
    if depth == 1:
        out = tensor
@ -108,6 +131,23 @@ def all_reduce(tensor: Tensor,


 def broadcast(tensor: Tensor, src: int, parallel_mode: ParallelMode, async_op: bool = False):
+    r"""Broadcast tensors to whole parallel group. Tensor must have the same
+    number of elements in all processes participating in the collective.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
+
+    Args:
+        tensor (:class:`torch.Tensor`): Tensor to be broadcast.
+        src (int): Source rank.
+        parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel group mode used in this communication.
+        async_op (bool, optional): Whether operations are asynchronous.
+
+    Returns:
+        Union[tuple(:class:`torch.Tensor`, work handle), :class:`torch.Tensor`]: The tensor need to be broadcast only,
+        if async_op is set to False. A tuple of output of all-gather and Async work handle, if async_op is set to True.
+    """
    depth = gpc.get_world_size(parallel_mode)
    if depth == 1:
        out = tensor
@ -122,6 +162,23 @@ def broadcast(tensor: Tensor, src: int, parallel_mode: ParallelMode, async_op: b


 def reduce(tensor: Tensor, dst: int, parallel_mode: ParallelMode, op: ReduceOp = ReduceOp.SUM, async_op: bool = False):
+    r"""Reduce tensors across whole parallel group. Only the process with
+    rank ``dst`` is going to receive the final result.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
+
+    Args:
+        tensor (:class:`torch.Tensor`): Tensor to be reduced.
+        dst (int): Destination rank.
+        parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel group mode used in this communication.
+        async_op (bool, optional): Whether operations are asynchronous.
+
+    Returns:
+        Union[tuple(:class:`torch.Tensor`, work handle), :class:`torch.Tensor`]: The result of reduce only,
+        if async_op is set to False. A tuple of output of all-gather and Async work handle, if async_op is set to True.
+    """
    depth = gpc.get_world_size(parallel_mode)
    if depth == 1:
        out = tensor
--- a/colossalai/communication/p2p.py
+++ b/colossalai/communication/p2p.py
@ -19,12 +19,12 @@ TensorShape = Union[torch.Size, List[int], Tuple[int]]
 def _get_tensor_shape(tensor_shape: TensorShape, chunk_tensor: bool = False) -> Tuple[TensorShape, bool]:
    """get the exact tensor shape when communicating and return whether the tensor is a chunk

-    :param tensor_shape: shape of tensor
-    :type tensor_shape: TensorShape
-    :param chunk_tensor: whether to chunk tensor, defaults to False
-    :type chunk_tensor: bool, optional
-    :return: exact tensor shape, whether to chunk tensor
-    :rtype: Tuple[Union[torch.Size, List[int], Tuple[int]], bool]
+    Args:
+        tensor_shape (:class:`torch.Size`): shape of tensor
+        chunk_tensor (bool, optional): whether to chunk tensor, defaults to False
+
+    Returns:
+        Tuple[Union[torch.Size, List[int], Tuple[int]], bool]: exact tensor shape, whether to chunk tensor
    """
    if chunk_tensor:
        tensor_chunk_shape = reduce(operator.mul, tensor_shape, 1)
@ -134,14 +134,14 @@ def _communicate(tensor_send_next=None,


 def recv_forward(input_tensor_shape, prev_rank=None, dtype=torch.float, scatter_gather_tensors=False):
-    """Receives the input tensor from the previous member in pipeline.
+    """Copy the forward output from the previous stage in pipeline as the input tensor of this stage.

-    :param input_tensor_shape: The shape of the tensor to be recieved
-    :param prev_rank: The rank of the source of the tensor
-    :type input_tensor_shape: torch.Size
-    :type prev_rank: int, optional
-    :return: The input tensor in forward step
-    :rtype: :class:`torch.Tensor`
+    Args:
+        input_tensor_shape (:class:`torch.Size`): The shape of the tensor to be received.
+        prev_rank (int, optional): The rank of the source of the tensor.
+
+    Returns:
+        :class:`torch.Tensor`: The input tensor.
    """
    if gpc.is_pipeline_first_stage():
        input_tensor = None
@ -155,14 +155,14 @@ def recv_forward(input_tensor_shape, prev_rank=None, dtype=torch.float, scatter_


 def recv_backward(output_grad_shape, next_rank=None, dtype=torch.float, scatter_gather_tensors=False):
-    """Receives the grad tensor from the next member in pipeline.
+    """Copy the gradient tensor from the next stage in pipeline as the input gradient of this stage.

-    :param output_grad_shape: The shape of the tensor to be recieved
-    :param next_rank: The rank of the source of the tensor
-    :type output_grad_shape: torch.Size
-    :type next_rank: int, optional
-    :return: The grad of output tensor in forward step
-    :rtype: :class:`torch.Tensor`
+    Args:
+        output_grad_shape (:class:`torch.Size`): The shape of the tensor to be received.
+        next_rank (int, optional): The rank of the source of the tensor.
+
+    Returns:
+        :class:`torch.Tensor`: The input gradient tensor.
    """
    if gpc.is_pipeline_last_stage():
        output_tensor_grad = None
@ -176,12 +176,11 @@ def recv_backward(output_grad_shape, next_rank=None, dtype=torch.float, scatter_


 def send_forward(output_tensor, next_rank=None, scatter_gather_tensors=False):
-    """Sends the input tensor to the next member in pipeline.
+    """Sends the input tensor to the next stage in pipeline.

-    :param output_tensor: Tensor to be sent
-    :param next_rank: The rank of the recipient of the tensor
-    :type output_tensor: :class:`torch.Tensor`
-    :type next_rank: int, optional
+    Args:
+        output_tensor (:class:`torch.Tensor`): Tensor to be sent.
+        next_rank (int, optional): The rank of the recipient of the tensor.
    """
    if not gpc.is_pipeline_last_stage():
        _communicate(tensor_send_next=output_tensor,
@ -190,12 +189,11 @@ def send_forward(output_tensor, next_rank=None, scatter_gather_tensors=False):


 def send_backward(input_tensor_grad, prev_rank=None, scatter_gather_tensors=False):
-    """Sends the grad tensor to the previous member in pipeline.
+    """Sends the gradient tensor to the previous stage in pipeline.

-    :param input_tensor_grad: Tensor to be sent
-    :param prev_rank: The rank of the recipient of the tensor
-    :type input_tensor_grad: :class:`torch.Tensor`
-    :type prev_rank: int, optional
+    Args:
+        input_tensor_grad (:class:`torch.Tensor`): Tensor to be sent
+        prev_rank (int, optional): The rank of the recipient of the tensor
    """
    if not gpc.is_pipeline_first_stage():
        _communicate(tensor_send_prev=input_tensor_grad,
@ -210,15 +208,15 @@ def send_forward_recv_backward(output_tensor,
                               dtype=torch.float,
                               scatter_gather_tensors=False):
    """Batched communication operation. Sends the input tensor to the 
-    next member in pipeline, while recieves the grad tensor from the
-    next member in pipeline.
+    next stage in pipeline, while receives the gradient tensor from the
+    next stage in pipeline as the input gradient tensor of this stage.

-    :param output_tensor: Tensor to be sent
-    :param output_grad_shape: The shape of the tensor to be recieved
-    :type output_tensor: :class:`torch.Tensor`
-    :type output_grad_shape: :class:`torch.Size`
-    :return: The grad of output tensor in forward step
-    :rtype: :class:`torch.Tensor`
+    Args:
+        output_tensor (:class:`torch.Tensor`): Tensor to be sent.
+        output_grad_shape (:class:`torch.Size`): The shape of the tensor to be received.
+
+    Returns:
+        :class:`torch.Tensor`: The input gradient tensor.
    """
    if gpc.is_pipeline_last_stage():
        output_tensor_grad = None
@ -238,16 +236,16 @@ def send_backward_recv_forward(input_tensor_grad,
                               prev_rank=None,
                               dtype=torch.float,
                               scatter_gather_tensors=False):
-    """Batched communication operation. Sends the grad tensor to the 
-    previous member in pipeline, while recieves the input tensor from the
-    previous member in pipeline.
+    """Batched communication operation. Sends the gradient tensor to the
+    previous stage in pipeline, while receives the output tensor from the
+    previous stage in pipeline as the input of this stage.

-    :param input_tensor_grad: Tensor to be sent
-    :param input_tensor_shape: The shape of the tensor to be recieved
-    :type input_tensor_grad: :class:`torch.Tensor`
-    :type input_tensor_shape: :class:`torch.Size`
-    :return: The input tensor in forward step
-    :rtype: :class:`torch.Tensor`
+    Args:
+        input_tensor_grad (:class:`torch.Tensor`): Tensor to be sent.
+        input_tensor_shape (:class:`torch.Size`): The shape of the tensor to be received.
+
+    Returns:
+        :class:`torch.Tensor`: The input tensor.
    """
    if gpc.is_pipeline_first_stage():
        input_tensor = None
@ -269,15 +267,15 @@ def send_forward_recv_forward(output_tensor,
                              dtype=torch.float,
                              scatter_gather_tensors=False):
    """Batched communication operation. Sends the input tensor to the 
-    next member in pipeline, while recieves the input tensor from the
-    previous member in pipeline.
+    next stage in pipeline, while receives the output tensor from the
+    previous stage in pipeline as the input of this stage.

-    :param output_tensor: Tensor to be sent
-    :param input_tensor_shape: The shape of the tensor to be recieved
-    :type output_tensor: :class:`torch.Tensor`
-    :type input_tensor_shape: :class:`torch.Size`
-    :return: The input tensor in forward step
-    :rtype: :class:`torch.Tensor`
+    Args:
+        output_tensor (:class:`torch.Tensor`): Tensor to be sent.
+        input_tensor_shape (:class:`torch.Size`): The shape of the tensor to be received.
+
+    Returns:
+        :class:`torch.Tensor`: The input tensor.
    """
    input_tensor, _ = _communicate(tensor_send_next=output_tensor,
                                   recv_prev=recv_prev,
@ -296,16 +294,16 @@ def send_backward_recv_backward(input_tensor_grad,
                                next_rank=None,
                                dtype=torch.float,
                                scatter_gather_tensors=False):
-    """Batched communication operation. Sends the grad tensor to the 
-    previous member in pipeline, while recieves the grad tensor from the
-    next member in pipeline.
+    """Batched communication operation. Sends the gradient tensor to the
+    previous stage in pipeline, while receives the gradient tensor from the
+    next member in pipeline as the input of this stage.

-    :param input_tensor_grad: Tensor to be sent
-    :param output_grad_shape: The shape of the tensor to be recieved
-    :type input_tensor_grad: :class:`torch.Tensor`
-    :type output_grad_shape: :class:`torch.Size`
-    :return: The grad of output tensor in forward step
-    :rtype: :class:`torch.Tensor`
+    Args:
+        input_tensor_grad (:class:`torch.Tensor`): Tensor to be sent.
+        output_grad_shape (:class:`torch.Size`): The shape of the tensor to be received.
+
+    Returns:
+        :class:`torch.Tensor`: The input gradient tensor.
    """
    _, output_tensor_grad = _communicate(tensor_send_prev=input_tensor_grad,
                                         recv_next=recv_next,
@ -327,20 +325,18 @@ def send_forward_backward_recv_forward_backward(output_tensor,
                                                next_rank=None,
                                                dtype=torch.float,
                                                scatter_gather_tensors=False):
-    """Batched communication operation. Sends the input tensor to the next and 
-    the grad tensor to the previous, while recieves the grad tensor from the
-    next and the input tensor from the previous.
+    """Batched communication operation. Sends the input tensor to the next stage in pipeline and
+    the gradient tensor to the previous stage, while receives the input gradient tensor from the
+    next stage and the input tensor from the previous stage.

-    :param output_tensor: Tensor sent to the next
-    :param input_tensor_grad: Tensor sent to the previous
-    :param input_tensor_shape: The shape of the tensor recieved from the previous
-    :param output_grad_shape: The shape of the tensor recieved from the next
-    :type output_tensor: :class:`torch.Tensor`
-    :type input_tensor_grad: :class:`torch.Tensor`
-    :type input_tensor_shape: :class:`torch.Size`
-    :type output_grad_shape: :class:`torch.Size`
-    :return: (the input tensor in forward step, the grad of output tensor in forward step)
-    :rtype: (Tensor, Tensor)
+    Args:
+        output_tensor (:class:`torch.Tensor`): Tensor sent to the next.
+        input_tensor_grad (:class:`torch.Tensor`): Tensor sent to the previous.
+        input_tensor_shape (:class:`torch.Size`): The shape of the tensor received from the previous.
+        output_grad_shape (:class:`torch.Size`): The shape of the tensor received from the next.
+
+    Returns:
+        Tuple(Tensor, Tensor): (the input tensor, the input gradient tensor)
    """
    input_tensor, output_tensor_grad = _communicate(
        tensor_send_next=output_tensor,
--- a/colossalai/communication/ring.py
+++ b/colossalai/communication/ring.py
@ -9,15 +9,19 @@ from colossalai.utils import get_current_device, synchronize


 def ring_forward(tensor_send_next: torch.Tensor, parallel_mode: ParallelMode):
-    """Sends a tensor to the next member and recieves a tensor from the previous member.
-    This function returns the recieved tensor from the previous member.
+    """Sends a tensor to the next member and receives a tensor from the previous member.
+    This function returns the received tensor from the previous member.

-    :param tensor_send_next: Tensor sent to next member
-    :param parallel_mode: Parallel group mode used in this communication
-    :type tensor_send_next: :class:`torch.Tensor`
-    :type parallel_mode: :class:`colossalai.context.ParallelMode`
-    :return: The tensor recieved from the previous
-    :rtype: :class:`torch.Tensor`
+    Args:
+        tensor_send_next: Tensor sent to next member
+        parallel_mode: Parallel group mode used in this communication
+
+    Returns:
+        :class:`torch.Tensor`: The tensor received from the previous.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
    """
    buffer_shape = tensor_send_next.size()

--- a/colossalai/communication/utils.py
+++ b/colossalai/communication/utils.py
@ -12,14 +12,13 @@ def send_tensor_meta(tensor, need_meta=True, next_rank=None):
    meta information of the tensor should be sent before communications. This function
    synchronizes with :func:`recv_tensor_meta`.

-    :param tensor: Tensor to be sent
-    :param need_meta: If False, meta information won't be sent
-    :param next_rank: The rank of the next member in pipeline parallel group
-    :type tensor: Tensor
-    :type need_meta: bool, optional
-    :type next_rank: int
-    :return: False
-    :rtype: bool
+    Args:
+        tensor (torch.Tensor): Tensor to be sent.
+        need_meta (bool, optional): If False, meta information won't be sent.
+        next_rank (int): The rank of the next member in pipeline parallel group.
+
+    Returns:
+        bool: False
    """
    if need_meta:
        if next_rank is None:
@ -36,17 +35,17 @@ def send_tensor_meta(tensor, need_meta=True, next_rank=None):


 def recv_tensor_meta(tensor_shape, prev_rank=None):
-    """Recieves tensor meta information before recieving a specific tensor.
+    """Receives tensor meta information before receiving a specific tensor.
    Since the recipient must know the shape of the tensor in p2p communications,
-    meta information of the tensor should be recieved before communications. This function
+    meta information of the tensor should be received before communications. This function
    synchronizes with :func:`send_tensor_meta`.

-    :param tensor_shape: The shape of the tensor to be recieved
-    :param prev_rank: The rank of the source of the tensor
-    :type tensor_shape: torch.Size
-    :type prev_rank: int, optional
-    :return: The shape of the tensor to be recieved
-    :rtype: torch.Size
+    Args:
+        tensor_shape (torch.Size): The shape of the tensor to be received.
+        prev_rank (int): The rank of the source of the tensor.
+
+    Returns:
+        torch.Size: The shape of the tensor to be received.
    """
    if tensor_shape is None:
        if prev_rank is None:
@ -67,14 +66,12 @@ def recv_tensor_meta(tensor_shape, prev_rank=None):
 def split_tensor_into_1d_equal_chunks(tensor, new_buffer=False):
    """Break a tensor into equal 1D chunks.

-    :param tensor: Tensor to be splitted before communication
-    :param new_buffer: Whether uses a new buffer to store sliced tensor
+    Args:
+        tensor (torch.Tensor): Tensor to be split before communication.
+        new_buffer (bool, optional): Whether to use a new buffer to store sliced tensor.

-    :type tensor: torch.Tensor
-    :type new_buffer: bool, optional
-
-    :return splitted_tensor: The splitted tensor
-    :rtype splitted_tensor: torch.Tensor
+    Returns:
+        torch.Tensor: The split tensor
    """
    partition_size = torch.numel(tensor) // gpc.get_world_size(ParallelMode.PARALLEL_1D)
    start_index = partition_size * gpc.get_local_rank(ParallelMode.PARALLEL_1D)
@ -92,11 +89,10 @@ def split_tensor_into_1d_equal_chunks(tensor, new_buffer=False):
 def gather_split_1d_tensor(tensor):
    """Opposite of above function, gather values from model parallel ranks.

-    :param tensor: Tensor to be gathered after communication
-    :type tensor: torch.Tensor
-
-    :return gathered: The gathered tensor
-    :rtype gathered: torch.Tensor
+    Args:
+        tensor (torch.Tensor): Tensor to be gathered after communication.
+    Returns:
+        gathered (torch.Tensor): The gathered tensor
    """
    world_size = gpc.get_world_size(ParallelMode.PARALLEL_1D)
    numel = torch.numel(tensor)
--- a/colossalai/context/config.py
+++ b/colossalai/context/config.py
@ -12,8 +12,8 @@ class Config(dict):
    """This is a wrapper class for dict objects so that values of which can be
    accessed as attributes.

-    :param config: The dict object to be wrapped
-    :type config: dict
+    Args:
+        config (dict): The dict object to be wrapped.
    """

    def __init__(self, config: dict = None):
@ -50,12 +50,14 @@ class Config(dict):
    def from_file(filename: str):
        """Reads a python file and constructs a corresponding :class:`Config` object.

-        :param filename: Name of the file to construct the return object
-        :type filename: str
-        :raises AssertionError: Raises an AssertionError if the file does not exist, or the file
-            is not .py file
-        :return: A :class:`Config` object constructed with information in the file
-        :rtype: :class:`Config`
+        Args:
+            filename (str): Name of the file to construct the return object.
+
+        Returns:
+            :class:`Config`: A :class:`Config` object constructed with information in the file.
+
+        Raises:
+            AssertionError: Raises an AssertionError if the file does not exist, or the file is not .py file
        """

        # check config path
--- a/colossalai/context/parallel_context.py
+++ b/colossalai/context/parallel_context.py
@ -22,6 +22,10 @@ class ParallelContext(metaclass=SingletonMeta):
    """This class provides interface functions for users to get the parallel context,
    such as the global rank, the local rank, the world size, etc. of each device.

+    Note:
+        The parallel_mode used in this class should be concluded in ``ParallelMode``.
+        More details about ``ParallelMode`` could be found in
+        `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
    """

    def __init__(self):
@ -62,10 +66,12 @@ class ParallelContext(metaclass=SingletonMeta):
    def load_config(self, config: Union[dict, str]):
        """Loads the configuration from either a dict or a file.

-        :param config: Either a dict containing the configuration information or the filename
-            of a file containing the configuration information
-        :type config: dict or str
-        :raises TypeError: Raises a TypeError if `config` is neither a dict or a str
+        Args:
+            config (dict or str): Either a dict containing the configuration information or the filename
+                of a file containing the configuration information.
+
+        Raises:
+            TypeError: Raises a TypeError if `config` is neither a dict nor a str.
        """
        if isinstance(config, str):
            self._config = Config.from_file(config)
@ -81,20 +87,21 @@ class ParallelContext(metaclass=SingletonMeta):
    def get_global_rank(self):
        """Returns the global rank of the current device.

-        :return: The global rank of the current device
-        :rtype: int
+        Returns:
+            int: The global rank of the current device
        """
        return self._global_ranks[ParallelMode.GLOBAL]

    def add_global_rank(self, parallel_mode: ParallelMode, rank: int):
        """Adds the global rank of the current device for `parallel_mode` to the context.

-        :param parallel_mode: The parallel mode for the rank
-        :type parallel_mode: :class:`colossalai.context.ParallelMode`
-        :param rank: The rank to be added
-        :type rank: int
-        :raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
-            of :class:`colossalai.context.ParallelMode`
+        Args:
+            parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode for the rank.
+            rank (int): The rank to be added
+
+        Raises:
+            AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
+                of :class:`colossalai.context.ParallelMode`.
        """
        self._check_parallel_mode(parallel_mode)
        self._global_ranks[parallel_mode] = rank
@ -102,12 +109,15 @@ class ParallelContext(metaclass=SingletonMeta):
    def get_local_rank(self, parallel_mode: ParallelMode):
        """Returns the local rank of the current device.

-        :param parallel_mode: The chosen parallel mode
-        :type parallel_mode: :class:`colossalai.context.ParallelMode`
-        :raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
-            of :class:`colossalai.context.ParallelMode`
-        :return: The local rank of the current device for `parallel_mode`
-        :rtype: int
+        Args:
+            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+
+        Raises:
+            AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
+                of :class:`colossalai.context.ParallelMode`.
+
+        Returns:
+            int: The local rank of the current device for `parallel_mode`.
        """
        self._check_parallel_mode(parallel_mode)
        return self._local_ranks[parallel_mode]
@ -115,12 +125,13 @@ class ParallelContext(metaclass=SingletonMeta):
    def add_local_rank(self, parallel_mode: ParallelMode, rank: int):
        """Adds the local rank of the current device for `parallel_mode` to the context.

-        :param parallel_mode: The parallel mode for the rank
-        :type parallel_mode: :class:`colossalai.context.ParallelMode`
-        :param rank: The rank to be added
-        :type rank: int
-        :raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
-            of :class:`colossalai.context.ParallelMode`
+        Args:
+            parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode for the rank.
+            rank (int): The rank to be added.
+
+        Raises:
+            AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
+                of :class:`colossalai.context.ParallelMode`.
        """
        self._check_parallel_mode(parallel_mode)
        self._local_ranks[parallel_mode] = rank
@ -128,12 +139,15 @@ class ParallelContext(metaclass=SingletonMeta):
    def get_next_global_rank(self, parallel_mode: ParallelMode):
        """Returns the global rank of the next device.

-        :param parallel_mode: The chosen parallel mode
-        :type parallel_mode: :class:`colossalai.context.ParallelMode`
-        :raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
-            of :class:`colossalai.context.ParallelMode`
-        :return: The global rank of the next device for `parallel_mode`
-        :rtype: int
+        Args:
+            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+
+        Raises:
+            AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
+                of :class:`colossalai.context.ParallelMode`.
+
+        Returns:
+            int: The global rank of the next device for `parallel_mode`.
        """
        self._check_parallel_mode(parallel_mode)

@ -147,12 +161,15 @@ class ParallelContext(metaclass=SingletonMeta):
    def get_prev_global_rank(self, parallel_mode: ParallelMode):
        """Returns the global rank of the previous device.

-        :param parallel_mode: The chosen parallel mode
-        :type parallel_mode: :class:`colossalai.context.ParallelMode`
-        :raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
-            of :class:`colossalai.context.ParallelMode`
-        :return: The global rank of the previous device for `parallel_mode`
-        :rtype: int
+        Args:
+            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+
+        Raises:
+            AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
+                of :class:`colossalai.context.ParallelMode`.
+
+        Returns:
+            int: The global rank of the previous device for `parallel_mode`.
        """
        self._check_parallel_mode(parallel_mode)

@ -167,13 +184,16 @@ class ParallelContext(metaclass=SingletonMeta):
        """Returns a boolean value indicating whether the current device is the first one
        among its group for `parallel_mode`.

-        :param parallel_mode: The chosen parallel mode
-        :type parallel_mode: :class:`colossalai.context.ParallelMode`
-        :raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
-            of :class:`colossalai.context.ParallelMode`
-        :return: a boolean value indicating whether the current device is the first one
-            among its group for `parallel_mode`
-        :rtype: bool
+        Args:
+            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+
+        Raises:
+            AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
+                of :class:`colossalai.context.ParallelMode`.
+
+        Returns:
+            bool: a boolean value indicating whether the current device is the first one
+                among its group for `parallel_mode`.
        """
        rank = self.get_local_rank(parallel_mode)
        return rank == 0
@ -182,13 +202,16 @@ class ParallelContext(metaclass=SingletonMeta):
        """Returns a boolean value indicating whether the current device is the last one
        among its group for `parallel_mode`.

-        :param parallel_mode: The chosen parallel mode
-        :type parallel_mode: :class:`colossalai.context.ParallelMode`
-        :raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
-            of :class:`colossalai.context.ParallelMode`
-        :return: a boolean value indicating whether the current device is the last one
-            among its group for `parallel_mode`
-        :rtype: bool
+        Args:
+            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+
+        Raises:
+            AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
+                of :class:`colossalai.context.ParallelMode`.
+
+        Returns:
+            bool: a boolean value indicating whether the current device is the first one
+                among its group for `parallel_mode`.
        """
        rank = self.get_local_rank(parallel_mode)
        world_size = self.get_world_size(parallel_mode)
@ -210,12 +233,15 @@ class ParallelContext(metaclass=SingletonMeta):
    def get_world_size(self, parallel_mode: ParallelMode):
        """Returns the world size for `parallel_mode`.

-        :param parallel_mode: The chosen parallel mode
-        :type parallel_mode: :class:`colossalai.context.ParallelMode`
-        :raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
-            of :class:`colossalai.context.ParallelMode`
-        :return: The world size for `parallel_mode`
-        :rtype: int
+        Args:
+            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+
+        Raises:
+            AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
+                of :class:`colossalai.context.ParallelMode`.
+
+        Returns:
+            int: The world size for `parallel_mode`.
        """
        self._check_parallel_mode(parallel_mode)
        return self._world_sizes[parallel_mode]
@ -223,12 +249,13 @@ class ParallelContext(metaclass=SingletonMeta):
    def add_world_size(self, parallel_mode: ParallelMode, world_size: int):
        """Adds world size for `parallel_mode`.

-        :param parallel_mode: The chosen parallel mode
-        :type parallel_mode: :class:`colossalai.context.ParallelMode`
-        :param world_size: The world size to be added
-        :type world_size: int
-        :raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
-            of :class:`colossalai.context.ParallelMode`
+        Args:
+            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+            world_size (int): The world size to be added
+
+        Raises:
+            AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
+                of :class:`colossalai.context.ParallelMode`.
        """
        self._check_parallel_mode(parallel_mode)
        self._world_sizes[parallel_mode] = world_size
@ -236,12 +263,15 @@ class ParallelContext(metaclass=SingletonMeta):
    def get_group(self, parallel_mode: ParallelMode):
        """Returns the group of the current device for `parallel_mode`.

-        :param parallel_mode: The chosen parallel mode
-        :type parallel_mode: :class:`colossalai.context.ParallelMode`
-        :raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
-            of :class:`colossalai.context.ParallelMode`
-        :return: The group of the current device for `parallel_mode`
-        :rtype: torch.distributed.ProcessGroup
+        Args:
+            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+
+        Raises:
+            AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
+                of :class:`colossalai.context.ParallelMode`.
+
+        Returns:
+            torch.distributed.ProcessGroup: The group of the current device for `parallel_mode`.
        """
        self._check_parallel_mode(parallel_mode)
        return self._groups[parallel_mode]
@ -249,12 +279,13 @@ class ParallelContext(metaclass=SingletonMeta):
    def add_group(self, parallel_mode: ParallelMode, group: dist.ProcessGroup):
        """Adds the group of the current device for `parallel_mode`.

-        :param parallel_mode: The chosen parallel mode
-        :type parallel_mode: :class:`colossalai.context.ParallelMode`
-        :param group: The group to be added
-        :type group: torch.distributed.ProcessGroup
-        :raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
-            of :class:`colossalai.context.ParallelMode`
+        Args:
+            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+            group (torch.distributed.ProcessGroup): The group to be added
+
+        Raises:
+            AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
+                of :class:`colossalai.context.ParallelMode`.
        """
        self._check_parallel_mode(parallel_mode)
        self._groups[parallel_mode] = group
@ -262,12 +293,15 @@ class ParallelContext(metaclass=SingletonMeta):
    def get_ranks_in_group(self, parallel_mode: ParallelMode):
        """Returns the rank of the current device for `parallel_mode` in the group.

-        :param parallel_mode: The chosen parallel mode
-        :type parallel_mode: :class:`colossalai.context.ParallelMode`
-        :raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
-            of :class:`colossalai.context.ParallelMode`
-        :return: the rank of the current device for `parallel_mode` in the group
-        :rtype: int
+        Args:
+            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+
+        Raises:
+            AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
+                of :class:`colossalai.context.ParallelMode`.
+
+        Returns:
+            int: The rank of the current device for `parallel_mode` in the group.
        """
        self._check_parallel_mode(parallel_mode)
        return self._ranks_in_group[parallel_mode]
@ -275,28 +309,26 @@ class ParallelContext(metaclass=SingletonMeta):
    def add_ranks_in_group(self, parallel_mode: ParallelMode, ranks: list):
        """Adds the ranks of the current device for `parallel_mode` in the group.

-        :param parallel_mode: The chosen parallel mode
-        :type parallel_mode: :class:`colossalai.context.ParallelMode`
-        :param ranks: List of ranks to be added
-        :type ranks: list
-        :raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
-            of :class:`colossalai.context.ParallelMode`
+        Args:
+            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+            ranks (list): List of ranks to be added
+
+        Raises:
+            AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
+                of :class:`colossalai.context.ParallelMode`.
        """
        self._check_parallel_mode(parallel_mode)
        self._ranks_in_group[parallel_mode] = ranks

    def init_global_dist(self, rank: int, world_size: int, backend: str, host: str, port: int):
        """Initializes the global distributed environment
-        :param rank: rank for the default process group
-        :type rank: int
-        :param world_size: world size of the default process group
-        :type world_size: int
-        :param host: the master address for distributed training
-        :type host: str
-        :param port: the master port for distributed training
-        :type port: str
-        :param backend: backend for torch.distributed
-        :type backend: str
+
+        Args:
+           rank (int): rank for the default process group.
+           world_size (int): world size of the default process group.
+           backend (str): backend for ``torch.distributed``
+           host (str): the master address for distributed training.
+           port (str): the master port for distributed training
        """
        # initialize the default process group
        init_method = f'tcp://{host}:{port}'
@ -315,8 +347,9 @@ class ParallelContext(metaclass=SingletonMeta):
    def check_sanity(self):
        """Checks sanity of the parallel context.

-        :raises AssertionError: Raises an AssertionError if the world size does not equal to the product
-            of data paralle size, pipeline parallel size and tensor parallel size
+        Raises:
+            AssertionError: Raises an AssertionError if the world size does not equal to the product
+                of data parallel size, pipeline parallel size and tensor parallel size.
        """
        dps = self.data_parallel_size
        pps = self.pipeline_parallel_size
@ -341,7 +374,8 @@ class ParallelContext(metaclass=SingletonMeta):
    def init_parallel_groups(self):
        """Initializes the parallel groups.

-        :raises AssertionError: Raises an AssertionError if the field paralle is not present in the config file
+        Raises:
+            AssertionError: Raises an AssertionError if the field parallel is not present in the config file.
        """

        # get rank and world size
@ -411,11 +445,11 @@ class ParallelContext(metaclass=SingletonMeta):
        """Returns a boolean value indicating whether `parallel_mode` is initialized
        in the current system.

-        :param parallel_mode: The chosen parallel mode
-        :type parallel_mode: :class:`colossalai.context.ParallelMode`
-        :return: a boolean value indicating whether `parallel_mode` is initialized
-            in the current system
-        :rtype: bool
+        Args:
+            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+
+        Returns:
+            bool: a boolean value indicating whether `parallel_mode` is initialized in the current system.
        """
        return parallel_mode in self._groups

@ -432,8 +466,8 @@ class ParallelContext(metaclass=SingletonMeta):
    def set_device(self, device_ordinal: int = None):
        """Sets distributed processes to be bound to devices.

-        :param device_ordinal: the device id to be bound to
-        :type device_ordinal: int, optional
+        Args:
+           device_ordinal (int, optional): the device id to be bound to
        """
        global_rank = self.get_global_rank()
        if device_ordinal is None:
@ -447,8 +481,8 @@ class ParallelContext(metaclass=SingletonMeta):
    def set_seed(self, seed: int):
        """Sets seeds for all random libraries.

-        :param seed: seed for random states
-        :type seed: int
+        Args:
+            seed (int): seed for random states
        """
        random.seed(seed)
        np.random.seed(seed)
--- a/colossalai/context/process_group_initializer/initializer_1d.py
+++ b/colossalai/context/process_group_initializer/initializer_1d.py
@ -11,8 +11,16 @@ from .process_group_initializer import ProcessGroupInitializer

@DIST_GROUP_INITIALIZER.register_module
 class Initializer_1D(ProcessGroupInitializer):
-    '''A ProcessGroupInitializer for 1d tensor parallelism.
-    '''
+    """A ProcessGroupInitializer for 1d tensor parallelism.
+
+    Args:
+        rank (int): The rank of current process.
+        world_size (int): Size of whole communication world.
+        config (Config): Running configuration.
+        data_parallel_size (int): Size of data parallel.
+        pipeline_parallel_size (int): Size of pipeline parallel.
+        tensor_parallel_size (int): Size of tensor parallel.
+    """

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
@ -20,8 +28,10 @@ class Initializer_1D(ProcessGroupInitializer):

    def init_dist_group(self):
        """Initialize 1D tensor parallel groups, and assign local_ranks and groups to each gpu.
-        :return: (local_rank, group_world_size, process_group, ranks_in_group, mode)
-        :rtype: Tuple
+
+        Returns:
+            Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
+                1D tensor parallelism's information in a tuple.
        """
        local_rank = None
        ranks_in_group = None
--- a/colossalai/context/process_group_initializer/initializer_2d.py
+++ b/colossalai/context/process_group_initializer/initializer_2d.py
@ -22,12 +22,16 @@ def _check_summa_env_var(summa_dim):

 class Initializer_2D_Row(ProcessGroupInitializer):
    """2d tensor parallel initialization among rows.
-    :param num_group: The number of all tensor groups
-    :param summa_dim: The dimension of SUMMA
-    :param args: Args used to initialize base class
-    :param kwargs: Kwargs used to initialize base class
-    :type num_group: int
-    :type summa_dim: int
+
+    Args:
+        num_group (int): The number of all tensor groups.
+        summa_dim (int): The dimension of SUMMA.
+        rank (int): The rank of current process.
+        world_size (int): Size of whole communication world.
+        config (Config): Running configuration.
+        data_parallel_size (int): Size of data parallel.
+        pipeline_parallel_size (int): Size of pipeline parallel.
+        tensor_parallel_size (int): Size of tensor parallel.
    """

    def __init__(self, num_group, summa_dim, *args, **kwargs):
@ -37,9 +41,9 @@ class Initializer_2D_Row(ProcessGroupInitializer):

    def init_dist_group(self):
        """Initialize 2D tensor row parallel groups, and assign local_ranks and groups to each gpu.
-
-        :return: 2D tensor row parallelism's information
-        :rtype: Tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
+        Returns:
+            Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
+                2D tensor row parallelism's information in a tuple.
        """
        local_rank = None
        ranks_in_group = None
@ -64,13 +68,15 @@ class Initializer_2D_Row(ProcessGroupInitializer):
 class Initializer_2D_Col(ProcessGroupInitializer):
    """2d tensor parallel initialization among cols.

-    :param num_group: The number of all tensor groups
-    :param summa_dim: The dimension of SUMMA
-    :param args: Args used to initialize base class
-    :param kwargs: Kwargs used to initialize base class
-
-    :type num_group: int
-    :type summa_dim: int
+    Args:
+        num_group (int): The number of all tensor groups.
+        summa_dim (int): The dimension of SUMMA.
+        rank (int): The rank of current process.
+        world_size (int): Size of whole communication world.
+        config (Config): Running configuration.
+        data_parallel_size (int): Size of data parallel.
+        pipeline_parallel_size (int): Size of pipeline parallel.
+        tensor_parallel_size (int): Size of tensor parallel.
    """

    def __init__(self, num_group, summa_dim, *args, **kwargs):
@ -81,8 +87,9 @@ class Initializer_2D_Col(ProcessGroupInitializer):
    def init_dist_group(self):
        """Initialize 2D tensor row parallel groups, and assign local_ranks and groups to each gpu.

-        :return: 2D tensor col parallelism's information
-        :rtype: Tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
+        Returns:
+            Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
+                2D tensor col parallelism's information in a tuple.
        """
        local_rank = None
        ranks_in_group = None
@ -109,8 +116,13 @@ class Initializer_2D(ProcessGroupInitializer):
    """
    Serve as the single entry point to 2D parallel initialization.

-    :param args: Args used to initialize ProcessGroupInitializer
-    :param kwargs: Kwargs used to initialize ProcessGroupInitializer
+    Args:
+        rank (int): The rank of current process.
+        world_size (int): Size of whole communication world.
+        config (Config): Running configuration.
+        data_parallel_size (int): Size of data parallel.
+        pipeline_parallel_size (int): Size of pipeline parallel.
+        tensor_parallel_size (int): Size of tensor parallel.
    """

    def __init__(self, *args, **kwargs):
@ -127,8 +139,10 @@ class Initializer_2D(ProcessGroupInitializer):

    def init_dist_group(self):
        """Initialize 2D tensor row and col parallel groups, and assign local_ranks and groups to each gpu.
-        :return: 2D tensor parallelism's information
-        :rtype: list of Tuples (local_rank, group_world_size, process_group, ranks_in_group, mode)
+
+        Returns:
+            List[Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode)]:
+                2D tensor parallelism's information in a list of tuples.
        """
        parallel_setting = [self.row_initializer.init_dist_group(), self.col_initializer.init_dist_group()]
        return parallel_setting
--- a/colossalai/context/process_group_initializer/initializer_2p5d.py
+++ b/colossalai/context/process_group_initializer/initializer_2p5d.py
@ -31,14 +31,17 @@ def _check_tesseract_env_var(tesseract_dim: int, tesseract_dep: int):

 # i row j col k dep
 class Initializer_2p5D_ROW(ProcessGroupInitializer):
-    """2p5d tensor parallel initialization among rows.
+    """2.5d tensor parallel initialization among rows.

-    :param tesseract_dim: The dimension of tesseract
-    :param tesseract_dep: The dimension of depth
-    :param args: Args used to initialize base class
-
-    :type tesseract_dim: int
-    :type tesseract_dep: int
+    Args:
+        tesseract_dim (int): The dimension of tesseract.
+        tesseract_dep (int): The dimension of depth.
+        rank (int): The rank of current process.
+        world_size (int): Size of whole communication world.
+        config (Config): Running configuration.
+        data_parallel_size (int): Size of data parallel.
+        pipeline_parallel_size (int): Size of pipeline parallel.
+        tensor_parallel_size (int): Size of tensor parallel.
    """

    def __init__(self, tesseract_dim: int, tesseract_dep: int, *args):
@ -50,10 +53,11 @@ class Initializer_2p5D_ROW(ProcessGroupInitializer):
            "Tensor parallel size should be depth * dim ** 2 in 2.5D parallel"

    def init_dist_group(self):
-        """Initialize 2p5D tensor row parallel groups, and assign local_ranks and groups to each gpu.
+        """Initialize 2.5D tensor row parallel groups, and assign local_ranks and groups to each gpu.

-        :return: 2p5D tensor row parallelism's information
-        :rtype: Tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
+        Returns:
+            Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
+                2.5D tensor row parallelism's information in a tuple.
        """
        local_rank = None
        ranks_in_group = None
@ -80,14 +84,17 @@ class Initializer_2p5D_ROW(ProcessGroupInitializer):


 class Initializer_2p5D_Col(ProcessGroupInitializer):
-    """2p5d tensor parallel initialization among cols.
+    """2.5d tensor parallel initialization among cols.

-    :param tesseract_dim: The dimension of tesseract
-    :param tesseract_dep: The dimension of depth
-    :param args: Args used to initialize base class
-
-    :type tesseract_dim: int
-    :type tesseract_dep: int
+    Args:
+        tesseract_dim (int): The dimension of tesseract.
+        tesseract_dep (int): The dimension of depth.
+        rank (int): The rank of current process.
+        world_size (int): Size of whole communication world.
+        config (Config): Running configuration.
+        data_parallel_size (int): Size of data parallel.
+        pipeline_parallel_size (int): Size of pipeline parallel.
+        tensor_parallel_size (int): Size of tensor parallel.
    """

    def __init__(self, tesseract_dim: int, tesseract_dep: int, *args):
@ -99,10 +106,11 @@ class Initializer_2p5D_Col(ProcessGroupInitializer):
            "Tensor parallel size should be depth * dim ** 2 in 2.5D parallel"

    def init_dist_group(self):
-        """Initialize 2p5D tensor col parallel groups, and assign local_ranks and groups to each gpu.
+        """Initialize 2.5D tensor col parallel groups, and assign local_ranks and groups to each gpu.

-        :return: 2p5D tensor col parallelism's information
-        :rtype: Tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
+        Returns:
+            Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
+                2.5D tensor col parallelism's information in a tuple.
        """
        local_rank = None
        ranks_in_group = None
@ -129,14 +137,17 @@ class Initializer_2p5D_Col(ProcessGroupInitializer):


 class Initializer_2p5D_Dep(ProcessGroupInitializer):
-    """2p5D tensor parallel initialization among depths.
+    """2.5D tensor parallel initialization among depths.

-    :param tesseract_dim: The dimension of tesseract
-    :param tesseract_dep: The dimension of depth
-    :param args: Args used to initialize base class
-
-    :type tesseract_dim: int
-    :type tesseract_dep: int
+    Args:
+        tesseract_dim (int): The dimension of tesseract.
+        tesseract_dep (int): The dimension of depth.
+        rank (int): The rank of current process.
+        world_size (int): Size of whole communication world.
+        config (Config): Running configuration.
+        data_parallel_size (int): Size of data parallel.
+        pipeline_parallel_size (int): Size of pipeline parallel.
+        tensor_parallel_size (int): Size of tensor parallel.
    """

    def __init__(self, tesseract_dim: int, tesseract_dep: int, *args):
@ -148,10 +159,11 @@ class Initializer_2p5D_Dep(ProcessGroupInitializer):
            "Tensor parallel size should be depth * dim ** 2 in 2.5D parallel"

    def init_dist_group(self):
-        """Initialize 2p5D tensor depth parallel groups, and assign local_ranks and groups to each gpu.
+        """Initialize 2.5D tensor depth parallel groups, and assign local_ranks and groups to each gpu.

-        :return: 2p5D tensor depth parallelism's information
-        :rtype: Tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
+        Returns:
+            Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
+                2.5D tensor depth parallelism's information in a tuple.
        """
        local_rank = None
        ranks_in_group = None
@ -179,14 +191,17 @@ class Initializer_2p5D_Dep(ProcessGroupInitializer):

 # i row j col k dep
 class Initializer_2p5D_XZ(ProcessGroupInitializer):
-    """2p5d tensor parallel initialization among cols times dep.
+    """2.5d tensor parallel initialization among cols times dep.

-    :param tesseract_dim: The dimension of tesseract
-    :param tesseract_dep: The dimension of depth
-    :param args: Args used to initialize base class
-
-    :type tesseract_dim: int
-    :type tesseract_dep: int
+    Args:
+        tesseract_dim (int): The dimension of tesseract.
+        tesseract_dep (int): The dimension of depth.
+        rank (int): The rank of current process.
+        world_size (int): Size of whole communication world.
+        config (Config): Running configuration.
+        data_parallel_size (int): Size of data parallel.
+        pipeline_parallel_size (int): Size of pipeline parallel.
+        tensor_parallel_size (int): Size of tensor parallel.
    """

    def __init__(self, tesseract_dim: int, tesseract_dep: int, *args):
@ -198,10 +213,11 @@ class Initializer_2p5D_XZ(ProcessGroupInitializer):
            "Tensor parallel size should be depth * dim ** 2 in 2.5D parallel"

    def init_dist_group(self):
-        """Initialize 2p5D tensor colXdepth parallel groups, and assign local_ranks and groups to each gpu.
+        """Initialize 2.5D tensor colXdepth parallel groups, and assign local_ranks and groups to each gpu.

-        :return: 2p5D tensor colXdepth parallelism's information
-        :rtype: Tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
+        Returns:
+            Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
+                2.5D tensor colXdepth parallelism's information in a tuple.
        """
        local_rank = None
        ranks_in_group = None
@ -232,20 +248,14 @@ class Initializer_2p5D(ProcessGroupInitializer):
    """
    Serve as the single entry point to Tesseract parallel initialization.

-    :param rank: The rank of current process
-    :param world_size: Size of whole communication world
-    :param config: Running configuration
-    :param data_parallel_size: Size of data parallel
-    :param pipeline_parallel_size: Size of pipeline parallel
-    :param tensor_parallel_size: Size of tensor parallel
-    :param depth: The depth of 2p5d parallel
-    :type rank: int
-    :type world_size: int
-    :type config: Config
-    :type data_parallel_size: int
-    :type pipeline_parallel_size: int
-    :type tensor_parallel_size: int
-    :type depth: int
+    Args:
+        rank (int): The rank of current process.
+        world_size (int): Size of whole communication world.
+        config (Config): Running configuration.
+        data_parallel_size (int): Size of data parallel.
+        pipeline_parallel_size (int): Size of pipeline parallel.
+        tensor_parallel_size (int): Size of tensor parallel.
+        depth (int): The depth of 2.5d parallel.
    """

    def __init__(self, rank: int, world_size: int, config: Config, data_parallel_size: int, pipeline_parallel_size: int,
@ -266,9 +276,11 @@ class Initializer_2p5D(ProcessGroupInitializer):
        self.xz_initializer = Initializer_2p5D_XZ(self.tesseract_dim, self.tesseract_dep, *args)

    def init_dist_group(self):
-        """Initialize 2p5D tensor row, col, depth, and colXdepth parallel groups, and assign local_ranks and groups to each gpu.
-        :return: Whole 2p5D tensor parallelism's information
-        :rtype: list of Tuples (local_rank, group_world_size, process_group, ranks_in_group, mode)
+        """Initialize 2.5D tensor row, col, depth, and colXdepth parallel groups, and assign local_ranks and groups to each gpu.
+
+        Returns:
+            List[Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode)]:
+                Whole 2.5D tensor parallelism's information in a list of tuples.
        """
        parallel_setting = [
            self.col_initializer.init_dist_group(),
--- a/colossalai/context/process_group_initializer/initializer_3d.py
+++ b/colossalai/context/process_group_initializer/initializer_3d.py
@ -26,12 +26,15 @@ def _check_depth_env_var(depth):
 class Initializer_3D_Input(ProcessGroupInitializer):
    """3D tensor parallel initialization among input.

-    :param num_group: The number of all tensor groups
-    :param depth: Depth of 3D parallelism
-    :param args: Args used in base class
-
-    :type num_group: int
-    :type depth: int
+    Args:
+        num_group (int): The number of all tensor groups.
+        depth (int): Depth of 3D parallelism.
+        rank (int): The rank of current process.
+        world_size (int): Size of whole communication world.
+        config (Config): Running configuration.
+        data_parallel_size (int): Size of data parallel.
+        pipeline_parallel_size (int): Size of pipeline parallel.
+        tensor_parallel_size (int): Size of tensor parallel.
    """

    def __init__(self, num_group: int, depth: int, *args):
@ -42,8 +45,9 @@ class Initializer_3D_Input(ProcessGroupInitializer):
    def init_dist_group(self):
        """Initialize 3D tensor parallel groups among input, and assign local_ranks and groups to each gpu.

-        :return: 3D tensor parallelism's information among input
-        :rtype: Tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
+        Returns:
+            Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
+                3D tensor parallelism's information among input in a tuple.
        """
        local_rank = None
        ranks_in_group = None
@ -70,12 +74,15 @@ class Initializer_3D_Input(ProcessGroupInitializer):
 class Initializer_3D_Weight(ProcessGroupInitializer):
    """3D tensor parallel initialization among weight.

-    :param num_group: The number of all tensor groups
-    :param depth: Depth of 3D parallelism
-    :param args: Args used in base class
-
-    :type num_group: int
-    :type depth: int
+    Args:
+        num_group (int): The number of all tensor groups.
+        depth (int): Depth of 3D parallelism.
+        rank (int): The rank of current process.
+        world_size (int): Size of whole communication world.
+        config (Config): Running configuration.
+        data_parallel_size (int): Size of data parallel.
+        pipeline_parallel_size (int): Size of pipeline parallel.
+        tensor_parallel_size (int): Size of tensor parallel.
    """

    def __init__(self, num_group: int, depth: int, *args):
@ -86,8 +93,9 @@ class Initializer_3D_Weight(ProcessGroupInitializer):
    def init_dist_group(self):
        """Initialize 3D tensor parallel groups among weight, and assign local_ranks and groups to each gpu.

-        :return: 3D tensor parallelism's information among weight
-        :rtype: Tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
+        Returns:
+            Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
+                3D tensor parallelism's information among weight in a tuple.
        """
        local_rank = None
        ranks_in_group = None
@ -114,12 +122,15 @@ class Initializer_3D_Weight(ProcessGroupInitializer):
 class Initializer_3D_Output(ProcessGroupInitializer):
    """3D tensor parallel initialization among output.

-    :param num_group: The number of all tensor groups
-    :param depth: Depth of 3D parallelism
-    :param args: Args used in base class
-
-    :type num_group: int
-    :type depth: int
+    Args:
+        num_group (int): The number of all tensor groups.
+        depth (int): Depth of 3D parallelism.
+        rank (int): The rank of current process.
+        world_size (int): Size of whole communication world.
+        config (Config): Running configuration.
+        data_parallel_size (int): Size of data parallel.
+        pipeline_parallel_size (int): Size of pipeline parallel.
+        tensor_parallel_size (int): Size of tensor parallel.
    """

    def __init__(self, num_group: int, depth: int, *args):
@ -130,8 +141,9 @@ class Initializer_3D_Output(ProcessGroupInitializer):
    def init_dist_group(self):
        """Initialize 3D tensor parallel groups among output, and assign local_ranks and groups to each gpu.

-        :return: 3D tensor parallelism's information among output
-        :rtype: Tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
+        Returns:
+            Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
+                3D tensor parallelism's information among output in a tuple.
        """
        local_rank = None
        ranks_in_group = None
@ -158,7 +170,14 @@ class Initializer_3D_Output(ProcessGroupInitializer):
@DIST_GROUP_INITIALIZER.register_module
 class Initializer_3D(ProcessGroupInitializer):
    """Serve as the single entry point to 3D parallel initialization.
-    :param args: Args used to initialize ProcessGroupInitializer
+
+    Args:
+        rank (int): The rank of current process.
+        world_size (int): Size of whole communication world.
+        config (Config): Running configuration.
+        data_parallel_size (int): Size of data parallel.
+        pipeline_parallel_size (int): Size of pipeline parallel.
+        tensor_parallel_size (int): Size of tensor parallel.
    """

    def __init__(self, *args):
@ -175,8 +194,10 @@ class Initializer_3D(ProcessGroupInitializer):

    def init_dist_group(self):
        """Initialize 3D tensor parallel groups, and assign local_ranks and groups to each gpu.
-        :return: 3D tensor parallelism's information
-        :rtype: list of Tuples (local_rank, group_world_size, process_group, ranks_in_group, mode)
+
+        Returns:
+            List[Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode)]:
+                Whole 3D tensor parallelism's information in a list of tuples.
        """
        parallel_setting = [
            self.input_initializer.init_dist_group(),
--- a/colossalai/context/process_group_initializer/initializer_data.py
+++ b/colossalai/context/process_group_initializer/initializer_data.py
@ -12,8 +12,13 @@ from ..parallel_mode import ParallelMode
 class Initializer_Data(ProcessGroupInitializer):
    """A ProcessGroupInitializer for data parallelism.

-    :param args: Args used to initialize ProcessGroupInitializer
-    :param kwargs: Kwargs used to initialize ProcessGroupInitializer
+    Args:
+        rank (int): The rank of current process.
+        world_size (int): Size of whole communication world.
+        config (Config): Running configuration.
+        data_parallel_size (int): Size of data parallel.
+        pipeline_parallel_size (int): Size of pipeline parallel.
+        tensor_parallel_size (int): Size of tensor parallel.
    """
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
@ -22,8 +27,9 @@ class Initializer_Data(ProcessGroupInitializer):
    def init_dist_group(self):
        """Initialize data parallel groups, and assign local_ranks and groups to each gpu.

-        :return: Data parallelism's information
-        :rtype: Tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
+        Returns:
+            Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
+                A Data parallelism's information tuple.
        """
        local_rank = None
        ranks_in_group = None
--- a/colossalai/context/process_group_initializer/initializer_model.py
+++ b/colossalai/context/process_group_initializer/initializer_model.py
@ -12,8 +12,13 @@ class Initializer_Model(ProcessGroupInitializer):
    """A ProcessGroupInitializer for model parallelism (model parallel group contains pipeline and tensor parallel
    groups).

-    :param args: Args used to initialize ProcessGroupInitializer
-    :param kwargs: Kwargs used to initialize ProcessGroupInitializer
+    Args:
+        rank (int): The rank of current process.
+        world_size (int): Size of whole communication world.
+        config (Config): Running configuration.
+        data_parallel_size (int): Size of data parallel.
+        pipeline_parallel_size (int): Size of pipeline parallel.
+        tensor_parallel_size (int): Size of tensor parallel.
    """

    def __init__(self, *args, **kwargs):
@ -24,8 +29,9 @@ class Initializer_Model(ProcessGroupInitializer):
    def init_dist_group(self):
        """Initialize model parallel groups, and assign local_ranks and groups to each gpu.

-        :return: (local_rank, group_world_size, process_group, ranks_in_group, mode)
-        :rtype: Tuple
+        Returns:
+            Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
+                A Model parallelism's information tuple.
        """
        local_rank = None
        ranks_in_group = None
--- a/colossalai/context/process_group_initializer/initializer_pipeline.py
+++ b/colossalai/context/process_group_initializer/initializer_pipeline.py
@ -12,8 +12,13 @@ from ..parallel_mode import ParallelMode
 class Initializer_Pipeline(ProcessGroupInitializer):
    """A ProcessGroupInitializer for pipeline parallelism.

-    :param args: Args used to initialize ProcessGroupInitializer
-    :param kwargs: Kwargs used to initialize ProcessGroupInitializer
+    Args:
+        rank (int): The rank of current process
+        world_size (int): Size of whole communication world
+        config (Config): Running configuration
+        data_parallel_size (int): Size of data parallel
+        pipeline_parallel_size (int): Size of pipeline parallel
+        tensor_parallel_size (int): Size of tensor parallel
    """
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
@ -23,8 +28,9 @@ class Initializer_Pipeline(ProcessGroupInitializer):
    def init_dist_group(self):
        """Initialize pipeline parallel groups, and assign local_ranks and groups to each gpu.

-        :return: Pipeline parallelism's information
-        :rtype: list of Tuples (local_rank, group_world_size, process_group, ranks_in_group, mode)
+        Returns:
+            List[Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode)]:
+                A Pipeline parallelism's information in list of tuples.
        """
        dist_settings = list()
        for i in range(self.data_parallel_size):
--- a/colossalai/context/process_group_initializer/initializer_sequence.py
+++ b/colossalai/context/process_group_initializer/initializer_sequence.py
@ -15,8 +15,13 @@ class Initializer_Sequence_DP(ProcessGroupInitializer):
    In Sequence Parallelism, each GPU holds the full copy of model weights,
    thus, gradient all-reduce occurs across all processes in the same pipeline stage

-    :param args: Args used to initialize ProcessGroupInitializer
-    :param kwargs: Kwargs used to initialize ProcessGroupInitializer
+    Args:
+        rank (int): The rank of current process
+        world_size (int): Size of whole communication world
+        config (Config): Running configuration
+        data_parallel_size (int): Size of data parallel
+        pipeline_parallel_size (int): Size of pipeline parallel
+        tensor_parallel_size (int): Size of tensor parallel
    """

    def __init__(self, *args, **kwargs):
@ -27,8 +32,8 @@ class Initializer_Sequence_DP(ProcessGroupInitializer):
    def init_dist_group(self):
        """Initialize Sequence Parallel process groups used for gradient all-reduce.

-        :return: (local_rank, group_world_size, process_group, ranks_in_group, mode)
-        :rtype: Tuple
+        Returns:
+            Tuple: A tuple (local_rank, group_world_size, process_group, ranks_in_group, mode).
        """
        local_rank = None
        ranks_in_group = None
@ -52,8 +57,13 @@ class Initializer_Sequence_DP(ProcessGroupInitializer):
 class Initializer_Sequence(ProcessGroupInitializer):
    """A ProcessGroupInitializer for sequence parallelism.

-    :param args: Args used to initialize ProcessGroupInitializer
-    :param kwargs: Kwargs used to initialize ProcessGroupInitializer
+    Args:
+        rank (int): The rank of current process.
+        world_size (int): Size of whole communication world.
+        config (Config): Running configuration.
+        data_parallel_size (int): Size of data parallel.
+        pipeline_parallel_size (int): Size of pipeline parallel.
+        tensor_parallel_size (int): Size of tensor parallel.
    """
    def __init__(self,
                 *args, **kwargs):
@ -66,11 +76,12 @@ class Initializer_Sequence(ProcessGroupInitializer):
        """Initialize Sequence parallel process groups and assign local_ranks and groups to each gpu.

        Sequence parallelism requires 2 process groups. The first is for model forward where several processes
-        exchange paritial query, key and value embedding to compute self attention values. The second is for
+        exchange partial query, key and value embedding to compute self attention values. The second is for
        all-reduce to synchronize the model parameters.

-        :return: Sequence parallelism's information
-        :rtype: list of Tuples (local_rank, group_world_size, process_group, ranks_in_group, mode)
+        Returns:
+            List[Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode)]:
+                A Sequence parallelism's information in list of tuples.
        """

        parallel_setting = []
--- a/colossalai/context/process_group_initializer/initializer_tensor.py
+++ b/colossalai/context/process_group_initializer/initializer_tensor.py
@ -12,8 +12,13 @@ from ..parallel_mode import ParallelMode
 class Initializer_Tensor(ProcessGroupInitializer):
    """A ProcessGroupInitializer for tensor parallelism.

-    :param args: Args used to initialize ProcessGroupInitializer
-    :param kwargs: Kwargs used to initialize ProcessGroupInitializer
+    Args:
+        rank (int): The rank of current process.
+        world_size (int): Size of whole communication world.
+        config (Config): Running configuration.
+        data_parallel_size (int): Size of data parallel.
+        pipeline_parallel_size (int): Size of pipeline parallel.
+        tensor_parallel_size (int): Size of tensor parallel.
    """
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
@ -22,8 +27,9 @@ class Initializer_Tensor(ProcessGroupInitializer):
    def init_dist_group(self):
        """Initialize tensor parallel groups, and assign local_ranks and groups to each gpu.

-        :return: Tensor parallelism's information
-        :rtype: Tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
+        Returns:
+            Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
+                A Tensor parallelism's information tuple.
        """
        local_rank = None
        ranks_in_group = None
--- a/colossalai/context/process_group_initializer/process_group_initializer.py
+++ b/colossalai/context/process_group_initializer/process_group_initializer.py
@ -9,19 +9,13 @@ from colossalai.context import Config
 class ProcessGroupInitializer(ABC):
    """An object, knowing the parallelism configuration, that initializes parallel groups.

-    :param rank: The rank of current process
-    :param world_size: Size of whole communication world
-    :param config: Running configuration
-    :param data_parallel_size: Size of data parallel
-    :param pipeline_parallel_size: Size of pipeline parallel
-    :param tensor_parallel_size: Size of tensor parallel
-
-    :type rank: int
-    :type world_size: int
-    :type config: Config
-    :type data_parallel_size: int
-    :type pipeline_parallel_size: int
-    :type tensor_parallel_size: int
+    Args:
+        rank (int): The rank of current process.
+        world_size (int): Size of whole communication world.
+        config (Config): Running configuration.
+        data_parallel_size (int): Size of data parallel.
+        pipeline_parallel_size (int): Size of pipeline parallel.
+        tensor_parallel_size (int): Size of tensor parallel.
    """
    def __init__(self,
                 rank: int,
--- a/colossalai/context/random/_helper.py
+++ b/colossalai/context/random/_helper.py
@ -16,8 +16,8 @@ _SEED_MANAGER = SeedManager()
 def get_seeds():
    """Returns the seeds of the seed manager.

-    :return: The seeds of the seed manager
-    :rtype: dict
+    Returns:
+        dict: The seeds of the seed manager.
    """
    return _SEED_MANAGER.seeds

@ -25,8 +25,8 @@ def get_seeds():
 def get_states(copy=False):
    """Returns the seed states of the seed manager.

-    :return: The seed states of the seed manager
-    :rtype: dict
+    Returns:
+        dict: The seed states of the seed manager.
    """
    states = _SEED_MANAGER.seed_states

@ -43,8 +43,8 @@ def get_states(copy=False):
 def get_current_mode():
    """Returns the current mode of the seed manager.

-    :return: The current mode of the seed manager.
-    :rtype: :class:`torch.ByteTensor`
+    Returns:
+        :class:`torch.ByteTensor`: The current mode of the seed manager.
    """
    return _SEED_MANAGER.current_mode

@ -52,12 +52,16 @@ def get_current_mode():
 def add_seed(parallel_mode: ParallelMode, seed: int, overwrite: bool = False):
    """Adds a seed to the seed manager for `parallel_mode`.

-    :param parallel_mode: The chosen parallel mode
-    :type parallel_mode: :class:`colossalai.context.ParallelMode`
-    :param seed: The seed to be added
-    :type seed: int
-    :raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance of
-        :class:`colossalai.context.ParallelMode` or the seed for `parallel_mode` has been added
+    Args:
+        parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+        seed (int): The seed to be added
+    Raises:
+        AssertionError: Raises an AssertionError if `parallel_mode` is not an instance of
+            :class:`colossalai.context.ParallelMode` or the seed for `parallel_mode` has been added.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
    """
    _SEED_MANAGER.add_seed(parallel_mode, seed, overwrite)

@ -65,8 +69,12 @@ def add_seed(parallel_mode: ParallelMode, seed: int, overwrite: bool = False):
 def set_mode(parallel_mode: ParallelMode):
    """Sets the current mode of the seed manager.

-    :param parallel_mode: The chosen parallel mode
-    :type parallel_mode: :class:`colossalai.context.ParallelMode`
+    Args:
+        parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
    """
    _SEED_MANAGER.set_mode(parallel_mode)

@ -74,11 +82,12 @@ def set_mode(parallel_mode: ParallelMode):
 def set_seed_states(parallel_mode: ParallelMode, state: Tensor):
    """Sets the state of the seed manager for `parallel_mode`.

-    :param parallel_mode: The chosen parallel mode
-    :type parallel_mode: :class:`colossalai.context.ParallelMode`
-    :param state: the state to be set
-    :type state: :class:`torch.Tensor`
-    :raises AssertionError: Raises an AssertionError if `parallel_mode` is not found in the seed manager
+    Args:
+        parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+        state (:class:`torch.Tensor`): the state to be set.
+
+    Raises:
+        AssertionError: Raises an AssertionError if `parallel_mode` is not found in the seed manager.
    """
    _SEED_MANAGER.set_state(parallel_mode, state)

@ -98,6 +107,9 @@ def seed(parallel_mode: ParallelMode):
        with seed(ParallelMode.DATA):
            output = F.dropout(input)

+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
    """
    try:
        # set to new mode
@ -125,6 +137,9 @@ def with_seed(func, parallel_mode: ParallelMode):
        wrapper_forward = with_seed(forward, ParallelMode.DATA)
        out = wrapped_forward(input)

+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
    """

    @functools.wraps(func)
--- a/colossalai/context/random/seed_manager.py
+++ b/colossalai/context/random/seed_manager.py
@ -9,6 +9,10 @@ from colossalai.context.parallel_mode import ParallelMode

 class SeedManager:
    """This class is a manager of all random seeds involved in the system.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
    """

    def __init__(self):
@ -30,12 +34,12 @@ class SeedManager:

    def set_state(self, parallel_mode: ParallelMode, state: Tensor):
        """Sets the state of the seed manager for `parallel_mode`.
+        Args:
+            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+            state (:class:`torch.Tensor`): the state to be set.

-        :param parallel_mode: The chosen parallel mode
-        :type parallel_mode: :class:`colossalai.context.ParallelMode`
-        :param state: the state to be set
-        :type state: :class:`torch.Tensor`
-        :raises AssertionError: Raises an AssertionError if `parallel_mode` is not found in the seed manager
+        Raises:
+            AssertionError: Raises an AssertionError if `parallel_mode` is not found in the seed manager.
        """
        assert parallel_mode in self._seed_states, f'Parallel mode {parallel_mode} is not found in the seed manager'
        self._seed_states[parallel_mode] = state
@ -43,8 +47,8 @@ class SeedManager:
    def set_mode(self, parallel_mode: ParallelMode):
        """Sets the current mode of the seed manager.

-        :param parallel_mode: The chosen parallel mode
-        :type parallel_mode: :class:`colossalai.context.ParallelMode`
+        Args:
+            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
        """
        if self.current_mode:
            # save the current state for current mode
@ -57,14 +61,14 @@ class SeedManager:
    def add_seed(self, parallel_mode: ParallelMode, seed: int, overwrtie: bool = False):
        """Adds a seed to the seed manager for `parallel_mode`.

-        :param parallel_mode: The chosen parallel mode
-        :type parallel_mode: :class:`colossalai.context.ParallelMode`
-        :param seed: The seed to be added
-        :type seed: int
-        :param overwrtie: Whether allows to overwrite the seed that has been set already
-        :type overwrtie: bool, optional
-        :raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance of
-            :class:`colossalai.context.ParallelMode` or the seed for `parallel_mode` has been added
+        Args:
+            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+            seed (int): The seed to be added.
+            overwrtie (bool, optional): Whether allows to overwrite the seed that has been set already
+
+        Raises
+            AssertionError: Raises an AssertionError if `parallel_mode` is not an instance of
+                :class:`colossalai.context.ParallelMode` or the seed for `parallel_mode` has been added.
        """
        assert isinstance(parallel_mode, ParallelMode), 'A valid ParallelMode must be provided'
        if overwrtie is False:
--- a/colossalai/engine/_base_engine.py
+++ b/colossalai/engine/_base_engine.py
@ -19,20 +19,37 @@ class Engine:
    :meth:`step` which is based on the given :attr:`schedule` over each batch of a dataset.
    It controls a iteration in training.

-    :param model: The neural network model
-    :type model: ``torch.nn.Module``
-    :param optimizer: Optimizer for updating the parameters
-    :type optimizer: ``torch.optim.Optimizer``
-    :param criterion: Loss function for calculating loss
-    :type criterion: ``torch.nn.modules.loss._Loss``, optional
-    :param gradient_handlers: A list of gradient handler used in backward
-    :type gradient_handlers: a list of ``BaseGradientHandler``, optional
-    :param clip_grad_norm: The norm of gradient clipping
-    :type clip_grad_norm: float, optional
-    :param ophook_list: List of ophook
-    :type ophook_list: list
-    :param verbose: whether to display log info
-    :type verbose: bool
+    Args:
+        model (``torch.nn.Module``): The neural network model.
+        optimizer (``torch.optim.Optimizer``): Optimizer for updating the parameters.
+        criterion (``torch.nn.modules.loss._Loss``, optional): Loss function for calculating loss.
+        gradient_handlers (List[``BaseGradientHandler``], optional): A list of gradient handler used in backward.
+        clip_grad_norm (float, optional): The norm of gradient clipping.
+        ophook_list (list): List of ophook.
+        verbose (bool): whether to display log info.
+
+    Examples:
+        >>> # define model, criterion, optimizer, lr_scheduler, train_dataloader for your training
+        >>> model = ...
+        >>> criterion = ...
+        >>> optimizer = ...
+        >>> train_dataloader = ...
+        >>> engine, _, _, _ = colossalai.initialize(model, optimizer, criterion)
+        >>> engine.train()
+        >>> for inputs, labels in train_dataloader
+        >>>     # set gradients to zero
+        >>>     engine.zero_grad()
+        >>>     # run forward pass
+        >>>     outputs = engine(inputs)
+        >>>     # compute loss value and run backward pass
+        >>>     loss = engine.criterion(outputs, labels)
+        >>>     engine.backward(loss)
+        >>>     # update parameters
+        >>>     engine.step()
+
+    The example of using Engine in training could be find in
+    `Training with engine and trainer <https://www.colossalai.org/docs/basics/engine_trainer>`_. and
+    `Run resnet cifar10 with engine <https://github.com/hpcaitech/ColossalAI-Examples/blob/main/image/resnet/run_resnet_cifar10_with_engine.py>`_.
    """

    def __init__(self,
@ -113,10 +130,10 @@ class Engine:
        return self.optimizer.step()

    def backward(self, loss: Tensor):
-        """Start backward propagation given the loss value computed by a loss function
+        """Start backward propagation given the loss value computed by a loss function.

-        :param loss: Loss value computed by a loss function
-        :type loss: :class:`torch.Tensor`
+        Args:
+            loss (:class:`torch.Tensor`): Loss value computed by a loss function.
        """
        ret = self.optimizer.backward(loss)
        for ophook in self._ophook_list:
@ -124,34 +141,22 @@ class Engine:
        return ret

    def backward_by_grad(self, tensor, grad):
-        """Start backward propagation given the gradient of the output tensor
+        """Start backward propagation given the gradient of the output tensor.

-        :param tensor: Output tensor
-        :type tensor: :class:`torch.Tensor`
-        :param grad: Gradient passed back to the output
-        :type grad: :class:`torch.Tensor`
+        Args:
+            tensor (:class:`torch.Tensor`): Output tensor.
+            grad (:class:`torch.Tensor`): Gradient passed back to the output.
        """
        ret = self.optimizer.backward_by_grad(tensor, grad)
        for ophook in self._ophook_list:
            ophook.post_iter()
        return ret

-    def calc_loss(self, *args, **kwargs):
-        """Compute the loss value
-
-        :param args: Args used in criterion function
-        :param kwargs: Kwargs used in criterion function
-
-        :return: The loss value
-        :rtype: :class:`torch.Tensor`
-        """
-        return self.criterion(*args, **kwargs)
-
    def __call__(self, *args, **kwargs):
-        """Run the forward step for the model
+        """Run the forward step for the model.

-        :return: Output the model
-        :rtype: Tuple[:class:`torch.Tensor`] or :class:`torch.Tensor`
+        Returns:
+            Tuple[:class:`torch.Tensor`] or :class:`torch.Tensor`: Output of the model.
        """
        return self.model(*args, **kwargs)

--- a/colossalai/engine/gradient_handler/_base_gradient_handler.py
+++ b/colossalai/engine/gradient_handler/_base_gradient_handler.py
@ -8,10 +8,9 @@ class BaseGradientHandler(ABC):
    """A basic helper class to handle all-reduce operations of gradients across different parallel groups 
    before optimization.

-    :param model: Model where the gradients accumulate
-    :param optimizer: Optimizer for updating the parameters
-    :type model: Module
-    :type optimizer: Optimizer
+    Args:
+        model (Module): Model where the gradients accumulate.
+        optimizer (Optimizer): Optimizer for updating the parameters.
    """
    def __init__(self, model, optimizer):
        self._model = model
--- a/colossalai/engine/ophooks/_memtracer_ophook.py
+++ b/colossalai/engine/ophooks/_memtracer_ophook.py
@ -17,12 +17,11 @@ import math
 class MemTracerOpHook(BaseOpHook):
    """
    Collect GPU memory usage information
-    :param warmup: This parameter indicates how many iterations to truncate before profiling, defaults to 50
-    :type warmup: int
-    :param refreshrate: This parameter decides the frequency of write file, defaults to 10
-    :type refreshrate: int
-    :param data_prefix: The prefix of the stats data file, defaults to "memstats"
-    :type data_prefix: string
+
+    Args:
+        warmup (int): This parameter indicates how many iterations to truncate before profiling, defaults to 50.
+        refreshrate (int): This parameter decides the frequency of write file, defaults to 10.
+        data_prefix (string): The prefix of the stats data file, defaults to "memstats".
    """

    def __init__(self, warmup: int = 50, refreshrate: int = 10, data_prefix: str = "memstats"):
--- a/colossalai/engine/schedule/_base_schedule.py
+++ b/colossalai/engine/schedule/_base_schedule.py
@ -15,8 +15,12 @@ class BaseSchedule(ABC):
    """A basic helper class to control the process of training or evaluation.
    It mainly composes of forward_backward_step for gradient backward and
    optimizer_step for parameters update.
-    For the convenience to enable FP16, we aggreate all codes that contain the
+    For the convenience to enable FP16, we aggregate all codes that contain the
    control of FP16 in class schedule.
+
+    Args:
+        batch_data_process_func (Callable, optional): The preprocessing function which receives a batch of data,
+        and it will be executed in load_batch.
    """

    def __init__(self, batch_data_process_func: Callable = None):
@ -46,13 +50,12 @@ class BaseSchedule(ABC):
        """Loads a batch from data iterator. It returns the data and labels which are
        already in the same GPU as where the model's.

-        :param data_iter: Data iterator from which get a batch of data
-        :type data_iter: DataIter
-        :param to_gpu: Whether the data should be moved to GPU
-        :type to_gpu: bool, optional
+        Args:
+            data_iter (Iterable): Data iterator from which get a batch of data, obtained by calling iter(dataloader).
+            to_gpu (bool, optional): Whether the data should be moved to GPU

-        :return: (data, label)
-        :rtype: (:class:`Tensor`, :class:`torch.Tensor`)
+        Returns:
+            Tuple (:class:`Tensor`, :class:`torch.Tensor`): A tuple of (data, label).
        """
        if data_iter is None:
            raise RuntimeError('Dataloader is not defined.')
@ -87,16 +90,12 @@ class BaseSchedule(ABC):
                              ):
        """The process function over a batch of dataset for training or evaluation.

-        :param engine: Colossalai training engine
-        :type engine: colossalai.engine.Engine
-        :param data_iter: Data iterator from which get a batch of data
-        :type data_iter: DataIter
-        :param forward_only: If True, the process won't include backward
-        :type forward_only: bool
-        :param return_loss: If False, the loss won't be returned
-        :type return_loss: bool, optional
-        :param return_output_label: If False, the output and label won't be returned
-        :type return_output_label: bool, optional
+        Args:
+            engine (colossalai.engine.Engine): Colossalai engine for training and inference.
+            data_iter (Iterable): Data iterator from which get a batch of data, obtained by calling iter(dataloader).
+            forward_only (bool): If True, the process won't include backward.
+            return_loss (bool, optional): If False, the loss won't be returned.
+            return_output_label (bool, optional): If False, the output and label won't be returned.
        """
        pass

--- a/colossalai/engine/schedule/_non_pipeline_schedule.py
+++ b/colossalai/engine/schedule/_non_pipeline_schedule.py
@ -15,6 +15,10 @@ class NonPipelineSchedule(BaseSchedule):
    During one process, it loads a batch of dataset and feeds it to the model.
    After getting the output and calculating the loss, it will use :meth:`step`
    to update the parameters if it is in training mode.
+
+    Args:
+        batch_data_process_func (Callable, optional): The preprocessing function which receives a batch of data,
+        and it will be executed in load_batch.
    """

    def forward_backward_step(self,
@ -23,22 +27,19 @@ class NonPipelineSchedule(BaseSchedule):
                              forward_only: bool = False,
                              return_loss: bool = True,
                              return_output_label: bool = True):
-        """The process function that loads loads a batch of dataset and feeds it to the model.
+        """The process function that loads a batch of dataset and feeds it to the model.
        The returned labels and loss will None if :attr:`return_loss` is False.

-        :param engine: Model for training and inference
-        :param data_iter: Data iterator of the dataloader, e.g. iter(dataloader)
-        :param forward_only: If True, the model is run for the forward pass, else back propagation will be executed
-        :param return_loss: Loss will be returned if True
-        :param return_output_label: Output and label will be returned if True
-        :type engine: Iterator
-        :type data_iter: Iterator
-        :type forward_only: bool, optional
-        :type return_loss: bool, optional
-        :type return_output_label: bool, optional
+        Args:
+            engine (colossalai.engine.Engine): Colossalai engine for training and inference.
+            data_iter (Iterable): Dataloader as the form of an iterator, obtained by calling iter(dataloader).
+            forward_only (bool, optional):
+                If True, the model is run for the forward pass, else back propagation will be executed.
+            return_loss (bool, optional): Loss will be returned if True.
+            return_output_label (bool, optional): Output and label will be returned if True.

-        :return: (output, label, loss)
-        :rtype: Tuple[:class:`torch.Tensor`]
+        Returns:
+            Tuple[:class:`torch.Tensor`]: A tuple of (output, label, loss), loss and label could be None.
        """
        assert forward_only or return_loss, \
            "The argument 'return_loss' has to be True when 'forward_only' is False, but got False."
--- a/colossalai/engine/schedule/_pipeline_schedule.py
+++ b/colossalai/engine/schedule/_pipeline_schedule.py
@ -41,14 +41,13 @@ class PipelineSchedule(BaseSchedule):
    It uses non-interleaved 1F1B strategy. Other properties are similar as
    :class:`NonPipelineSchedule`.

-    :param num_microbatches: The number of microbatches
-    :type num_microbatches: int
-    :param batch_data_process_func: The preprocessing function which receives a batch of data, and it will be executed in `load_batch`
-    :type batch_data_process_func: Callable, optional
-    :param tensor_shape: Specified shape in pipeline communication
-    :type tensor_shape: torch.Size, optional
-    :param scatter_gather_tensors: If set to `True`, communication will be reduced over pipeline when using 1D tensor parallelization
-    :type scatter_gather_tensors: bool, optional
+    Args:
+        num_microbatches (int): The number of microbatches.
+        batch_data_process_func (Callable, optional):
+            The preprocessing function which receives a batch of data, and it will be executed in `load_batch`.
+        tensor_shape (torch.Size, optional): Specified shape in pipeline communication.
+        scatter_gather_tensors (bool, optional):
+            If set to `True`, communication will be reduced over pipeline when using 1D tensor parallelization.
    """

    def __init__(self,
@ -131,19 +130,14 @@ class PipelineSchedule(BaseSchedule):
        is obtained from data_iterator, otherwise the passed-in input_tensor is used.
        Returns output tensor. This is a helper function and can be ignored by users.

-        :param engine: Your engine object
-        :type engine: colossalai.engine.Engine
-        :param input_tensor: Input tensor for this pipeline stage
-        :type input_tensor: :class:`torch.Tensor`
-        :param return_tensors: A list of tensors to return
-        :type return_tensors: List[:class:`torch.Tensor`]
-        :param return_output_label: Whether returns output labels
-        :type return_output_label: bool, optional
-        :param accum_loss: Where accumulated loss stores
-        :type  accum_loss: optional
-
-        :return: output or the loss value of the current pipeline stage
-        :rtype: :class:`torch.Tensor`
+        Args:
+            engine (colossalai.engine.Engine): Colossalai engine for training and inference.
+            input_tensor (:class:`torch.Tensor`): Input tensor for this pipeline stage.
+            return_tensors (List[:class:`torch.Tensor`]): A list of tensors to return.
+            return_output_label (bool, optional): Whether returns output labels.
+            accum_loss (optional): Where accumulated loss stores.
+        Returns:
+            :class:`torch.Tensor`: output or the loss value of the current pipeline stage.
        """
        data, label = self.load_micro_batch()
        output_tensor = self._call_engine(engine.model, input_tensor, data)
@ -173,17 +167,14 @@ class PipelineSchedule(BaseSchedule):
        Returns the gradients with respect to the input tensor (None if first stage).
        This is a helper function and can be ignored by users.

-        :param engine: your engine object
-        :type engine: colossalai.engine.Engine
-        :param input_tensor: input tensor for this pipeline stage
-        :type input_tensor: :class:`torch.Tensor`
-        :param output_tensor: output tensor for this pipeline stage
-        :type output_tensor: :class:`torch.Tensor`
-        :param output_tensor_grad: gradient of output tensor for this pipeline stage
-        :type output_tensor_grad: :class:`torch.Tensor`
+        Args:
+            engine (colossalai.engine.Engine): Colossalai engine for training and inference.
+            input_tensor (:class:`torch.Tensor`): input tensor for this pipeline stage.
+            output_tensor (:class:`torch.Tensor`): output tensor for this pipeline stage.
+            output_tensor_grad (:class:`torch.Tensor`): gradient of output tensor for this pipeline stage.

-        :return: gradient of input tensor
-        :rtype: :class:`torch.Tensor`
+        Returns:
+            :class:`torch.Tensor`: gradient of input tensor.
        """

        # Retain the grad on the input_tensor.
@ -207,19 +198,16 @@ class PipelineSchedule(BaseSchedule):
        """Runs non-interleaved 1F1B schedule, with communication between pipeline stages.
        Returns a tuple with losses if the last stage, an empty tuple otherwise.

-        :param engine: Your engine object
-        :type engine: colossalai.engine.Engine
-        :param data_iter: Dataloader as the form of an iterator, obtained by calling iter(dataloader)
-        :type data_iter: Iterable
-        :param forward_only: Whether run forward step only. Default is false. If true, no backward will be run.
-        :type forward_only: bool
-        :param return_loss: Whether returns the loss value. Default is true.
-        :type return_loss: bool
-        :param return_output_label: If False, the output and label won't be returned
-        :type return_output_label: bool
+        Args:
+            engine (colossalai.engine.Engine): Colossalai engine for training and inference.
+            data_iter (Iterable): Dataloader as the form of an iterator, obtained by calling iter(dataloader).
+            forward_only (bool, optional):
+                Whether run forward step only. Default is false. If true, no backward will be run.
+            return_loss (bool, optional): Whether returns the loss value. Default is true.
+            return_output_label (bool, optional): If False, the output and label won't be returned.

-        :return: (output, label, loss)
-        :rtype: Tuple[:class:`torch.Tensor`]
+        Returns:
+            Tuple[:class:`torch.Tensor`]: A tuple of (output, label, loss), loss and label could be None.
        """

        assert forward_only or return_loss, \
@ -354,16 +342,14 @@ class InterleavedPipelineSchedule(PipelineSchedule):
        It uses interleaved 1F1B strategy. Other properties are similar as
        :class:`NonPipelineSchedule`.

-        :param num_microbatches: The number of microbatches
-        :type num_microbatches: int
-        :param num_model_chunks: The number of model chunks
-        :type num_model_chunks: int
-        :param batch_data_process_func: The preprocessing function which receives a batch of data, and it will be executed in `load_batch`
-        :type batch_data_process_func: Callable, optional
-        :param tensor_shape: Specified shape in pipeline communication
-        :type tensor_shape: torch.Size, optional
-        :param scatter_gather_tensors: If set to `True`, communication will be reduced over pipeline when using 1D tensor parallelization
-        :type scatter_gather_tensors: bool, optional
+        Args:
+            num_microbatches (int): The number of microbatches.
+            num_model_chunks (int): The number of model chunks.
+            batch_data_process_func (Callable, optional):
+                The preprocessing function which receives a batch of data, and it will be executed in `load_batch`.
+            tensor_shape (torch.Size, optional): Specified shape in pipeline communication.
+            scatter_gather_tensors (bool, optional):
+                If set to `True`, communication will be reduced over pipeline when using 1D tensor parallelization.
        """
        assert num_microbatches % gpc.get_world_size(ParallelMode.PIPELINE) == 0, \
            'num_microbatches must be an integer multiple of pipeline parallel world size'
@ -408,6 +394,16 @@ class InterleavedPipelineSchedule(PipelineSchedule):
        """Forward step for passed-in model. If it is the first stage, the input tensor 
        is obtained from data_iterator, otherwise the passed-in input_tensor is used.
        Returns output tensor. This is a helper function and can be ignored by users.
+
+        Args:
+            engine (colossalai.engine.Engine): Colossalai engine for training and inference.
+            model_chunk_id (int): The id of model chunks.
+            input_tensor (:class:`torch.Tensor`): Input tensor for this pipeline stage.
+            return_tensors (List[:class:`torch.Tensor`]): A list of tensors to return.
+            return_output_label (bool, optional): Whether returns output labels.
+            accum_loss (optional): Where accumulated loss stores.
+        Returns:
+            :class:`torch.Tensor`: output or the loss value of the current pipeline stage.
        """
        data, label = self.load_micro_batch(model_chunk_id)
        output_tensor = self._call_engine(engine.model[model_chunk_id], input_tensor, data)
@ -435,18 +431,17 @@ class InterleavedPipelineSchedule(PipelineSchedule):
        """Run interleaved 1F1B schedule (model split into model chunks), with
        communication between pipeline stages as needed.

-        Returns dictionary with losses if the last stage, empty dict otherwise.
+        Args:
+            engine (colossalai.engine.Engine): Colossalai engine for training and inference.
+            data_iter (Iterable): Dataloader as the form of an iterator, obtained by calling iter(dataloader).
+            forward_only (bool, optional):
+                Whether run forward step only. Default is false. If true, no backward will be run.
+            return_loss (bool, optional): Whether returns the loss value. Default is true.
+            return_output_label (bool, optional): If False, the output and label won't be returned.

-        :param engine: Your engine object
-        :type engine: colossalai.engine.Engine
-        :param data_iter: Dataloader as the form of an iterator, obtained by calling iter(dataloader)
-        :type data_iter: Iterable
-        :param forward_only: Whether run forward step only. Default is false. If true, no backward will be run.
-        :type forward_only: bool
-        :param return_loss: Whether returns the loss value. Default is true.
-        :type return_loss: bool
-        :param return_output_label: If False, the output and label won't be returned
-        :type return_output_label: bool
+        Returns:
+            Tuple[:class:`torch.Tensor`]: A tuple of (output, label, loss), loss and label could be None.
+                The loss would be returned only in the last stage.
        """
        assert forward_only or return_loss, \
            'The argument \'return_loss\' has to be True when \'forward_only\' is False, but got False.'
--- a/colossalai/initialize.py
+++ b/colossalai/initialize.py
@ -37,8 +37,8 @@ def get_default_parser():
    """Reads user command line and uses an argument parser to parse the input arguments.
    Input arguments include configuration, host, port, world size, local rank, backend for torch.distributed.

-    :return: Returns the parser with the default arguments, the user may add customized arguments into this parser
-    :rtype: Namespace
+    Returns:
+       Namespace: Returns the parser with the default arguments, the user may add customized arguments into this parser.
    """
    parser = argparse.ArgumentParser()
    parser.add_argument('--config', type=str, help='path to the config file')
@ -63,26 +63,21 @@ def launch(config: Union[str, Path, Config, Dict],
    """This function first parses the configuration arguments, using :func:`parse_args()` in case one of the input
    arguments are not given. Then initialize and set distributed environment by calling global_context's functions.

-    :param config: Config file or config file path are both acceptable
-    :type config: Union[str, dict, Config]
-    :param rank: Rank for the default process group
-    :type rank: int
-    :param world_size: World size of the default process group
-    :type world_size: int
-    :param host: The master address for distributed training
-    :type host: str
-    :param port: The master port for distributed training
-    :type port: str
-    :param backend: Backend for torch.distributed
-    :type backend: str, optional
-    :param local_rank: Rank for the process on the node and is used to set the default CUDA device, defaults to None.
-        If local_rank = None, the default device ordinal will be calculated automatically
-    :type local_rank: int, optional
-    :param seed: Specified random seed for every processes
-    :type seed: int, optional
-    :param verbose: Whether to print logs
-    :type verbose: bool, optional
-    :raises Exception: Raise exception when config type is wrong
+    Args:
+        config (Union[str, dict, Config]): Config file or config file path are both acceptable
+        rank (int): Rank for the default process group
+        world_size (int): World size of the default process group
+        host (str): The master address for distributed training
+        port (str): The master port for distributed training
+        backend (str, optional): Backend for ``torch.distributed``, defaults to ``nccl``
+        local_rank (int, optional):
+            Rank for the process on the node and is used to set the default CUDA device,
+            defaults to None. If local_rank = None, the default device ordinal will be calculated automatically.
+        seed (int, optional): Specified random seed for every process. Defaults to 1024.
+        verbose (bool, optional): Whether to print logs. Defaults to True.
+
+    Raises:
+        Exception: Raise exception when config type is wrong
    """
    gpc.verbose = verbose

@ -126,18 +121,13 @@ def launch_from_slurm(config: Union[str, Path, Config, Dict],
    """A wrapper for colossalai.launch for SLURM launcher by reading rank and world size from the environment variables
    set by SLURM

-    :param config: Config file or config file path are both acceptable
-    :type config: Union[str, dict, Config]
-    :param host: The master address for distributed training
-    :type host: str
-    :param port: The master port for distributed training
-    :type port: str
-    :param backend: Backend for torch.distributed
-    :type backend: str, optional
-    :param seed: Specified random seed for every processes
-    :type seed: int, optional
-    :param verbose: Whether to print logs
-    :type verbose: bool, optional
+    Args:
+        config (Union[str, dict, Config]): Config file or config file path are both acceptable
+        host (str): The master address for distributed training
+        port (str): The master port for distributed training
+        backend (str, optional): Backend for ``torch.distributed``, defaults to ``nccl``
+        seed (int, optional): Specified random seed for every process. Defaults to 1024.
+        verbose (bool, optional): Whether to print logs. Defaults to True.
    """
    rank = int(os.environ['SLURM_PROCID'])
    world_size = int(os.environ['SLURM_NPROCS'])
@ -160,18 +150,13 @@ def launch_from_openmpi(config: Union[str, Path, Config, Dict],
    """A wrapper for colossalai.launch for OpenMPI launcher by reading rank and world size from the environment variables
    set by OpenMPI

-    :param config: Config file or config file path are both acceptable
-    :type config: Union[str, dict, Config]
-    :param host: The master address for distributed training
-    :type host: str
-    :param port: The master port for distributed training
-    :type port: str
-    :param backend: Backend for torch.distributed
-    :type backend: str, optional
-    :param seed: Specified random seed for every processes
-    :type seed: int, optional
-    :param verbose: Whether to print logs
-    :type verbose: bool, optional
+    Args:
+        config (Union[str, dict, Config]): Config file or config file path are both acceptable
+        host (str): The master address for distributed training
+        port (str): The master port for distributed training
+        backend (str, optional): Backend for ``torch.distributed``, defaults to ``nccl``
+        seed (int, optional): Specified random seed for every process. Defaults to 1024.
+        verbose (bool, optional): Whether to print logs. Defaults to True.
    """
    rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
    local_rank = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])
@ -194,14 +179,11 @@ def launch_from_torch(config: Union[str, Path, Config, Dict],
    """A wrapper for colossalai.launch for torchrun or torch.distributed.launch by reading rank and world size
    from the environment variables set by PyTorch

-    :param config: Config file or config file path are both acceptable
-    :type config: Union[str, dict, Config]
-    :param backend: Backend for torch.distributed
-    :type backend: str, optional
-    :param seed: Specified random seed for every processes
-    :type seed: int, optional
-    :param verbose: Whether to print logs
-    :type verbose: bool, optional
+    Args:
+        config (Union[str, dict, Config]): Config file or config file path are both acceptable
+        backend (str, optional): Backend for ``torch.distributed``, defaults to ``nccl``
+        seed (int, optional): Specified random seed for every process. Defaults to 1024.
+        verbose (bool, optional): Whether to print logs. Defaults to True.
    """
    rank = int(os.environ['RANK'])
    local_rank = int(os.environ['LOCAL_RANK'])
@ -230,22 +212,20 @@ def initialize(model: nn.Module,
    """Core function to wrap the essential training components with our functionality based on the config which is
    loaded into gpc.config.

-    :param model: Your model instance or a function to build the model
-    :type model: :class:`torch.nn.Module` or Callbale
-    :param optimizer: Your optimizer instance
-    :type optimizer: :class:`torch.optim.optimizer.Optimizer` or :class:`Type[torch.optim.optimizer]`
-    :param criterion: Your criterion instance
-    :type criterion: :class:`torch.nn.modules.loss._Loss`, optional
-    :param train_dataloader: Dataloader for training
-    :type train_dataloader: :class:`torch.utils.data.DataLoader`, optional
-    :param test_dataloader: Dataloader for testing
-    :type test_dataloader: :class:`torch.utils.data.DataLoader`, optional
-    :param lr_scheduler: Your lr scheduler instance, optional
-    :type lr_scheduler: :class:`torch.nn.lr_scheduler._LRScheduler`, optional
-    :param verbose: Whether to print logs
-    :type verbose: bool, optional
-    :return: (engine, train_dataloader, test_dataloader, lr_scheduler)
-    :rtype: Tuple
+    Args:
+        model (:class:`torch.nn.Module` or Callbale): Your model instance or a function to build the model.
+        optimizer (:class:`torch.optim.optimizer.Optimizer` or :class:`Type[torch.optim.optimizer]`):
+            Your optimizer instance.
+        criterion (:class:`torch.nn.modules.loss._Loss`, optional): Your criterion instance.
+        train_dataloader (:class:`torch.utils.data.DataLoader`, optional): Dataloader for training.
+        test_dataloader (:class:`torch.utils.data.DataLoader`, optional): Dataloader for testing.
+        lr_scheduler (:class:`torch.nn.lr_scheduler._LRScheduler`, optional): Your lr scheduler instance, optional.
+        verbose (bool, optional): Whether to print logs.
+
+    Returns:
+        Tuple (engine, train_dataloader, test_dataloader, lr_scheduler):
+            A tuple of ``(engine, train_dataloader, test_dataloader, lr_scheduler)``
+            where only ``engine`` could not be None.
    """
    # get logger
    logger = get_dist_logger()
--- a/colossalai/logging/init.py
+++ b/colossalai/logging/init.py
@ -10,6 +10,8 @@ def get_dist_logger(name='colossalai'):
    """Get logger instance based on name. The DistributedLogger will create singleton instances,
    which means that only one logger instance is created per name.

+    Args:
+
    :param name: name of the logger, name must be unique
    :type name: str

--- a/colossalai/logging/logger.py
+++ b/colossalai/logging/logger.py
@ -23,8 +23,13 @@ except ImportError:
 class DistributedLogger:
    """This is a distributed event logger class essentially based on :class:`logging`.

-    :param name: The name of the logger
-    :type name: str
+    Args:
+        name (str): The name of the logger.
+
+    Note:
+        The parallel_mode used in ``info``, ``warning``, ``debug`` and ``error``
+        should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
    """

    __instances = dict()
@ -33,10 +38,10 @@ class DistributedLogger:
    def get_instance(name: str):
        """Get the unique single logger instance based on name.

-        :param name: The name of the logger
-        :type name: str
-        :return: A DistributedLogger object
-        :rtype: DistributedLogger
+        Args:
+            name (str): The name of the logger.
+        Returns:
+            DistributedLogger: A DistributedLogger object
        """
        if name in DistributedLogger.__instances:
            return DistributedLogger.__instances[name]
@ -73,8 +78,8 @@ class DistributedLogger:
    def set_level(self, level: str):
        """Set the logging level

-        :param level: Can only be INFO, DEBUG, WARNING and ERROR
-        :type level: str
+        Args:
+            level (str): Can only be INFO, DEBUG, WARNING and ERROR.
        """
        self._check_valid_logging_level(level)
        self._logger.setLevel(getattr(logging, level))
@ -82,14 +87,11 @@ class DistributedLogger:
    def log_to_file(self, path: Union[str, Path], mode: str = 'a', level: str = 'INFO', suffix: str = None):
        """Save the logs to file

-        :param path: The file to save the log
-        :type path: A string or pathlib.Path object
-        :param mode: The mode to write log into the file
-        :type mode: str
-        :param level: Can only be INFO, DEBUG, WARNING and ERROR
-        :type level: str
-        :param suffix: The suffix string of log's name
-        :type suffix: str
+        Args:
+            path (A string or pathlib.Path object): The file to save the log.
+            mode (str): The mode to write log into the file.
+            level (str): Can only be INFO, DEBUG, WARNING and ERROR.
+            suffix (str): The suffix string of log's name.
        """
        assert isinstance(path, (str, Path)), \
            f'expected argument path to be type str or Path, but got {type(path)}'
@ -131,12 +133,11 @@ class DistributedLogger:
    def info(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: list = None):
        """Log an info message.

-        :param message: The message to be logged
-        :type message: str
-        :param parallel_mode: The parallel mode used for logging. Defaults to ParallelMode.GLOBAL
-        :type parallel_mode: :class:`colossalai.context.parallel_mode.ParallelMode`
-        :param ranks: List of parallel ranks
-        :type ranks: list
+        Args:
+            message (str): The message to be logged.
+            parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`):
+                The parallel mode used for logging. Defaults to ParallelMode.GLOBAL.
+            ranks (List): List of parallel ranks.
        """
        message_prefix = "{}:{} {}".format(*self.__get_call_info())
        self._log('info', message_prefix, parallel_mode, ranks)
@ -145,12 +146,11 @@ class DistributedLogger:
    def warning(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: list = None):
        """Log a warning message.

-        :param message: The message to be logged
-        :type message: str
-        :param parallel_mode: The parallel mode used for logging. Defaults to ParallelMode.GLOBAL
-        :type parallel_mode: :class:`colossalai.context.parallel_mode.ParallelMode`
-        :param ranks: List of parallel ranks
-        :type ranks: list
+        Args:
+            message (str): The message to be logged.
+            parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`):
+                The parallel mode used for logging. Defaults to ParallelMode.GLOBAL.
+            ranks (List): List of parallel ranks.
        """
        message_prefix = "{}:{} {}".format(*self.__get_call_info())
        self._log('warning', message_prefix, parallel_mode, ranks)
@ -159,12 +159,11 @@ class DistributedLogger:
    def debug(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: list = None):
        """Log a debug message.

-        :param message: The message to be logged
-        :type message: str
-        :param parallel_mode: The parallel mode used for logging. Defaults to ParallelMode.GLOBAL
-        :type parallel_mode: :class:`colossalai.context.parallel_mode.ParallelMode`
-        :param ranks: List of parallel ranks
-        :type ranks: list
+        Args:
+            message (str): The message to be logged.
+            parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`):
+                The parallel mode used for logging. Defaults to ParallelMode.GLOBAL.
+            ranks (List): List of parallel ranks.
        """
        message_prefix = "{}:{} {}".format(*self.__get_call_info())
        self._log('debug', message_prefix, parallel_mode, ranks)
@ -173,12 +172,11 @@ class DistributedLogger:
    def error(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: list = None):
        """Log an error message.

-        :param message: The message to be logged
-        :type message: str
-        :param parallel_mode: The parallel mode used for logging. Defaults to ParallelMode.GLOBAL
-        :type parallel_mode: :class:`colossalai.context.parallel_mode.ParallelMode`
-        :param ranks: List of parallel ranks
-        :type ranks: list
+        Args:
+            message (str): The message to be logged.
+            parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`):
+                The parallel mode used for logging. Defaults to ParallelMode.GLOBAL.
+            ranks (List): List of parallel ranks.
        """
        message_prefix = "{}:{} {}".format(*self.__get_call_info())
        self._log('error', message_prefix, parallel_mode, ranks)
--- a/colossalai/nn/init.py
+++ b/colossalai/nn/init.py
@ -6,6 +6,7 @@ import torch.nn as nn


 def zeros_():
+    """Return the initializer filling the input Tensor with the scalar zeros"""
    def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None):
        return nn.init.zeros_(tensor)

@ -13,6 +14,7 @@ def zeros_():


 def ones_():
+    """Return the initializer filling the input Tensor with the scalar ones"""
    def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None):
        return nn.init.ones_(tensor)

@ -20,6 +22,14 @@ def ones_():


 def uniform_(a: float = 0., b: float = 1.):
+    r"""Return the initializer filling the input Tensor with values drawn from the uniform
+    distribution :math:`\mathcal{U}(a, b)`.
+
+    Args:
+        a (float): the lower bound of the uniform distribution. Defaults 0.0.
+        b (float): the upper bound of the uniform distribution. Defaults 1.0.
+    """
+
    def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None):
        return nn.init.uniform_(tensor, a, b)

@ -27,6 +37,15 @@ def uniform_(a: float = 0., b: float = 1.):


 def normal_(mean: float = 0., std: float = 1.):
+    r"""Return the initializer filling the input Tensor with values drawn from the normal distribution
+
+     .. math::
+        \mathcal{N}(\text{mean}, \text{std}^2)
+
+    Args:
+        mean (float): the mean of the normal distribution. Defaults 0.0.
+        std (float): the standard deviation of the normal distribution. Defaults 1.0.
+     """
    def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None):
        return nn.init.normal_(tensor, mean, std)

@ -34,6 +53,19 @@ def normal_(mean: float = 0., std: float = 1.):


 def trunc_normal_(mean: float = 0., std: float = 1., a: float = -2., b: float = 2.):
+    r"""Return the initializer filling the input Tensor with values drawn from a truncated
+    normal distribution. The values are effectively drawn from the
+    normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \leq \text{mean} \leq b`.
+
+    Args:
+        mean (float): the mean of the normal distribution. Defaults 0.0.
+        std (float): the standard deviation of the normal distribution. Defaults 1.0.
+        a (float): the minimum cutoff value. Defaults -2.0.
+        b (float): the maximum cutoff value. Defaults 2.0.
+    """
    def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None):
        return nn.init.trunc_normal_(tensor, mean, std, a, b)

@ -41,6 +73,26 @@ def trunc_normal_(mean: float = 0., std: float = 1., a: float = -2., b: float =


 def kaiming_uniform_(a=0, mode='fan_in', nonlinearity='leaky_relu'):
+    r"""Return the initializer filling the input `Tensor` with values according to the method
+    described in `Delving deep into rectifiers: Surpassing human-level
+    performance on ImageNet classification` - He, K. et al. (2015), using a
+    uniform distribution. The resulting tensor will have values sampled from
+    :math:`\mathcal{U}(-\text{bound}, \text{bound})` where
+
+    .. math::
+        \text{bound} = \text{gain} \times \sqrt{\frac{3}{\text{fan_mode}}}
+
+    Also known as 'He initialization'.
+
+    Args:
+        a (int): the negative slope of the rectifier used after this layer (only used with ``'leaky_relu'``).
+        mode (str, optional): either ``'fan_in'`` (default) or ``'fan_out'``. Choosing ``'fan_in'``
+                preserves the magnitude of the variance of the weights in the
+                forward pass. Choosing ``'fan_out'`` preserves the magnitudes in the
+                backwards pass.
+        nonlinearity (str, optional): the non-linear function (`nn.functional` name),
+                        recommended to use only with ``'relu'`` or ``'leaky_relu'`` (default).
+    """
    # adapted from torch.nn.init
    def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None):
        if 0 in tensor.shape:
@ -64,6 +116,26 @@ def kaiming_uniform_(a=0, mode='fan_in', nonlinearity='leaky_relu'):


 def kaiming_normal_(a=0, mode='fan_in', nonlinearity='leaky_relu'):
+    r"""Return the initializer filling the input `Tensor` with values according to the method
+    described in `Delving deep into rectifiers: Surpassing human-level
+    performance on ImageNet classification` - He, K. et al. (2015), using a
+    normal distribution. The resulting tensor will have values sampled from
+    :math:`\mathcal{N}(0, \text{std}^2)` where
+
+    .. math::
+        \text{std} = \frac{\text{gain}}{\sqrt{\text{fan_mode}}}
+
+    Also known as 'He initialization'.
+
+    Args:
+        a (int): the negative slope of the rectifier used after this layer (only used with ``'leaky_relu'``).
+        mode (str, optional): either ``'fan_in'`` (default) or ``'fan_out'``. Choosing ``'fan_in'``
+                preserves the magnitude of the variance of the weights in the
+                forward pass. Choosing ``'fan_out'`` preserves the magnitudes in the
+                backwards pass.
+        nonlinearity (str, optional): the non-linear function (`nn.functional` name),
+                        recommended to use only with ``'relu'`` or ``'leaky_relu'`` (default).
+    """
    # adapted from torch.nn.init
    def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None):
        if 0 in tensor.shape:
@ -86,6 +158,23 @@ def kaiming_normal_(a=0, mode='fan_in', nonlinearity='leaky_relu'):


 def xavier_uniform_(a: float = math.sqrt(3.), scale: float = 2., gain: float = 1.):
+    r"""Return the initializer filling the input `Tensor` with values according to the method
+    described in `Understanding the difficulty of training deep feedforward
+    neural networks` - Glorot, X. & Bengio, Y. (2010), using a uniform
+    distribution. The resulting tensor will have values sampled from
+    :math:`\mathcal{U}(-a, a)` where
+
+    .. math::
+        a = \text{gain} \times \sqrt{\frac{6}{\text{fan_in} + \text{fan_out}}}
+
+    Also known as 'Glorot initialization'.
+
+    Args:
+        a (float, optional): an optional scaling factor used to calculate uniform
+            bounds from standard deviation. Defaults ``math.sqrt(3.)``.
+        scale (float, optional): an optional scaling factor used to calculate standard deviation. Defaults 2.0.
+        gain (float, optional): an optional scaling factor. Defaults 1.0.
+    """
    # adapted from torch.nn.init
    def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None):
        assert fan_in is not None, 'Fan_in is not provided.'
@ -102,6 +191,21 @@ def xavier_uniform_(a: float = math.sqrt(3.), scale: float = 2., gain: float = 1


 def xavier_normal_(scale: float = 2., gain: float = 1.):
+    r"""Return the initializer filling the input `Tensor` with values according to the method
+    described in `Understanding the difficulty of training deep feedforward
+    neural networks` - Glorot, X. & Bengio, Y. (2010), using a normal
+    distribution. The resulting tensor will have values sampled from
+    :math:`\mathcal{N}(0, \text{std}^2)` where
+
+    .. math::
+        \text{std} = \text{gain} \times \sqrt{\frac{2}{\text{fan_in} + \text{fan_out}}}
+
+    Also known as 'Glorot initialization'.
+
+    Args:
+        scale (float, optional): an optional scaling factor used to calculate standard deviation. Defaults 2.0.
+        gain (float, optional): an optional scaling factor. Defaults 1.0.
+    """
    # adapted from torch.nn.init
    def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None):
        assert fan_in is not None, 'Fan_in is not provided.'
@ -137,4 +241,4 @@ def lecun_normal_():
        std = math.sqrt(1.0 / fan_in)
        return nn.init.trunc_normal_(tensor, std=std / .87962566103423978)

-    return initializer
+    return initializer
--- a/colossalai/nn/layer/colossalai_layer/dropout.py
+++ b/colossalai/nn/layer/colossalai_layer/dropout.py
@ -6,13 +6,11 @@ from ..utils import get_tensor_parallel_mode


 class Dropout(nn.Module):
-    """
-    Dropout layer of colossalai
+    """Dropout layer of colossalai.

-    :param p: dropout rate, defaults to 0.5
-    :type p: float, optional
-    :param inplace: If set to ``True``, will do this operation in-place, defaults tp ``False``
-    :type inplace: bool, optional
+    Args:
+        p (float, optional): probability of an element to be zeroed, defaults 0.5.
+        inplace (bool, optional): whether to do dropout in-place, default to be False.
    """
    def __init__(self, p: float = 0.5, inplace: bool = False) -> None:
        super().__init__()
--- a/colossalai/nn/layer/colossalai_layer/embedding.py
+++ b/colossalai/nn/layer/colossalai_layer/embedding.py
@ -35,21 +35,33 @@ _parallel_patchembedding = {


 class Embedding(nn.Module):
-    """
-    Embedding for colossalai
+    r"""Embedding for colossalai.

-    :param num_embeddings: number of embeddings
-    :type num_embeddings: int
-    :param embedding_dim: dimension of embedding
-    :type embedding_dim: int
-    :param padding_idx: index of padding, defaults to None
-    :type padding_idx: int, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param weight_initializer: The intializer of weight, defaults to normal initializer
-    :type weight_initializer: typing.Callable, optional
-    :param args: Args used in F.embedding
-    :param kwargs: Kwargs used in F.embedding
+    Args:
+        num_embeddings (int): number of embeddings.
+        embedding_dim (int): dimension of embedding.
+        padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
+            therefore, the embedding vector at padding_idx is not updated during training,
+            i.e. it remains as a fixed “pad”, defaults to None.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        weight_initializer (:class:`typing.Callable`, optional):
+            he initializer of weight, defaults to normal initializer.
+
+    The ``args`` and ``kwargs`` used in :class:`torch.nn.functional.embedding` should contain:
+    ::
+
+        max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
+                    renormalized to have norm max_norm. Note: this will modify weight in-place.
+        norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
+        scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
+                    of frequency of the words in the mini-batch. Default False.
+        sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
+
+    More details about ``args`` and ``kwargs`` could be found in
+    `Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_
    """

    def __init__(self,
@ -97,27 +109,24 @@ class Embedding(nn.Module):


 class PatchEmbedding(nn.Module):
-    """
-    2D Image to Patch Embedding
+    """2D Image to Patch Embedding.

-    :param img_size: image size
-    :type img_size: int
-    :param patch_size: patch size
-    :type patch_size: int
-    :param in_chans: number of channels of input image
-    :type in_chans: int
-    :param embed_size: size of embedding
-    :type embed_size: int
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param flatten: whether to flatten output tensor, defaults to True
-    :type flatten: bool, optional
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
-    :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
-    :type bias_initializer: typing.Callable, optional
-    :param position_embed_initializer: The intializer of position embedding, defaults to zero
-    :type position_embed_initializer: typing.Callable, optional
+    Args:
+        img_size (int): image size.
+        patch_size (int): patch size.
+        in_chans (int): number of channels of input image.
+        embed_size (int): size of embedding.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        flatten (bool, optional): whether to flatten output tensor, defaults to True.
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+        position_embed_initializer (:class:`typing.Callable`, optional):
+            The initializer of position embedding, defaults to zeros initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """

    def __init__(
--- a/colossalai/nn/layer/colossalai_layer/linear.py
+++ b/colossalai/nn/layer/colossalai_layer/linear.py
@ -31,22 +31,35 @@ _vocab_parallel_classifier = {


 class Linear(nn.Module):
-    """
-    Linear layer of colossalai
+    """Linear layer of colossalai.

-    :param in_features: size of each input sample
-    :type in_features: int
-    :param out_features: size of each output sample
-    :type out_features: int
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True
-    :type bias: bool, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
-    :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
-    :type bias_initializer: typing.Callable, optional
-    :param kwargs: Kwargs used for particular parallelisms
+    Args:
+        in_features (int): size of each input sample.
+        out_features (int): size of each output sample.
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+
+    Note: ``kwargs`` would contain different parameters when you use different parallelisms.
+
+    The ``kwargs`` should contain parameters below:
+    ::
+
+        Linear1D:
+            gather_output: bool (optional, default to be false)
+            skip_bias_add: bool (optional, default to be false)
+        Linear2D:
+            skip_bias_add: bool (optional, default to be false)
+        Linear2p5D:
+            skip_bias_add: bool (optional, default to be false)
+        Linear3D:
+            None
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """

    def __init__(self,
@ -88,21 +101,21 @@ class Linear(nn.Module):


 class Classifier(nn.Module):
-    """
-    Classifier layer of colossalai
+    """Classifier layer of colossalai.

-    :param in_features: size of each input sample
-    :type in_features: int
-    :param num_classes: number of total classes for the dataset
-    :type num_classes: int
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True
-    :type bias: bool, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
-    :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
-    :type bias_initializer: typing.Callable, optional
+    Args:
+        in_features (int): size of each input sample.
+        num_classes (int): number of classes.
+        weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """

    def __init__(self,
--- a/colossalai/nn/layer/colossalai_layer/normalization.py
+++ b/colossalai/nn/layer/colossalai_layer/normalization.py
@ -19,18 +19,15 @@ _parallel_layernorm = {


 class LayerNorm(nn.Module):
-    r"""
-    Layer Normalization for colossalai
+    r"""Layer Normalization for colossalai.

-    :param normalized_shape: input shape from an expected input
-        of size. :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1] \times \ldots \times \text{normalized_shape}[-1]]`
-        If a single integer is used, it is treated as a singleton list, and this module will
-        normalize over the last dimension which is expected to be of that specific size.
-    :type normalized_shape: int
-    :param eps: a value added to the denominator for numerical stability, defaults to 1e-05
-    :type eps: float, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
+    Args:
+        normalized_shape (int): input shape from an expected input of size.
+            :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1] \times \ldots \times \text{normalized_shape}[-1]]`
+            If a single integer is used, it is treated as a singleton list, and this module will
+            normalize over the last dimension which is expected to be of that specific size.
+        eps (float, optional): a value added to the denominator for numerical stability, defaults to 1e-05
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
    """

    def __init__(self, normalized_shape: int, eps=1e-05, dtype=None) -> None:
--- a/colossalai/nn/layer/moe/experts.py
+++ b/colossalai/nn/layer/moe/experts.py
@ -28,11 +28,10 @@ class Experts(MoeExperts):
    moe model parallel group, where E is the number of experts. Every expert
    is a instence of the class, 'expert' in initialization parameters.

-    :param expert: The class of all experts
-    :param num_experts: The number of experts
-    :param expert_args: Args used to initialize experts
-
-    :type num_experts: int
+    Args:
+        expert_cls (:class:`torch.nn.Module`): The class of all experts
+        num_experts (int): The number of experts
+        expert_args: Args used to initialize experts, the args could be found in corresponding expert class
    """

    def __init__(self, expert_cls: Type[nn.Module], num_experts: int, **expert_args):
--- a/colossalai/nn/layer/moe/layers.py
+++ b/colossalai/nn/layer/moe/layers.py
@ -18,19 +18,13 @@ class Top1Router(nn.Module):
    for routing usage. More deailted function can be found in the paper about Switch Transformer
    of Google.

-    :param capacity_factor_train: Capacity factor in routing during training
-    :param capacity_factor_eval: Capacity factor in routing during evaluation
-    :param min_capacity: The minimum number of the capacity of each expert
-    :param select_policy: The policy about tokens selection
-    :param noisy_func: Noisy function used in logits
-    :param drop_tks: Whether drops tokens in evaluation
-
-    :type capacity_factor_train: float, optional
-    :type capacity_factor_eval: float, optional
-    :type min_capacity: int, optional
-    :type select_policy: str, optional
-    :type noisy_func: Callable, optional
-    :type drop_tks: bool, optional
+    Args:
+        capacity_factor_train (float, optional): Capacity factor in routing of training.
+        capacity_factor_eval (float, optional): Capacity factor in routing of evaluation.
+        min_capacity (int, optional): The minimum number of the capacity of each expert.
+        select_policy (str, optional): The policy about tokens selection.
+        noisy_func (:class:`typing.Callable`, optional): Noisy function used in logits.
+        drop_tks (bool, optional): Whether drops tokens in evaluation
    """

    def __init__(self,
@ -119,17 +113,12 @@ class Top2Router(nn.Module):
    """Top2 router that returns the dispatch mask [s, e, c] and combine weight [s, e, c]
    for routing usage. More deailted function can be found in the paper about ViT-MoE.

-    :param capacity_factor_train: Capacity factor in routing during training
-    :param capacity_factor_eval: Capacity factor in routing during evaluation
-    :param min_capacity: The minimum number of the capacity of each expert
-    :param noisy_func: Noisy function used in logits
-    :param drop_tks: Whether drops tokens in evaluation
-
-    :type capacity_factor_train: float, optional
-    :type capacity_factor_eval: float, optional
-    :type min_capacity: int, optional
-    :type noisy_func: Callable, optional
-    :type drop_tks: bool, optional
+    Args:
+        capacity_factor_train (float, optional): Capacity factor in routing of training.
+        capacity_factor_eval (float, optional): Capacity factor in routing of evaluation.
+        min_capacity (int, optional): The minimum number of the capacity of each expert
+        noisy_func (:class:`typing.Callable`, optional): Noisy function used in logits.
+        drop_tks (bool, optional): Whether drops tokens in evaluation.
    """

    def __init__(self,
@ -239,15 +228,11 @@ class MoeLayer(nn.Module):
    the moe tensor group by all to all comunication. Then it will get the output of all
    experts and exchange the output. At last returns the output of the moe system.

-    :param dim_model: Dimension of model
-    :param num_experts: The number of experts
-    :param router: Instance of router used in routing
-    :param experts: Instance of experts generated by Expert
-
-    :type dim_model: int
-    :type num_experts: int
-    :type router: nn.Module
-    :type experts: nn.Module
+    Args:
+        dim_model (int): Dimension of model.
+        num_experts (int): The number of experts.
+        router (:class:`torch.nn.Module`): Instance of router used in routing.
+        experts (:class:`torch.nn.Module`): Instance of experts generated by Expert.
    """

    def __init__(self, dim_model: int, num_experts: int, router: nn.Module, experts: MoeExperts):
--- a/colossalai/nn/layer/moe/utils.py
+++ b/colossalai/nn/layer/moe/utils.py
@ -16,8 +16,8 @@ class NormalNoiseGenerator:
    All noise is generated from a normal distribution (0, 1 / E^2), where
    E = the number of experts.

-    :param num_experts: The number of experts
-    :type num_experts: int
+    Args:
+        num_experts (int): The number of experts.
    """

    def __init__(self, num_experts: int):
@ -37,8 +37,8 @@ class UniformNoiseGenerator:
    Makes models more resilient to rounding errors introduced by bfloat16.
    This seems particularly important for logits.

-    :param eps: Epsilon in generator
-    :type eps: float
+    Args:
+        eps (float, optional): Epsilon in generator, defaults 1e-2.
    """

    def __init__(self, eps: float = 1e-2):
--- a/colossalai/nn/layer/parallel_1d/_operation.py
+++ b/colossalai/nn/layer/parallel_1d/_operation.py
@ -7,17 +7,17 @@ except:


 class FusedLayerNormAffineFunction1D(torch.autograd.Function):
-    r"""
-  Layernorm
+    r"""Layernorm

-  :param input: input maxtrix
-  :param weight: weight matrix
-  :param bias: bias matrix
-  :param normalized_shape: input shape from an expected input
-        of size. :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1] \times \ldots \times \text{normalized_shape}[-1]]`
-        If a single integer is used, it is treated as a singleton list, and this module will
-        normalize over the last dimension which is expected to be of that specific size.
-  :param eps: a value added to the denominator for numerical stability
+    Args:
+        input: input matrix.
+        weight: weight matrix.
+        bias: bias matrix.
+        normalized_shape: input shape from an expected input of size.
+            :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1] \times \ldots \times \text{normalized_shape}[-1]]`
+            If a single integer is used, it is treated as a singleton list, and this module will
+            normalize over the last dimension which is expected to be of that specific size.
+        eps: a value added to the denominator for numerical stability
  """

    @staticmethod
--- a/colossalai/nn/layer/parallel_1d/_utils.py
+++ b/colossalai/nn/layer/parallel_1d/_utils.py
@ -78,8 +78,9 @@ class _ReduceGrad(torch.autograd.Function):
    """
    Pass the input to the model parallel region.

-    :param input_: input matrix
-    :param parallel_mode: parallel mode
+    Args:
+        input_: input matrix.
+        parallel_mode: parallel mode.
    """

    @staticmethod
@ -99,9 +100,10 @@ class _ReduceGrad(torch.autograd.Function):
 class _ReduceInput(torch.autograd.Function):
    """
    All-reduce the input from the model parallel region.
-    
-    :param input_: input matrix
-    :param parallel_mode: parallel mode
+
+    Args:
+        input_: input matrix.
+        parallel_mode: parallel mode.
    """

    @staticmethod
@ -121,9 +123,10 @@ class _SplitForwardGatherBackward(torch.autograd.Function):
    """
    Split the input and keep only the corresponding chuck to the rank.
    
-    :param input_: input matrix
-    :param parallel_mode: parallel mode
-    :param dim: dimension
+    Args:
+        input_: input matrix.
+        parallel_mode: parallel mode.
+        dim: dimension
    """

    @staticmethod
@ -142,12 +145,12 @@ class _SplitForwardGatherBackward(torch.autograd.Function):


 class _GatherForwardSplitBackward(torch.autograd.Function):
-    """
-    Gather the input from model parallel region and concatinate.
-    
-    :param input_: input matrix
-    :param parallel_mode: parallel mode
-    :param dim: dimension
+    """Gather the input from model parallel region and concatenate.
+
+    Args:
+        input_: input matrix.
+        parallel_mode: parallel mode.
+        dim: dimension
    """

    @staticmethod
--- a/colossalai/nn/layer/parallel_1d/layers.py
+++ b/colossalai/nn/layer/parallel_1d/layers.py
@ -24,24 +24,23 @@ from ._utils import (gather_forward_split_backward, get_parallel_input, reduce_g

@LAYERS.register_module
 class Linear1D(torch.nn.Module):
-    """
-    Linear layer for 1D parallelism
+    r"""Linear layer for 1D parallelism.

-    :param in_features: size of each input sample
-    :type in_features: int
-    :param out_features: size of each output sample
-    :type out_features: int
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True
-    :type bias: bool, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param skip_bias_add: If set to ``True``, it will skip bias add for linear layer,
-        which is preserved for kernel fusion, defaults to False
-    :type skip_bias_add: bool, optional
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
-    :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
-    :type bias_initializer: typing.Callable, optional
+    Args:
+        in_features (int): size of each input sample.
+        out_features (int): size of each output sample.
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        gather_output (bool, optional): Whether to call all-gather on output, defaults to False.
+        skip_bias_add (bool, optional): If set to ``True``, it will skip bias add for linear layer,
+            which is preserved for kernel fusion, defaults to False
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """

    def __init__(self,
@ -88,23 +87,21 @@ class Linear1D(torch.nn.Module):

@LAYERS.register_module
 class Classifier1D(ParallelLayer):
-    """RowLinear with given weight
-    Classifier of 1D parallelism
-    
-    :param in_features: size of input features
-    :type in_features: int
-    :param num_classes: number of classes in the dataset
-    :type num_classes: int
-    :param weight: weight of the classifier, defaults to True
-    :type weight: torch.nn.Parameter, optional
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True``
-    :type bias: bool, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
-    :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
-    :type bias_initializer: typing.Callable, optional
+    r"""RowLinear with given weight. Classifier of 1D parallelism.
+
+    Args:
+        in_features (int): size of each input sample.
+        num_classes (int): number of classes.
+        weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """

    def __init__(self,
@ -171,23 +168,21 @@ class Classifier1D(ParallelLayer):

@LAYERS.register_module
 class VocabParallelClassifier1D(ParallelLayer):
-    """ColLinear with given weight
-    Classifier of 1D parallelism
-    
-    :param in_features: size of input features
-    :type in_features: int
-    :param num_classes: number of classes in the dataset
-    :type num_classes: int
-    :param weight: weight of the classifier, defaults to True
-    :type weight: torch.nn.Parameter, optional
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True``
-    :type bias: bool, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
-    :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
-    :type bias_initializer: typing.Callable, optional
+    r"""ColLinear with given weight. Classifier of 1D parallelism.
+
+    Args:
+        in_features (int): size of each input sample.
+        num_classes (int): number of classes.
+        weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """

    def __init__(self,
@ -249,30 +244,28 @@ class VocabParallelClassifier1D(ParallelLayer):

@LAYERS.register_module
 class Linear1D_Col(ParallelLayer):
-    """Linear layer with column parallelism.
+    r"""Linear layer with column parallelism.

    The linear layer is defined as :math:`Y = XA + b`. A is parallelized along
    its second dimension as :math:`A = [A_1, ..., A_p]`.

-    :param in_features: first dimension of matrix A.
-    :type in_features: int
-    :param output_size: second dimension of matrix A.
-    :type output_size: int
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True``
-    :type bias: bool, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param gather_output: If true, call all-gether on output and make Y avaiable
+    Args:
+        in_features (int): size of each input sample.
+        out_features (int): size of each output sample.
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        gather_output (bool, optional): If true, call all-gather on output and make Y available
                    to all GPUs, otherwise, every GPU will have its output
                    which is :math:`Y_i = XA_i`, defaults to False
-    :type gather_output: bool, optional
-    :param skip_bias_add: If set to ``True``, it will skip bias add for linear layer,
-        which is preserved for kernel fusion, defaults to False
-    :type skip_bias_add: bool, optional
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
-    :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
-    :type bias_initializer: typing.Callable, optional
+        skip_bias_add (bool, optional): If set to ``True``, it will skip bias add for linear layer,
+            which is preserved for kernel fusion, defaults to Fals
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """

    def __init__(self,
@ -343,25 +336,23 @@ class Linear1D_Col(ParallelLayer):

@LAYERS.register_module
 class Linear1D_Row(ParallelLayer):
-    """ Linear layer with row parallelism 
+    r""" Linear layer with row parallelism

-    :param in_features: size of each input sample
-    :type in_features: int
-    :param out_features: size of each output sample
-    :type out_features: int
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True``
-    :type bias: bool, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param parallel_input: If set to ``True``, it's assumed that the input is splitted, defaults to False
-    :type parallel_input: bool, optional
-    :param skip_bias_add: If set to ``True``, it will skip bias add for linear layer,
-        which is preserved for kernel fusion, defaults to False
-    :type skip_bias_add: bool, optional
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
-    :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
-    :type bias_initializer: typing.Callable, optional
+    Args:
+        in_features (int): size of each input sample.
+        out_features (int): size of each output sample.
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        parallel_input (bool, optional): If set to ``True``, it's assumed that the input is split, defaults to False.
+        skip_bias_add (bool, optional): If set to ``True``, it will skip bias add for linear layer,
+            which is preserved for kernel fusion, defaults to Fals
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """

    def __init__(self,
@ -432,21 +423,33 @@ class Linear1D_Row(ParallelLayer):

@LAYERS.register_module
 class Embedding1D(ParallelLayer):
-    """
-    Embedding for 1D parallelism
+    r"""Embedding for 1D parallelism.

-    :param num_embeddings: number of embeddings
-    :type num_embeddings: int
-    :param embedding_dim: dimension of embedding
-    :type embedding_dim: int
-    :param padding_idx: index of padding, defaults to None
-    :type padding_idx: int, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param weight_initializer: The intializer of weight, defaults to normal initializer
-    :type weight_initializer: typing.Callable, optional
-    :param args: Args used in F.embedding
-    :param kwargs: Kwargs used in F.embedding
+    Args:
+        num_embeddings (int): number of embeddings.
+        embedding_dim (int): dimension of embedding.
+        padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
+            therefore, the embedding vector at padding_idx is not updated during training,
+            i.e. it remains as a fixed “pad”, defaults to None.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        weight_initializer (:class:`typing.Callable`, optional):
+            he initializer of weight, defaults to normal initializer.
+
+    The ``args`` and ``kwargs`` used in :class:`torch.nn.functional.embedding` should contain:
+    ::
+
+        max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
+                    renormalized to have norm max_norm. Note: this will modify weight in-place.
+        norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
+        scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
+                    of frequency of the words in the mini-batch. Default False.
+        sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
+
+    More details about ``args`` and ``kwargs`` could be found in
+    `Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_
    """

    def __init__(self,
@ -499,20 +502,33 @@ class Embedding1D(ParallelLayer):

@LAYERS.register_module
 class VocabParallelEmbedding1D(torch.nn.Module):
-    """Embedding parallelized in the vocabulary dimension.
+    r"""Embedding parallelized in the vocabulary dimension.

-    :param num_embeddings: number of embeddings
-    :type num_embeddings: int
-    :param embedding_dim: dimension of embedding
-    :type embedding_dim: int
-    :param padding_idx: index of padding, defaults to None
-    :type padding_idx: int, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param weight_initializer: The intializer of weight, defaults to normal initializer
-    :type weight_initializer: typing.Callable, optional
-    :param args: Args used in F.embedding
-    :param kwargs: Kwargs used in F.embedding
+    Args:
+        num_embeddings (int): number of embeddings.
+        embedding_dim (int): dimension of embedding.
+        padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
+            therefore, the embedding vector at padding_idx is not updated during training,
+            i.e. it remains as a fixed “pad”, defaults to None.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        weight_initializer (:class:`typing.Callable`, optional):
+            he initializer of weight, defaults to normal initializer.
+
+    The ``args`` and ``kwargs`` used in :class:``torch.nn.functional.embedding`` should contain:
+    ::
+
+        max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
+                    renormalized to have norm max_norm. Note: this will modify weight in-place.
+        norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
+        scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
+                    of frequency of the words in the mini-batch. Default False.
+        sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
+
+    More details about ``args`` and ``kwargs`` could be found in
+    `Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
+
+    More details about initializer please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """

    def __init__(self,
@ -578,13 +594,11 @@ class VocabParallelEmbedding1D(torch.nn.Module):

@LAYERS.register_module
 class Dropout1D(ParallelLayer):
-    """
-    Dropout layer of 1D parallelism
+    """Dropout layer of 1D parallelism.

-    :param p: dropout rate, defaults to 0.5
-    :type p: float, optional
-    :param inplace: If set to ``True``, will do this operation in-place, defaults tp ``False``
-    :type inplace: bool, optional
+    Args:
+        p (float, optional): probability of an element to be zeroed, defaults 0.5.
+        inplace (bool, optional): whether to do dropout in-place, default to be False.
    """

    def __init__(self, p: float = 0.5, inplace: bool = False):
--- a/colossalai/nn/layer/parallel_2d/_operation.py
+++ b/colossalai/nn/layer/parallel_2d/_operation.py
@ -21,27 +21,26 @@ def matmul_2d(
    row_parallel_mode=ParallelMode.PARALLEL_2D_ROW,
    col_parallel_mode=ParallelMode.PARALLEL_2D_COL,
 ):
-    """
-    Matrix multiplication for 2D parallelism
+    r"""Matrix multiplication for 2D parallelism.

-    :param a: matrix :math:`A`
-    :type a: torch.tensor
-    :param b: matrix :math:`B`
-    :type b: torch.tensor
-    :param summa_dim: dimension of SUMMA fo 2D parallelism
-    :type summa_dim: int
-    :param out_shape: shape of output tensor
-    :type out_shape: tuple
-    :param row_rank: the rank of row, defaults to None
-    :type row_rank: int, optional
-    :param col_rank: the rank of column, defaults to None
-    :type col_rank: int, optional
-    :param row_parallel_mode: row parallel mode, defaults to ParallelMode.PARALLEL_2D_ROW
-    :type row_parallel_mode: str, optional
-    :param col_parallel_mode: column parallel mode, defaults to ParallelMode.PARALLEL_2D_COL
-    :type col_parallel_mode: str, optional
-    :return: :math:`C = AB`
-    :rtype: torch.tensor
+    Args:
+        a (:class:`torch.tensor`): matrix :math:`A`.
+        b (:class:`torch.tensor`): matrix :math:`B`.
+        summa_dim (int): dimension of SUMMA fo 2D parallelism.
+        out_shape (:class:`torch.size`): shape of output tensor.
+        row_rank (int, optional): the rank of row, defaults to None.
+        col_rank (int, optional): the rank of column, defaults to None.
+        row_parallel_mode (:class:`colossalai.context.ParallelMode`, optional):
+            row parallel mode, defaults to ParallelMode.PARALLEL_2D_ROW.
+        col_parallel_mode (:class:`colossalai.context.ParallelMode`, optional):
+            column parallel mode, defaults to ParallelMode.PARALLEL_2D_COL.
+
+    Returns:
+        :class:`torch.tensor`: :math:`C = AB`.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """
    if row_rank is None:
        row_rank = gpc.get_local_rank(col_parallel_mode)
@ -135,35 +134,26 @@ def classifier_2d(A: Tensor, B: Tensor, bias: Optional[Tensor], summa_dim: int,
                  row_rank: int, col_rank: int, row_parallel_mode: ParallelMode, col_parallel_mode: ParallelMode,
                  data_parallel_rank: int, pipeline_parallel_rank: int, pipeline_parallel_size: int,
                  tensor_parallel_size: int) -> Tensor:
-    """
-    2D parallel classifier
+    r"""2D parallel classifier.

-    :param a: matrix :math:`A`
-    :type a: torch.tensor
-    :param b: matrix :math:`B`
-    :type b: torch.tensor
-    :param bias: matrix of bias
-    :type bias: torch.tensor, optional
-    :param summa_dim: dimension of SUMMA fo 2D parallelism
-    :type summa_dim: int
-    :param out_shape: shape of output tensor
-    :type out_shape: tuple
-    :param row_rank: the rank of row
-    :type row_rank: int
-    :param col_rank: the rank of column
-    :type col_rank: int
-    :param row_parallel_mode: row parallel mode
-    :type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param col_parallel_mode: column parallel mode
-    :type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param data_parallel_rank: data parallel rank
-    :type data_parallel_rank: int
-    :param pipeline_parallel_rank: pipeline parallel rank
-    :type pipeline_parallel_rank: int
-    :param pipeline_parallel_size: pipeline parallel size
-    :type pipeline_parallel_size: int
-    :param tensor_parallel_size: tensor parallel size
-    :type tensor_parallel_size: int
+    Args:
+        A (:class:`torch.tensor`): matrix :math:`A`.
+        B (:class:`torch.tensor`): matrix :math:`B`.
+        bias (:class:`torch.tensor`, optional): matrix of bias.
+        summa_dim (int): dimension of SUMMA fo 2D parallelism.
+        out_shape (:class:`torch.size`): shape of output tensor.
+        row_rank (int, optional): the rank of row, defaults to None.
+        col_rank (int, optional): the rank of column, defaults to None.
+        row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
+        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
+        data_parallel_rank (int): data parallel rank.
+        pipeline_parallel_rank (int): pipeline parallel rank
+        pipeline_parallel_size (int): pipeline parallel size.
+        tensor_parallel_size (int): tensor parallel size.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """
    return _Classifier2D.apply(A, B, bias, summa_dim, out_shape, row_rank, col_rank, row_parallel_mode,
                               col_parallel_mode, data_parallel_rank, pipeline_parallel_rank, pipeline_parallel_size,
@ -171,33 +161,25 @@ def classifier_2d(A: Tensor, B: Tensor, bias: Optional[Tensor], summa_dim: int,


 class Matmul_AB_2D(torch.autograd.Function):
-    """
-    Matrix multiplication for :math:`C = AB`
+    r"""Matrix multiplication for :math:`C = AB`.

-    :param a: matrix :math:`A`
-    :type a: torch.tensor
-    :param b: matrix :math:`B`
-    :type b: torch.tensor
-    :param summa_dim: dimension of SUMMA fo 2D parallelism
-    :type summa_dim: int
-    :param out_shape: shape of output tensor
-    :type out_shape: tuple
-    :param row_rank: the rank of row
-    :type row_rank: int
-    :param col_rank: the rank of column
-    :type col_rank: int
-    :param row_parallel_mode: row parallel mode
-    :type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param col_parallel_mode: column parallel mode
-    :type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param data_parallel_rank: data parallel rank
-    :type data_parallel_rank: int
-    :param pipeline_parallel_rank: pipeline parallel rank
-    :type pipeline_parallel_rank: int
-    :param pipeline_parallel_size: pipeline parallel size
-    :type pipeline_parallel_size: int
-    :param tensor_parallel_size: tensor parallel size
-    :type tensor_parallel_size: int
+    Args:
+        A (:class:`torch.tensor`): matrix :math:`A`.
+        B (:class:`torch.tensor`): matrix :math:`B`.
+        summa_dim (int): dimension of SUMMA fo 2D parallelism.
+        out_shape (:class:`torch.size`): shape of output tensor.
+        row_rank (int, optional): the rank of row, defaults to None.
+        col_rank (int, optional): the rank of column, defaults to None.
+        row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
+        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
+        data_parallel_rank (int): data parallel rank.
+        pipeline_parallel_rank (int): pipeline parallel rank
+        pipeline_parallel_size (int): pipeline parallel size.
+        tensor_parallel_size (int): tensor parallel size.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """
    @staticmethod
    @custom_fwd(cast_inputs=torch.float16)
@ -305,33 +287,26 @@ class Matmul_AB_2D(torch.autograd.Function):


 class Matmul_ABT_2D(torch.autograd.Function):
-    """
-    Matrix multiplication for :math:`C = AB^T`
-    
-    :param a: matrix :math:`A`
-    :type a: torch.tensor
-    :param b: matrix :math:`B`
-    :type b: torch.tensor
-    :param summa_dim: dimension of SUMMA fo 2D parallelism
-    :type summa_dim: int
-    :param out_shape: shape of output tensor
-    :type out_shape: tuple
-    :param row_rank: the rank of row
-    :type row_rank: int
-    :param col_rank: the rank of column
-    :type col_rank: int
-    :param row_parallel_mode: row parallel mode
-    :type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param col_parallel_mode: column parallel mode
-    :type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param data_parallel_rank: data parallel rank
-    :type data_parallel_rank: int
-    :param pipeline_parallel_rank: pipeline parallel rank
-    :type pipeline_parallel_rank: int
-    :param pipeline_parallel_size: pipeline parallel size
-    :type pipeline_parallel_size: int
-    :param tensor_parallel_size: tensor parallel size
-    :type tensor_parallel_size: int
+    r"""Matrix multiplication for :math:`C = AB^T`
+
+    Args:
+        A (:class:`torch.tensor`): matrix :math:`A`.
+        B (:class:`torch.tensor`): matrix :math:`B`.
+        summa_dim (int): dimension of SUMMA fo 2D parallelism.
+        out_shape (:class:`torch.size`): shape of output tensor.
+        row_rank (int, optional): the rank of row, defaults to None.
+        col_rank (int, optional): the rank of column, defaults to None.
+        row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
+        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
+            column parallel mode, defaults to ParallelMode.PARALLEL_2D_COL.
+        data_parallel_rank (int): data parallel rank.
+        pipeline_parallel_rank (int): pipeline parallel rank
+        pipeline_parallel_size (int): pipeline parallel size.
+        tensor_parallel_size (int): tensor parallel size.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
    """
    @staticmethod
    @custom_fwd(cast_inputs=torch.float16)
@ -445,33 +420,25 @@ class Matmul_ABT_2D(torch.autograd.Function):


 class Matmul_ATB_2D(torch.autograd.Function):
-    """
-    Matrix multiplication for :math:`C = A^TB`
+    r"""Matrix multiplication for :math:`C = A^TB`.

-    :param a: matrix :math:`A`
-    :type a: torch.tensor
-    :param b: matrix :math:`B`
-    :type b: torch.tensor
-    :param summa_dim: dimension of SUMMA fo 2D parallelism
-    :type summa_dim: int
-    :param out_shape: shape of output tensor
-    :type out_shape: tuple
-    :param row_rank: the rank of row
-    :type row_rank: int
-    :param col_rank: the rank of column
-    :type col_rank: int
-    :param row_parallel_mode: row parallel mode
-    :type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param col_parallel_mode: column parallel mode
-    :type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param data_parallel_rank: data parallel rank
-    :type data_parallel_rank: int
-    :param pipeline_parallel_rank: pipeline parallel rank
-    :type pipeline_parallel_rank: int
-    :param pipeline_parallel_size: pipeline parallel size
-    :type pipeline_parallel_size: int
-    :param tensor_parallel_size: tensor parallel size
-    :type tensor_parallel_size: int
+    Args:
+        A (:class:`torch.tensor`): matrix :math:`A`.
+        B (:class:`torch.tensor`): matrix :math:`B`.
+        summa_dim (int): dimension of SUMMA fo 2D parallelism.
+        out_shape (:class:`torch.size`): shape of output tensor.
+        row_rank (int, optional): the rank of row, defaults to None.
+        col_rank (int, optional): the rank of column, defaults to None.
+        row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
+        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
+        data_parallel_rank (int): data parallel rank.
+        pipeline_parallel_rank (int): pipeline parallel rank
+        pipeline_parallel_size (int): pipeline parallel size.
+        tensor_parallel_size (int): tensor parallel size.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
    """
    @staticmethod
    @custom_fwd(cast_inputs=torch.float16)
@ -639,33 +606,26 @@ def add_bias_2d(input_: Tensor, bias: Tensor, output_size_per_partition: int, ro
                row_parallel_mode: ParallelMode, col_parallel_mode: ParallelMode, skip_bias_add: bool,
                data_parallel_rank: int, pipeline_parallel_rank: int, pipeline_parallel_size: int,
                tensor_parallel_size: int) -> Tensor:
-    """
-    Matrix add bias: :math:`C = A + b`
+    r"""Matrix add bias: :math:`C = A + b`.

-    :param input_: matrix :math:`A`
-    :type input_: torch.tensor
-    :param bias: matrix :math:`b`
-    :type bias: torch.tensor
-    :param output_size_per_partition: size of ouput per partition
-    :type output_size_per_partition: int
-    :param row_rank: the rank of row
-    :type row_rank: int
-    :param col_rank: the rank of column
-    :type col_rank: int
-    :param row_parallel_mode: row parallel mode
-    :type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param col_parallel_mode: column parallel mode
-    :type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param skip_bias_add: If set to ``True``, it will skip bias add for linear layer, which is preserved for kernel fusion
-    :type skip_bias_add: bool
-    :param data_parallel_rank: data parallel rank
-    :type data_parallel_rank: int
-    :param pipeline_parallel_rank: pipeline parallel rank
-    :type pipeline_parallel_rank: int
-    :param pipeline_parallel_size: pipeline parallel size
-    :type pipeline_parallel_size: int
-    :param tensor_parallel_size: tensor parallel size
-    :type tensor_parallel_size: int
+    Args:
+        input_ (:class:`torch.tensor`): matrix :math:`A`.
+        bias (:class:`torch.tensor`): matrix :math:`B`.
+        output_size_per_partition (int): size of output per partition.
+        row_rank (int, optional): the rank of row, defaults to None.
+        col_rank (int, optional): the rank of column, defaults to None.
+        row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
+        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
+        skip_bias_add (bool):
+            If set to ``True``, it will skip bias add for linear layer, which is preserved for kernel fusion.
+        data_parallel_rank (int): data parallel rank.
+        pipeline_parallel_rank (int): pipeline parallel rank
+        pipeline_parallel_size (int): pipeline parallel size.
+        tensor_parallel_size (int): tensor parallel size.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """
    return _Add_Bias_2D.apply(input_, bias, output_size_per_partition, row_rank, col_rank, row_parallel_mode,
                              col_parallel_mode, skip_bias_add, data_parallel_rank, pipeline_parallel_rank,
@ -711,21 +671,19 @@ class _Layernorm_2D(torch.autograd.Function):

 def layernorm_2d(input_: Tensor, E_x: Tensor, Var_x: Tensor, hidden_size: int, row_parallel_mode: ParallelMode,
                 col_parallel_mode: ParallelMode) -> Tensor:
-    """
-    Layernorm
+    r"""Layernorm.

-    :param input_: input maxtrix
-    :type input_: torch.tensor
-    :param E_x: mean
-    :type E_x: torch.tensor
-    :param Var_x: variance
-    :type Var_x: torch.tensor
-    :param hidden_size: hidden size
-    :type hidden_size: int
-    :param row_parallel_mode: row parallel mode
-    :type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param col_parallel_mode: column parallel mode
-    :type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
+    Args:
+        input_ (:class:`torch.tensor`): input matrix.
+        E_x (:class:`torch.tensor`): mean.
+        Var_x (:class:`torch.tensor`): variance.
+        hidden_size (int): hidden size.
+        row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
+        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """
    return _Layernorm_2D.apply(input_, E_x, Var_x, hidden_size, row_parallel_mode, col_parallel_mode)

@ -748,27 +706,29 @@ class _AllGatherTensor2D(torch.autograd.Function):


 def all_gather_tensor_2d(tensor: Tensor, dim: int, parallel_mode: ParallelMode) -> Tensor:
-    """
-    All gather the tensor of 2D parallelism
+    r"""All gather the tensor of 2D parallelism.

-    :param inputs: input maxtrix
-    :type inputs: torch.tensor
-    :param dim: dimension to gather
-    :type dim: int
-    :param parallel_mode: parallel mode
-    :type parallel_mode: colossalai.context.parallel_mode.ParallelMode
+    Args:
+        tensor (:class:`torch.tensor`): Input tensor.
+        dim (int): Dimension to gather.
+        parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode tensor used.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """
    return _AllGatherTensor2D.apply(tensor, dim, parallel_mode)


 def split_tensor_2d(input_: Tensor, dim: int = 0) -> Tensor:
-    """Splits 2D tensor in specified dimension across cols
-    :param input_: Input tensor
-    :param dim: Specified dimension in which to split
-    :type input_: torch.Tensor
-    :type dim: int, optional
-    :return output: Splitted tensor
-    :rtype output: torch.Tensor
+    """Splits 2D tensor in specified dimension across cols.
+
+    Args:
+        input_ (:class:`torch.tensor`): Input tensor.
+        dim (int): Specified dimension in which to split.
+
+    Returns:
+        :class:`torch.tensor`: The tensor has been split.
    """
    if input_.size(dim) <= 1:
        return input_
@ -787,11 +747,15 @@ class _ReduceTensor2D(torch.autograd.Function):


 def reduce_tensor_2d(input_: Tensor, parallel_mode: ParallelMode) -> Tensor:
-    """
-    All-reduce the input.
-    
-    :param input_: input tensor
-    :param parallel_mode: parallel mode
+    r"""All-reduce the input.
+
+    Args:
+        input_ (:class:`torch.tensor`): Input tensor.
+        parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode tensor used.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """
    return _ReduceTensor2D.apply(input_, parallel_mode)

@ -809,12 +773,16 @@ class _ReduceScatterTensor2D(torch.autograd.Function):


 def reduce_scatter_tensor_2d(tensor: Tensor, dim: int, parallel_mode: ParallelMode) -> Tensor:
-    """
-    Reduce-scatter the input.
-    
-    :param tensor: Input tensor
-    :param dim: Dimension to scatter
-    :param parallel_mode: Parallel mode
+    r"""Reduce-scatter the input.
+
+    Args:
+        tensor (:class:`torch.tensor`): Input tensor.
+        dim (int): Dimension to reduce.
+        parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode tensor used.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """
    return _ReduceScatterTensor2D.apply(tensor, dim, parallel_mode)

@ -849,11 +817,11 @@ class _ReduceByBatch2D(torch.autograd.Function):


 def reduce_by_batch_2d(input_, reduce_mean: bool = False) -> Tensor:
-    """All-reduce the input from the model parallel region.
+    r"""All-reduce the input from the model parallel region.

-    :param input_: input maxtrix
-    :type input_: torch.tensor
-    :param reduce_mean:  If set to ``True``, it will divide the output by column parallel size, default to False
-    :type reduce_mean: bool, optional
+    Args:
+        input_ (:class:`torch.tensor`): input matrix.
+        reduce_mean (bool, optional):
+            If set to ``True``, it will divide the output by column parallel size, default to False.
    """
    return _ReduceByBatch2D.apply(input_, reduce_mean)
--- a/colossalai/nn/layer/parallel_2d/layers.py
+++ b/colossalai/nn/layer/parallel_2d/layers.py
@ -22,23 +22,22 @@ from ._utils import assert_summa_initialization, get_summa_dim_from_env

@LAYERS.register_module
 class Linear2D(ParallelLayer):
-    """
-    Linear layer for 2D parallelism
+    r"""Linear layer for 2D parallelism

-    :param in_features: size of each input sample
-    :type in_features: int
-    :param out_features: size of each output sample
-    :type out_features: int
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True
-    :type bias: bool, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param skip_bias_add: If set to ``True``, it will skip bias add for linear layer, which is preserved for kernel fusion, defaults to False
-    :type skip_bias_add: bool, optional
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
-    :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
-    :type bias_initializer: typing.Callable, optional
+    Args:
+        in_features (int): size of each input sample.
+        out_features (int): size of each output sample.
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        skip_bias_add (bool, optional): If set to ``True``, it will skip bias add for linear layer,
+            which is preserved for kernel fusion, defaults to False.
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """
    def __init__(self,
                 in_features: int,
@ -119,18 +118,16 @@ class Linear2D(ParallelLayer):

@LAYERS.register_module
 class LayerNorm2D(ParallelLayer):
-    r"""
-    Layer Normalization for 2D parallelism
+    r"""Layer Normalization for 2D parallelism.

-    :param normalized_shape: input shape from an expected input
-        of size. :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1] \times \ldots \times \text{normalized_shape}[-1]]`
-        If a single integer is used, it is treated as a singleton list, and this module will
-        normalize over the last dimension which is expected to be of that specific size.
-    :type normalized_shape: int
-    :param eps: a value added to the denominator for numerical stability, defaults to 1e-05
-    :type eps: float, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
+    Args:
+        normalized_shape (int): input shape from an expected input of size.
+            :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
+            \times \ldots \times \text{normalized_shape}[-1]]`
+            If a single integer is used, it is treated as a singleton list, and this module will
+            normalize over the last dimension which is expected to be of that specific size.
+        eps (float, optional): a value added to the denominator for numerical stability, defaults to 1e-05.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
    """
    def __init__(self, normalized_shape: int, eps: float = 1e-05, dtype=None):
        super().__init__()
@ -189,27 +186,24 @@ class LayerNorm2D(ParallelLayer):

@LAYERS.register_module
 class PatchEmbedding2D(ParallelLayer):
-    """
-    2D Image to Patch Embedding
+    r"""2D Image to Patch Embedding.

-    :param img_size: image size
-    :type img_size: int
-    :param patch_size: patch size
-    :type patch_size: int
-    :param in_chans: number of channels of input image
-    :type in_chans: int
-    :param embed_size: size of embedding
-    :type embed_size: int
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param flatten: whether to flatten output tensor, defaults to True
-    :type flatten: bool, optional
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
-    :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
-    :type bias_initializer: typing.Callable, optional
-    :param position_embed_initializer: The intializer of position embedding, defaults to zero
-    :type position_embed_initializer: typing.Callable, optional
+    Args:
+        img_size (int): image size.
+        patch_size (int): patch size.
+        in_chans (int): number of channels of input image.
+        embed_size (int): size of embedding.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        flatten (bool, optional): whether to flatten output tensor, defaults to True.
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+        position_embed_initializer (:class:`typing.Callable`, optional):
+            The initializer of position embedding, defaults to zeros initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """
    def __init__(self,
                 img_size: int,
@ -291,21 +285,33 @@ class PatchEmbedding2D(ParallelLayer):

@LAYERS.register_module
 class Embedding2D(ParallelLayer):
-    """
-    Embedding for 2D parallelism
+    r"""Embedding for 2D parallelism.

-    :param num_embeddings: number of embeddings
-    :type num_embeddings: int
-    :param embedding_dim: dimension of embedding
-    :type embedding_dim: int
-    :param padding_idx: index of padding, defaults to None
-    :type padding_idx: int, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param weight_initializer: The intializer of weight, defaults to normal initializer
-    :type weight_initializer: typing.Callable, optional
-    :param args: Args used in F.embedding
-    :param kwargs: Kwargs used in F.embedding
+    Args:
+        num_embeddings (int): number of embeddings.
+        embedding_dim (int): dimension of embedding.
+        padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
+            therefore, the embedding vector at padding_idx is not updated during training,
+            i.e. it remains as a fixed “pad”, defaults to None.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        weight_initializer (:class:`typing.Callable`, optional):
+            he initializer of weight, defaults to normal initializer.
+
+    The ``args`` and ``kwargs`` used in :class:``torch.nn.functional.embedding`` should contain:
+    ::
+
+        max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
+                    renormalized to have norm max_norm. Note: this will modify weight in-place.
+        norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
+        scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
+                    of frequency of the words in the mini-batch. Default False.
+        sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
+
+    More details about ``args`` and ``kwargs`` could be found in
+    `Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
+
+    More details about initializer please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_
    """
    def __init__(self,
                 num_embeddings: int,
@ -358,20 +364,33 @@ class Embedding2D(ParallelLayer):

@LAYERS.register_module
 class VocabParallelEmbedding2D(torch.nn.Module):
-    """Embedding parallelized in the vocabulary dimension.
+    r"""Embedding parallelized in the vocabulary dimension.

-    :param num_embeddings: number of embeddings
-    :type num_embeddings: int
-    :param embedding_dim: dimension of embedding
-    :type embedding_dim: int
-    :param padding_idx: index of padding, defaults to None
-    :type padding_idx: int, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param weight_initializer: The intializer of weight, defaults to normal initializer
-    :type weight_initializer: typing.Callable, optional
-    :param args: Args used in F.embedding
-    :param kwargs: Kwargs used in F.embedding
+    Args:
+        num_embeddings (int): number of embeddings.
+        embedding_dim (int): dimension of embedding.
+        padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
+            therefore, the embedding vector at padding_idx is not updated during training,
+            i.e. it remains as a fixed “pad”, defaults to None.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        weight_initializer (:class:`typing.Callable`, optional):
+            he initializer of weight, defaults to normal initializer.
+
+    The ``args`` and ``kwargs`` used in :class:``torch.nn.functional.embedding`` should contain:
+    ::
+
+        max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
+                    renormalized to have norm max_norm. Note: this will modify weight in-place.
+        norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
+        scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
+                    of frequency of the words in the mini-batch. Default False.
+        sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
+
+    More details about ``args`` and ``kwargs`` could be found in
+    `Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
+
+    More details about initializer please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """
    def __init__(self,
                 num_embeddings: int,
@ -435,23 +454,21 @@ class VocabParallelEmbedding2D(torch.nn.Module):

@LAYERS.register_module
 class Classifier2D(ParallelLayer):
-    """
-    Classifier for 2D parallelism
+    r"""Classifier for 2D parallelism.

-    :param in_features: size of each input sample
-    :type in_features: int
-    :param num_classes: number of classes
-    :type num_classes: int
-    :param weight: weight of the classifier, defaults to True
-    :type weight: torch.nn.Parameter, optional
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True``
-    :type bias: bool, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
-    :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
-    :type bias_initializer: typing.Callable, optional
+    Args:
+        in_features (int): size of each input sample.
+        num_classes (int): number of classes.
+        weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """
    def __init__(self,
                 in_features: int,
@ -515,23 +532,21 @@ class Classifier2D(ParallelLayer):

@LAYERS.register_module
 class VocabParallelClassifier2D(ParallelLayer):
-    """
-    Vocab parallel classifier layer for 2D parallelism
+    r"""Vocab parallel classifier layer for 2D parallelism.

-    :param in_features: size of each input sample
-    :type in_features: int
-    :param num_classes: number of classes
-    :type num_classes: int
-    :param weight: weight of the classifier, defaults to True
-    :type weight: torch.nn.Parameter, optional
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True``
-    :type bias: bool, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
-    :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
-    :type bias_initializer: typing.Callable, optional
+    Args:
+        in_features (int): size of each input sample.
+        num_classes (int): number of classes.
+        weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """
    def __init__(self,
                 in_features: int,
--- a/colossalai/nn/layer/parallel_2p5d/_operation.py
+++ b/colossalai/nn/layer/parallel_2p5d/_operation.py
@ -100,35 +100,26 @@ def classifier_2p5d(A: Tensor, B: Tensor, bias, tesseract_dim: int, out_shape: T
                                                                                     ...], row_rank: int, col_rank: int,
                    row_parallel_mode: ParallelMode, col_parallel_mode: ParallelMode, data_parallel_rank: int,
                    pipeline_parallel_rank: int, pipeline_parallel_size: int, tensor_parallel_size: int) -> Tensor:
-    """
-    Classifier
+    r"""Classifier.

-    :param a: matrix :math:`A`
-    :type a: torch.tensor
-    :param b: matrix :math:`B`
-    :type b: torch.tensor
-    :param bias: matrix of bias
-    :type bias: torch.tensor, optional
-    :param tesseract_dim: dimension of TESSERACT fo 2.5D parallelism
-    :type tesseract_dim: int
-    :param out_shape: shape of output tensor
-    :type out_shape: tuple
-    :param row_rank: the rank of row
-    :type row_rank: int
-    :param col_rank: the rank of column
-    :type col_rank: int
-    :param row_parallel_mode: row parallel mode
-    :type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param col_parallel_mode: column parallel mode
-    :type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param data_parallel_rank: data parallel rank
-    :type data_parallel_rank: int
-    :param pipeline_parallel_rank: pipeline parallel rank
-    :type pipeline_parallel_rank: int
-    :param pipeline_parallel_size: pipeline parallel size
-    :type pipeline_parallel_size: int
-    :param tensor_parallel_size: tensor parallel size
-    :type tensor_parallel_size: int
+    Args:
+        A (:class:`torch.tensor`): matrix :math:`A`.
+        B (:class:`torch.tensor`): matrix :math:`B`.
+        bias (:class:`torch.tensor`): matrix of bias.
+        tesseract_dim (int): dimension of TESSERACT fo 2.5D parallelism.
+        out_shape (:class:`torch.size`): shape of output tensor.
+        row_rank (int): the rank of row.
+        col_rank (int): the rank of column.
+        row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
+        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
+        data_parallel_rank (int): data parallel rank.
+        pipeline_parallel_rank (int): pipeline parallel rank
+        pipeline_parallel_size (int): pipeline parallel size.
+        tensor_parallel_size (int): tensor parallel size.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """
    return _Classifier2p5D.apply(A, B, bias, tesseract_dim, out_shape, row_rank, col_rank, row_parallel_mode,
                                 col_parallel_mode, data_parallel_rank, pipeline_parallel_rank, pipeline_parallel_size,
@ -136,35 +127,26 @@ def classifier_2p5d(A: Tensor, B: Tensor, bias, tesseract_dim: int, out_shape: T


 class Matmul_AB_2p5D(torch.autograd.Function):
-    """
-    Matrix multiplication for :math:`C = AB`
+    r"""Matrix multiplication for :math:`C = AB`.

-    :param a: matrix :math:`A`
-    :type a: torch.tensor
-    :param b: matrix :math:`B`
-    :type b: torch.tensor
-    :param tesseract_dim: dimension of TESSERACT fo 2.5D parallelism
-    :type tesseract_dim: int
-    :param out_shape: shape of output tensor
-    :type out_shape: tuple
-    :param row_rank: the rank of row
-    :type row_rank: int
-    :param col_rank: the rank of column
-    :type col_rank: int
-    :param dep_rank: the rank of depth
-    :type dep_rank: int
-    :param row_parallel_mode: row parallel mode
-    :type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param col_parallel_mode: column parallel mode
-    :type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param data_parallel_rank: data parallel rank
-    :type data_parallel_rank: int
-    :param pipeline_parallel_rank: pipeline parallel rank
-    :type pipeline_parallel_rank: int
-    :param pipeline_parallel_size: pipeline parallel size
-    :type pipeline_parallel_size: int
-    :param tensor_parallel_size: tensor parallel size
-    :type tensor_parallel_size: int
+    Args:
+        A (:class:`torch.tensor`): matrix :math:`A`.
+        B (:class:`torch.tensor`): matrix :math:`B`.
+        tesseract_dim (int): dimension of TESSERACT fo 2.5D parallelism.
+        out_shape (:class:`torch.size`): shape of output tensor.
+        row_rank (int): the rank of row.
+        col_rank (int): the rank of column.
+        dep_rank (int): the rank of depth.
+        row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
+        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
+        data_parallel_rank (int): data parallel rank.
+        pipeline_parallel_rank (int): pipeline parallel rank
+        pipeline_parallel_size (int): pipeline parallel size.
+        tensor_parallel_size (int): tensor parallel size.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """

    @staticmethod
@ -270,35 +252,26 @@ class Matmul_AB_2p5D(torch.autograd.Function):


 class Matmul_ABT_2p5D(torch.autograd.Function):
-    """
-    Matrix multiplication for :math:`C = AB^T`
+    r"""Matrix multiplication for :math:`C = AB^T`.

-    :param a: matrix :math:`A`
-    :type a: torch.tensor
-    :param b: matrix :math:`B`
-    :type b: torch.tensor
-    :param tesseract_dim: dimension of TESSERACT fo 2.5D parallelism
-    :type tesseract_dim: int
-    :param out_shape: shape of output tensor
-    :type out_shape: tuple
-    :param row_rank: the rank of row
-    :type row_rank: int
-    :param col_rank: the rank of column
-    :type col_rank: int
-    :param dep_rank: the rank of depth
-    :type dep_rank: int
-    :param row_parallel_mode: row parallel mode
-    :type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param col_parallel_mode: column parallel mode
-    :type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param data_parallel_rank: data parallel rank
-    :type data_parallel_rank: int
-    :param pipeline_parallel_rank: pipeline parallel rank
-    :type pipeline_parallel_rank: int
-    :param pipeline_parallel_size: pipeline parallel size
-    :type pipeline_parallel_size: int
-    :param tensor_parallel_size: tensor parallel size
-    :type tensor_parallel_size: int
+    Args:
+        A (:class:`torch.tensor`): matrix :math:`A`.
+        B (:class:`torch.tensor`): matrix :math:`B`.
+        tesseract_dim (int): dimension of TESSERACT fo 2.5D parallelism.
+        out_shape (:class:`torch.size`): shape of output tensor.
+        row_rank (int): the rank of row.
+        col_rank (int): the rank of column.
+        dep_rank (int): the rank of depth.
+        row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
+        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
+        data_parallel_rank (int): data parallel rank.
+        pipeline_parallel_rank (int): pipeline parallel rank
+        pipeline_parallel_size (int): pipeline parallel size.
+        tensor_parallel_size (int): tensor parallel size.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """

    @staticmethod
@ -409,35 +382,26 @@ class Matmul_ABT_2p5D(torch.autograd.Function):


 class Matmul_ATB_2p5D(torch.autograd.Function):
-    """
-    Matrix multiplication for :math:`C = A^TB`
+    r"""Matrix multiplication for :math:`C = A^TB`

-    :param a: matrix :math:`A`
-    :type a: torch.tensor
-    :param b: matrix :math:`B`
-    :type b: torch.tensor
-    :param tesseract_dim: dimension of TESSERACT fo 2.5D parallelism
-    :type tesseract_dim: int
-    :param out_shape: shape of output tensor
-    :type out_shape: tuple
-    :param row_rank: the rank of row
-    :type row_rank: int
-    :param col_rank: the rank of column
-    :type col_rank: int
-    :param dep_rank: the rank of depth
-    :type dep_rank: int
-    :param row_parallel_mode: row parallel mode
-    :type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param col_parallel_mode: column parallel mode
-    :type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param data_parallel_rank: data parallel rank
-    :type data_parallel_rank: int
-    :param pipeline_parallel_rank: pipeline parallel rank
-    :type pipeline_parallel_rank: int
-    :param pipeline_parallel_size: pipeline parallel size
-    :type pipeline_parallel_size: int
-    :param tensor_parallel_size: tensor parallel size
-    :type tensor_parallel_size: int
+    Args:
+        A (:class:`torch.tensor`): matrix :math:`A`.
+        B (:class:`torch.tensor`): matrix :math:`B`.
+        tesseract_dim (int): dimension of TESSERACT fo 2.5D parallelism.
+        out_shape (:class:`torch.size`): shape of output tensor.
+        row_rank (int): the rank of row.
+        col_rank (int): the rank of column.
+        dep_rank (int): the rank of depth.
+        row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
+        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
+        data_parallel_rank (int): data parallel rank.
+        pipeline_parallel_rank (int): pipeline parallel rank
+        pipeline_parallel_size (int): pipeline parallel size.
+        tensor_parallel_size (int): tensor parallel size.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """

    @staticmethod
@ -629,36 +593,27 @@ def add_bias_2p5d(input: Tensor, bias: Tensor, output_size_per_partition: int, t
                  col_rank: int, dep_rank: int, col_parallel_mode: ParallelMode, skip_bias_add: bool,
                  data_parallel_rank: int, pipeline_parallel_rank: int, pipeline_parallel_size: int,
                  tensor_parallel_size: int) -> Tensor:
-    """
-    Matrix add bias: :math:`C = A + b`
+    r"""Matrix add bias: :math:`C = A + b`.

-    :param input: matrix :math:`A`
-    :type input: torch.tensor
-    :param bias: matrix :math:`b`
-    :type bias: torch.tensor
-    :param output_size_per_partition: output size in each partition
-    :type output_size_per_partition: int
-    :param tesseract_dim: dimension of TESSERACT fo 2.5D parallelism
-    :type tesseract_dim: int
-    :param row_rank: the rank of row
-    :type row_rank: int
-    :param col_rank: the rank of column
-    :type col_rank: int
-    :param row_parallel_mode: row parallel mode
-    :type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param col_parallel_mode: column parallel mode
-    :type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param skip_bias_add: If set to ``True``, it will skip bias add for linear layer,
-           which is preserved for kernel fusion
-    :type skip_bias_add: bool
-    :param data_parallel_rank: data parallel rank
-    :type data_parallel_rank: int
-    :param pipeline_parallel_rank: pipeline parallel rank
-    :type pipeline_parallel_rank: int
-    :param pipeline_parallel_size: pipeline parallel size
-    :type pipeline_parallel_size: int
-    :param tensor_parallel_size: tensor parallel size
-    :type tensor_parallel_size: int
+    Args:
+        input (:class:`torch.tensor`): matrix :math:`A`.
+        bias (:class:`torch.tensor`): matrix :math:`B`.
+        tesseract_dim (int): dimension of TESSERACT fo 2.5D parallelism.
+        output_size_per_partition (int): output size in each partition.
+        row_rank (int): the rank of row.
+        col_rank (int): the rank of column.
+        dep_rank (int): the rank of depth.
+        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
+        skip_bias_add (bool): If set to ``True``, it will skip bias add for linear layer,
+            which is preserved for kernel fusion.
+        data_parallel_rank (int): data parallel rank.
+        pipeline_parallel_rank (int): pipeline parallel rank
+        pipeline_parallel_size (int): pipeline parallel size.
+        tensor_parallel_size (int): tensor parallel size.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """
    return _Add_Bias_2p5D.apply(input, bias, output_size_per_partition, tesseract_dim, row_rank, col_rank, dep_rank,
                                col_parallel_mode, skip_bias_add, data_parallel_rank, pipeline_parallel_rank,
@ -666,19 +621,18 @@ def add_bias_2p5d(input: Tensor, bias: Tensor, output_size_per_partition: int, t


 class _Layernorm2p5D(torch.autograd.Function):
-    """
-    Layernorm
+    r"""Layernorm.

-    :param input: input maxtrix
-    :type input: torch.tensor
-    :param E_x: mean
-    :type E_x: torch.tensor
-    :param Var_x: variance
-    :type Var_x: torch.tensor
-    :param hidden_size: hidden size
-    :type hidden_size: int
-    :param row_parallel_mode: row parallel mode
-    :type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
+    Args:
+        input (:class:`torch.tensor`): input matrix.
+        E_x (:class:`torch.tensor`): mean.
+        Var_x (:class:`torch.tensor`): variance.
+        hidden_size (int): hidden size.
+        row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """

    @staticmethod
@ -718,19 +672,18 @@ class _Layernorm2p5D(torch.autograd.Function):

 def layernorm_2p5d(input: Tensor, E_x: Tensor, Var_x: Tensor, hidden_size: int,
                   row_parallel_mode: ParallelMode) -> Tensor:
-    """
-    Layernorm
+    r"""Layernorm.

-    :param input: input maxtrix
-    :type input: torch.tensor
-    :param E_x: mean
-    :type E_x: torch.tensor
-    :param Var_x: variance
-    :type Var_x: torch.tensor
-    :param hidden_size: hidden size
-    :type hidden_size: int
-    :param row_parallel_mode: row parallel mode
-    :type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
+    Args:
+        input (:class:`torch.tensor`): input matrix.
+        E_x (:class:`torch.tensor`): mean.
+        Var_x (:class:`torch.tensor`): variance.
+        hidden_size (int): hidden size.
+        row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
    """
    return _Layernorm2p5D.apply(input, E_x, Var_x, hidden_size, row_parallel_mode)

@ -753,29 +706,31 @@ class _AllGatherTensor2p5D(torch.autograd.Function):


 def all_gather_tensor_2p5d(inputs: Tensor, dim: int, col_parallel_mode: ParallelMode) -> Tensor:
-    """
-    all gather the weight of 2.5D parallelism
+    r"""all gather the weight of 2.5D parallelism.

-    :param inputs: input maxtrix
-    :type inputs: torch.tensor
-    :param dim: dimension of all gather
-    :type dim: int
-    :param tesseract_dim: dimension of TESSERACT fo 2.5D parallelism
-    :type tesseract_dim: int
-    :param col_parallel_mode: column parallel mode
-    :type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
+    Args:
+        inputs (:class:`torch.tensor`): input tensor.
+        dim (int): dimension of all-gather.
+        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
    """
    return _AllGatherTensor2p5D.apply(inputs, dim, col_parallel_mode)


 class SplitFirst(torch.autograd.Function):
-    """
-    :param inputs: input maxtrix
-    :type inputs: torch.tensor
-    :param tesseract_dim: dimension of TESSERACT fo 2.5D parallelism
-    :type tesseract_dim: int
-    :param col_parallel_mode: column parallel mode
-    :type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
+    r"""
+
+    Args:
+        inputs (:class:`torch.tensor`): input tensor.
+        tesseract_dim (int): dimension of TESSERACT fo 2.5D parallelism
+        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
    """

    @staticmethod
@ -801,16 +756,14 @@ class SplitFirst(torch.autograd.Function):


 def split_tensor_2p5d(input_: Tensor, dim: int = 0) -> Tensor:
-    """Splits 2P5D tensor in specified dimension across cols
+    """Splits 2P5D tensor in specified dimension across cols.

-    :param input_: Input tensor
-    :param dim: Specified dimension in which to split
+    Args:
+        input_ (:class:`torch.tensor`): Input tensor.
+        dim (int): Specified dimension in which to split.

-    :type input_: torch.Tensor
-    :type dim: int, optional
-
-    :return output: Splitted tensor
-    :rtype output: torch.Tensor
+    Returns:
+        :class:`torch.tensor`: The tensor has been split.
    """
    if input_.size(dim) <= 1:
        return input_
@ -829,11 +782,15 @@ class _ReduceTensor2p5D(torch.autograd.Function):


 def reduce_tensor_2p5d(input_: Tensor, parallel_mode: ParallelMode) -> Tensor:
-    """
-    All-reduce the input.
+    r"""All-reduce the input.

-    :param input_: input tensor
-    :param parallel_mode: parallel mode
+    Args:
+        input_ (:class:`torch.tensor`): Input tensor.
+        parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode tensor used.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """
    return _ReduceTensor2p5D.apply(input_, parallel_mode)

@ -851,11 +808,16 @@ class _ReduceScatterTensor2p5D(torch.autograd.Function):


 def reduce_scatter_tensor_2p5d(input_: Tensor, dim: int, parallel_mode: ParallelMode) -> Tensor:
-    """
-    Reduce-scatter the input.
+    r"""Reduce-scatter the input.

-    :param input_: input tensor
-    :param parallel_mode: parallel mode
+    Args:
+        input_ (:class:`torch.tensor`): Input tensor.
+        dim (int): Dimension to reduce.
+        parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode tensor used.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """
    return _ReduceScatterTensor2p5D.apply(input_, dim, parallel_mode)

@ -890,12 +852,11 @@ class _RreduceByBatch2p5D(torch.autograd.Function):


 def reduce_by_batch_2p5d(input_, reduce_mean: bool = False) -> Tensor:
-    """
-    All-reduce the input from the model parallel region.
+    r"""All-reduce the input from the model parallel region.

-    :param input_: input maxtrix
-    :type input_: torch.tensor
-    :param reduce_mean:  If set to ``True``, it will divide the output by column parallel size, default to False
-    :type reduce_mean: bool, optional
+    Args:
+        input_ (:class:`torch.tensor`): input matrix.
+        reduce_mean (bool, optional):
+            If set to ``True``, it will divide the output by column parallel size, default to False.
    """
    return _RreduceByBatch2p5D.apply(input_, reduce_mean)
--- a/colossalai/nn/layer/parallel_2p5d/layers.py
+++ b/colossalai/nn/layer/parallel_2p5d/layers.py
@ -23,21 +23,22 @@ from ._utils import assert_tesseract_initialization, get_tesseract_dim_dep_from_

@LAYERS.register_module
 class Linear2p5D(ParallelLayer):
-    """
-    Linear layer for 2.5D parallelism
+    r"""Linear layer for 2.5D parallelism.

-    :param in_features: size of each input sample
-    :type in_features: int
-    :param out_features: size of each output sample
-    :type out_features: int
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True
-    :type bias: bool, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
-    :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
-    :type bias_initializer: typing.Callable, optional
+    Args:
+        in_features (int): size of each input sample.
+        out_features (int): size of each output sample.
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        skip_bias_add (bool, optional): If set to ``True``, it will skip bias add for linear layer,
+            which is preserved for kernel fusion, defaults to False.
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """
    def __init__(self,
                 in_features: int,
@ -131,19 +132,16 @@ class Linear2p5D(ParallelLayer):

@LAYERS.register_module
 class LayerNorm2p5D(ParallelLayer):
-    r"""
-    Layer Normalization for 2.5D parallelism
+    r"""Layer Normalization for 2.5D parallelism.

-    :param normalized_shape: input shape from an expected input of size.
-        :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
-        \times \ldots \times \text{normalized_shape}[-1]]`
-        If a single integer is used, it is treated as a singleton list, and this module will
-        normalize over the last dimension which is expected to be of that specific size.
-    :type normalized_shape: int
-    :param eps: a value added to the denominator for numerical stability, defaults to 1e-05
-    :type eps: float, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
+    Args:
+        normalized_shape (int): input shape from an expected input of size.
+            :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
+            \times \ldots \times \text{normalized_shape}[-1]]`
+            If a single integer is used, it is treated as a singleton list, and this module will
+            normalize over the last dimension which is expected to be of that specific size.
+        eps (float, optional): a value added to the denominator for numerical stability, defaults to 1e-05.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
    """
    def __init__(self, normalized_shape: int, eps: float = 1e-05, dtype=None):
        super().__init__()
@ -204,27 +202,24 @@ class LayerNorm2p5D(ParallelLayer):

@LAYERS.register_module
 class PatchEmbedding2p5D(ParallelLayer):
-    """
-    2D Image to Patch Embedding
+    r"""2D Image to Patch Embedding.

-    :param img_size: image size
-    :type img_size: int
-    :param patch_size: patch size
-    :type patch_size: int
-    :param in_chans: number of channels of input image
-    :type in_chans: int
-    :param embed_size: size of embedding
-    :type embed_size: int
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param flatten: whether to flatten output tensor, defaults to True
-    :type flatten: bool, optional
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
-    :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
-    :type bias_initializer: typing.Callable, optional
-    :param position_embed_initializer: The intializer of position embedding, defaults to zero
-    :type position_embed_initializer: typing.Callable, optional
+    Args:
+        img_size (int): image size.
+        patch_size (int): patch size.
+        in_chans (int): number of channels of input image.
+        embed_size (int): size of embedding.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        flatten (bool, optional): whether to flatten output tensor, defaults to True.
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+        position_embed_initializer (:class:`typing.Callable`, optional):
+            The initializer of position embedding, defaults to zeros initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """
    def __init__(self,
                 img_size: int,
@ -306,21 +301,33 @@ class PatchEmbedding2p5D(ParallelLayer):

@LAYERS.register_module
 class Embedding2p5D(ParallelLayer):
-    """
-    Embedding for 2.5D parallelism
+    r"""Embedding for 2.5D parallelism.

-    :param num_embeddings: number of embeddings
-    :type num_embeddings: int
-    :param embedding_dim: dimension of embedding
-    :type embedding_dim: int
-    :param padding_idx: index of padding, defaults to None
-    :type padding_idx: int, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param weight_initializer: The intializer of weight, defaults to normal initializer
-    :type weight_initializer: typing.Callable, optional
-    :param args: Args used in F.embedding
-    :param kwargs: Kwargs used in F.embedding
+    Args:
+        num_embeddings (int): number of embeddings.
+        embedding_dim (int): dimension of embedding.
+        padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
+            therefore, the embedding vector at padding_idx is not updated during training,
+            i.e. it remains as a fixed “pad”, defaults to None.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        weight_initializer (:class:`typing.Callable`, optional):
+            he initializer of weight, defaults to normal initializer.
+
+    The ``args`` and ``kwargs`` used in :class:``torch.nn.functional.embedding`` should contain:
+    ::
+
+        max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
+                    renormalized to have norm max_norm. Note: this will modify weight in-place.
+        norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
+        scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
+                    of frequency of the words in the mini-batch. Default False.
+        sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
+
+    More details about ``args`` and ``kwargs`` could be found in
+    `Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
+
+    More details about initializer please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_
    """
    def __init__(self,
                 num_embeddings: int,
@ -376,18 +383,31 @@ class Embedding2p5D(ParallelLayer):
 class VocabParallelEmbedding2p5D(torch.nn.Module):
    """Embedding parallelized in the vocabulary dimension.

-    :param num_embeddings: number of embeddings
-    :type num_embeddings: int
-    :param embedding_dim: dimension of embedding
-    :type embedding_dim: int
-    :param padding_idx: index of padding, defaults to None
-    :type padding_idx: int, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param weight_initializer: The intializer of weight, defaults to normal initializer
-    :type weight_initializer: typing.Callable, optional
-    :param args: Args used in F.embedding
-    :param kwargs: Kwargs used in F.embedding
+    Args:
+        num_embeddings (int): number of embeddings.
+        embedding_dim (int): dimension of embedding.
+        padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
+            therefore, the embedding vector at padding_idx is not updated during training,
+            i.e. it remains as a fixed “pad”, defaults to None.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        weight_initializer (:class:`typing.Callable`, optional):
+            he initializer of weight, defaults to normal initializer.
+
+    The ``args`` and ``kwargs`` used in :class:``torch.nn.functional.embedding`` should contain:
+    ::
+
+        max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
+                    renormalized to have norm max_norm. Note: this will modify weight in-place.
+        norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
+        scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
+                    of frequency of the words in the mini-batch. Default False.
+        sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
+
+    More details about ``args`` and ``kwargs`` could be found in
+    `Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
+
+    More details about initializer please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """
    def __init__(self,
                 num_embeddings: int,
@ -455,23 +475,21 @@ class VocabParallelEmbedding2p5D(torch.nn.Module):

@LAYERS.register_module
 class Classifier2p5D(ParallelLayer):
-    """
-    Classifier for 2.5D parallelism
+    r"""Classifier for 2.5D parallelism.

-    :param in_features: size of each input sample
-    :type in_features: int
-    :param num_classes: number of classes
-    :type num_classes: int
-    :param weight: weight of the classifier, defaults to True
-    :type weight: torch.nn.Parameter, optional
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True
-    :type bias: bool, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
-    :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
-    :type bias_initializer: typing.Callable, optional
+    Args:
+        in_features (int): size of each input sample.
+        num_classes (int): number of classes.
+        weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """
    def __init__(self,
                 in_features: int,
@ -537,23 +555,21 @@ class Classifier2p5D(ParallelLayer):

@LAYERS.register_module
 class VocabParallelClassifier2p5D(ParallelLayer):
-    """
-    Vocab parallel classifier layer for 2.5D parallelism
+    r"""Vocab parallel classifier layer for 2.5D parallelism.

-    :param in_features: size of each input sample
-    :type in_features: int
-    :param num_classes: number of classes
-    :type num_classes: int
-    :param weight: weight of the classifier, defaults to True
-    :type weight: torch.nn.Parameter, optional
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True``
-    :type bias: bool, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
-    :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
-    :type bias_initializer: typing.Callable, optional
+    Args:
+        in_features (int): size of each input sample.
+        num_classes (int): number of classes.
+        weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """
    def __init__(self,
                 in_features: int,
--- a/colossalai/nn/layer/parallel_3d/_operation.py
+++ b/colossalai/nn/layer/parallel_3d/_operation.py
@ -88,27 +88,22 @@ def linear_3d(input_: Tensor,
              input_dim: int = 0,
              weight_dim: int = -1,
              output_dim: int = 0) -> Tensor:
-    """
-    Linear layer for 3D parallelism
+    r"""Linear layer for 3D parallelism.

-    :param input_: matrix of input
-    :type input_: torch.tensor
-    :param weight: matrix of weight
-    :type weight: torch.tensor
-    :param bias: matrix of bias
-    :type bias: torch.tensor, optional
-    :param input_parallel_mode: input parallel mode
-    :type input_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param weight_parallel_mode: weight parallel mode
-    :type weight_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param output_parallel_mode: output parallel mode
-    :type output_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param input_dim: dimension of input, defaults to 0
-    :type input_dim: int, optional
-    :param weight_dim: dimension of weight, defaults to -1
-    :type weight_dim: int, optional
-    :param output_dim: dimension of output, defaults to 0
-    :type output_dim: int, optional
+    Args:
+        input_ (:class:`torch.tensor`): input matrix.
+        weight (:class:`torch.tensor`): matrix of weight.
+        bias (:class:`torch.tensor`): matrix of bias.
+        input_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): input parallel mode.
+        weight_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): weight parallel mode.
+        output_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): output parallel mode.
+        input_dim (int, optional): dimension of input, defaults to 0.
+        weight_dim (int, optional): dimension of weight, defaults to -1.
+        output_dim (int, optional): dimension of output, defaults to 0.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """
    return _Linear3D.apply(input_, weight, bias, input_parallel_mode, weight_parallel_mode, output_parallel_mode,
                           input_dim, weight_dim, output_dim)
@ -174,21 +169,19 @@ class _Classifier3D(torch.autograd.Function):

 def classifier_3d(input_: Tensor, weight: Tensor, bias: Optional[Tensor], input_parallel_mode: ParallelMode,
                  weight_parallel_mode: ParallelMode, output_parallel_mode: ParallelMode) -> Tensor:
-    """
-    3D parallel classifier
+    r"""3D parallel classifier.

-    :param input_: matrix of input
-    :type input_: torch.tensor
-    :param weight: matrix of weight
-    :type weight: torch.tensor
-    :param bias: matrix of bias
-    :type bias: torch.tensor, optional
-    :param input_parallel_mode: input parallel mode
-    :type input_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param weight_parallel_mode: weight parallel mode
-    :type weight_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param output_parallel_mode: output parallel mode
-    :type output_parallel_mode: colossalai.context.parallel_mode.ParallelMode
+    Args:
+        input_ (:class:`torch.tensor`): input matrix.
+        weight (:class:`torch.tensor`): matrix of weight.
+        bias (:class:`torch.tensor`): matrix of bias.
+        input_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): input parallel mode.
+        weight_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): weight parallel mode.
+        output_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): output parallel mode.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """
    return _Classifier3D.apply(input_, weight, bias, input_parallel_mode, weight_parallel_mode, output_parallel_mode)

@ -244,48 +237,44 @@ class _Layernorm3D(torch.autograd.Function):
 def layernorm_3d(input_: Tensor, weight: Tensor, bias: Tensor, normalized_shape: int, eps: float,
                 input_parallel_mode: ParallelMode, weight_parallel_mode: ParallelMode,
                 output_parallel_mode: ParallelMode) -> Tensor:
-    r"""
-    3D parallel Layernorm
+    r"""3D parallel Layernorm.

-    :param input_: input maxtrix
-    :type input_: torch.tensor
-    :param weight: matrix of weight
-    :type weight: torch.tensor
-    :param bias: matrix of bias
-    :type bias: torch.tensor
-    :param normalized_shape: input shape from an expected input of size.
-    :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
-    \times \ldots \times \text{normalized_shape}[-1]]`
-        If a single integer is used, it is treated as a singleton list, and this module will
-        normalize over the last dimension which is expected to be of that specific size.
-    :type normalized_shape: int
-    :param eps: a value added to the denominator for numerical stability
-    :type eps: float
-    :param input_parallel_mode: input parallel mode
-    :type input_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param weight_parallel_mode: weight parallel mode
-    :type weight_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param output_parallel_mode: output parallel mode
-    :type output_parallel_mode: colossalai.context.parallel_mode.ParallelMode
+    Args:
+        input_ (:class:`torch.tensor`): input matrix.
+        weight (:class:`torch.tensor`): matrix of weight.
+        bias (:class:`torch.tensor`): matrix of bias.
+        normalized_shape (int): input shape from an expected input of size.
+            :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
+            \times \ldots \times \text{normalized_shape}[-1]]`
+            If a single integer is used, it is treated as a singleton list, and this module will
+            normalize over the last dimension which is expected to be of that specific size.
+        eps (float): a value added to the denominator for numerical stability
+        input_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): input parallel mode.
+        weight_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): weight parallel mode.
+        output_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): output parallel mode.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """
    return _Layernorm3D.apply(input_, weight, bias, normalized_shape, eps, input_parallel_mode, weight_parallel_mode,
                              output_parallel_mode)


 def split_tensor_3d(tensor: Tensor, dim: int, parallel_mode: ParallelMode) -> Tensor:
-    """Splits 3D parallel tensor in specified dimension
+    r"""Splits 3D parallel tensor in specified dimension.

-    :param tensor: Input tensor
-    :param dim: Specified dimension in which to split
-    :param parallel_mode: Parallel mode
-    :param weight_parallel_mode: Weight parallel mode
+     Args:
+        tensor (:class:`torch.tensor`): Input tensor.
+        dim (int): Specified dimension in which to split.
+        parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`, optional): Parallel mode.

-    :type tensor: torch.Tensor
-    :type dim: int
-    :type parallel_mode: colossalai.context.parallel_mode.ParallelMode
+    Returns:
+        :class:`torch.tensor`: The tensor has been split.

-    :return output: Splitted tensor
-    :rtype output: torch.Tensor
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
    """
    if tensor.size(dim) <= 1:
        return tensor
@ -298,17 +287,20 @@ def split_batch_3d(input_: Tensor,
                   dim: int = 0,
                   input_parallel_mode: ParallelMode = ParallelMode.PARALLEL_3D_INPUT,
                   weight_parallel_mode: ParallelMode = ParallelMode.PARALLEL_3D_WEIGHT) -> Tensor:
-    """Splits 3D tensor in batch
-    :param input_: Input tensor
-    :param dim: Specified dimension in which to split
-    :param input_parallel_mode: Input parallel mode
-    :param weight_parallel_mode: Weight parallel mode
-    :type input_: torch.Tensor
-    :type dim: int, optional
-    :type input_parallel_mode: colossalai.context.parallel_mode.ParallelMode, optional
-    :type weight_parallel_mode: colossalai.context.parallel_mode.ParallelMode, optional
-    :return output: Splitted tensor
-    :rtype output: torch.Tensor
+    r"""Splits 3D tensor in batch.
+
+    Args:
+        input_ (:class:`torch.tensor`): Input tensor.
+        dim (int): Specified dimension in which to split.
+        input_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`, optional): input parallel mode.
+        weight_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`, optional): weight parallel mode.
+
+    Returns:
+        :class:`torch.tensor`: The tensor has been split.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
    """
    if input_.size(dim) <= 1:
        return input_
@ -333,11 +325,15 @@ class _ReduceTensor3D(torch.autograd.Function):


 def reduce_tensor_3d(tensor: Tensor, parallel_mode: ParallelMode) -> Tensor:
-    """
-    All-reduce the input
+    r"""All-reduce the input

-    :param tensor: Input tensor
-    :param parallel_mode: Parallel mode
+    Args:
+        tensor (:class:`torch.tensor`): Input tensor.
+        parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): Parallel mode.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
    """
    return _ReduceTensor3D.apply(tensor, parallel_mode)

@ -358,11 +354,16 @@ class _AllGatherTensor3D(torch.autograd.Function):


 def all_gather_tensor_3d(tensor: Tensor, dim: int, parallel_mode: ParallelMode) -> Tensor:
-    """
-    All-reduce the gradient in backward pass.
+    r"""All-reduce the gradient in backward pass.

-    :param tensor: Input tensor
-    :param parallel_mode: Parallel mode
+    Args:
+        tensor (:class:`torch.tensor`): Input tensor.
+        dim (int): Dimension to gather.
+        parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): Parallel mode.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
    """
    return _AllGatherTensor3D.apply(tensor, dim, parallel_mode)

@ -382,12 +383,16 @@ class _ReduceScatterTensor3D(torch.autograd.Function):


 def reduce_scatter_tensor_3d(tensor: Tensor, dim: int, parallel_mode: ParallelMode) -> Tensor:
-    """
-    Reduce-scatter the input.
+    r"""Reduce-scatter the input.

-    :param tensor: Input tensor
-    :param dim: Dimension to scatter
-    :param parallel_mode: Parallel mode
+    Args:
+        tensor (:class:`torch.tensor`): Input tensor.
+        dim (int): Dimension to scatter.
+        parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): Parallel mode.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """
    return _ReduceScatterTensor3D.apply(tensor, dim, parallel_mode)

@ -423,34 +428,33 @@ def reduce_by_batch_3d(tensor: Tensor,
                       input_parallel_mode: ParallelMode,
                       weight_parallel_mode: ParallelMode,
                       reduce_mean: bool = False) -> Tensor:
-    """
-    All-reduce the input from the model parallel region.
+    r"""All-reduce the input from the model parallel region.

-    :param input_: input maxtrix
-    :type input_: torch.tensor
-    :param input_parallel_mode: input parallel mode
-    :type input_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param weight_parallel_mode: weight parallel mode
-    :type weight_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param reduce_mean:  If set to ``True``, it will divide the output by (input parallel size * weight parallel size),
-    default to False
-    :type reduce_mean: int, optional
+    Args:
+        input_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): input parallel mode.
+        weight_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): weight parallel mode.
+        reduce_mean (bool, optional): If set to ``True``, it will divide the output by
+            (input parallel size * weight parallel size), default to False.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """
    return _ReduceByBatch3D.apply(tensor, input_parallel_mode, weight_parallel_mode, reduce_mean)


 class _BroadcastWeight3D_FromDiagonal(torch.autograd.Function):
-    """
-    broadcast weight from diagonal
+    r"""broadcast weight from diagonal.

-    :param input_: input maxtrix
-    :type input_: torch.tensor
-    :param input_parallel_mode: input parallel mode
-    :type input_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param weight_parallel_mode: weight parallel mode
-    :type weight_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param weight_parallel_mode: output parallel mode
-    :type weight_parallel_mode: colossalai.context.parallel_mode.ParallelMode
+    Args:
+        input_ (:class:`torch.tensor`): input matrix.
+        input_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): input parallel mode.
+        weight_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): weight parallel mode.
+        output_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): output parallel mode.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """

    @staticmethod
--- a/colossalai/nn/layer/parallel_3d/layers.py
+++ b/colossalai/nn/layer/parallel_3d/layers.py
@ -24,19 +24,16 @@ from ._utils import get_depth_from_env, get_last_group, get_parallel_mode_from_e

@LAYERS.register_module
 class LayerNorm3D(ParallelLayer):
-    r"""
-    Layer Normalization for 3D parallelism
+    r"""Layer Normalization for 3D parallelism.

-    :param normalized_shape: input shape from an expected input of size.
-    :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
-    \times \ldots \times \text{normalized_shape}[-1]]`
-        If a single integer is used, it is treated as a singleton list, and this module will
-        normalize over the last dimension which is expected to be of that specific size.
-    :type normalized_shape: int
-    :param eps: a value added to the denominator for numerical stability, defaults to 1e-12
-    :type eps: float, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
+    Args:
+        normalized_shape (int): input shape from an expected input of size.
+            :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
+            \times \ldots \times \text{normalized_shape}[-1]]`
+            If a single integer is used, it is treated as a singleton list, and this module will
+            normalize over the last dimension which is expected to be of that specific size.
+        eps (float, optional): a value added to the denominator for numerical stability, defaults to 1e-12.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
    """

    def __init__(self, normalized_shape: int, eps: float = 1e-12, dtype=None):
@ -71,21 +68,20 @@ class LayerNorm3D(ParallelLayer):

@LAYERS.register_module
 class Linear3D(ParallelLayer):
-    """
-    Linear layer for 3D parallelism
+    r"""Linear layer for 3D parallelism.

-    :param in_features: size of each input sample
-    :type in_features: int
-    :param out_features: size of each output sample
-    :type out_features: int
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True
-    :type bias: bool, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
-    :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
-    :type bias_initializer: typing.Callable, optional
+    Args:
+        in_features (int): size of each input sample.
+        out_features (int): size of each output sample.
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """

    def __init__(self,
@ -146,23 +142,21 @@ class Linear3D(ParallelLayer):

@LAYERS.register_module
 class Classifier3D(ParallelLayer):
-    """
-    Classifier for 3D parallelism
+    r"""Classifier for 3D parallelism.

-    :param in_features: size of each input sample
-    :type in_features: int
-    :param num_classes: number of classes
-    :type num_classes: int
-    :param weight: weight of the classifier, defaults to True
-    :type weight: torch.nn.Parameter, optional
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True
-    :type bias: bool, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
-    :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
-    :type bias_initializer: typing.Callable, optional
+    Args:
+        in_features (int): size of each input sample.
+        num_classes (int): number of classes.
+        weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """

    def __init__(self,
@ -225,23 +219,21 @@ class Classifier3D(ParallelLayer):

@LAYERS.register_module
 class VocabParallelClassifier3D(ParallelLayer):
-    """
-    Vocab parallel classifier layer for 2D parallelism
+    r"""Vocab parallel classifier layer for 3D parallelism.

-    :param in_features: size of each input sample
-    :type in_features: int
-    :param num_classes: number of classes
-    :type num_classes: int
-    :param weight: weight of the classifier, defaults to True
-    :type weight: torch.nn.Parameter, optional
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True``
-    :type bias: bool, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
-    :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
-    :type bias_initializer: typing.Callable, optional
+    Args:
+        in_features (int): size of each input sample.
+        num_classes (int): number of classes.
+        weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """

    def __init__(self,
@ -311,27 +303,24 @@ class VocabParallelClassifier3D(ParallelLayer):

@LAYERS.register_module
 class PatchEmbedding3D(ParallelLayer):
-    """
-    2D Image to Patch Embedding
+    r"""2D Image to Patch Embedding.

-    :param img_size: image size
-    :type img_size: int
-    :param patch_size: patch size
-    :type patch_size: int
-    :param in_chans: number of channels of input image
-    :type in_chans: int
-    :param embed_size: size of embedding
-    :type embed_size: int
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param flatten: whether to flatten output tensor, defaults to True
-    :type flatten: bool, optional
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
-    :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
-    :type bias_initializer: typing.Callable, optional
-    :param position_embed_initializer: The intializer of position embedding, defaults to zero
-    :type position_embed_initializer: typing.Callable, optional
+    Args:
+        img_size (int): image size.
+        patch_size (int): patch size.
+        in_chans (int): number of channels of input image.
+        embed_size (int): size of embedding.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        flatten (bool, optional): whether to flatten output tensor, defaults to True.
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+        position_embed_initializer (:class:`typing.Callable`, optional):
+            The initializer of position embedding, defaults to zeros initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """

    def __init__(self,
@ -419,21 +408,33 @@ class PatchEmbedding3D(ParallelLayer):

@LAYERS.register_module
 class Embedding3D(ParallelLayer):
-    """
-    Embedding for 3D parallelism
+    r"""Embedding for 3D parallelism.

-    :param num_embeddings: number of embeddings
-    :type num_embeddings: int
-    :param embedding_dim: dimension of embedding
-    :type embedding_dim: int
-    :param padding_idx: index of padding, defaults to None
-    :type padding_idx: int, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param weight_initializer: The intializer of weight, defaults to normal initializer
-    :type weight_initializer: typing.Callable, optional
-    :param args: Args used in F.embedding
-    :param kwargs: Kwargs used in F.embedding
+    Args:
+        num_embeddings (int): number of embeddings.
+        embedding_dim (int): dimension of embedding.
+        padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
+            therefore, the embedding vector at padding_idx is not updated during training,
+            i.e. it remains as a fixed “pad”, defaults to None.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        weight_initializer (:class:`typing.Callable`, optional):
+            he initializer of weight, defaults to normal initializer.
+
+    The ``args`` and ``kwargs`` used in :class:``torch.nn.functional.embedding`` should contain:
+    ::
+
+        max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
+                    renormalized to have norm max_norm. Note: this will modify weight in-place.
+        norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
+        scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
+                    of frequency of the words in the mini-batch. Default False.
+        sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
+
+    More details about ``args`` and ``kwargs`` could be found in
+    `Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
+
+    More details about initializer please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_
    """

    def __init__(self,
@ -491,20 +492,33 @@ class Embedding3D(ParallelLayer):

@LAYERS.register_module
 class VocabParallelEmbedding3D(torch.nn.Module):
-    """Embedding parallelized in the vocabulary dimension.
+    r"""Embedding parallelized in the vocabulary dimension.

-    :param num_embeddings: number of embeddings
-    :type num_embeddings: int
-    :param embedding_dim: dimension of embedding
-    :type embedding_dim: int
-    :param padding_idx: index of padding, defaults to None
-    :type padding_idx: int, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param weight_initializer: The intializer of weight, defaults to normal initializer
-    :type weight_initializer: typing.Callable, optional
-    :param args: Args used in F.embedding
-    :param kwargs: Kwargs used in F.embedding
+    Args:
+        num_embeddings (int): number of embeddings.
+        embedding_dim (int): dimension of embedding.
+        padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
+            therefore, the embedding vector at padding_idx is not updated during training,
+            i.e. it remains as a fixed “pad”, defaults to None.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        weight_initializer (:class:`typing.Callable`, optional):
+            he initializer of weight, defaults to normal initializer.
+
+    The ``args`` and ``kwargs`` used in :class:``torch.nn.functional.embedding`` should contain:
+    ::
+
+        max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
+                    renormalized to have norm max_norm. Note: this will modify weight in-place.
+        norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
+        scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
+                    of frequency of the words in the mini-batch. Default False.
+        sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
+
+    More details about ``args`` and ``kwargs`` could be found in
+    `Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
+
+    More details about initializer please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """

    def __init__(self,
--- a/colossalai/nn/layer/parallel_sequence/layers.py
+++ b/colossalai/nn/layer/parallel_sequence/layers.py
@ -24,14 +24,13 @@ class TransformerSelfAttentionRing(nn.Module):
    Self-attention layer takes input with size [b, s, h]
    and returns output of the same size.

-    :param hidden_size: hidden size
-    :type hidden_size: int
-    :param kv_channels: channels of key/value tensor
-    :type kv_channels: int
-    :param num_attention_heads: number of attention heads
-    :type num_attention_heads: int
-    :param attention_dropout: dropout probability for attention layer
-    :type attention_dropout: float
+    Args:
+        hidden_size (int): hidden size.
+        num_attention_heads (int): number of attention heads.
+        attention_dropout (float): dropout probability for attention layer.
+        attention_mask_func (:class:`typing.Callable`): Mask function to be applied.
+        layer_number (int): number of layers.
+
    """

    def __init__(self,
--- a/colossalai/nn/layer/utils/common.py
+++ b/colossalai/nn/layer/utils/common.py
@ -38,11 +38,16 @@ class CheckpointModule(nn.Module):


 def divide(numerator, denominator):
-    """Only allow exact division
+    """Only allow exact division.

-    :param numerator: Numerator of the division
-    :param denominator: Denominator of the division
+    Args:
+        numerator (int): Numerator of the division.
+        denominator (int): Denominator of the division.
+
+    Returns:
+        int: the result of exact division.
    """
+    assert denominator != 0, 'denominator can not be zero'
    assert numerator % denominator == 0, \
        '{} is not divisible by {}'.format(numerator, denominator)
    return numerator // denominator
--- a/colossalai/nn/layer/vanilla/layers.py
+++ b/colossalai/nn/layer/vanilla/layers.py
@ -15,11 +15,16 @@ from ..utils import to_2tuple

 def drop_path(x, drop_prob: float = 0., training: bool = False):
    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
    'survival rate' as the argument.
+
+    Args:
+        drop_prob (float, optional): probability of dropping path, defaults 0.0.
+        training (bool, optional): whether in training progress, defaults False.
    """
    if drop_prob == 0. or not training:
        return x
@ -35,6 +40,9 @@ class DropPath(nn.Module):
    """
    Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
    Adapted from https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/drop.py
+
+    Args:
+        drop_prob (float, optional): probability of dropping path, defaults None.
    """

    def __init__(self, drop_prob=None):
@ -46,7 +54,19 @@ class DropPath(nn.Module):


 class WrappedDropout(nn.Module):
-    """Same as torch.nn.Dropout. But it is wrapped with the context of seed manager.
+    r"""Same as torch.nn.Dropout. But it is wrapped with the context of seed manager. During training, randomly zeroes
+    some elements of the input tensor with probability p using samples from a Bernoulli distribution. Each
+    channel will be zeroed out independently on every forward call. Furthermore, the outputs are scaled by a factor of
+    1/(1-p) during training. This means that during evaluation the module simply computes an identity function.
+
+    Args:
+        p (float, optional): probability of an element to be zeroed, defaults 0.5.
+        inplace (bool, optional): whether to do dropout in-place, default to be False.
+        mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """

    def __init__(self, p: float = 0.5, inplace: bool = False, mode=None):
@ -74,8 +94,16 @@ class WrappedDropout(nn.Module):


 class WrappedDropPath(nn.Module):
-    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    r"""Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
    Here, it is wrapped with the context of seed manager.
+
+    Args:
+        p (float, optional): probability of dropping path, defaults 0.0.
+        mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """

    def __init__(self, p: float = 0., mode=None):
@ -101,27 +129,25 @@ class WrappedDropPath(nn.Module):

@LAYERS.register_module
 class VanillaPatchEmbedding(nn.Module):
-    """
+    r"""
    2D Image to Patch Embedding

-    :param img_size: image size
-    :type img_size: int
-    :param patch_size: patch size
-    :type patch_size: int
-    :param in_chans: number of channels of input image
-    :type in_chans: int
-    :param embed_size: size of embedding
-    :type embed_size: int
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param flatten: whether to flatten output tensor, defaults to True
-    :type flatten: bool, optional
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
-    :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
-    :type bias_initializer: typing.Callable, optional
-    :param position_embed_initializer: The intializer of position embedding, defaults to zero
-    :type position_embed_initializer: typing.Callable, optional
+    Args:
+        img_size (int): image size.
+        patch_size (int): patch size.
+        in_chans (int): number of channels of input image.
+        embed_size (int): size of embedding.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        flatten (bool, optional): whether to flatten output tensor, defaults to True.
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+        position_embed_initializer (:class:`typing.Callable`, optional):
+            The initializer of position embedding, defaults to zeros initializer.
+
+    More details about initializer please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """

    def __init__(self,
@ -174,23 +200,21 @@ class VanillaPatchEmbedding(nn.Module):

@LAYERS.register_module
 class VanillaClassifier(nn.Module):
-    """
-    Dense linear classifier
+    r"""Dense linear classifier.

-    :param in_features: size of each input sample
-    :type in_features: int
-    :param num_classes: number of classes
-    :type num_classes: int
-    :param weight: weight of the classifier, defaults to True
-    :type weight: torch.nn.Parameter, optional
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True
-    :type bias: bool, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
-    :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
-    :type bias_initializer: typing.Callable, optional
+    Args:
+        in_features (int): size of each input sample.
+        num_classes (int): number of classes.
+        weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        flatten (bool, optional): whether to flatten output tensor, defaults to True.
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+
+    More details about initializer please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """

    def __init__(self,
--- a/colossalai/nn/layer/wrapper/lambda_wrapper.py
+++ b/colossalai/nn/layer/wrapper/lambda_wrapper.py
@ -9,12 +9,11 @@ from colossalai.registry import LAYERS

@LAYERS.register_module
 class LambdaWrapper(nn.Module):
-    """Wrap a function to nn.Module, which takes a config of layers and can fully access them
+    """Wrap a function to nn.Module, which takes a config of layers and can fully access them.

-    :param func: User customed function
-    :type func: Callable
-    :param layers_cfg: Config of layers, defaults to None
-    :type layers_cfg: dict, optional
+    Args:
+        func (``Callable``): User customed function.
+        layers_cfg (dict, optional): Config of layers, defaults to None.
    """

    def __init__(self, func, layers_cfg: dict = None):
--- a/colossalai/nn/loss/loss_1d.py
+++ b/colossalai/nn/loss/loss_1d.py
@ -86,12 +86,10 @@ class _VocabParallelCrossEntropy1D(torch.autograd.Function):

@LOSSES.register_module
 class VocabParallelCrossEntropyLoss1D(_Loss):
-    """
-    Vocab parallel cross entropy loss for 1D parallelism
+    """Vocab parallel cross entropy loss for 1D parallelism.

-    :param reduction: whether to average the loss, defaults to True
-
-    :type reduction: bool, optional
+    Args:
+        reduction (bool, optional): whether to average the loss, defaults to True.
    """

    def __init__(self, reduction=True):
@ -99,10 +97,11 @@ class VocabParallelCrossEntropyLoss1D(_Loss):
        self.reduction_mean = reduction

    def forward(self, logits, targets):
-        """Calculate loss between logits and targets
+        """Calculate loss between logits and targets.

-        :param logits: Output logits of model
-        :param targets: True targets from data
+        Args:
+            logits (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
+            targets (:class:`torch.tensor`): Ground truth class indices or class probabilities.
        """
        loss = _VocabParallelCrossEntropy1D.apply(logits, targets)
        if self.reduction_mean:
--- a/colossalai/nn/loss/loss_2d.py
+++ b/colossalai/nn/loss/loss_2d.py
@ -13,14 +13,22 @@ from torch.nn.modules.loss import _Loss

@LOSSES.register_module
 class CrossEntropyLoss2D(_Loss):
-    """
-    Cross entropy loss for 2D parallelism
+    r"""Cross entropy loss for 2D parallelism

-    :param reduction: whether to average the loss, defaults to True
-    :param args: Args for loss function
-    :param kwargs: Kwargs for loss function
+    Args:
+        reduction (bool, optional): whether to average the loss, defaults to True.

-    :type reduction: bool, optional
+    The ``args`` and ``kwargs`` should include parameters below:
+    ::
+
+        weight (Tensor, optional)
+        size_average (bool, optional)
+        ignore_index (int, optional)
+        reduce (bool, optional)
+        label_smoothing (float, optional)
+
+    More details about args, kwargs and torch.nn.functional.cross_entropy could be found in
+    `Cross_entropy <https://pytorch.org/docs/stable/generated/torch.nn.functional.cross_entropy.html#torch.nn.functional.cross_entropy>`_.
    """

    def __init__(self, reduction=True, *args, **kwargs):
@ -31,10 +39,14 @@ class CrossEntropyLoss2D(_Loss):
        self.loss_kwargs = kwargs

    def forward(self, logits, targets):
-        """Calculate loss between logits and targets
+        """Calculate loss between logits and targets.

-        :param logits: Output logits of model
-        :param targets: True targets from data
+        Args:
+            logits (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
+            targets (:class:`torch.tensor`): Ground truth class indices or class probabilities.
+
+        Returns:
+            float: the loss between logits and targets.
        """
        targets = split_tensor_2d(targets)
        loss = cross_entropy(logits, targets, reduction='none', *self.loss_args, **self.loss_kwargs)
@ -116,12 +128,10 @@ class _VocabParallelCrossEntropy2D(torch.autograd.Function):

@LOSSES.register_module
 class VocabParallelCrossEntropyLoss2D(_Loss):
-    """
-    Vocab parallel cross entropy loss for 2D parallelism
+    """Vocab parallel cross entropy loss for 2D parallelism.

-    :param reduction: whether to average the loss, defaults to True
-
-    :type reduction: bool, optional
+    Args:
+        reduction (bool, optional): whether to average the loss, defaults to True.
    """

    def __init__(self, reduction=True):
@ -129,10 +139,11 @@ class VocabParallelCrossEntropyLoss2D(_Loss):
        self.reduction_mean = reduction

    def forward(self, logits, targets):
-        """Calculate loss between logits and targets
+        """Calculate loss between logits and targets.

-        :param logits: Output logits of model
-        :param targets: True targets from data
+        Args:
+            logits (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
+            targets (:class:`torch.tensor`): Ground truth class indices or class probabilities.
        """
        targets = split_tensor_2d(targets)
        loss = _VocabParallelCrossEntropy2D.apply(
--- a/colossalai/nn/loss/loss_2p5d.py
+++ b/colossalai/nn/loss/loss_2p5d.py
@ -13,14 +13,22 @@ from torch.nn.modules.loss import _Loss

@LOSSES.register_module
 class CrossEntropyLoss2p5D(_Loss):
-    """
-    Cross entropy loss for 2.5D parallelism
+    r"""Cross entropy loss for 2.5D parallelism

-    :param reduction: whether to average the loss, defaults to True
-    :param args: Args for loss function
-    :param kwargs: Kwargs for loss function
+    Args:
+        reduction (bool, optional): whether to average the loss, defaults to True.

-    :type reduction: bool, optional
+    The ``args`` and ``kwargs`` should include parameters below:
+    ::
+
+        weight (Tensor, optional)
+        size_average (bool, optional)
+        ignore_index (int, optional)
+        reduce (bool, optional)
+        label_smoothing (float, optional)
+
+    More details about args, kwargs and torch.nn.functional.cross_entropy could be found in
+    `Cross_entropy <https://pytorch.org/docs/stable/generated/torch.nn.functional.cross_entropy.html#torch.nn.functional.cross_entropy>`_.
    """
    def __init__(self, reduction=True, *args, **kwargs):
        super().__init__()
@ -30,10 +38,11 @@ class CrossEntropyLoss2p5D(_Loss):
        self.loss_kwargs = kwargs

    def forward(self, logits, targets):
-        """Calculate loss between logits and targets
+        """Calculate loss between logits and targets.

-        :param logits: Output logits of model
-        :param targets: True targets from data
+        Args:
+            logits (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
+            targets (:class:`torch.tensor`): Ground truth class indices or class probabilities.
        """
        targets = split_tensor_2p5d(targets)
        loss = cross_entropy(logits, targets, reduction='none', *self.loss_args, **self.loss_kwargs)
@ -115,19 +124,19 @@ class VocabParallelCrossEntropyLoss2p5D(_Loss):
    """
    Vocab parallel cross entropy loss for 2.5D parallelism

-    :param reduction: whether to average the loss, defaults to True
-
-    :type reduction: bool, optional
+    Args:
+        reduction (bool, optional): whether to average the loss, defaults to True.
    """
    def __init__(self, reduction=True):
        super().__init__()
        self.reduction_mean = reduction

    def forward(self, logits, targets):
-        """Calculate loss between logits and targets
+        """Calculate loss between logits and targets.

-        :param logits: Output logits of model
-        :param targets: True targets from data
+        Args:
+            logits (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
+            targets (:class:`torch.tensor`): Ground truth class indices or class probabilities.
        """
        targets = split_tensor_2p5d(targets)
        loss = _VocabParallelCrossEntropy2p5D.apply(logits, targets)
--- a/colossalai/nn/loss/loss_3d.py
+++ b/colossalai/nn/loss/loss_3d.py
@ -13,14 +13,22 @@ from torch.nn.modules.loss import _Loss

@LOSSES.register_module
 class CrossEntropyLoss3D(_Loss):
-    """
-    Cross entropy loss for 3D parallelism
+    r"""Cross entropy loss for 3D parallelism.

-    :param reduction: whether to average the loss, defaults to True
-    :param args: Args for loss function
-    :param kwargs: Kwargs for loss function
+    Args:
+        reduction (bool, optional): whether to average the loss, defaults to True.

-    :type reduction: bool, optional
+    The ``args`` and ``kwargs`` should include parameters below:
+    ::
+
+        weight (Tensor, optional)
+        size_average (bool, optional)
+        ignore_index (int, optional)
+        reduce (bool, optional)
+        label_smoothing (float, optional)
+
+    More details about args, kwargs and torch.nn.functional.cross_entropy could be found in
+    `Cross_entropy <https://pytorch.org/docs/stable/generated/torch.nn.functional.cross_entropy.html#torch.nn.functional.cross_entropy>`_.
    """

    def __init__(self, reduction=True, *args, **kwargs):
@ -32,10 +40,11 @@ class CrossEntropyLoss3D(_Loss):
        self.loss_kwargs = kwargs

    def forward(self, logits, targets):
-        """Calculate loss between logits and targets
+        """Calculate loss between logits and targets.

-        :param logits: Output logits of model
-        :param targets: True targets from data
+        Args:
+            logits (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
+            targets (:class:`torch.tensor`): Ground truth class indices or class probabilities.
        """
        targets = split_tensor_3d(targets, 0, self.weight_parallel_mode)
        targets = split_tensor_3d(targets, 0, self.input_parallel_mode)
@ -109,12 +118,10 @@ class _VocabParallelCrossEntropy3D(torch.autograd.Function):

@LOSSES.register_module
 class VocabParallelCrossEntropyLoss3D(_Loss):
-    """
-    Vocab parallel cross entropy loss for 2D parallelism
+    """Vocab parallel cross entropy loss for 2D parallelism.

-    :param reduction: whether to average the loss, defaults to True
-
-    :type reduction: bool, optional
+    Args:
+        reduction (bool, optional): whether to average the loss, defaults to True.
    """

    def __init__(self, reduction=True):
@ -125,10 +132,11 @@ class VocabParallelCrossEntropyLoss3D(_Loss):
        self.reduction_mean = reduction

    def forward(self, logits, targets):
-        """Calculate loss between logits and targets
+        """Calculate loss between logits and targets.

-        :param logits: Output logits of model
-        :param targets: True targets from data
+        Args:
+            logits (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
+            targets (:class:`torch.tensor`): Ground truth class indices or class probabilities.
        """
        targets = split_tensor_3d(targets, 0, self.weight_parallel_mode)
        targets = split_tensor_3d(targets, 0, self.input_parallel_mode)
--- a/colossalai/nn/loss/loss_moe.py
+++ b/colossalai/nn/loss/loss_moe.py
@ -6,13 +6,25 @@ from colossalai.context.moe_context import MOE_CONTEXT

@LOSSES.register_module
 class MoeCrossEntropyLoss(_Loss):
-    """torch.nn.CrossEntropyLoss added with auxiliary loss.
+    r"""torch.nn.CrossEntropyLoss added with auxiliary loss.

-    :param aux_weight: Weight of auxiliary loss in total loss
-    :param args: Args in CrossEntropyLoss
-    :param kwargs: Kwargs in CrossEntropyLoss
+    Args:
+        input (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
+        target (:class:`torch.tensor`): Ground truth class indices or class probabilities.
+        aux_weight (float, optional): Weight of auxiliary loss in total loss.Defaults 0.01.

-    :type aux_weight: float, optional
+    The ``args`` and ``kwargs`` should include parameters below:
+    ::
+
+        weight (Tensor, optional)
+        size_average (bool, optional)
+        ignore_index (int, optional)
+        reduce (bool, optional)
+        reduction (str, optional)
+        label_smoothing (float, optional)
+
+    More details about args, kwargs and torch.nn.functional.cross_entropy could be found in
+    `Cross_entropy <https://pytorch.org/docs/stable/generated/torch.nn.functional.cross_entropy.html#torch.nn.functional.cross_entropy>`_.
    """

    def __init__(self, aux_weight: float = 0.01, *args, **kwargs):
@ -21,6 +33,16 @@ class MoeCrossEntropyLoss(_Loss):
        self.aux_weight = aux_weight

    def forward(self, *args):
+        """
+        The ``args`` should at least include parameters below:
+        ::
+
+            input (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
+            target (:class:`torch.tensor`): Ground truth class indices or class probabilities.
+
+        More details about args, kwargs and torch.nn.functional.cross_entropy could be found in
+        `Cross_entropy <https://pytorch.org/docs/stable/generated/torch.nn.functional.cross_entropy.html#torch.nn.functional.cross_entropy>`_.
+        """
        main_loss = self.loss(*args)
        aux_loss = MOE_CONTEXT.get_loss()
        return main_loss + self.aux_weight * aux_loss
@ -30,13 +52,11 @@ class MoeCrossEntropyLoss(_Loss):
 class MoeLoss(_Loss):
    """A wrapper class for any loss module to add with auxiliary loss.

-    :param aux_weight: Weight of auxiliary loss in total loss
-    :param loss_fn: Loss function
-    :param args: Args in loss function
-    :param kwargs: Kwargs in loss function
-
-    :type aux_weight: float
-    :type loss_fn: Callable
+    Args:
+        aux_weight (float): Weight of auxiliary loss in total loss.
+        loss_fn (``Callable``): Loss function.
+        args (list): Args in loss function.
+        kwargs (dict): Kwargs in loss function
    """

    def __init__(self, aux_weight: float, loss_fn, *args, **kwargs):
@ -45,6 +65,16 @@ class MoeLoss(_Loss):
        self.aux_weight = aux_weight

    def forward(self, *args, **kwargs):
+        """
+        The ``args`` and ``kwargs`` should at least include parameters below:
+        ::
+
+            input (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
+            target (:class:`torch.tensor`): Ground truth class indices or class probabilities.
+
+        Note:
+            The ``args`` and ``kwargs`` may include different parameters varying with different loss function.
+        """
        main_loss = self.loss_fn(*args, **kwargs)
        aux_loss = MOE_CONTEXT.get_loss()
        return main_loss + self.aux_weight * aux_loss
--- a/colossalai/nn/lr_scheduler/cosine.py
+++ b/colossalai/nn/lr_scheduler/cosine.py
@ -36,14 +36,12 @@ class CosineAnnealingLR(_CosineAnnealingLR):
    .. _SGDR\: Stochastic Gradient Descent with Warm Restarts:
        https://arxiv.org/abs/1608.03983

-    :param optimizer: Wrapped optimizer
-    :type optimizer: torch.optim.Optimizer
-    :param total_steps: Number of total training steps
-    :type total_steps: int
-    :param eta_min: Minimum learning rate, defaults to 0
-    :type eta_min: int, optional
-    :param last_epoch: The index of last epoch, defaults to -1
-    :type last_epoch: int, optional
+    Args:
+        optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
+        total_steps (int): Number of total training steps.
+        eta_min (int, optional): Minimum learning rate, defaults to 0.
+        last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
+            the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
    """

    def __init__(self, optimizer, total_steps: int, eta_min: int = 0, last_epoch: int = -1, **kwargs):
@ -54,16 +52,13 @@ class CosineAnnealingLR(_CosineAnnealingLR):
 class CosineAnnealingWarmupLR(WarmupScheduler):
    """Cosine annealing learning rate scheduler with learning rate warmup. A linear warmup schedule will be applied.

-    :param optimizer: Wrapped optimizer
-    :type optimizer: torch.optim.Optimizer
-    :param total_steps: Number of total training steps
-    :type total_steps: int
-    :param warmup_steps: Number of warmup steps, defaults to 0
-    :type warmup_steps: int, optional
-    :param eta_min: Minimum learning rate, defaults to 0
-    :type eta_min: int, optional
-    :param last_epoch: The index of last epoch, defaults to -1
-    :type last_epoch: int, optional
+    Args:
+        optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
+        total_steps (int): Number of total training steps.
+        warmup_steps (int, optional): Number of warmup steps, defaults to 0.
+        eta_min (int, optional): Minimum learning rate, defaults to 0.
+        last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
+            the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
    """

    def __init__(self, optimizer, total_steps: int, warmup_steps: int = 0, eta_min: float = 0., last_epoch: int = -1):
@ -76,14 +71,12 @@ class CosineAnnealingWarmupLR(WarmupScheduler):
 class FlatAnnealingLR(DelayerScheduler):
    """Flat and cosine annealing learning rate scheduler. The learning rate will be a fixed value before starting decay.

-    :param optimizer: Wrapped optimizer
-    :type optimizer: torch.optim.Optimizer
-    :param total_steps: Number of total training steps
-    :type total_steps: int
-    :param pct_start: Percent of steps before starting learning rate decay
-    :type pct_start: float
-    :param last_epoch: The index of last epoch, defaults to -1
-    :type last_epoch: int, optional
+    Args:
+        optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
+        total_steps (int): Number of total training steps.
+        pct_start (float, optional): Percent of steps before starting learning rate decay, defaults to -0.72.
+        last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
+            the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
    """

    def __init__(self, optimizer, total_steps: int, pct_start: float = 0.72, last_epoch: int = -1, **kwargs):
@ -102,18 +95,14 @@ class FlatAnnealingWarmupLR(WarmupDelayerScheduler):
    """Flat and cosine annealing learning rate scheduler with learning rate warmup. A linear warmup schedule will be
    applied, and then the learning rate will be a fixed value before starting decay.

-    :param optimizer: Wrapped optimizer
-    :type optimizer: torch.optim.Optimizer
-    :param total_steps: Number of total training steps
-    :type total_steps: int
-    :param warmup_steps: Number of warmup steps, defaults to 0
-    :type warmup_steps: int, optional
-    :param pct_start: Percent of steps before starting learning rate decay
-    :type pct_start: float
-    :param eta_min: Minimum learning rate, defaults to 0
-    :type eta_min: int, optional
-    :param last_epoch: The index of last epoch, defaults to -1
-    :type last_epoch: int, optional
+    Args:
+        optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
+        total_steps (int): Number of total training steps.
+        warmup_steps (int, optional): Number of warmup steps, defaults to 0.
+        pct_start (float, optional): Percent of steps before starting learning rate decay, defaults to -0.72.
+        eta_min (int, optional): Minimum learning rate, defaults to 0.
+        last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
+            the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
    """

    def __init__(self, optimizer, total_steps: int, warmup_steps: int = 0, pct_start: float = 0.72, eta_min: int = 0,
--- a/colossalai/nn/lr_scheduler/delayed.py
+++ b/colossalai/nn/lr_scheduler/delayed.py
@ -14,16 +14,15 @@ class _enable_get_lr_call:


 class DelayerScheduler(_LRScheduler):
-    """ Starts with a flat lr schedule until it reaches N epochs the applies a scheduler 
+    """Starts with a flat lr schedule until it reaches N epochs then applies
+    the specific scheduler (For example: ReduceLROnPlateau)

-    :param optimizer: Wrapped optimizer.
-    :type optimizer: torch.optim.Optimizer
-    :param delay_epochs: Number of epochs to keep the initial lr until starting aplying the scheduler
-    :type delay_epochs: int
-    :param after_scheduler: After target_epoch, use this scheduler(eg. ReduceLROnPlateau)
-    :type after_scheduler: torch.optim.lr_scheduler
-    :param last_epoch: The index of last epoch, defaults to -1
-    :type last_epoch: int, optional
+    Args:
+        optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
+        delay_epochs (int): Number of epochs to keep the initial lr until starting applying the scheduler.
+        after_scheduler (:class:`torch.optim.lr_scheduler`): After target_epoch, use this scheduler.
+        last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
+            the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
    """

    def __init__(self, optimizer, delay_epochs, after_scheduler, last_epoch=-1):
@ -57,16 +56,15 @@ class DelayerScheduler(_LRScheduler):


 class WarmupScheduler(_LRScheduler):
-    """ Starts with a linear warmup lr schedule until it reaches N epochs the applies a scheduler
+    """Starts with a linear warmup lr schedule until it reaches N epochs then applies
+    the specific scheduler (For example: ReduceLROnPlateau).

-    :param optimizer: Wrapped optimizer.
-    :type optimizer: torch.optim.Optimizer
-    :param warmup_epochs: Number of epochs to linearly warmup lr until starting aplying the scheduler
-    :type warmup_epochs: int
-    :param after_scheduler: After target_epoch, use this scheduler(eg. ReduceLROnPlateau)
-    :type after_scheduler: torch.optim.lr_scheduler
-    :param last_epoch: The index of last epoch, defaults to -1
-    :type last_epoch: int, optional
+    Args:
+        optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
+        warmup_epochs (int): Number of epochs to linearly warmup lr until starting applying the scheduler.
+        after_scheduler (:class:`torch.optim.lr_scheduler`): After target_epoch, use this scheduler.
+        last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
+            the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
    """

    def __init__(self, optimizer, warmup_epochs, after_scheduler, last_epoch=-1):
@ -97,18 +95,16 @@ class WarmupScheduler(_LRScheduler):


 class WarmupDelayerScheduler(_LRScheduler):
-    """ Starts with a linear warmup lr schedule until it reaches N epochs and a flat lr schedule until it reaches M epochs the applies a scheduler 
+    """Starts with a linear warmup lr schedule until it reaches N epochs and a flat lr schedule
+    until it reaches M epochs then applies the specific scheduler (For example: ReduceLROnPlateau).

-    :param optimizer: Wrapped optimizer.
-    :type optimizer: torch.optim.Optimizer
-    :param warmup_epochs: Number of epochs to linearly warmup lr until starting aplying the scheduler
-    :type warmup_epochs: int
-    :param delay_epochs: Number of epochs to keep the initial lr until starting aplying the scheduler
-    :type delay_epochs: int
-    :param after_scheduler: After target_epoch, use this scheduler(eg. ReduceLROnPlateau)
-    :type after_scheduler: torch.optim.lr_scheduler
-    :param last_epoch: The index of last epoch, defaults to -1
-    :type last_epoch: int, optional
+    Args:
+        optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
+        warmup_epochs (int): Number of epochs to linearly warmup lr until starting applying the scheduler.
+        delay_epochs (int): Number of epochs to keep the initial lr until starting applying the scheduler.
+        after_scheduler (:class:`torch.optim.lr_scheduler`): After target_epoch, use this scheduler.
+        last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
+            the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
    """

    def __init__(self, optimizer, warmup_epochs, delay_epochs, after_scheduler, last_epoch=-1):
--- a/colossalai/nn/lr_scheduler/linear.py
+++ b/colossalai/nn/lr_scheduler/linear.py
@ -5,16 +5,14 @@ from colossalai.registry import LR_SCHEDULERS

@LR_SCHEDULERS.register_module
 class LinearWarmupLR(_LRScheduler):
-    """Linearly warmup learning rate and then linearly decay
+    """Linearly warmup learning rate and then linearly decay.

-    :param optimizer: Wrapped optimizer
-    :type optimizer: torch.optim.Optimizer
-    :param total_steps: Number of total training steps
-    :type total_steps: int
-    :param warmup_steps: Number of warmup steps, defaults to 0
-    :type warmup_steps: int, optional
-    :param last_epoch: The index of last epoch, defaults to -1
-    :type last_epoch: int, optional
+    Args:
+        optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
+        total_steps (int): Number of total training steps.
+        warmup_steps (int, optional): Number of warmup steps, defaults to 0
+        last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
+            the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
    """

    def __init__(self, optimizer, total_steps: int, warmup_steps: int = 0, last_epoch: int = -1, **kwargs):
--- a/colossalai/nn/lr_scheduler/multistep.py
+++ b/colossalai/nn/lr_scheduler/multistep.py
@ -13,18 +13,13 @@ class MultiStepLR(_MultiStepLR):
    happen simultaneously with other changes to the learning rate from outside
    this scheduler. When last_epoch=-1, sets initial lr as lr.

-    :param optimizer: Wrapped optimizer
-    :type optimizer: torch.optim.Optimizer
-    :param total_steps: Number of total training steps
-    :type total_steps: int
-    :param milestones: List of epoch indices. Must be increasing, defaults to None
-    :type milestones: List[int], optional
-    :param gamma: Multiplicative factor of learning rate decay, defaults to 0.1
-    :type gamma: float, optional
-    :param num_steps_per_epoch: Number of steps per epoch, defaults to -1
-    :type num_steps_per_epoch: int, optional
-    :param last_epoch: The index of last epoch, defaults to -1
-    :type last_epoch: int, optional
+    Args:
+        optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
+        total_steps (int): Number of total training steps.
+        milestones (List[int], optional): List of epoch indices. Must be increasing, defaults to None.
+        gamma (float, optional): Multiplicative factor of learning rate decay, defaults to 0.1.
+        last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
+            the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
    """

    def __init__(self, optimizer, total_steps: int, milestones: List[int] = None, gamma: float = 0.1, last_epoch: int = -1, **kwargs):
@ -33,22 +28,17 @@ class MultiStepLR(_MultiStepLR):

@LR_SCHEDULERS.register_module
 class MultiStepWarmupLR(WarmupScheduler):
-    """Multi-step laerning rate scheduler with warmup.
+    """Multistep learning rate scheduler with warmup.

-    :param optimizer: Wrapped optimizer
-    :type optimizer: torch.optim.Optimizer
-    :param total_steps: Number of total training steps
-    :type total_steps: int
-    :param warmup_steps: Number of warmup steps, defaults to 0
-    :type warmup_steps: int, optional
-    :param milestones: List of epoch indices. Must be increasing, defaults to None
-    :type milestones: List[int], optional
-    :param gamma: Multiplicative factor of learning rate decay, defaults to 0.1
-    :type gamma: float, optional
-    :param num_steps_per_epoch: Number of steps per epoch, defaults to -1
-    :type num_steps_per_epoch: int, optional
-    :param last_epoch: The index of last epoch, defaults to -1
-    :type last_epoch: int, optional
+    Args:
+        optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
+        total_steps (int): Number of total training steps.
+        warmup_steps (int, optional): Number of warmup steps, defaults to 0.
+        milestones (List[int], optional): List of epoch indices. Must be increasing, defaults to None.
+        gamma (float, optional): Multiplicative factor of learning rate decay, defaults to 0.1.
+        num_steps_per_epoch (int, optional): Number of steps per epoch, defaults to -1.
+        last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
+            the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
    """

    def __init__(self, optimizer, total_steps: int, warmup_steps: int = 0, milestones: List[int] = None,
--- a/colossalai/nn/lr_scheduler/onecycle.py
+++ b/colossalai/nn/lr_scheduler/onecycle.py
@ -28,43 +28,41 @@ class OneCycleLR(_OneCycleLR):
    claims that "unpublished work has shown even better results by using only two phases". To
    mimic the behaviour of the original paper instead, set ``three_phase=True``.

-    :param optimizer: Wrapped optimizer
-    :type optimizer: torch.optim.Optimizer
-    :param total_steps: Number of total training steps
-    :type total_steps: int
-    :param pct_start: The percentage of the cycle (in number of steps) spent increasing the learning rate, defaults to 0.3
-    :type pct_start: float, optional
-    :param anneal_strategy: {'cos', 'linear'}
-        Specifies the annealing strategy: "cos" for cosine annealing, "linear" for
-        linear annealing, defaults to 'cos'
-    :type anneal_strategy: str, optional
-    :param cycle_momentum: If ``True``, momentum is cycled inversely
-        to learning rate between 'base_momentum' and 'max_momentum', defaults to True
-    :type cycle_momentum: bool, optional
-    :param base_momentum:  Lower momentum boundaries in the cycle
-        for each parameter group. Note that momentum is cycled inversely
-        to learning rate; at the peak of a cycle, momentum is
-        'base_momentum' and learning rate is 'max_lr', defaults to 0.85
-    :type base_momentum: float, optional
-    :param max_momentum: Upper momentum boundaries in the cycle
-        for each parameter group. Functionally,
-        it defines the cycle amplitude (max_momentum - base_momentum).
-        Note that momentum is cycled inversely
-        to learning rate; at the start of a cycle, momentum is 'max_momentum'
-        and learning rate is 'base_lr', defaults to 0.95
-    :type max_momentum: float, optional
-    :param div_factor: Determines the initial learning rate via
-        initial_lr = max_lr/div_factor, defaults to 25.0
-    :type div_factor: float, optional
-    :param final_div_factor: Determines the minimum learning rate via
-        min_lr = initial_lr/final_div_factor, defaults to 10000.0
-    :type final_div_factor: float, optional
-    :param last_epoch: The index of the last batch. This parameter is used when
-        resuming a training job. Since `step()` should be invoked after each
-        batch instead of after each epoch, this number represents the total
-        number of *batches* computed, not the total number of epochs computed.
-        When last_epoch=-1, the schedule is started from the beginning, defaults to -1
-    :type last_epoch: int, optional
+    Args:
+        optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
+        total_steps (int): Number of total training steps.
+        pct_start (float, optional):
+            The percentage of the cycle (in number of steps) spent increasing the learning rate, defaults to 0.3.
+        anneal_strategy (str, optional): {'cos', 'linear'}, Specifies the annealing strategy:
+            "cos" for cosine annealing, "linear" for linear annealing, defaults to 'cos'.
+        cycle_momentum (bool, optional): If ``True``, momentum is cycled inversely
+            to learning rate between 'base_momentum' and 'max_momentum', defaults to True.
+        base_momentum (float, optional):  Lower momentum boundaries in the cycle for each parameter group.
+            Note that momentum is cycled inversely to learning rate; at the peak of a cycle, momentum is
+            'base_momentum' and learning rate is 'max_lr', defaults to 0.85.
+        max_momentum (float, optional): Upper momentum boundaries in the cycle for each parameter group.
+            Functionally, it defines the cycle amplitude (max_momentum - base_momentum).
+            Note that momentum is cycled inversely to learning rate; at the start of a cycle, momentum is 'max_momentum'
+            and learning rate is 'base_lr', defaults to 0.95.
+        div_factor (float, optional): Determines the initial learning rate via
+            initial_lr = max_lr/div_factor, defaults to 25.0.
+        final_div_factor (float, optional): Determines the minimum learning rate via
+            min_lr = initial_lr/final_div_factor, defaults to 10000.0.
+        last_epoch (int, optional): The index of the last batch. This parameter is used when resuming a training job.
+            Since `step()` should be invoked after each batch instead of after each epoch, this number represents
+            the total number of *batches* computed, not the total number of epochs computed.
+            When last_epoch=-1, the schedule is started from the beginning, defaults to -1
+
+    The ``kwargs`` for initializing torch.optim.lr_scheduler.OneCycleLR should include parameters below:
+    ::
+
+        epochs (int, optional, default=None)
+        steps_per_epoch (int, optional, default=None)
+        three_phase (bool, optional, default=False)
+        verbose (bool, optional, default=False)
+
+    More details about kwargs could be found in
+    `OneCycleLR <https://pytorch.org/docs/stable/generated/torch.optim.lr_scheduler.OneCycleLR.html#torch.optim.lr_scheduler.OneCycleLR>`_.

    .. _Super-Convergence\: Very Fast Training of Neural Networks Using Large Learning Rates:
        https://arxiv.org/abs/1708.07120
--- a/colossalai/nn/lr_scheduler/poly.py
+++ b/colossalai/nn/lr_scheduler/poly.py
@ -8,16 +8,13 @@ from .delayed import WarmupScheduler
 class PolynomialLR(_LRScheduler):
    """Polynomial learning rate scheduler.

-    :param optimizer: Wrapped optimizer
-    :type optimizer: torch.optim.Optimizer
-    :param total_steps: Number of total training steps
-    :type total_steps: int
-    :param end_lr: Minimum learning rate, defaults to 0.0001
-    :type end_lr: float, optional
-    :param power: The power of polynomial, defaults to 1.0
-    :type power: float, optional
-    :param last_epoch: The index of last epoch, defaults to -1
-    :type last_epoch: int, optional
+    Args:
+        optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
+        total_steps (int): Number of total training steps.
+        end_lr (float, optional): Minimum learning rate, defaults to 0.0001.
+        power (float, optional): The power of polynomial, defaults to 1.0.
+        last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
+            the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
    """

    def __init__(self, optimizer, total_steps: int, end_lr: float = 0.0001, power: float = 1.0, last_epoch: int = -1,
@ -44,18 +41,14 @@ class PolynomialLR(_LRScheduler):
 class PolynomialWarmupLR(WarmupScheduler):
    """Polynomial learning rate scheduler with warmup.

-    :param optimizer: Wrapped optimizer
-    :type optimizer: torch.optim.Optimizer
-    :param total_steps: Number of total training steps
-    :type total_steps: int
-    :param warmup_steps: Number of warmup steps, defaults to 0
-    :type warmup_steps: int, optional
-    :param end_lr: Minimum learning rate, defaults to 0.0001
-    :type end_lr: float, optional
-    :param power: The power of polynomial, defaults to 1.0
-    :type power: float, optional
-    :param last_epoch: The index of last epoch, defaults to -1
-    :type last_epoch: int, optional
+    Args:
+        optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
+        total_steps (int): Number of total training steps.
+        warmup_steps (int, optional): Number of warmup steps, defaults to 0.
+        end_lr (float, optional): Minimum learning rate, defaults to 0.0001.
+        power (float, optional): The power of polynomial, defaults to 1.0.
+        last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
+            the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
    """

    def __init__(self, optimizer, total_steps: int, warmup_steps: int = 0, end_lr: float = 0.0001, power: float = 1.0,
--- a/colossalai/nn/lr_scheduler/torch.py
+++ b/colossalai/nn/lr_scheduler/torch.py
@ -11,16 +11,13 @@ class LambdaLR(_LambdaLR):
    """Sets the learning rate of each parameter group to the initial lr
    times a given function. When last_epoch=-1, sets initial lr as lr.

-    :param optimizer: Wrapped optimizer
-    :type optimizer: torch.optim.Optimizer
-    :param total_steps: Number of total training steps
-    :type total_steps: int
-    :param lr_lambda: A function which computes a multiplicative
-        factor given an integer parameter epoch, or a list of such
-        functions, one for each group in optimizer.param_groups, defaults to None
-    :type lr_lambda: function or list, optional
-    :param last_epoch: The index of last epoch, defaults to -1
-    :type last_epoch: int, optional
+    Args:
+        optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
+        total_steps (int): Number of total training steps.
+        lr_lambda (Union[``function``, ``list[function]``]): A function which computes a multiplicative
+            factor given an integer parameter epoch, or a list of such functions,
+            one for each group in optimizer.param_groups, defaults to None.
+        last_epoch (int, optional): The index of last epoch, defaults to -1.
    """

    def __init__(self, optimizer, total_steps, lr_lambda=None, last_epoch: int = -1) -> None:
@ -30,18 +27,15 @@ class LambdaLR(_LambdaLR):
@LR_SCHEDULERS.register_module
 class MultiplicativeLR(_MultiplicativeLR):
    """Multiply the learning rate of each parameter group by the factor given
-    in the specified function. When last_epoch=-1, sets initial lr as lr
+    in the specified function. When last_epoch=-1, sets initial lr as lr.

-    :param optimizer: Wrapped optimizer
-    :type optimizer: torch.optim.Optimizer
-    :param total_steps: Number of total training steps
-    :type total_steps: int
-    :param lr_lambda: A function which computes a multiplicative
-        factor given an integer parameter epoch, or a list of such
-        functions, one for each group in optimizer.param_groups, defaults to None
-    :type lr_lambda: function or list, optional
-    :param last_epoch: The index of last epoch, defaults to -1
-    :type last_epoch: int, optional
+    Args:
+        optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
+        total_steps (int): Number of total training steps.
+        lr_lambda (Union[``function``, ``list[function]``]): A function which computes a multiplicative
+            factor given an integer parameter epoch, or a list of such functions,
+            one for each group in optimizer.param_groups, defaults to None.
+        last_epoch (int, optional): The index of last epoch, defaults to -1.
    """

    def __init__(self, optimizer, total_steps, lr_lambda=None, last_epoch: int = -1) -> None:
@ -53,18 +47,14 @@ class StepLR(_StepLR):
    """Decays the learning rate of each parameter group by gamma every
    step_size epochs. Notice that such decay can happen simultaneously with
    other changes to the learning rate from outside this scheduler. When
-    last_epoch=-1, sets initial lr as lr
+    last_epoch=-1, sets initial lr as lr.

-    :param optimizer: Wrapped optimizer
-    :type optimizer: torch.optim.Optimizer
-    :param total_steps: Number of total training steps
-    :type total_steps: int
-    :param step_size: Period of learning rate decay, defaults to 1
-    :type step_size: int, optional
-    :param gamma: Multiplicative factor of learning rate decay, defaults to 0.1
-    :type gamma: float, optional
-    :param last_epoch: The index of last epoch, defaults to -1
-    :type last_epoch: int, optional
+    Args:
+        optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
+        total_steps (int): Number of total training steps.
+        step_size (int, optional): Period of learning rate decay, defaults to 1.
+        gamma (float, optional): Multiplicative factor of learning rate decay, defaults to 0.1.
+        last_epoch (int, optional): The index of last epoch, defaults to -1.
    """

    def __init__(self, optimizer, total_steps, step_size: int = 1, gamma: float = 0.1, last_epoch: int = -1) -> None:
@ -77,14 +67,11 @@ class ExponentialLR(_ExponentialLR):
    """Decays the learning rate of each parameter group by gamma every epoch.
    When last_epoch=-1, sets initial lr as lr

-    :param optimizer: Wrapped optimizer
-    :type optimizer: torch.optim.Optimizer
-    :param total_steps: Number of total training steps
-    :type total_steps: int
-    :param gamma: Multiplicative factor of learning rate decay, defaults to 1.0
-    :type gamma: float, optional
-    :param last_epoch: The index of last epoch, defaults to -1
-    :type last_epoch: int, optional
+    Args:
+        optimizer (Union[:class:`torch.optim.Optimizer`, :class:`colossalai.nn.optimizer`]): Wrapped optimizer.
+        total_steps (int): Number of total training steps.
+        gamma (float, optional): Multiplicative factor of learning rate decay, defaults to 1.0.
+        last_epoch (int, optional): The index of last epoch, defaults to -1.
    """

    def __init__(self, optimizer, total_steps, gamma: float = 1.0,
--- a/colossalai/nn/metric/accuracy_2d.py
+++ b/colossalai/nn/metric/accuracy_2d.py
@ -14,8 +14,12 @@ class Accuracy2D(nn.Module):
    def forward(self, logits, targets):
        """Calculate the accuracy of predicted labels.

-        :param logits: Predicted labels
-        :param targets: True labels from data
+        Args:
+            logits (:class:`torch.tensor`): Predicted labels.
+            targets (:class:`torch.tensor`): True labels from data.
+
+        Returns:
+            float: the accuracy of prediction.
        """
        with torch.no_grad():
            targets = split_tensor_2d(targets)
--- a/colossalai/nn/metric/accuracy_2p5d.py
+++ b/colossalai/nn/metric/accuracy_2p5d.py
@ -14,8 +14,12 @@ class Accuracy2p5D(nn.Module):
    def forward(self, logits, targets):
        """Calculate the accuracy of predicted labels.

-        :param logits: Predicted labels
-        :param targets: True labels from data
+        Args:
+            logits (:class:`torch.tensor`): Predicted labels.
+            targets (:class:`torch.tensor`): True labels from data.
+
+        Returns:
+            float: the accuracy of prediction.
        """
        with torch.no_grad():
            targets = split_tensor_2p5d(targets)
--- a/colossalai/nn/metric/accuracy_3d.py
+++ b/colossalai/nn/metric/accuracy_3d.py
@ -18,8 +18,12 @@ class Accuracy3D(nn.Module):
    def forward(self, logits, targets):
        """Calculate the accuracy of predicted labels.

-         :param logits: Predicted labels
-         :param targets: True labels from data
+        Args:
+            logits (:class:`torch.tensor`): Predicted labels.
+            targets (:class:`torch.tensor`): True labels from data.
+
+        Returns:
+            float: the accuracy of prediction.
         """
        with torch.no_grad():
            targets = split_tensor_3d(targets, 0, self.weight_parallel_mode)
--- a/colossalai/registry/registry.py
+++ b/colossalai/registry/registry.py
@ -9,11 +9,10 @@ class Registry:
    """This is a registry class used to register classes and modules so that a universal 
    object builder can be enabled.

-    :param name: The name of the registry
-    :type name: str
-    :param third_party_library: List of third party libraries which are used in the 
-        initialization of the register module
-    :type third_party_library: list, optional
+    Args:
+        name (str): The name of the registry .
+        third_party_library (list, optional):
+            List of third party libraries which are used in the initialization of the register module.
    """

    def __init__(self, name: str, third_party_library: List[ModuleType] = None):
@ -28,12 +27,12 @@ class Registry:
    def register_module(self, module_class):
        """Registers a module represented in `module_class`.

-        :param module_class: The module to be registered
-        :type module_class: class
-        :raises AssertionError: Raises an AssertionError if the module has already been 
-            registered before
-        :return: The module to be registered, so as to use it normally if via importing
-        :rtype: class
+        Args:
+            module_class (class): The module to be registered.
+        Returns:
+            class: The module to be registered, so as to use it normally if via importing.
+        Raises:
+            AssertionError: Raises an AssertionError if the module has already been registered before.
        """
        module_name = module_class.__name__
        assert module_name not in self._registry
@ -46,12 +45,13 @@ class Registry:
        """Retrieves a module with name `module_name` and returns the module if it has 
        already been registered before.

-        :param module_name: The name of the module to be retrieved
-        :type module_name: str
-        :raises NameError: Raises a NameError if the module to be retrieved has neither been 
-            registered directly nor as third party modules before
-        :return: The retrieved module or None
-        :rtype: :class:`object`
+        Args:
+            module_name (str): The name of the module to be retrieved.
+        Returns:
+            :class:`object`: The retrieved module or None.
+        Raises:
+            NameError: Raises a NameError if the module to be retrieved has neither been
+            registered directly nor as third party modules before.
        """
        if module_name in self._registry:
            return self._registry[module_name]
@ -65,11 +65,11 @@ class Registry:
        """Searches for a module with name `module_name` and returns a boolean value indicating
        whether the module has been registered directly or as third party modules before.

-        :param module_name: The name of the module to be searched for
-        :type module_name: str
-        :return: A boolean value indicating whether the module has been registered directly or
-            as third party modules before
-        :rtype: bool
+        Args:
+            module_name (str): The name of the module to be searched for.
+        Returns:
+            bool: A boolean value indicating whether the module has been registered directly or
+            as third party modules before.
        """
        found_flag = module_name in self._registry

--- a/colossalai/trainer/_trainer.py
+++ b/colossalai/trainer/_trainer.py
@ -17,18 +17,46 @@ from colossalai.trainer.hooks import BaseHook


 class Trainer:
-    """This a class tending for easy deployments of users' training and evaluation instead of
+    r"""This is a class tending for easy deployments of users' training and evaluation instead of
    writing their own scripts. It is similar with ``ignite.engine`` and ``keras.engine``, but is
    called `Trainer`.

-    :param engine: Engine responsible for the process function
-    :type engine: :class:`Engine`
-    :param schedule: Schedule responsible for forward and backward steps
-    :type schedule: :class:`BaseSchedule`, optional
-    :param timer: Timer used to monitor the whole training
-    :type timer: :class:`MultiTimer`, optional
-    :param logger: Logger used to record the whole training
-    :type logger: :class:`colossalai.logging.DistributedLogger`, optional
+    Args:
+        engine (:class:`Engine`): Engine responsible for the process function.
+        schedule (:class:`BaseSchedule`, optional): Schedule responsible for forward and backward steps.
+        timer (:class:`MultiTimer`, optional): Timer used to monitor the whole training.
+        logger (:class:`colossalai.logging.DistributedLogger`, optional): Logger used to record the whole training log.
+
+    Note:
+        when `schedule` is None, the ``NonPipelineSchedule`` would be used. If you would like to use pipeline,
+        you should choose ``PipelineSchedule`` or ``InterleavedPipelineSchedule`` for the `schedule`
+
+    Examples:
+        >>> # define model, criterion, optimizer, lr_scheduler, train_dataloader for your training
+        >>> model = ...
+        >>> criterion = ...
+        >>> optimizer = ...
+        >>> train_dataloader = ...
+        >>> # Initialize your engine, train_dataloader, test_dataloader, lr_scheduler
+        >>> engine, train_dataloader, _, _ = colossalai.initialize(model, optimizer, criterion)
+        >>> # Beginning training progress
+        >>> timier = ...
+        >>> logger = ...
+        >>> trainer = Trainer(engine=engine, logger=logger, schedule=schedule, timer=timier)
+        >>> # add hooks you would like to use here.
+        >>> hook_list = []
+        >>> trainer.fit(
+        >>>    train_dataloader=train_dataloader,
+        >>>    epochs=gpc.config.NUM_EPOCHS,
+        >>>    test_interval=1,
+        >>>    hooks=hook_list,
+        >>>    display_progress=True,
+        >>>    return_output_label=False
+        >>>    )
+
+    More examples and details could be found in
+    `Training with engine and trainer <https://www.colossalai.org/docs/basics/engine_trainer>`_
+    and `ColossalAI-Examples <https://github.com/hpcaitech/ColossalAI-Examples/tree/main>`_.
    """
    def __init__(
            self,
@ -108,20 +136,19 @@ class Trainer:
    def _set_current_step(self, epoch: int):
        """Sets current step number.

-        :param epoch: Step number to be set
-        :type epoch: int
+        Args:
+            epoch (int): Step number to be set.
        """
        self._cur_step = epoch * self._steps_per_epoch

    def _call_timer(self, action: str, item: str, *args, **kwargs) -> None:
        """Call timer funciton with a given timer name.

-        :param action: Function to be called on timer
-        :type action: str
-        :param item: Name of the timer
-        :type item: str
-        :param args: args used for action function
-        :param kwargs: kwargs used for action function
+        Args:
+            action (str): Function to be called on timer.
+            item (str): Name of the timer.
+            args (list): args used for action function.
+            kwargs (dict): kwargs used for action function.
        """

        if self._timer is not None:
@ -134,10 +161,9 @@ class Trainer:
    def _call_hooks(self, func, output=None):
        """Calls specific hooks in the current time point.

-        :param func: A string represents the time point
-        :param output: Output of the model after running a iteration or None in any other time points
-        :type func: str
-        :type output: optional
+        Args:
+            func (str): A string represents the time point.
+            output (Any, optional): Output of the model after running an iteration or None in any other time points.
        """
        # Only after iter hook will receive output
        for hook in self.hooks:
@ -273,25 +299,17 @@ class Trainer:
            display_progress: bool = False,
            return_output_label: bool = True,
    ):
-        """Trains the model to fit training data.
+        r"""Trains the model to fit training data.

-        :param train_dataloader: DataLoader in training
-        :param epochs: Maximum number of epoches
-        :param max_steps: Maximum number of running iterations
-        :param test_dataloader: DataLoader in testing
-        :param test_interval: Interval of testing
-        :param hooks: A list of hooks used in training
-        :param display_progress: If True, the training progress will be printed
-        :param return_output_label: If True, the output of model and the label will be returned
-
-        :type train_dataloader: DataLoader
-        :type epochs: int
-        :type max_steps: int, optional
-        :type test_dataloader: DataLoader, optional
-        :type test_interval: int, optional
-        :type hooks: list, optional
-        :type display_progress: bool, optional
-        :type return_output_label: bool, optional
+        Args:
+            train_dataloader (:class:`torch.utils.data.DataLoader`): DataLoader for training.
+            epochs (int): Maximum number of epochs.
+            max_steps (int, optional): Maximum number of running iterations.
+            test_dataloader (:class:`torch.utils.data.DataLoader`, optional): DataLoader for validation.
+            test_interval (int, optional): Interval of validation
+            hooks (list[`BaseHook <https://github.com/hpcaitech/ColossalAI/tree/main/colossalai/trainer/hooks>`_],
+                optional): A list of hooks used in training.
+            display_progress (bool, optional): If True, a progress bar will be displayed.
        """

        # set epochs and steps, consider gradient accumulation
@ -374,15 +392,12 @@ class Trainer:
    ):
        """Evaluates the model with testing data.

-        :param test_dataloader: DataLoader in testing
-        :param hooks: A list of hooks used in evaluation
-        :param display_progress: If True, the evaluation progress will be printed
-        :param return_output_label: If True, the output of model and the label will be returned
-
-        :type test_dataloader: DataLoader
-        :type hooks: list, optional
-        :type display_progress: bool, optional
-        :type return_output_label: bool
+        Args:
+            test_dataloader (:class:`torch.utils.data.DataLoader`, optional): Dataloader for testing.
+            hooks (list, optional): A list of hooks used in evaluation. Defaults to None.
+            display_progress (bool, optional): If True, the evaluation progress will be printed. Defaults to False.
+            return_output_label (bool, optional): If True, the output of model and the label
+                will be returned. Defaults to True.
        """
        # set display
        display_progress = self._should_display_progress(display_progress)
@ -418,10 +433,11 @@ class Trainer:
    def predict(self, data: Union[Tensor, List[Tensor]]):
        """Uses trained model to make a prediction for a tensor or a tensor list.

-        :param data: Data as the input
-        :type data: Union[Tensor, List[Tensor]
-        :return: The output of model as the prediction
-        :rtype: Tensor
+        Args:
+            data (Union[:class:`torch.tensor`, List[:class:`torch.tensor`]]): Data as the input.
+
+        Returns:
+            :class:`torch.tensor`: The output of model as the prediction
        """
        # predict without labels
        if isinstance(data, (list, tuple)):
--- a/colossalai/trainer/hooks/_base_hook.py
+++ b/colossalai/trainer/hooks/_base_hook.py
@ -40,14 +40,11 @@ class BaseHook(ABC):
    def after_train_iter(self, trainer, output: Tensor, label: Tensor, loss: Tensor):
        """Actions after running a training iteration.

-        :param trainer: Trainer which is using this hook
-        :type trainer: :class:`Trainer`
-        :param output: Output of the model
-        :type output: torch.Tensor
-        :param label: Labels of the input data
-        :type label: torch.Tensor
-        :param loss: Loss between the output and input data
-        :type loss: torch.Tensor
+        Args:
+           trainer (:class:`Trainer`): Trainer which is using this hook.
+           output (:class:`torch.Tensor`): Output of the model.
+           label (:class:`torch.Tensor`): Labels of the input data.
+           loss (:class:`torch.Tensor`): Loss between the output and input data.
        """
        pass

@ -89,24 +86,21 @@ class BaseHook(ABC):
    def after_test_iter(self, trainer, output: Tensor, label: Tensor, loss: Tensor):
        """Actions after running a testing iteration.

-        :param trainer: Trainer which is using this hook
-        :type trainer: :class:`Trainer`
-        :param output: Output of the model
-        :type output: Tensor
-        :param label: Labels of the input data
-        :type label: Tensor
-        :param loss: Loss between the output and input data
-        :type loss: Tensor
+        Args:
+           trainer (:class:`Trainer`): Trainer which is using this hook
+           output (:class:`torch.Tensor`): Output of the model
+           label (:class:`torch.Tensor`): Labels of the input data
+           loss (:class:`torch.Tensor`): Loss between the output and input data
        """
        pass

    def init_runner_states(self, trainer, key, val):
        """Initializes trainer's state.

-        :param trainer: Trainer which is using this hook
-        :type trainer: :class:`Trainer`
-        :param key: Key of reseting state
-        :param val: Value of reseting state
+        Args:
+            trainer (:class:`Trainer`): Trainer which is using this hook
+            key: Key of state to be reset
+            val: Value of state to be reset
        """
        if key not in trainer.states:
            trainer.states[key] = val
--- a/colossalai/trainer/hooks/_checkpoint_hook.py
+++ b/colossalai/trainer/hooks/_checkpoint_hook.py
@ -16,14 +16,13 @@ from ._lr_scheduler_hook import LRSchedulerHook
 class SaveCheckpointHook(BaseHook):
    """Saves the model by interval in training process.

-    :param interval: Saving interval, defaults to 1
-    :type interval: int, optional
-    :param checkpoint_dir: Directory of saving checkpoint, defaults to None
-    :type checkpoint_dir: str, optional
-    :param suffix: Saving suffix of the file, defaults to ''
-    :type suffix: str, optional
-    :param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10
-    :type priority: int, optional
+    Args:
+       interval (int, optional): Saving interval, defaults to 1.
+       checkpoint_dir (str, optional): Directory of saving checkpoint, defaults to None.
+       suffix (str, optional): Saving suffix of the file, defaults to ''.
+       priority (int, optional): Priority in the printing, hooks with small priority will be printed in front
+            defaults to 10. If different hooks share same priority, the order of printing would
+            depend on the hooks order in the hook list.
    """

    def __init__(self,
@ -71,18 +70,17 @@ class SaveCheckpointHook(BaseHook):
 class LoadCheckpointHook(BaseHook):
    """Loads the model before training process.

-    :param checkpoint_dir: Directory of saving checkpoint, defaults to None
-    :type checkpoint_dir: str, optional
-    :param epoch: Epoch number to be set, defaults to -1
-    :type epoch: str, optional
-    :param finetune: Whether allows to load a part of the model, defaults to False
-    :type finetune: bool, optional
-    :param strict: Whether loads a model that has the same shape of parameters, defaults to False
-    :type strict: bool, optional
-    :param suffix: Suffic, defaults to ''
-    :type suffix: str, optional
-    :param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 0
-    :type priority: int, optional
+    Args:
+        checkpoint_dir (str, optional): Directory of saving checkpoint, defaults to None.
+        epoch (str, optional): Loading checkpoint of setting epoch numbers, defaults to -1.
+            Epoch equals to -1 means choosing the latest checkpoint.
+        finetune (bool, optional): Whether allows to load a part of the model, defaults to False.
+        strict (bool, optional): Whether to strictly enforce that the keys in :attr:`state_dict` of the checkpoint
+            match the names of parameters and buffers in model, defaults to False.
+        suffix (str, optional): Suffix of checkpoint file path, defaults to ''.
+        priority (int, optional): Priority in the printing, hooks with small priority will be printed in front,
+            defaults to 0. If different hooks share same priority, the order of printing would
+            depend on the hooks order in the hook list.
    """

    def __init__(self,
--- a/colossalai/trainer/hooks/_log_hook.py
+++ b/colossalai/trainer/hooks/_log_hook.py
@ -25,13 +25,14 @@ def _format_number(val, prec=5):


 class LogByEpochHook(BaseHook):
-    """Hook to log by epoch
+    """Hook to log by epoch.

-    :param logger: Logger for the log
-    :param interval: Recording interval, defaults to 1
-    :type interval: int, optional
-    :param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 1
-    :type priority: int, optional
+    Args:
+        logger (:class:`colossalai.logging.DistributedLogger`): Logger for recording the log information.
+        interval (int, optional): Interval of printing log information, defaults to 1.
+        priority (int, optional): Priority in the printing, hooks with small priority will be printed in front,
+            defaults to 1. If different hooks share same priority, the order of printing would
+            depend on the hooks order in the hook list.
    """

    def __init__(self,
@ -48,10 +49,12 @@ class LogByEpochHook(BaseHook):

@HOOKS.register_module
 class LogMetricByStepHook(BaseHook):
-    """Hook to log metric by step
+    """Hook to log metric by step.

-    :param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10
-    :type priority: int, optional
+    Args:
+        priority (int, optional): Priority in the printing, hooks with small priority will be printed in front,
+            defaults to 10. If different hooks share same priority, the order of printing would
+            depend on the hooks order in the hook list.
    """

    def __init__(self, priority: int = 10):
@ -74,11 +77,12 @@ class LogMetricByStepHook(BaseHook):
 class LogMetricByEpochHook(LogByEpochHook):
    """Specialized hook to record the metric to log.

-    :param logger: Logger for the log
-    :param interval: Recording interval, defaults to 1
-    :type interval: int, optional
-    :param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10
-    :type priority: int, optional
+    Args:
+        logger (:class:`colossalai.logging.DistributedLogger`): Logger for recording the log information.
+        interval (int, optional): Interval of printing log information, defaults to 1.
+        priority (int, optional): Priority in the printing, hooks with small priority will be printed in front,
+            defaults to 10. If different hooks share same priority, the order of printing would
+            depend on the hooks order in the hook list.
    """

    def __init__(self,
@ -116,14 +120,14 @@ class LogMetricByEpochHook(LogByEpochHook):
 class TensorboardHook(BaseHook):
    """Specialized hook to record the metric to Tensorboard.

-    :param log_dir: Directory of log
-    :type log_dir: str
-    :param ranks: Ranks of processors
-    :type ranks: typing.List
-    :param parallel_mode: Parallel mode, defaults to colossalai.context.parallel_mode.ParallelMode.GLOBAL
-    :type parallel_mode: :class:`colossalai.context.parallel_mode.ParallelMode`, optional
-    :param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10
-    :type priority: int, optional
+    Args:
+        log_dir (str): Directory of log.
+        ranks (list): Ranks of processors.
+        parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`, optional): Parallel mode used in trainer,
+            defaults to colossalai.context.parallel_mode.ParallelMode.GLOBAL.
+        priority (int, optional): Priority in the printing, hooks with small priority will be printed in front,
+            defaults to 10. If different hooks share same priority, the order of printing would
+            depend on the hooks order in the hook list.
    """

    def __init__(self,
@ -200,18 +204,15 @@ class TensorboardHook(BaseHook):
 class LogTimingByEpochHook(LogByEpochHook):
    """Specialized hook to write timing record to log.

-    :param timer: Timer for the hook
-    :type timer: :class:`colossalai.utils.MultiTimer`
-    :param logger: Logger for the log
-    :type logger: :class:`colossalai.logging.DistributedLogger`
-    :param interval: Recording interval, defaults to 1
-    :type interval: int, optional
-    :param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10
-    :type priority: int, optional
-    :param log_eval: Whether writes in evaluation, defaults to True
-    :type log_eval: bool, optional
-    :param ignore_num_train_steps: Number of training steps to ignore, defaults to 0
-    :type ignore_num_train_steps: int, optional
+    Args:
+        timer (:class:`colossalai.utils.MultiTimer`): Timer for the hook.
+        logger (:class:`colossalai.logging.DistributedLogger`): Logger for recording the log information.
+        interval (int, optional): Interval of printing log information, defaults to 1.
+        priority (int, optional): Priority in the printing, hooks with small priority will be printed in front
+            defaults to 10. If different hooks share same priority, the order of printing would
+            depend on the hooks order in the hook list.
+        log_eval (bool, optional): Whether writes in evaluation, defaults to True.
+        ignore_num_train_steps (int, optional): Number of training steps to ignore, defaults to 0.
    """

    def __init__(self,
@ -270,14 +271,13 @@ class LogTimingByEpochHook(LogByEpochHook):
 class LogMemoryByEpochHook(LogByEpochHook):
    """Specialized Hook to write memory usage record to log.

-    :param logger: Logger for the log
-    :type logger: colossalai.logging.DistributedLogger
-    :param interval: Recording interval, defaults to 1
-    :type interval: int, optional
-    :param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10
-    :type priority: int, optional
-    :param log_eval: Whether writes in evaluation, defaults to True
-    :type log_eval: bool, optional
+    Args:
+        logger (:class:`colossalai.logging.DistributedLogger`): Logger for recording the log information.
+        interval (int, optional): Interval of printing log information, defaults to 1.
+        priority (int, optional): Priority in the printing, hooks with small priority will be printed in front
+            defaults to 1. If different hooks share same priority, the order of printing would
+            depend on the hooks order in the hook list.
+        log_eval (bool, optional): Whether writes in evaluation, defaults to True.
    """

    def __init__(self,
--- a/colossalai/trainer/hooks/_lr_scheduler_hook.py
+++ b/colossalai/trainer/hooks/_lr_scheduler_hook.py
@ -6,15 +6,17 @@ from ._metric_hook import LearningRateMetric, MetricHook

@HOOKS.register_module
 class LRSchedulerHook(MetricHook):
-    """Build LR scheduler
+    r"""Build LR scheduler for trainer.

-    :param lr_scheduler: LR scheduler
-    :param by_epoch: If `True`, the LR will be scheduled every epoch. Else, the LR will be scheduled every batch
-    :type by_epoch: bool
-    :param store_lr_in_state: If `True`, store the learning rate in each state, defaults to `True`
-    :type store_lr_in_state: bool, optional
-    :param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 1
-    :type priority: int, optional
+    Args:
+        lr_scheduler (:class:`colossalai.nn.lr_scheduler`): The specific LR scheduler
+            in range of ``colossalai.nn.lr_scheduler``, more details about ``lr_scheduler`` could be found in
+            `lr_scheduler <https://github.com/hpcaitech/ColossalAI/tree/main/colossalai/nn/lr_scheduler>`_.
+        by_epoch (bool): If `True`, the LR will be scheduled every epoch. Else, the LR will be scheduled every batch.
+        store_lr_in_state (bool, optional): If `True`, store the learning rate in each state, defaults to `True`.
+        priority (int, optional): Priority in the printing, hooks with small priority will be printed in front
+            defaults to 1. If different hooks share same priority, the order of printing would
+            depend on the hooks order in the hook list.
    """
    def __init__(
        self,
--- a/colossalai/trainer/hooks/_metric_hook.py
+++ b/colossalai/trainer/hooks/_metric_hook.py
@ -17,13 +17,13 @@ from ._base_hook import BaseHook

 class Metric(ABC):
    """A basic class of metric collectors. It collects a specific
-    metric during training or evaluation and it's always used with 
+    metric during training or evaluation and would always be used with
    :class:`MetricHook` to help it update its states and show the 
    metric. So please use corresponding hook class to make the metric 
    collector works.

-    :param epoch_only: Whether the metric only read for the full epoch
-    :type epoch_only: bool
+    Args:
+        epoch_only (bool): Whether the metric only read for the full epoch.
    """

    def __init__(self, epoch_only: bool):
@ -80,8 +80,8 @@ class Metric(ABC):
 class LossMetric(Metric):
    """A metric collector for loss.

-    :param epoch_only: Whether the metric only read for the full epoch
-    :type epoch_only: bool
+    Args:
+        epoch_only (bool): Whether the metric only read for the full epoch.
    """

    def __init__(self, epoch_only):
@ -101,7 +101,8 @@ class LossMetric(Metric):
        """Updates :attr:`last_step_loss` and :attr:`accum_loss` with current loss.
        It expects the output has loss.

-        :param loss: Current loss of the output
+        Args:
+            loss (:class:`torch.tensor`): Current loss of the output.
        """
        # expect output to be logits, label and loss
        loss_ = loss.detach()
@ -132,10 +133,9 @@ class LossMetric(Metric):
 class LearningRateMetric(Metric):
    """A metric collector for learning rate.

-    :param epoch_only: Whether the metric only read for the full epoch
-    :type epoch_only: bool
-    :param initial_lr: Initial learning rate, defaults to 0.0
-    :type initial_lr: float, optional
+    Args:
+        epoch_only (bool): Whether the metric only read for the full epoch.
+        initial_lr (float, optional): Initial learning rate, defaults to 0.0.
    """

    def __init__(self, epoch_only: bool, initial_lr: float = 0.):
@ -163,10 +163,9 @@ class AccuracyMetric(Metric):
    """A metric collector for accuracy. It only works for classification
    tasks.

-    :param epoch_only: Whether the metric only read for the full epoch
-    :type epoch_only: bool
-    :param accuracy_func: Accuracy function for the classification task
-    :type accuracy_func: :class:`typing.Callable`
+    Args:
+        epoch_only (bool): Whether the metric only read for the full epoch.
+        accuracy_func (:class:`typing.Callable`): Accuracy function for the classification task.
    """

    def __init__(self, epoch_only: bool, accuracy_func: Callable):
@ -187,9 +186,10 @@ class AccuracyMetric(Metric):
        """Updates last step accuracy and accumulated accuracy with current logits
        and labels. It expects the output has logits and labels.

-        :param logits: The logits output of the model
-        :param targets: Real labels of the dataset
-        :param batch_size: Batch size of the task
+        Args:
+            logits (:class:`torch.tensor`): The logits output of the model.
+            targets (:class:`torch.tensor`): Real labels of the dataset.
+            batch_size (int): Batch size of the task.
        """
        if isinstance(logits, (list, tuple)):
            logits = logits[0]
@ -224,8 +224,10 @@ class MetricHook(BaseHook):
    update their states. Others are used to display and 
    record the metric.

-    :param priority: Priority in the printing, hooks with small priority will be printed in front
-    :type priority: int
+    Args:
+        priority (int): Priority in the printing, hooks with small priority will be printed in front
+            defaults to 1. If different hooks share same priority, the order of printing would
+            depend on the hooks order in the hook list.
    """

    def __init__(
@ -244,8 +246,10 @@ class MetricHook(BaseHook):
 class LossHook(MetricHook):
    """Specialized hook class for :class:`Loss`.

-    :param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 0
-    :type priority: int, optional
+    Args:
+        priority (int, optional): Priority in the printing, hooks with small priority will be printed in front
+            defaults to 0. If different hooks share same priority, the order of printing would
+            depend on the hooks order in the hook list.
    """

    def __init__(self, priority: int = 0):
@ -283,10 +287,11 @@ class LossHook(MetricHook):
 class AccuracyHook(MetricHook):
    """Specialized hook class for :class:`Accuracy`.

-    :param accuracy_func: Priority in the printing, hooks with small priority will be printed in front
-    :type accuracy_func: typing.Callable
-    :param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 0
-    :type priority: int, optional
+    Args:
+        accuracy_func (:class:`typing.Callable`): Accuracy function for the classification task.
+        priority (int, optional): Priority in the printing, hooks with small priority will be printed in front
+            defaults to 0. If different hooks share same priority, the order of printing would
+            depend on the hooks order in the hook list.
    """

    def __init__(self, accuracy_func: Callable, priority: int = 0):
@ -314,8 +319,8 @@ class AccuracyHook(MetricHook):
 class ThroughputMetric(Metric):
    """Metric for :class:`Throughput`.

-    :param epoch_only: epoch only
-    :type epoch_only: bool
+    Args:
+        epoch_only (bool): Whether the metric only read for the full epoch.
    """
    def __init__(self, epoch_only: bool, ignored_steps: int = 0):
        super().__init__(epoch_only=epoch_only)
@ -360,10 +365,13 @@ class ThroughputMetric(Metric):

@HOOKS.register_module
 class ThroughputHook(MetricHook):
-    """Specialized hook class for :class:`Throughput`.
+    """Specialized hook class for :class:`Throughput`. Hook to measure execution throughput (samples/sec).

-    :param priority: priority of throughput hook, defaults to 10
-    :type priority: int, optional
+    Args:
+        ignored_steps (int, optional): the number of initial training steps to ignore.
+        priority (int, optional): Priority in the printing, hooks with small priority will be printed in front
+            defaults to 10. If different hooks share same priority, the order of printing would
+            depend on the hooks order in the hook list.
    """
    def __init__(self, ignored_steps: int = 0, priority: int = 10):
        super().__init__(priority)
--- a/colossalai/utils/activation_checkpoint.py
+++ b/colossalai/utils/activation_checkpoint.py
@ -114,10 +114,13 @@ class CheckpointFunction(torch.autograd.Function):


 def checkpoint(function, activation_offload ,*args):
-    """Checkpoint the computation while preserve the rng states, modified from Pytorch torch.utils.checkpoint
+    """Checkpoint the computation while preserve the rng states, modified from Pytorch torch.utils.checkpoint.

-    :param function: Describe the forward pass function. It should know how to handle the input tuples.
-    :param args: Tuple containing the parameters of the function
-    :return: Output of running function with provided args
+    Args:
+        function: Describe the forward pass function. It should know how to handle the input tuples.
+        args (list): Tuple containing the parameters of the function
+
+    Returns:
+        Output of running function with provided args.
    """
    return CheckpointFunction.apply(function, activation_offload, *args)
--- a/colossalai/utils/checkpointing.py
+++ b/colossalai/utils/checkpointing.py
@ -50,17 +50,17 @@ def _get_standard_checkpoint_filename(epoch: int, suffix: str = ''):


 def get_checkpoint_path(checkpoint_dir: str, epoch: int, suffix: str = ''):
-    """This is a function to generate the checkpoint path from the (checkpoint_dir, epoch, suffix, gpu_parallel_rank) tuple.
+    """This is a function to generate the checkpoint path from the tuple
+    (checkpoint_dir, epoch, suffix, gpu_parallel_rank).
    This is useful during generation and recuperation of the checkpoint.

-    :param checkpoint_dir: Set up a directory for saving checkpoints
-    :type checkpoint_dir: str
-    :param epoch: Epoch number (indicate how many epochs have you trained this model)
-    :type epoch: int
-    :param suffix: Additional notation to specify the model or checkpoint, defaults to ''
-    :type suffix: str, optional
-    :return: Checkpoint path to be generated
-    :rtype: path
+    Args:
+        checkpoint_dir (str): Set up a directory for saving checkpoints.
+        epoch (int): Epoch number (indicate how many epochs have you trained this model).
+        suffix (str, optional): Additional notation to specify the model or checkpoint, defaults to ''
+
+    Returns:
+        str: The checkpoint path to be generated.
    """
    ckpt_filename = _get_standard_checkpoint_filename(epoch, suffix)
    return os.path.join(checkpoint_dir, ckpt_filename)
@ -74,12 +74,13 @@ def _ensure_directory_exists(filename: str):


 def get_latest_checkpoint_pattern(suffix: str = ''):
-    """Generate Regular expression of latest checkpoint's pattern
+    """Generate Regular expression of the latest checkpoint's pattern.

-    :param suffix: Additional notation to specify the model or checkpoint, defaults to ''
-    :type suffix: str, optional
-    :return: Checkpoint pattern
-    :rtype: regular expression
+    Args:
+        suffix (str, optional): Additional notation to specify the model or checkpoint, defaults to ''.
+
+    Returns:
+        str: The regular expression of checkpoint pattern.
    """
    ranks_name = _get_ranks_name()
    pattern = r'epoch(\d+)-{}{}\.pt'.format(ranks_name, suffix)
@ -88,16 +89,19 @@ def get_latest_checkpoint_pattern(suffix: str = ''):


 def get_latest_checkpoint_path(checkpoint_dir: str, suffix: str = ''):
-    """This is a function to retrieve the latest checkpoint path from the (checkpoint_dir, suffix, gpu_parallel_rank) tuple.
+    """This is a function to retrieve the latest checkpoint path from the tuple
+    (checkpoint_dir, suffix, gpu_parallel_rank).
    This is useful during recuperation of the checkpoint, especially when you do not know the epoch number.

-    :param checkpoint_dir: Directory for saving checkpoints
-    :type checkpoint_dir: str
-    :param suffix: Additional notation to specify the model or checkpoint, defaults to ''
-    :type suffix: str, optional
-    :raises FileNotFoundError: Raise error when we cannot find the latest checkpoint file with inputs given
-    :return: The latest checkpoint path to be retrieved
-    :rtype: path
+    Args:
+        checkpoint_dir (str): Directory for saving checkpoints
+        suffix (str, optional): Additional notation to specify the model or checkpoint, defaults to ''
+
+    Returns:
+        str: The latest retrieved checkpoint path.
+
+    Raises:
+        FileNotFoundError: Raise error when we cannot find the latest checkpoint file with inputs given.
    """
    CKPT_NAME_PAT = get_latest_checkpoint_pattern(suffix=suffix)

@ -126,22 +130,19 @@ def save_checkpoint(checkpoint_path: str,
                    optimizer: torch.optim.Optimizer,
                    lr_scheduler: torch.optim.lr_scheduler._LRScheduler = None,
                    **kwargs):
-    """Given a directory to store the checkpoints, saves all the training components' parameters or buffers, such as model,
-     optimizer, lr_scheduler and etc. into a checkpoint dictionary.
+    """Given a directory to store the checkpoints, saves all the training components' parameters or buffers, such as
+    model, optimizer, lr_scheduler etc. into a checkpoint dictionary.

-    This method can be used for both colosalai nn.BaseModel and normal pytorch nn.Module.
+    This method can be used for both :class:`colossalai.nn.BaseModel` and normal  :class:`torch.nn.Module`.

-
-    :param checkpoint_path: Set up a directory for saving checkpoints
-    :type checkpoint_path: str
-    :param epoch: Epoch number (indicate how many epochs have you trained this model)
-    :type epoch: int
-    :param model: Model to be registered
-    :type model: torch.nn.Module
-    :param optimizer: Optimizer to be registered
-    :type optimizer: torch.optim.Optimizer
-    :param lr_scheduler: lr_scheduler to be registered, defaults to None
-    :type lr_scheduler: torch.optim.lr_scheduler._LRScheduler, optional
+    Args:
+        checkpoint_path (str): Set up a directory for saving checkpoints.
+        epoch (int): Epoch number (indicate how many epochs have you trained this model).
+        model (:class:`torch.nn.Module`): Model to be registered.
+        optimizer (Union[:class:`torch.optim.Optimizer`, :class:`colossalai.nn.optimizer`]): Optimizer to be registered.
+        lr_scheduler (Union[:class:`torch.optim.lr_scheduler`,
+            :class:`colossalai.nn.lr_scheduler`], optional): lr_scheduler to be registered, defaults to None.
+        kwargs (dict): additional parameters to be saved.
    """
    # for compatibility with normal pytorch nn.Module
    if hasattr(model, 'state_dict_for_save_checkpoint'):
@ -165,31 +166,31 @@ def load_checkpoint(checkpoint_path: str,
                    finetune: bool = False,
                    strict: bool = True) -> Tuple:
    """Loads the checkpoint file.
+
    If finetune is False, then we intend to continue/resume the training process from the checkpoint given.
    So we copy parameters and buffers from state_dict into these modules(model, optimizer,lr_scheduler)
-     and its descendants.
-    If finetune is True, then only the weights and buffers of model should be reload.
-    If strict is True, then the keys of state_dict must exactly match the keys returned by this module’s
-     state_dict() function.
+    and its descendants.

-    :param checkpoint_path: The exact and matched checkpoint_path directory to retrieve appropriate state_dict
-    :type checkpoint_path: str
-    :param model: Model to reload parameters and buffers
-    :type model: torch.nn.Module
-    :param optimizer: Optimizer to recuperate
-    :type optimizer: torch.optim.Optimizer
-    :param lr_scheduler: lr_scheduler to recuperate, defaults to None
-    :type lr_scheduler: torch.optim.lr_scheduler._LRScheduler, optional
-    :param finetune: Whether to finetune the model with new dataset or continue the pre-training, defaults to False
-    :type finetune: bool, optional
-    :param strict: Whether to strictly enforce that the keys in
-        :attr:`state_dict` of the checkpoint match the names of
-        parameters and buffers in model., defaults to True
-    :type strict: bool, optional
-    :raises ValueError: Raise error if the model/optimizer cannot successfully be recuperated
-    :return: (the epoch number of the checkpoint retrieved, the checkpoint retrieved)
-    :rtype: Tuple
+    If finetune is True, then only the weights and buffers of model should be reloaded.
+    If strict is True, then the keys of state_dict must exactly match the keys returned
+    by this module’s state_dict() function.

+     Args:
+        checkpoint_path (str): The exact and matched checkpoint_path directory to retrieve appropriate state_dict.
+        model (:class:`torch.nn.Module`): Model to reload parameters and buffers.
+        optimizer (Union[:class:`torch.optim.Optimizer`, :class:`colossalai.nn.optimizer`]): Optimizer to recuperate.
+        lr_scheduler (:class:`torch.optim.lr_scheduler._LRScheduler`, optional):
+            lr_scheduler to recuperate, defaults to None.
+        finetune (bool, optional): Whether to finetune the model with new dataset or
+            continue the pre-training, defaults to False.
+        strict (bool, optional): Whether to strictly enforce that the keys in :attr:`state_dict`
+            of the checkpoint match the names of parameters and buffers in model, defaults to True.
+
+    Returns:
+        Tuple(int, ``checkpoint``): The tuple (the epoch number of the checkpoint retrieved, the checkpoint retrieved).
+
+    Raises:
+        ValueError: Raise error if the model/optimizer cannot successfully be recuperated
    """
    # Load the checkpoint.
    checkpoint = torch.load(checkpoint_path, map_location='cpu')
--- a/colossalai/utils/common.py
+++ b/colossalai/utils/common.py
@ -27,10 +27,10 @@ from .multi_tensor_apply import multi_tensor_applier
 def print_rank_0(msg: str, logger=None):
    """Print messages and save logs(optional). This is executed only if you are the rank-0 gpu.

-    :param msg: A string message to output
-    :type msg: str
-    :param logger: Python logger object, defaults to None
-    :type logger: optional
+    Args:
+        msg (str): A string message to output.
+        logger (:class:`colossalai.logging.DistributedLogger`, optional):
+            The logger to record the message, defaults to None.
    """
    if gpc.get_global_rank() == 0:
        if logger is None:
@ -53,12 +53,15 @@ def free_port():


 def sync_model_param(model, parallel_mode):
-    """Make sure data parameters are consistent during Data Parallel Mode
+    r"""Make sure data parameters are consistent during Data Parallel Mode.

-    :param model: A pyTorch nn.model on whose parameters you check the consistency
-    :param parallel_mode: Parallel mode to be checked
-    :type model: torch.nn.Module
-    :type parallel_mode:  colossalai.context.ParallelMode
+    Args:
+        model (:class:`torch.nn.Module`): A pyTorch model on whose parameters you check the consistency.
+        parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel mode to be checked.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """
    if gpc.is_initialized(parallel_mode) and gpc.get_world_size(parallel_mode) > 1:
        for param in model.parameters():
@ -146,18 +149,19 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
    """Clips gradient norm of an iterable of parameters whose gradients are in fp32.

    This is adapted from :func:`torch.nn.utils.clip_grad.clip_grad_norm_` and
-    added functionality to handle model parallel parameters. Note that
-    the gradients are modified in place.
+    added functionality to handle model parallel parameters.

-    :param parameters: An iterable of Tensors or a single Tensor that will have gradients normalized
-    :type parameters: (Iterable[Tensor] or Tensor)
-    :param max_norm: Max norm of the gradients
-    :type max_norm: float or int
-    :param norm_type: Type of the used p-norm. Can be ``'inf'`` for infinity norm.
-    :type norm_type: float or int 
+    Note:
+        the gradients are modified in place.

-    :return: Total norm of the parameters (viewed as a single vector).
-    :rtype: float
+    Args:
+        parameters (Iterable[:class:`torch.tensor`] or :class:`torch.tensor`):
+            An iterable of Tensors or a single Tensor that will have gradients normalized.
+        max_norm (Union[float, int]): Max norm of the gradients.
+        norm_type (Union[float, int, 'inf']): Type of the used p-norm. Can be ``'inf'`` for infinity norm.
+
+    Returns:
+        float: Total norm of the parameters.
    """

    if isinstance(parameters, torch.Tensor):
--- a/colossalai/utils/data_sampler/data_parallel_sampler.py
+++ b/colossalai/utils/data_sampler/data_parallel_sampler.py
@ -19,18 +19,15 @@ T_co = TypeVar('T_co', covariant=True)

@DATA_SAMPLERS.register_module
 class DataParallelSampler(Sampler):
-    """A data sampler for distributed data parallelism
+    """A data sampler for distributed data parallelism.

-    :param dataset: A Dataset instance
-    :type dataset: torch.utils.data.Dataset
-    :param shuffle: Whether to shuffle data, defaults to False
-    :type shuffle: bool, optional
-    :param seed: The random seed, defaults to 0
-    :type seed: int, optional
-    :param drop_last: Set to True to drop the last incomplete batch, if the dataset size is not divisible by the batch
-        size. If False and the size of dataset is not divisible by the batch size, then the last batch will be smaller,
-        defaults to False
-    :type drop_last: bool, optional
+    Args:
+        dataset (:class:`torch.utils.data.Dataset`): The Dataset for sampling.
+        shuffle (bool, optional): Whether to shuffle data, defaults to False.
+        seed (int, optional): The random seed used for sampling, defaults to 0.
+        drop_last (bool, optional): Set to True to drop the last incomplete batch, if the dataset size
+            is not divisible by the batch size. If False and the size of dataset is not divisible by
+            the batch size, then the last batch will be smaller, defaults to False.
    """

    def __init__(self,
@ -104,8 +101,8 @@ class DataParallelSampler(Sampler):
        use a different random ordering for each epoch. Otherwise, the next iteration of this
        sampler will yield the same ordering.

-        :param epoch: Epoch number.
-        :type epoch: int
+        Args:
+            epoch (int): Epoch number.
        """
        self.epoch = epoch

@ -118,29 +115,27 @@ def get_dataloader(dataset,
                   pin_memory=False,
                   num_workers=0,
                   **kwargs):
-    """Set up a deterministic dataloader (also configure seed workers, samplers and whether shuffle or not)
+    r"""Set up a deterministic dataloader (also configure seed workers, samplers and whether shuffle or not)

-    .. note:: When pipeline parallel is enabled, shuffle cannot be True as it will result in mismatch between input data
-        on the 1st stage and label on the last stage
+    Note:
+        When pipeline parallel is enabled, shuffle cannot be True as it will result in mismatch between input data
+        on the 1st stage and label on the last stage.

-    :param dataset: A :class:`torch.utils.data.Dataset` object
-    :param shuffle: Whether to shuffle the dataset
-    :param seed: Random worker seed, defaults to 1024
-    :param add_sampler: Add DistributedDataParallelSampelr to the dataset
-    :param drop_last: Drop the last incomplete batch of data
-    :param pin_memory: Whether to pin memory address in CPU memory
-    :param num_workers: Number of worker threads for this dataloader
+    Args:
+        dataset (:class:`torch.utils.data.Dataset`): The dataset to be loaded.
+        shuffle (bool, optional): Whether to shuffle the dataset. Defaults to False.
+        seed (int, optional): Random worker seed for sampling, defaults to 1024.
+        add_sampler: Whether to add ``DistributedDataParallelSampler`` to the dataset. Defaults to True.
+        drop_last (bool, optional): Set to True to drop the last incomplete batch, if the dataset size
+            is not divisible by the batch size. If False and the size of dataset is not divisible by
+            the batch size, then the last batch will be smaller, defaults to False.
+        pin_memory (bool, optional): Whether to pin memory address in CPU memory. Defaults to False.
+        num_workers (int, optional): Number of worker threads for this dataloader. Defaults to 0.
+        kwargs (dict): optional parameters for ``torch.utils.data.DataLoader``, more details could be found in
+                `DataLoader <https://pytorch.org/docs/stable/_modules/torch/utils/data/dataloader.html#DataLoader>`_.

-    :type dataset: :class:`torch.utils.data.Dataset`
-    :type shuffle: bool, optional. Default is False
-    :type seed: int, optional. Default is 1024
-    :type add_sampler: bool, optional. Default is True
-    :type drop_last: bool, optional. Default is False
-    :type pin_memory: bool, optional. Default is False
-    :type num_workers: int, optional. Default is 0
-
-    :return: A object of :class:`torch.utils.data.DataLoader`
-    :rtype: :class:`torch.utils.data.DataLoader`
+    Returns:
+        :class:`torch.utils.data.DataLoader`: A DataLoader used for training or testing.
    """
    _kwargs = kwargs.copy()

--- a/colossalai/utils/gradient_accumulation/init.py
+++ b/colossalai/utils/gradient_accumulation/init.py
@ -13,19 +13,25 @@ def accumulate_gradient(model: nn.Module,
                        accumulate_size: int,
                        gradient_handlers: List[BaseGradientHandler] = None,
                        lr_scheduler: _LRScheduler = None):
-    """
-    :param model: your model object
-    :type model: :class:`torch.nn.Module`
-    :param optimizer: your optimizer object
-    :type optimizer: :class:`torch.optim.Optimizer`
-    :param dataloader: your dataloader object
-    :type dataloader: Iterable
-    :param accumulate_size: the number of steps to accumulate gradients
-    :type accumulate_size: int
-    :param gradient_handlers: list of gradient handler objects. Default is None
-    :type gradient_handlers: List[:class:`colossalai.engine.BaseGradientHandler`]
-    :param lr_scheduler: your lr scheduler object. Default is None
-    :type lr_scheduler: `torch.optim.lr_scheduler._LRScheduler`
+    r"""Turning model, optimizer, dataloader into corresponding object for gradient accumulation.
+
+    Args:
+        model (:class:`torch.nn.Module`): your model object for gradient accumulation.
+        optimizer (:class:`torch.optim.Optimizer`): your optimizer object for gradient accumulation.
+        dataloader (:class:`torch.utils.data.DataLoader` or iterable objects):
+            your dataloader object, would be called like iter(dataloader)
+        accumulate_size (int): the number of steps to accumulate gradients
+        gradient_handlers (List[:class:`colossalai.engine.BaseGradientHandler`]):
+            list of gradient handler objects. Default is None.
+        lr_scheduler (`torch.optim.lr_scheduler` or `colossalai.nn.lr_scheduler`):
+            your ``lr_scheduler`` object for gradient accumulation. Defaults to None.
+
+    More details about `gradient_handlers` could be found in
+    `Gradient_handler <https://github.com/hpcaitech/ColossalAI/tree/main/colossalai/engine/gradient_handler>`_.
+
+    More details about `lr_scheduler` could be found
+    `lr_scheduler <https://github.com/hpcaitech/ColossalAI/tree/main/colossalai/nn/lr_scheduler>`_. and
+    `how to adjust learning rate <https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate>`_.
    """
    optimizer = GradAccumOptimizer(optimizer, accumulate_size=accumulate_size, model=model)
    dataloader = GradAccumDataloader(dataloader, accumulate_size=accumulate_size)
--- a/colossalai/utils/gradient_accumulation/_gradient_accumulation.py
+++ b/colossalai/utils/gradient_accumulation/_gradient_accumulation.py
@ -15,15 +15,13 @@ from colossalai.engine import BaseGradientHandler

 class GradAccumOptimizer(ColossalaiOptimizer):
    """A wrapper for the optimizer to enable gradient accumulation by skipping the steps 
-    before accumulation size is reached
-
-    :param optim: Your optimizer object
-    :type optim: :class:`torch.optim.Optimizer`
-    :param accumulate_size: The number of steps to accumulate gradients
-    :type accumulate_size: int
-    :param model: Your model object to check if it is DDP for special handling of no_sync() context
-    :type model: :class:`torch.nn.Module`
+    before accumulation size is reached.

+    Args:
+        optim (:class:`torch.optim.Optimizer`): Your optimizer object for gradient accumulation.
+        accumulate_size (int): The number of steps to accumulate gradients.
+        model (:class:`torch.nn.Module`):
+            Your model object to check if it is DistributedDataParallel for special handling of no_sync() context.
    """

    def __init__(self, optim: Optimizer, accumulate_size: int, model: nn.Module = None):
@ -76,18 +74,18 @@ class GradAccumOptimizer(ColossalaiOptimizer):


 class GradAccumDataloader:
-    """A wrapper for dataloder to enable gradient accumulation by dropping the last incomplete steps.
+    """A wrapper for dataloader to enable gradient accumulation by dropping the last incomplete steps.

-    For example, if a dataloader has 10 batches of data and accumulate size is 4. The model paramters will 
-    be update only twice at step 4 and step 8. The last two batches of data do not form a complete 4-step cycle.
-    Thus, they will be automatically skipped by this class. If the dataloader is not standard PyTorch dataloader, 
-    (e.g. Dali dataloader), this class will automatically consume (load data for nothing) the remaining 2 batches.
-
-    :param dataloader: Your dataloader object
-    :type dataloader: Iterable
-    :param accumulate_size: The number of steps to accumulate gradients
-    :type accumulate_size: int
+    Note:
+        The dataloader would drop the last incomplete steps for gradient accumulation.
+        For example, if a dataloader has 10 batches of data and accumulate size is 4. The model parameters will
+        be updated only twice at step 4 and step 8. The last two batches of data do not form a complete 4-step cycle.
+        Thus, they will be automatically skipped by this class. If the dataloader is not standard PyTorch dataloader,
+        (e.g. Dali dataloader), this class will automatically consume (load data for nothing) the remaining 2 batches.

+    Args:
+        optim (``Iterable``): Your dataloader object for gradient accumulation.
+        accumulate_size (int): The number of steps to accumulate gradients.
    """

    def __init__(self, dataloader: Iterable, accumulate_size: int) -> None:
@ -125,13 +123,12 @@ class GradAccumDataloader:

 class GradAccumLrSchedulerByStep(_LRScheduler):
    """A wrapper for the LR scheduler to enable gradient accumulation by skipping the steps 
-    before accumulation size is reached
-
-    :param lr_scheduler: Your lr scheduler object
-    :type lr_scheduler: :class:`torch.optim.lr_scheduler._LRScheduler`    
-    :param accumulate_size: The number of steps to accumulate gradients
-    :type accumulate_size: int
+    before accumulation size is reached.

+    Args:
+        lr_scheduler (:class:`torch.optim.lr_scheduler._LRScheduler`):
+            Your ``lr_scheduler`` object for gradient accumulation.
+        accumulate_size (int): The number of steps to accumulate gradients.
    """

    def __init__(self, lr_scheduler: _LRScheduler, accumulate_size: int) -> None:
@ -171,13 +168,16 @@ class GradAccumLrSchedulerByStep(_LRScheduler):


 class GradAccumGradientHandler:
-    """A wrapper for the gradient handler to enable gradient accumulation by skipping the steps 
-    before accumulation size is reached
+    r"""A wrapper for the gradient handler to enable gradient accumulation by skipping the steps
+    before accumulation size is reached.

-    :param grad_handler: Your gradient handler object
-    :type grad_handler: :class:`colossalai.engine.BaseGradientHandler`    
-    :param accumulate_size: The number of steps to accumulate gradients
-    :type accumulate_size: int
+    Args:
+        grad_handler (:class:`colossalai.engine.BaseGradientHandler`):
+            Your ``gradient_handler`` object for gradient accumulation, would be called when achieving `accumulate_size`.
+        accumulate_size (int): The number of steps to accumulate gradients.
+
+    More details about ``gradient_handlers`` could be found in
+    `Gradient_handler <https://github.com/hpcaitech/ColossalAI/tree/main/colossalai/engine/gradient_handler>`_.

    """

--- a/colossalai/utils/memory_utils/memory_monitor.py
+++ b/colossalai/utils/memory_utils/memory_monitor.py
@ -14,12 +14,13 @@ from typing import Optional


 def colo_cuda_memory_used(device: Optional[torch.device] = None) -> int:
-    """
-    Get the free memory info of device.
-    :param device: a torch device instance or None
-    :type device: Optional[torch.device]
-    :return: current memory usage, sized by Byte
-    :rtype: int
+    """Get the free memory info of device.
+
+    Args:
+       device (Optional[``torch.device``]): a torch device instance or None. Defaults None.
+
+    Returns:
+        int: current memory usage, sized by Byte.
    """
    if device:
        assert device.type == 'cuda'
@ -34,7 +35,7 @@ def colo_cuda_memory_used(device: Optional[torch.device] = None) -> int:


 def bytes_to_GB(val, decimal=2):
-    """A byte-to-Gigabyte converter, defaultly using binary notation.
+    """A byte-to-Gigabyte converter, default using binary notation.

    :param val: X bytes to convert
    :return: X' GB
@ -43,7 +44,7 @@ def bytes_to_GB(val, decimal=2):


 def bytes_to_MB(val, decimal=2):
-    """A byte-to-Megabyte converter, defaultly using binary notation.
+    """A byte-to-Megabyte converter, default using binary notation.

    :param val: X bytes to convert
    :return: X' MB
@ -54,13 +55,13 @@ def bytes_to_MB(val, decimal=2):
 def report_memory_usage(message, logger=None, report_cpu=False):
    """Calculate and print RAM usage (in GB)

-    :param message: A prefix message to add in the log
-    :type message: str
-    :param logger: An instance of :class:`colossalai.logging.DistributedLogger`
-    :type logger: :class:`colossalai.logging.DistributedLogger`, optional
-    :param report_cpu: Whether to report CPU memory
-    :type report_cpu: bool, optional
-    :raises EnvironmentError: Raise error if no distributed environment has been initialized
+    Args:
+        message (str): A prefix message to add in the log.
+        logger (:class:`colossalai.logging.DistributedLogger`): The logger used to record memory information.
+        report_cpu (bool, optional): Whether to report CPU memory.
+
+    Raises:
+        EnvironmentError: Raise error if no distributed environment has been initialized.
    """
    if not gpc.is_initialized(ParallelMode.GLOBAL):
        raise EnvironmentError("No distributed environment is initialized")
--- a/colossalai/utils/moe.py
+++ b/colossalai/utils/moe.py
@ -12,8 +12,8 @@ def get_moe_epsize_param_dict(model: nn.Module) -> Dict[int, List[nn.Parameter]]
    size of every parameter. Since the parameters in data parallelism is replicated
    in each GPU, we set their ep_size to 1.

-    :param model: A pyTorch nn.model from which we get dict
-    :type model: torch.nn.Module
+    Args:
+        model (:class:`torch.nn.Module`): A pyTorch `nn.Module` from which we get dict.
    """
    epsize_param_dict = dict()
    for param in model.parameters():
@ -29,10 +29,10 @@ def get_moe_epsize_param_dict(model: nn.Module) -> Dict[int, List[nn.Parameter]]


 def sync_moe_model_param(model: nn.Module):
-    """Make sure model parameters are consistent in MoE parallel context
+    """Make sure model parameters are consistent in MoE parallel context.

-    :param model: A pyTorch nn.model on whose parameters you check the consistency
-    :type model: torch.nn.Module
+    Args:
+        model (:class:`torch.nn.Module`): A pyTorch model on whose parameters you check the consistency.
    """
    if is_using_ddp():

--- a/colossalai/utils/multi_tensor_apply/multi_tensor_apply.py
+++ b/colossalai/utils/multi_tensor_apply/multi_tensor_apply.py
@ -3,10 +3,10 @@

 class MultiTensorApply(object):
    """
-    Apply an operation to a list of tensors efficiently
+    Apply an operation to a list of tensors efficiently.

-    :param chunk_size: Size of a chunk
-    :type chunk_size: int
+    Args:
+        chunk_size (int): Size of a chunk.
    """

    available = False
--- a/colossalai/utils/tensor_detector/tensor_detector.py
+++ b/colossalai/utils/tensor_detector/tensor_detector.py
@ -9,6 +9,7 @@ from collections import defaultdict
 LINE_WIDTH = 108
 LINE = '-' * LINE_WIDTH + '\n'

+
 class TensorDetector():
    def __init__(self,
                 show_info: bool = True,
@ -16,17 +17,14 @@ class TensorDetector():
                 include_cpu: bool = False,
                 module: Optional[nn.Module] = None
                 ):
-        """This class is an detector to detect tensor on different devices.
-
-        :param show_info: whether to print the info on screen, default True
-        :type show_info: bool
-        :param log: the file name to save the log
-        :type log: str
-        :param include_cpu: whether to detect tensor on cpu, default False
-        :type include_cpu: bool
-        :param module: when sending an `nn.Module` it, the detector can name the tensors detected better
-        :type module: Optional[nn.Module]
+        """This class is a detector to detect tensor on different devices.

+        Args:
+            show_info (bool, optional): whether to print the info on screen, default True.
+            log (str, optional): the file name to save the log. Defaults to None.
+            include_cpu (bool, optional): whether to detect tensor on cpu, default False.
+            module (Optional[:class:`nn.Module`]): when sending an ``nn.Module`` object,
+                the detector can name the tensors detected better.
        """
        self.show_info = show_info
        self.log = log
@ -48,7 +46,6 @@ class TensorDetector():
                self.tensor_info[id(param)].append(param.requires_grad)
                self.tensor_info[id(param)].append(param.dtype)
                self.tensor_info[id(param)].append(self.get_tensor_mem(param))
-                

    def get_tensor_mem(self, tensor):
        # calculate the memory occupied by a tensor
@ -58,7 +55,6 @@ class TensorDetector():
            memory_size += grad_memory_size
        return self.mem_format(memory_size)

-
    def mem_format(self, real_memory_size):
        # format the tensor memory into a reasonal magnitude
        if real_memory_size >= 2 ** 30:
@ -68,7 +64,6 @@ class TensorDetector():
        if real_memory_size >= 2 ** 10:
            return str(real_memory_size / (2 ** 10)) + ' KB'
        return str(real_memory_size) + ' B' 
-        

    def collect_tensors_state(self):
        for obj in gc.get_objects():
@ -116,7 +111,6 @@ class TensorDetector():
                if obj.device not in self.devices:
                    self.devices.append(obj.device)

-    
    def print_tensors_state(self):
        template_format = '{:3s}{:<30s}{:>10s}{:>20s}{:>10s}{:>20s}{:>15s}'
        self.info += LINE
@ -173,7 +167,6 @@ class TensorDetector():
        if self.log is not None:
            with open(self.log + '.log', 'a') as f:
                f.write(self.info)
-
    
    def detect(self, include_cpu = False):
        self.include_cpu = include_cpu
--- a/colossalai/utils/timer.py
+++ b/colossalai/utils/timer.py
@ -25,7 +25,7 @@ class Timer:
        return time.time()

    def start(self):
-        """Fisrtly synchronize cuda, reset the clock and then start the timer.
+        """Firstly synchronize cuda, reset the clock and then start the timer.
        """
        self._elapsed = 0
        synchronize()
@ -40,10 +40,11 @@ class Timer:
    def stop(self, keep_in_history: bool = False):
        """Stop the timer and record the start-stop time interval.

-        :param keep_in_history: Whether does it record into history each start-stop interval, defaults to False
-        :type keep_in_history: bool, optional
-        :return: Start-stop interval
-        :rtype: int
+        Args:
+            keep_in_history (bool, optional): Whether does it record into history
+                each start-stop interval, defaults to False.
+        Returns:
+            int: Start-stop interval.
        """
        synchronize()
        end_time = time.time()
@ -57,26 +58,27 @@ class Timer:
    def get_history_mean(self):
        """Mean of all history start-stop time intervals.

-        :return: Mean of time intervals
-        :rtype: int
+        Returns:
+            int: Mean of time intervals
        """
        return sum(self._history) / len(self._history)

    def get_history_sum(self):
        """Add up all the start-stop time intervals.

-        :return: Sum of time intervals
-        :rtype: int
+        Returns:
+            int: Sum of time intervals.
        """
        return sum(self._history)

    def get_elapsed_time(self):
        """Return the last start-stop time interval.

-        .. note:: Use it only when timer is not in progress
+        Returns:
+            int: The last time interval.

-        :return: The last time interval
-        :rtype: int
+        Note:
+            Use it only when timer is not in progress
        """
        assert not self._started, 'Timer is still in progress'
        return self._elapsed
@ -90,10 +92,10 @@ class Timer:


 class MultiTimer:
-    """An object contains multiple timers
+    """An object contains multiple timers.

-    :param on: Whether the timer is enabled. Default is True
-    :type on: bool, optional
+    Args:
+        on (bool, optional): Whether the timer is enabled. Default is True.
    """

    def __init__(self, on: bool = True):
@ -101,10 +103,10 @@ class MultiTimer:
        self._timers = dict()

    def start(self, name: str):
-        """Start namely one of the timers
+        """Start namely one of the timers.

-        :param name: Timer's key
-        :type name: str
+        Args:
+            name (str): Timer's key.
        """
        if self._on:
            if name not in self._timers:
@ -114,10 +116,9 @@ class MultiTimer:
    def stop(self, name: str, keep_in_history: bool):
        """Stop namely one of the timers.

-        :param name: Timer's key
-        :type name: str
-        :param keep_in_history: Whether does it record into history each start-stop interval
-        :type keep_in_history: bool
+        Args:
+            name (str): Timer's key.
+            keep_in_history (bool): Whether does it record into history each start-stop interval.
        """
        if self._on:
            return self._timers[name].stop(keep_in_history)
@ -127,17 +128,19 @@ class MultiTimer:
    def get_timer(self, name):
        """Get timer by its name (from multitimer)

-        :param name: Timer's key
-        :return: Timer with the name you give correctly
-        :rtype: Timer
+        Args:
+            name (str): Timer's key.
+        Returns:
+            :class:`colossalai.utils.Timer`: Timer with the name you give correctly.
        """
        return self._timers[name]

    def reset(self, name=None):
        """Reset timers.

-        :param name: If name is designated, the named timer will be reset and others will not, defaults to None
-        :type name: optional
+        Args:
+            name (str, optional): If name is designated, the named timer will be reset
+                and others will not, defaults to None.
        """
        if self._on:
            if name is not None:
--- a/tests/test_moe/test_grad_handler.py
+++ b/tests/test_moe/test_grad_handler.py
@ -1,74 +1,74 @@
-from functools import partial
-import pytest
-import torch
-import torch.nn as nn
-import torch.multiprocessing as mp
-import torch.distributed as dist
-import colossalai
-from colossalai.utils import free_port, get_current_device
-from colossalai.nn.layer.moe import Top1Router, UniformNoiseGenerator, MoeLayer, Experts
-from colossalai.context.moe_context import MOE_CONTEXT
-from colossalai.utils.moe import sync_moe_model_param
-from colossalai.engine.gradient_handler import MoeGradientHandler
-from colossalai.testing import assert_equal_in_group
-from colossalai.testing import rerun_on_exception
-
-BATCH_SIZE = 4
-DIM = 16
-CONFIG = dict()
-
-
-def run_test(rank, world_size, port):
-    colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
-    expert_module = nn.Linear
-    expert_factor = dict(in_features=DIM, out_features=DIM, device=get_current_device())
-
-    MOE_CONTEXT.setup(42)    # MOE initialization
-    noisy_func = UniformNoiseGenerator()
-    router = Top1Router(noisy_func=noisy_func)
-    num_experts_list = [1, 2, 4]
-    layer_list = []
-    for num_experts in num_experts_list:
-        exp = Experts(expert_module, num_experts, **expert_factor)
-        moe_layer = MoeLayer(DIM, num_experts, router, exp)
-        layer_list.append(moe_layer)
-
-    model = nn.Sequential(*layer_list)
-    model = model.to(get_current_device())
-    sync_moe_model_param(model)
-
-    dist_dict = MOE_CONTEXT.parallel_info_dict
-    assert_equal_in_group(layer_list[0].experts.experts[0].weight.data, dist_dict[1].dp_group)
-    assert_equal_in_group(layer_list[1].experts.experts[0].weight.data, dist_dict[2].dp_group)
-    # MoE model synchronization passed
-
-    grad_handler = MoeGradientHandler(model, 0)
-
-    rank = dist.get_rank()
-    torch.cuda.manual_seed(78 + rank)
-    data = torch.randn(BATCH_SIZE, DIM, device=get_current_device())
-    grad = torch.randn_like(data)
-
-    MOE_CONTEXT.reset_loss()
-    outputs = model(data)
-    outputs.backward(grad)
-    grad_handler.handle_gradient()
-
-    assert_equal_in_group(layer_list[0].experts.experts[0].weight.grad, dist_dict[1].dp_group)
-    assert_equal_in_group(layer_list[0].experts.experts[0].bias.grad, dist_dict[1].dp_group)
-
-    assert_equal_in_group(layer_list[1].experts.experts[0].weight.grad, dist_dict[2].dp_group)
-    assert_equal_in_group(layer_list[1].experts.experts[0].bias.grad, dist_dict[2].dp_group)
-    # MoE grad handler test passed
-
-
-@pytest.mark.dist
-@rerun_on_exception(exception_type=mp.ProcessRaisedException, pattern=".*Address already in use.*")
-def test_grad_handler():
-    world_size = 4
-    run_func = partial(run_test, world_size=world_size, port=free_port())
-    mp.spawn(run_func, nprocs=world_size)
-
-
-if __name__ == '__main__':
-    test_grad_handler()
+from functools import partial
+import pytest
+import torch
+import torch.nn as nn
+import torch.multiprocessing as mp
+import torch.distributed as dist
+import colossalai
+from colossalai.utils import free_port, get_current_device
+from colossalai.nn.layer.moe import Top1Router, UniformNoiseGenerator, MoeLayer, Experts
+from colossalai.context.moe_context import MOE_CONTEXT
+from colossalai.utils.moe import sync_moe_model_param
+from colossalai.engine.gradient_handler import MoeGradientHandler
+from colossalai.testing import assert_equal_in_group
+from colossalai.testing import rerun_on_exception
+
+BATCH_SIZE = 4
+DIM = 16
+CONFIG = dict()
+
+
+def run_test(rank, world_size, port):
+    colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
+    expert_module = nn.Linear
+    expert_factor = dict(in_features=DIM, out_features=DIM, device=get_current_device())
+
+    MOE_CONTEXT.setup(42)    # MOE initialization
+    noisy_func = UniformNoiseGenerator()
+    router = Top1Router(noisy_func=noisy_func)
+    num_experts_list = [1, 2, 4]
+    layer_list = []
+    for num_experts in num_experts_list:
+        exp = Experts(expert_module, num_experts, **expert_factor)
+        moe_layer = MoeLayer(DIM, num_experts, router, exp)
+        layer_list.append(moe_layer)
+
+    model = nn.Sequential(*layer_list)
+    model = model.to(get_current_device())
+    sync_moe_model_param(model)
+
+    dist_dict = MOE_CONTEXT.parallel_info_dict
+    assert_equal_in_group(layer_list[0].experts.experts[0].weight.data, dist_dict[1].dp_group)
+    assert_equal_in_group(layer_list[1].experts.experts[0].weight.data, dist_dict[2].dp_group)
+    # MoE model synchronization passed
+
+    grad_handler = MoeGradientHandler(model, 0)
+
+    rank = dist.get_rank()
+    torch.cuda.manual_seed(78 + rank)
+    data = torch.randn(BATCH_SIZE, DIM, device=get_current_device())
+    grad = torch.randn_like(data)
+
+    MOE_CONTEXT.reset_loss()
+    outputs = model(data)
+    outputs.backward(grad)
+    grad_handler.handle_gradient()
+
+    assert_equal_in_group(layer_list[0].experts.experts[0].weight.grad, dist_dict[1].dp_group)
+    assert_equal_in_group(layer_list[0].experts.experts[0].bias.grad, dist_dict[1].dp_group)
+
+    assert_equal_in_group(layer_list[1].experts.experts[0].weight.grad, dist_dict[2].dp_group)
+    assert_equal_in_group(layer_list[1].experts.experts[0].bias.grad, dist_dict[2].dp_group)
+    # MoE grad handler test passed
+
+
+@pytest.mark.dist
+@rerun_on_exception(exception_type=mp.ProcessRaisedException, pattern=".*Address already in use.*")
+def test_grad_handler():
+    world_size = 4
+    run_func = partial(run_test, world_size=world_size, port=free_port())
+    mp.spawn(run_func, nprocs=world_size)
+
+
+if __name__ == '__main__':
+    test_grad_handler()
--- a/tests/test_moe/test_kernel.py
+++ b/tests/test_moe/test_kernel.py
@ -1,104 +1,104 @@
-from functools import partial
-import pytest
-import torch
-import torch.nn as nn
-import torch.multiprocessing as mp
-import colossalai
-from colossalai.context import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.utils import free_port, get_current_device
-from colossalai.nn.layer.moe import Top1Router, Top2Router, MoeLayer, Experts
-from colossalai.context.moe_context import MOE_CONTEXT
-from colossalai.testing import rerun_on_exception
-
-BATCH_SIZE = 16
-NUM_EXPERTS = 4
-CONFIG = dict()
-
-
-def check_equal(tensor_a, tensor_b, atol=1e-06):
-    assert torch.allclose(tensor_a, tensor_b, rtol=0, atol=atol) is True
-
-
-def run_routing(rank, world_size, port, rs=2, hidden_size=128, data_type=torch.float32, router=Top2Router):
-    # Here we do not need TF32, since it brings absolute error on results
-    torch.backends.cuda.matmul.allow_tf32 = False
-
-    colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
-    local_rank = gpc.get_local_rank(ParallelMode.GLOBAL)
-
-    MOE_CONTEXT.setup(42)    # MOE environment initialization
-    MOE_CONTEXT.reset_loss()
-    torch.manual_seed(rs + local_rank)    # set each process has different random seed
-
-    # get randomized data
-    tokens = torch.randn(BATCH_SIZE, hidden_size, dtype=data_type, device=get_current_device(), requires_grad=True)
-
-    expert_module = nn.Linear
-    expert_factor = dict(in_features=hidden_size, out_features=hidden_size, device=get_current_device())
-    expert = Experts(expert_module, NUM_EXPERTS, **expert_factor)
-    layer = MoeLayer(hidden_size, NUM_EXPERTS, router(capacity_factor_train=1.0), expert)
-    if data_type == torch.float16:
-        layer = layer.half()
-
-    # use matrix multiplication instead of COL_MOE_KERNL in MOE dispatch and combine
-    layer.use_kernel = False
-    old_out = layer(tokens)
-    ech = old_out.shape
-    grad = torch.randn(ech, device=get_current_device())
-    old_out.backward(grad)    # get gradient
-
-    # save all results
-    o_tk_grad = tokens.grad.data.clone()
-    o_gt_grad = layer.gate.weight.grad.data.clone()
-
-    # reset all gradients
-    tokens.grad.zero_()
-    layer.gate.weight.grad.zero_()
-
-    layer.use_kernel = True
-    new_out = layer(tokens)    # get ouputs through colossal kernel
-
-    if data_type == torch.float32:
-        check_equal(old_out, new_out)
-    else:
-        check_equal(old_out, new_out, 1e-2)
-    # forward function passed
-
-    new_out.backward(grad)    # get new type gradient
-    n_tk_grad = tokens.grad.data.clone()
-    n_gt_grad = layer.gate.weight.grad.data.clone()
-
-    if data_type == torch.float32:
-        check_equal(o_tk_grad, n_tk_grad)
-    else:
-        check_equal(o_tk_grad, o_tk_grad, 1e-2)
-    # tokens gradient is correct
-
-    if data_type == torch.float32:
-        check_equal(o_gt_grad, n_gt_grad, 5e-05)
-    else:
-        check_equal(o_gt_grad, n_gt_grad, 2e-01)
-    # bias gradient is correct
-
-
-@pytest.mark.dist
-@pytest.mark.parametrize("rs", [131])
-@pytest.mark.parametrize("hidden_size", [32, 144])
-@pytest.mark.parametrize("data_type", [torch.float32, torch.float16])
-@pytest.mark.parametrize("router", [Top1Router, Top2Router])
-@rerun_on_exception(exception_type=mp.ProcessRaisedException, pattern=".*Address already in use.*")
-def test_moe_kernel(rs, hidden_size, data_type, router):
-    world_size = 4
-    run_func = partial(run_routing,
-                       world_size=world_size,
-                       port=free_port(),
-                       rs=rs,
-                       hidden_size=hidden_size,
-                       data_type=data_type,
-                       router=router)
-    mp.spawn(run_func, nprocs=world_size)
-
-
-if __name__ == '__main__':
-    test_moe_kernel(2, 256, torch.float16, Top2Router)
+from functools import partial
+import pytest
+import torch
+import torch.nn as nn
+import torch.multiprocessing as mp
+import colossalai
+from colossalai.context import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.utils import free_port, get_current_device
+from colossalai.nn.layer.moe import Top1Router, Top2Router, MoeLayer, Experts
+from colossalai.context.moe_context import MOE_CONTEXT
+from colossalai.testing import rerun_on_exception
+
+BATCH_SIZE = 16
+NUM_EXPERTS = 4
+CONFIG = dict()
+
+
+def check_equal(tensor_a, tensor_b, atol=1e-06):
+    assert torch.allclose(tensor_a, tensor_b, rtol=0, atol=atol) is True
+
+
+def run_routing(rank, world_size, port, rs=2, hidden_size=128, data_type=torch.float32, router=Top2Router):
+    # Here we do not need TF32, since it brings absolute error on results
+    torch.backends.cuda.matmul.allow_tf32 = False
+
+    colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
+    local_rank = gpc.get_local_rank(ParallelMode.GLOBAL)
+
+    MOE_CONTEXT.setup(42)    # MOE environment initialization
+    MOE_CONTEXT.reset_loss()
+    torch.manual_seed(rs + local_rank)    # set each process has different random seed
+
+    # get randomized data
+    tokens = torch.randn(BATCH_SIZE, hidden_size, dtype=data_type, device=get_current_device(), requires_grad=True)
+
+    expert_module = nn.Linear
+    expert_factor = dict(in_features=hidden_size, out_features=hidden_size, device=get_current_device())
+    expert = Experts(expert_module, NUM_EXPERTS, **expert_factor)
+    layer = MoeLayer(hidden_size, NUM_EXPERTS, router(capacity_factor_train=1.0), expert)
+    if data_type == torch.float16:
+        layer = layer.half()
+
+    # use matrix multiplication instead of COL_MOE_KERNL in MOE dispatch and combine
+    layer.use_kernel = False
+    old_out = layer(tokens)
+    ech = old_out.shape
+    grad = torch.randn(ech, device=get_current_device())
+    old_out.backward(grad)    # get gradient
+
+    # save all results
+    o_tk_grad = tokens.grad.data.clone()
+    o_gt_grad = layer.gate.weight.grad.data.clone()
+
+    # reset all gradients
+    tokens.grad.zero_()
+    layer.gate.weight.grad.zero_()
+
+    layer.use_kernel = True
+    new_out = layer(tokens)    # get ouputs through colossal kernel
+
+    if data_type == torch.float32:
+        check_equal(old_out, new_out)
+    else:
+        check_equal(old_out, new_out, 1e-2)
+    # forward function passed
+
+    new_out.backward(grad)    # get new type gradient
+    n_tk_grad = tokens.grad.data.clone()
+    n_gt_grad = layer.gate.weight.grad.data.clone()
+
+    if data_type == torch.float32:
+        check_equal(o_tk_grad, n_tk_grad)
+    else:
+        check_equal(o_tk_grad, o_tk_grad, 1e-2)
+    # tokens gradient is correct
+
+    if data_type == torch.float32:
+        check_equal(o_gt_grad, n_gt_grad, 5e-05)
+    else:
+        check_equal(o_gt_grad, n_gt_grad, 2e-01)
+    # bias gradient is correct
+
+
+@pytest.mark.dist
+@pytest.mark.parametrize("rs", [131])
+@pytest.mark.parametrize("hidden_size", [32, 144])
+@pytest.mark.parametrize("data_type", [torch.float32, torch.float16])
+@pytest.mark.parametrize("router", [Top1Router, Top2Router])
+@rerun_on_exception(exception_type=mp.ProcessRaisedException, pattern=".*Address already in use.*")
+def test_moe_kernel(rs, hidden_size, data_type, router):
+    world_size = 4
+    run_func = partial(run_routing,
+                       world_size=world_size,
+                       port=free_port(),
+                       rs=rs,
+                       hidden_size=hidden_size,
+                       data_type=data_type,
+                       router=router)
+    mp.spawn(run_func, nprocs=world_size)
+
+
+if __name__ == '__main__':
+    test_moe_kernel(2, 256, torch.float16, Top2Router)
--- a/tests/test_moe/test_moe_group.py
+++ b/tests/test_moe/test_moe_group.py
@ -1,71 +1,71 @@
-from functools import partial
-import pytest
-import torch.nn as nn
-import torch.multiprocessing as mp
-import torch.distributed as dist
-import colossalai
-from colossalai.utils import free_port, get_current_device
-from colossalai.nn.layer.moe import Experts
-from colossalai.context.moe_context import MOE_CONTEXT
-from colossalai.utils.moe import sync_moe_model_param
-from colossalai.testing import assert_equal_in_group, rerun_on_exception
-
-D_MODEL = 4
-D_FF = 8
-CONFIG = dict()
-
-
-def run_test(rank, port):
-    world_size = 4
-    colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
-    expert_module = nn.Linear
-    expert_factor = dict(in_features=D_MODEL, out_features=D_FF, device=get_current_device())
-
-    MOE_CONTEXT.setup(42)    # MOE environment initialization
-    exp0 = Experts(expert_module, 1, **expert_factor)
-    exp1 = Experts(expert_module, 2, **expert_factor)
-    exp2 = Experts(expert_module, 4, **expert_factor)
-    exp3 = Experts(expert_module, 8, **expert_factor)
-
-    assert exp0.num_local_experts == 1
-    assert exp1.num_local_experts == 1
-    assert exp2.num_local_experts == 1
-    assert exp3.num_local_experts == 2
-    # experts deployment passed
-
-    parallel_info_dict = MOE_CONTEXT.parallel_info_dict
-    rank = dist.get_rank()
-
-    assert len(parallel_info_dict) == 3
-    assert dist.get_rank(parallel_info_dict[4].ep_group) == rank
-    assert dist.get_rank(parallel_info_dict[2].ep_group) == rank % 2
-    assert dist.get_rank(parallel_info_dict[1].ep_group) == 0
-
-    assert dist.get_rank(parallel_info_dict[4].dp_group) == 0
-    assert dist.get_rank(parallel_info_dict[2].dp_group) == rank // 2
-    assert dist.get_rank(parallel_info_dict[1].dp_group) == rank
-    # group creation passed
-
-    model = nn.ModuleList([exp0, exp1, exp2, exp3])
-    model = model.to(get_current_device())
-    sync_moe_model_param(model)
-
-    assert_equal_in_group(exp0.experts[0].weight.data, parallel_info_dict[1].dp_group)
-    assert_equal_in_group(exp0.experts[0].bias.data, parallel_info_dict[1].dp_group)
-    # MOE experts layout success when ep_size = 1
-
-    assert_equal_in_group(exp1.experts[0].weight.data, parallel_info_dict[2].dp_group)
-    assert_equal_in_group(exp1.experts[0].bias.data, parallel_info_dict[2].dp_group)
-    # MOE experts layout success when ep_size = 2
-
-
-@pytest.mark.dist
-@rerun_on_exception(exception_type=mp.ProcessRaisedException, pattern=".*Address already in use.*")
-def test_moe_initialization():
-    world_size = 4
-    run_func = partial(run_test, port=free_port())
-    mp.spawn(run_func, nprocs=world_size)
-
-
-if __name__ == '__main__':
-    test_moe_initialization()
+from functools import partial
+import pytest
+import torch.nn as nn
+import torch.multiprocessing as mp
+import torch.distributed as dist
+import colossalai
+from colossalai.utils import free_port, get_current_device
+from colossalai.nn.layer.moe import Experts
+from colossalai.context.moe_context import MOE_CONTEXT
+from colossalai.utils.moe import sync_moe_model_param
+from colossalai.testing import assert_equal_in_group, rerun_on_exception
+
+D_MODEL = 4
+D_FF = 8
+CONFIG = dict()
+
+
+def run_test(rank, port):
+    world_size = 4
+    colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
+    expert_module = nn.Linear
+    expert_factor = dict(in_features=D_MODEL, out_features=D_FF, device=get_current_device())
+
+    MOE_CONTEXT.setup(42)    # MOE environment initialization
+    exp0 = Experts(expert_module, 1, **expert_factor)
+    exp1 = Experts(expert_module, 2, **expert_factor)
+    exp2 = Experts(expert_module, 4, **expert_factor)
+    exp3 = Experts(expert_module, 8, **expert_factor)
+
+    assert exp0.num_local_experts == 1
+    assert exp1.num_local_experts == 1
+    assert exp2.num_local_experts == 1
+    assert exp3.num_local_experts == 2
+    # experts deployment passed
+
+    parallel_info_dict = MOE_CONTEXT.parallel_info_dict
+    rank = dist.get_rank()
+
+    assert len(parallel_info_dict) == 3
+    assert dist.get_rank(parallel_info_dict[4].ep_group) == rank
+    assert dist.get_rank(parallel_info_dict[2].ep_group) == rank % 2
+    assert dist.get_rank(parallel_info_dict[1].ep_group) == 0
+
+    assert dist.get_rank(parallel_info_dict[4].dp_group) == 0
+    assert dist.get_rank(parallel_info_dict[2].dp_group) == rank // 2
+    assert dist.get_rank(parallel_info_dict[1].dp_group) == rank
+    # group creation passed
+
+    model = nn.ModuleList([exp0, exp1, exp2, exp3])
+    model = model.to(get_current_device())
+    sync_moe_model_param(model)
+
+    assert_equal_in_group(exp0.experts[0].weight.data, parallel_info_dict[1].dp_group)
+    assert_equal_in_group(exp0.experts[0].bias.data, parallel_info_dict[1].dp_group)
+    # MOE experts layout success when ep_size = 1
+
+    assert_equal_in_group(exp1.experts[0].weight.data, parallel_info_dict[2].dp_group)
+    assert_equal_in_group(exp1.experts[0].bias.data, parallel_info_dict[2].dp_group)
+    # MOE experts layout success when ep_size = 2
+
+
+@pytest.mark.dist
+@rerun_on_exception(exception_type=mp.ProcessRaisedException, pattern=".*Address already in use.*")
+def test_moe_initialization():
+    world_size = 4
+    run_func = partial(run_test, port=free_port())
+    mp.spawn(run_func, nprocs=world_size)
+
+
+if __name__ == '__main__':
+    test_moe_initialization()