Refactored docstring to google style

2025-09-09 04:50:17 +00:00 · 2022-03-25 13:02:39 +08:00
parent 53b1b6e340
commit ec5086c49c
94 changed files with 3389 additions and 2982 deletions
--- a/colossalai/utils/activation_checkpoint.py
+++ b/colossalai/utils/activation_checkpoint.py
@@ -114,10 +114,13 @@ class CheckpointFunction(torch.autograd.Function):


 def checkpoint(function, activation_offload ,*args):
-    """Checkpoint the computation while preserve the rng states, modified from Pytorch torch.utils.checkpoint
+    """Checkpoint the computation while preserve the rng states, modified from Pytorch torch.utils.checkpoint.

-    :param function: Describe the forward pass function. It should know how to handle the input tuples.
-    :param args: Tuple containing the parameters of the function
-    :return: Output of running function with provided args
+    Args:
+        function: Describe the forward pass function. It should know how to handle the input tuples.
+        args (list): Tuple containing the parameters of the function
+
+    Returns:
+        Output of running function with provided args.
    """
    return CheckpointFunction.apply(function, activation_offload, *args)
--- a/colossalai/utils/checkpointing.py
+++ b/colossalai/utils/checkpointing.py
@@ -50,17 +50,17 @@ def _get_standard_checkpoint_filename(epoch: int, suffix: str = ''):


 def get_checkpoint_path(checkpoint_dir: str, epoch: int, suffix: str = ''):
-    """This is a function to generate the checkpoint path from the (checkpoint_dir, epoch, suffix, gpu_parallel_rank) tuple.
+    """This is a function to generate the checkpoint path from the tuple
+    (checkpoint_dir, epoch, suffix, gpu_parallel_rank).
    This is useful during generation and recuperation of the checkpoint.

-    :param checkpoint_dir: Set up a directory for saving checkpoints
-    :type checkpoint_dir: str
-    :param epoch: Epoch number (indicate how many epochs have you trained this model)
-    :type epoch: int
-    :param suffix: Additional notation to specify the model or checkpoint, defaults to ''
-    :type suffix: str, optional
-    :return: Checkpoint path to be generated
-    :rtype: path
+    Args:
+        checkpoint_dir (str): Set up a directory for saving checkpoints.
+        epoch (int): Epoch number (indicate how many epochs have you trained this model).
+        suffix (str, optional): Additional notation to specify the model or checkpoint, defaults to ''
+
+    Returns:
+        str: The checkpoint path to be generated.
    """
    ckpt_filename = _get_standard_checkpoint_filename(epoch, suffix)
    return os.path.join(checkpoint_dir, ckpt_filename)
@@ -74,12 +74,13 @@ def _ensure_directory_exists(filename: str):


 def get_latest_checkpoint_pattern(suffix: str = ''):
-    """Generate Regular expression of latest checkpoint's pattern
+    """Generate Regular expression of the latest checkpoint's pattern.

-    :param suffix: Additional notation to specify the model or checkpoint, defaults to ''
-    :type suffix: str, optional
-    :return: Checkpoint pattern
-    :rtype: regular expression
+    Args:
+        suffix (str, optional): Additional notation to specify the model or checkpoint, defaults to ''.
+
+    Returns:
+        str: The regular expression of checkpoint pattern.
    """
    ranks_name = _get_ranks_name()
    pattern = r'epoch(\d+)-{}{}\.pt'.format(ranks_name, suffix)
@@ -88,16 +89,19 @@ def get_latest_checkpoint_pattern(suffix: str = ''):


 def get_latest_checkpoint_path(checkpoint_dir: str, suffix: str = ''):
-    """This is a function to retrieve the latest checkpoint path from the (checkpoint_dir, suffix, gpu_parallel_rank) tuple.
+    """This is a function to retrieve the latest checkpoint path from the tuple
+    (checkpoint_dir, suffix, gpu_parallel_rank).
    This is useful during recuperation of the checkpoint, especially when you do not know the epoch number.

-    :param checkpoint_dir: Directory for saving checkpoints
-    :type checkpoint_dir: str
-    :param suffix: Additional notation to specify the model or checkpoint, defaults to ''
-    :type suffix: str, optional
-    :raises FileNotFoundError: Raise error when we cannot find the latest checkpoint file with inputs given
-    :return: The latest checkpoint path to be retrieved
-    :rtype: path
+    Args:
+        checkpoint_dir (str): Directory for saving checkpoints
+        suffix (str, optional): Additional notation to specify the model or checkpoint, defaults to ''
+
+    Returns:
+        str: The latest retrieved checkpoint path.
+
+    Raises:
+        FileNotFoundError: Raise error when we cannot find the latest checkpoint file with inputs given.
    """
    CKPT_NAME_PAT = get_latest_checkpoint_pattern(suffix=suffix)

@@ -126,22 +130,19 @@ def save_checkpoint(checkpoint_path: str,
                    optimizer: torch.optim.Optimizer,
                    lr_scheduler: torch.optim.lr_scheduler._LRScheduler = None,
                    **kwargs):
-    """Given a directory to store the checkpoints, saves all the training components' parameters or buffers, such as model,
-     optimizer, lr_scheduler and etc. into a checkpoint dictionary.
+    """Given a directory to store the checkpoints, saves all the training components' parameters or buffers, such as
+    model, optimizer, lr_scheduler etc. into a checkpoint dictionary.

-    This method can be used for both colosalai nn.BaseModel and normal pytorch nn.Module.
+    This method can be used for both :class:`colossalai.nn.BaseModel` and normal  :class:`torch.nn.Module`.

-
-    :param checkpoint_path: Set up a directory for saving checkpoints
-    :type checkpoint_path: str
-    :param epoch: Epoch number (indicate how many epochs have you trained this model)
-    :type epoch: int
-    :param model: Model to be registered
-    :type model: torch.nn.Module
-    :param optimizer: Optimizer to be registered
-    :type optimizer: torch.optim.Optimizer
-    :param lr_scheduler: lr_scheduler to be registered, defaults to None
-    :type lr_scheduler: torch.optim.lr_scheduler._LRScheduler, optional
+    Args:
+        checkpoint_path (str): Set up a directory for saving checkpoints.
+        epoch (int): Epoch number (indicate how many epochs have you trained this model).
+        model (:class:`torch.nn.Module`): Model to be registered.
+        optimizer (Union[:class:`torch.optim.Optimizer`, :class:`colossalai.nn.optimizer`]): Optimizer to be registered.
+        lr_scheduler (Union[:class:`torch.optim.lr_scheduler`,
+            :class:`colossalai.nn.lr_scheduler`], optional): lr_scheduler to be registered, defaults to None.
+        kwargs (dict): additional parameters to be saved.
    """
    # for compatibility with normal pytorch nn.Module
    if hasattr(model, 'state_dict_for_save_checkpoint'):
@@ -165,31 +166,31 @@ def load_checkpoint(checkpoint_path: str,
                    finetune: bool = False,
                    strict: bool = True) -> Tuple:
    """Loads the checkpoint file.
+
    If finetune is False, then we intend to continue/resume the training process from the checkpoint given.
    So we copy parameters and buffers from state_dict into these modules(model, optimizer,lr_scheduler)
-     and its descendants.
-    If finetune is True, then only the weights and buffers of model should be reload.
-    If strict is True, then the keys of state_dict must exactly match the keys returned by this module’s
-     state_dict() function.
+    and its descendants.

-    :param checkpoint_path: The exact and matched checkpoint_path directory to retrieve appropriate state_dict
-    :type checkpoint_path: str
-    :param model: Model to reload parameters and buffers
-    :type model: torch.nn.Module
-    :param optimizer: Optimizer to recuperate
-    :type optimizer: torch.optim.Optimizer
-    :param lr_scheduler: lr_scheduler to recuperate, defaults to None
-    :type lr_scheduler: torch.optim.lr_scheduler._LRScheduler, optional
-    :param finetune: Whether to finetune the model with new dataset or continue the pre-training, defaults to False
-    :type finetune: bool, optional
-    :param strict: Whether to strictly enforce that the keys in
-        :attr:`state_dict` of the checkpoint match the names of
-        parameters and buffers in model., defaults to True
-    :type strict: bool, optional
-    :raises ValueError: Raise error if the model/optimizer cannot successfully be recuperated
-    :return: (the epoch number of the checkpoint retrieved, the checkpoint retrieved)
-    :rtype: Tuple
+    If finetune is True, then only the weights and buffers of model should be reloaded.
+    If strict is True, then the keys of state_dict must exactly match the keys returned
+    by this module’s state_dict() function.

+     Args:
+        checkpoint_path (str): The exact and matched checkpoint_path directory to retrieve appropriate state_dict.
+        model (:class:`torch.nn.Module`): Model to reload parameters and buffers.
+        optimizer (Union[:class:`torch.optim.Optimizer`, :class:`colossalai.nn.optimizer`]): Optimizer to recuperate.
+        lr_scheduler (:class:`torch.optim.lr_scheduler._LRScheduler`, optional):
+            lr_scheduler to recuperate, defaults to None.
+        finetune (bool, optional): Whether to finetune the model with new dataset or
+            continue the pre-training, defaults to False.
+        strict (bool, optional): Whether to strictly enforce that the keys in :attr:`state_dict`
+            of the checkpoint match the names of parameters and buffers in model, defaults to True.
+
+    Returns:
+        Tuple(int, ``checkpoint``): The tuple (the epoch number of the checkpoint retrieved, the checkpoint retrieved).
+
+    Raises:
+        ValueError: Raise error if the model/optimizer cannot successfully be recuperated
    """
    # Load the checkpoint.
    checkpoint = torch.load(checkpoint_path, map_location='cpu')
--- a/colossalai/utils/common.py
+++ b/colossalai/utils/common.py
@@ -27,10 +27,10 @@ from .multi_tensor_apply import multi_tensor_applier
 def print_rank_0(msg: str, logger=None):
    """Print messages and save logs(optional). This is executed only if you are the rank-0 gpu.

-    :param msg: A string message to output
-    :type msg: str
-    :param logger: Python logger object, defaults to None
-    :type logger: optional
+    Args:
+        msg (str): A string message to output.
+        logger (:class:`colossalai.logging.DistributedLogger`, optional):
+            The logger to record the message, defaults to None.
    """
    if gpc.get_global_rank() == 0:
        if logger is None:
@@ -53,12 +53,15 @@ def free_port():


 def sync_model_param(model, parallel_mode):
-    """Make sure data parameters are consistent during Data Parallel Mode
+    r"""Make sure data parameters are consistent during Data Parallel Mode.

-    :param model: A pyTorch nn.model on whose parameters you check the consistency
-    :param parallel_mode: Parallel mode to be checked
-    :type model: torch.nn.Module
-    :type parallel_mode:  colossalai.context.ParallelMode
+    Args:
+        model (:class:`torch.nn.Module`): A pyTorch model on whose parameters you check the consistency.
+        parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel mode to be checked.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """
    if gpc.is_initialized(parallel_mode) and gpc.get_world_size(parallel_mode) > 1:
        for param in model.parameters():
@@ -146,18 +149,19 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
    """Clips gradient norm of an iterable of parameters whose gradients are in fp32.

    This is adapted from :func:`torch.nn.utils.clip_grad.clip_grad_norm_` and
-    added functionality to handle model parallel parameters. Note that
-    the gradients are modified in place.
+    added functionality to handle model parallel parameters.

-    :param parameters: An iterable of Tensors or a single Tensor that will have gradients normalized
-    :type parameters: (Iterable[Tensor] or Tensor)
-    :param max_norm: Max norm of the gradients
-    :type max_norm: float or int
-    :param norm_type: Type of the used p-norm. Can be ``'inf'`` for infinity norm.
-    :type norm_type: float or int 
+    Note:
+        the gradients are modified in place.

-    :return: Total norm of the parameters (viewed as a single vector).
-    :rtype: float
+    Args:
+        parameters (Iterable[:class:`torch.tensor`] or :class:`torch.tensor`):
+            An iterable of Tensors or a single Tensor that will have gradients normalized.
+        max_norm (Union[float, int]): Max norm of the gradients.
+        norm_type (Union[float, int, 'inf']): Type of the used p-norm. Can be ``'inf'`` for infinity norm.
+
+    Returns:
+        float: Total norm of the parameters.
    """

    if isinstance(parameters, torch.Tensor):
--- a/colossalai/utils/data_sampler/data_parallel_sampler.py
+++ b/colossalai/utils/data_sampler/data_parallel_sampler.py
@@ -19,18 +19,15 @@ T_co = TypeVar('T_co', covariant=True)

@DATA_SAMPLERS.register_module
 class DataParallelSampler(Sampler):
-    """A data sampler for distributed data parallelism
+    """A data sampler for distributed data parallelism.

-    :param dataset: A Dataset instance
-    :type dataset: torch.utils.data.Dataset
-    :param shuffle: Whether to shuffle data, defaults to False
-    :type shuffle: bool, optional
-    :param seed: The random seed, defaults to 0
-    :type seed: int, optional
-    :param drop_last: Set to True to drop the last incomplete batch, if the dataset size is not divisible by the batch
-        size. If False and the size of dataset is not divisible by the batch size, then the last batch will be smaller,
-        defaults to False
-    :type drop_last: bool, optional
+    Args:
+        dataset (:class:`torch.utils.data.Dataset`): The Dataset for sampling.
+        shuffle (bool, optional): Whether to shuffle data, defaults to False.
+        seed (int, optional): The random seed used for sampling, defaults to 0.
+        drop_last (bool, optional): Set to True to drop the last incomplete batch, if the dataset size
+            is not divisible by the batch size. If False and the size of dataset is not divisible by
+            the batch size, then the last batch will be smaller, defaults to False.
    """

    def __init__(self,
@@ -104,8 +101,8 @@ class DataParallelSampler(Sampler):
        use a different random ordering for each epoch. Otherwise, the next iteration of this
        sampler will yield the same ordering.

-        :param epoch: Epoch number.
-        :type epoch: int
+        Args:
+            epoch (int): Epoch number.
        """
        self.epoch = epoch

@@ -118,29 +115,27 @@ def get_dataloader(dataset,
                   pin_memory=False,
                   num_workers=0,
                   **kwargs):
-    """Set up a deterministic dataloader (also configure seed workers, samplers and whether shuffle or not)
+    r"""Set up a deterministic dataloader (also configure seed workers, samplers and whether shuffle or not)

-    .. note:: When pipeline parallel is enabled, shuffle cannot be True as it will result in mismatch between input data
-        on the 1st stage and label on the last stage
+    Note:
+        When pipeline parallel is enabled, shuffle cannot be True as it will result in mismatch between input data
+        on the 1st stage and label on the last stage.

-    :param dataset: A :class:`torch.utils.data.Dataset` object
-    :param shuffle: Whether to shuffle the dataset
-    :param seed: Random worker seed, defaults to 1024
-    :param add_sampler: Add DistributedDataParallelSampelr to the dataset
-    :param drop_last: Drop the last incomplete batch of data
-    :param pin_memory: Whether to pin memory address in CPU memory
-    :param num_workers: Number of worker threads for this dataloader
+    Args:
+        dataset (:class:`torch.utils.data.Dataset`): The dataset to be loaded.
+        shuffle (bool, optional): Whether to shuffle the dataset. Defaults to False.
+        seed (int, optional): Random worker seed for sampling, defaults to 1024.
+        add_sampler: Whether to add ``DistributedDataParallelSampler`` to the dataset. Defaults to True.
+        drop_last (bool, optional): Set to True to drop the last incomplete batch, if the dataset size
+            is not divisible by the batch size. If False and the size of dataset is not divisible by
+            the batch size, then the last batch will be smaller, defaults to False.
+        pin_memory (bool, optional): Whether to pin memory address in CPU memory. Defaults to False.
+        num_workers (int, optional): Number of worker threads for this dataloader. Defaults to 0.
+        kwargs (dict): optional parameters for ``torch.utils.data.DataLoader``, more details could be found in
+                `DataLoader <https://pytorch.org/docs/stable/_modules/torch/utils/data/dataloader.html#DataLoader>`_.

-    :type dataset: :class:`torch.utils.data.Dataset`
-    :type shuffle: bool, optional. Default is False
-    :type seed: int, optional. Default is 1024
-    :type add_sampler: bool, optional. Default is True
-    :type drop_last: bool, optional. Default is False
-    :type pin_memory: bool, optional. Default is False
-    :type num_workers: int, optional. Default is 0
-
-    :return: A object of :class:`torch.utils.data.DataLoader`
-    :rtype: :class:`torch.utils.data.DataLoader`
+    Returns:
+        :class:`torch.utils.data.DataLoader`: A DataLoader used for training or testing.
    """
    _kwargs = kwargs.copy()

--- a/colossalai/utils/gradient_accumulation/init.py
+++ b/colossalai/utils/gradient_accumulation/init.py
@@ -13,19 +13,25 @@ def accumulate_gradient(model: nn.Module,
                        accumulate_size: int,
                        gradient_handlers: List[BaseGradientHandler] = None,
                        lr_scheduler: _LRScheduler = None):
-    """
-    :param model: your model object
-    :type model: :class:`torch.nn.Module`
-    :param optimizer: your optimizer object
-    :type optimizer: :class:`torch.optim.Optimizer`
-    :param dataloader: your dataloader object
-    :type dataloader: Iterable
-    :param accumulate_size: the number of steps to accumulate gradients
-    :type accumulate_size: int
-    :param gradient_handlers: list of gradient handler objects. Default is None
-    :type gradient_handlers: List[:class:`colossalai.engine.BaseGradientHandler`]
-    :param lr_scheduler: your lr scheduler object. Default is None
-    :type lr_scheduler: `torch.optim.lr_scheduler._LRScheduler`
+    r"""Turning model, optimizer, dataloader into corresponding object for gradient accumulation.
+
+    Args:
+        model (:class:`torch.nn.Module`): your model object for gradient accumulation.
+        optimizer (:class:`torch.optim.Optimizer`): your optimizer object for gradient accumulation.
+        dataloader (:class:`torch.utils.data.DataLoader` or iterable objects):
+            your dataloader object, would be called like iter(dataloader)
+        accumulate_size (int): the number of steps to accumulate gradients
+        gradient_handlers (List[:class:`colossalai.engine.BaseGradientHandler`]):
+            list of gradient handler objects. Default is None.
+        lr_scheduler (`torch.optim.lr_scheduler` or `colossalai.nn.lr_scheduler`):
+            your ``lr_scheduler`` object for gradient accumulation. Defaults to None.
+
+    More details about `gradient_handlers` could be found in
+    `Gradient_handler <https://github.com/hpcaitech/ColossalAI/tree/main/colossalai/engine/gradient_handler>`_.
+
+    More details about `lr_scheduler` could be found
+    `lr_scheduler <https://github.com/hpcaitech/ColossalAI/tree/main/colossalai/nn/lr_scheduler>`_. and
+    `how to adjust learning rate <https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate>`_.
    """
    optimizer = GradAccumOptimizer(optimizer, accumulate_size=accumulate_size, model=model)
    dataloader = GradAccumDataloader(dataloader, accumulate_size=accumulate_size)
--- a/colossalai/utils/gradient_accumulation/_gradient_accumulation.py
+++ b/colossalai/utils/gradient_accumulation/_gradient_accumulation.py
@@ -15,15 +15,13 @@ from colossalai.engine import BaseGradientHandler

 class GradAccumOptimizer(ColossalaiOptimizer):
    """A wrapper for the optimizer to enable gradient accumulation by skipping the steps 
-    before accumulation size is reached
-
-    :param optim: Your optimizer object
-    :type optim: :class:`torch.optim.Optimizer`
-    :param accumulate_size: The number of steps to accumulate gradients
-    :type accumulate_size: int
-    :param model: Your model object to check if it is DDP for special handling of no_sync() context
-    :type model: :class:`torch.nn.Module`
+    before accumulation size is reached.

+    Args:
+        optim (:class:`torch.optim.Optimizer`): Your optimizer object for gradient accumulation.
+        accumulate_size (int): The number of steps to accumulate gradients.
+        model (:class:`torch.nn.Module`):
+            Your model object to check if it is DistributedDataParallel for special handling of no_sync() context.
    """

    def __init__(self, optim: Optimizer, accumulate_size: int, model: nn.Module = None):
@@ -76,18 +74,18 @@ class GradAccumOptimizer(ColossalaiOptimizer):


 class GradAccumDataloader:
-    """A wrapper for dataloder to enable gradient accumulation by dropping the last incomplete steps.
+    """A wrapper for dataloader to enable gradient accumulation by dropping the last incomplete steps.

-    For example, if a dataloader has 10 batches of data and accumulate size is 4. The model paramters will 
-    be update only twice at step 4 and step 8. The last two batches of data do not form a complete 4-step cycle.
-    Thus, they will be automatically skipped by this class. If the dataloader is not standard PyTorch dataloader, 
-    (e.g. Dali dataloader), this class will automatically consume (load data for nothing) the remaining 2 batches.
-
-    :param dataloader: Your dataloader object
-    :type dataloader: Iterable
-    :param accumulate_size: The number of steps to accumulate gradients
-    :type accumulate_size: int
+    Note:
+        The dataloader would drop the last incomplete steps for gradient accumulation.
+        For example, if a dataloader has 10 batches of data and accumulate size is 4. The model parameters will
+        be updated only twice at step 4 and step 8. The last two batches of data do not form a complete 4-step cycle.
+        Thus, they will be automatically skipped by this class. If the dataloader is not standard PyTorch dataloader,
+        (e.g. Dali dataloader), this class will automatically consume (load data for nothing) the remaining 2 batches.

+    Args:
+        optim (``Iterable``): Your dataloader object for gradient accumulation.
+        accumulate_size (int): The number of steps to accumulate gradients.
    """

    def __init__(self, dataloader: Iterable, accumulate_size: int) -> None:
@@ -125,13 +123,12 @@ class GradAccumDataloader:

 class GradAccumLrSchedulerByStep(_LRScheduler):
    """A wrapper for the LR scheduler to enable gradient accumulation by skipping the steps 
-    before accumulation size is reached
-
-    :param lr_scheduler: Your lr scheduler object
-    :type lr_scheduler: :class:`torch.optim.lr_scheduler._LRScheduler`    
-    :param accumulate_size: The number of steps to accumulate gradients
-    :type accumulate_size: int
+    before accumulation size is reached.

+    Args:
+        lr_scheduler (:class:`torch.optim.lr_scheduler._LRScheduler`):
+            Your ``lr_scheduler`` object for gradient accumulation.
+        accumulate_size (int): The number of steps to accumulate gradients.
    """

    def __init__(self, lr_scheduler: _LRScheduler, accumulate_size: int) -> None:
@@ -171,13 +168,16 @@ class GradAccumLrSchedulerByStep(_LRScheduler):


 class GradAccumGradientHandler:
-    """A wrapper for the gradient handler to enable gradient accumulation by skipping the steps 
-    before accumulation size is reached
+    r"""A wrapper for the gradient handler to enable gradient accumulation by skipping the steps
+    before accumulation size is reached.

-    :param grad_handler: Your gradient handler object
-    :type grad_handler: :class:`colossalai.engine.BaseGradientHandler`    
-    :param accumulate_size: The number of steps to accumulate gradients
-    :type accumulate_size: int
+    Args:
+        grad_handler (:class:`colossalai.engine.BaseGradientHandler`):
+            Your ``gradient_handler`` object for gradient accumulation, would be called when achieving `accumulate_size`.
+        accumulate_size (int): The number of steps to accumulate gradients.
+
+    More details about ``gradient_handlers`` could be found in
+    `Gradient_handler <https://github.com/hpcaitech/ColossalAI/tree/main/colossalai/engine/gradient_handler>`_.

    """

--- a/colossalai/utils/memory_utils/memory_monitor.py
+++ b/colossalai/utils/memory_utils/memory_monitor.py
@@ -14,12 +14,13 @@ from typing import Optional


 def colo_cuda_memory_used(device: Optional[torch.device] = None) -> int:
-    """
-    Get the free memory info of device.
-    :param device: a torch device instance or None
-    :type device: Optional[torch.device]
-    :return: current memory usage, sized by Byte
-    :rtype: int
+    """Get the free memory info of device.
+
+    Args:
+       device (Optional[``torch.device``]): a torch device instance or None. Defaults None.
+
+    Returns:
+        int: current memory usage, sized by Byte.
    """
    if device:
        assert device.type == 'cuda'
@@ -34,7 +35,7 @@ def colo_cuda_memory_used(device: Optional[torch.device] = None) -> int:


 def bytes_to_GB(val, decimal=2):
-    """A byte-to-Gigabyte converter, defaultly using binary notation.
+    """A byte-to-Gigabyte converter, default using binary notation.

    :param val: X bytes to convert
    :return: X' GB
@@ -43,7 +44,7 @@ def bytes_to_GB(val, decimal=2):


 def bytes_to_MB(val, decimal=2):
-    """A byte-to-Megabyte converter, defaultly using binary notation.
+    """A byte-to-Megabyte converter, default using binary notation.

    :param val: X bytes to convert
    :return: X' MB
@@ -54,13 +55,13 @@ def bytes_to_MB(val, decimal=2):
 def report_memory_usage(message, logger=None, report_cpu=False):
    """Calculate and print RAM usage (in GB)

-    :param message: A prefix message to add in the log
-    :type message: str
-    :param logger: An instance of :class:`colossalai.logging.DistributedLogger`
-    :type logger: :class:`colossalai.logging.DistributedLogger`, optional
-    :param report_cpu: Whether to report CPU memory
-    :type report_cpu: bool, optional
-    :raises EnvironmentError: Raise error if no distributed environment has been initialized
+    Args:
+        message (str): A prefix message to add in the log.
+        logger (:class:`colossalai.logging.DistributedLogger`): The logger used to record memory information.
+        report_cpu (bool, optional): Whether to report CPU memory.
+
+    Raises:
+        EnvironmentError: Raise error if no distributed environment has been initialized.
    """
    if not gpc.is_initialized(ParallelMode.GLOBAL):
        raise EnvironmentError("No distributed environment is initialized")
--- a/colossalai/utils/moe.py
+++ b/colossalai/utils/moe.py
@@ -12,8 +12,8 @@ def get_moe_epsize_param_dict(model: nn.Module) -> Dict[int, List[nn.Parameter]]
    size of every parameter. Since the parameters in data parallelism is replicated
    in each GPU, we set their ep_size to 1.

-    :param model: A pyTorch nn.model from which we get dict
-    :type model: torch.nn.Module
+    Args:
+        model (:class:`torch.nn.Module`): A pyTorch `nn.Module` from which we get dict.
    """
    epsize_param_dict = dict()
    for param in model.parameters():
@@ -29,10 +29,10 @@ def get_moe_epsize_param_dict(model: nn.Module) -> Dict[int, List[nn.Parameter]]


 def sync_moe_model_param(model: nn.Module):
-    """Make sure model parameters are consistent in MoE parallel context
+    """Make sure model parameters are consistent in MoE parallel context.

-    :param model: A pyTorch nn.model on whose parameters you check the consistency
-    :type model: torch.nn.Module
+    Args:
+        model (:class:`torch.nn.Module`): A pyTorch model on whose parameters you check the consistency.
    """
    if is_using_ddp():

--- a/colossalai/utils/multi_tensor_apply/multi_tensor_apply.py
+++ b/colossalai/utils/multi_tensor_apply/multi_tensor_apply.py
@@ -3,10 +3,10 @@

 class MultiTensorApply(object):
    """
-    Apply an operation to a list of tensors efficiently
+    Apply an operation to a list of tensors efficiently.

-    :param chunk_size: Size of a chunk
-    :type chunk_size: int
+    Args:
+        chunk_size (int): Size of a chunk.
    """

    available = False
--- a/colossalai/utils/tensor_detector/tensor_detector.py
+++ b/colossalai/utils/tensor_detector/tensor_detector.py
@@ -9,6 +9,7 @@ from collections import defaultdict
 LINE_WIDTH = 108
 LINE = '-' * LINE_WIDTH + '\n'

+
 class TensorDetector():
    def __init__(self,
                 show_info: bool = True,
@@ -16,17 +17,14 @@ class TensorDetector():
                 include_cpu: bool = False,
                 module: Optional[nn.Module] = None
                 ):
-        """This class is an detector to detect tensor on different devices.
-
-        :param show_info: whether to print the info on screen, default True
-        :type show_info: bool
-        :param log: the file name to save the log
-        :type log: str
-        :param include_cpu: whether to detect tensor on cpu, default False
-        :type include_cpu: bool
-        :param module: when sending an `nn.Module` it, the detector can name the tensors detected better
-        :type module: Optional[nn.Module]
+        """This class is a detector to detect tensor on different devices.

+        Args:
+            show_info (bool, optional): whether to print the info on screen, default True.
+            log (str, optional): the file name to save the log. Defaults to None.
+            include_cpu (bool, optional): whether to detect tensor on cpu, default False.
+            module (Optional[:class:`nn.Module`]): when sending an ``nn.Module`` object,
+                the detector can name the tensors detected better.
        """
        self.show_info = show_info
        self.log = log
@@ -48,7 +46,6 @@ class TensorDetector():
                self.tensor_info[id(param)].append(param.requires_grad)
                self.tensor_info[id(param)].append(param.dtype)
                self.tensor_info[id(param)].append(self.get_tensor_mem(param))
-                

    def get_tensor_mem(self, tensor):
        # calculate the memory occupied by a tensor
@@ -58,7 +55,6 @@ class TensorDetector():
            memory_size += grad_memory_size
        return self.mem_format(memory_size)

-
    def mem_format(self, real_memory_size):
        # format the tensor memory into a reasonal magnitude
        if real_memory_size >= 2 ** 30:
@@ -68,7 +64,6 @@ class TensorDetector():
        if real_memory_size >= 2 ** 10:
            return str(real_memory_size / (2 ** 10)) + ' KB'
        return str(real_memory_size) + ' B' 
-        

    def collect_tensors_state(self):
        for obj in gc.get_objects():
@@ -116,7 +111,6 @@ class TensorDetector():
                if obj.device not in self.devices:
                    self.devices.append(obj.device)

-    
    def print_tensors_state(self):
        template_format = '{:3s}{:<30s}{:>10s}{:>20s}{:>10s}{:>20s}{:>15s}'
        self.info += LINE
@@ -173,7 +167,6 @@ class TensorDetector():
        if self.log is not None:
            with open(self.log + '.log', 'a') as f:
                f.write(self.info)
-
    
    def detect(self, include_cpu = False):
        self.include_cpu = include_cpu
--- a/colossalai/utils/timer.py
+++ b/colossalai/utils/timer.py
@@ -25,7 +25,7 @@ class Timer:
        return time.time()

    def start(self):
-        """Fisrtly synchronize cuda, reset the clock and then start the timer.
+        """Firstly synchronize cuda, reset the clock and then start the timer.
        """
        self._elapsed = 0
        synchronize()
@@ -40,10 +40,11 @@ class Timer:
    def stop(self, keep_in_history: bool = False):
        """Stop the timer and record the start-stop time interval.

-        :param keep_in_history: Whether does it record into history each start-stop interval, defaults to False
-        :type keep_in_history: bool, optional
-        :return: Start-stop interval
-        :rtype: int
+        Args:
+            keep_in_history (bool, optional): Whether does it record into history
+                each start-stop interval, defaults to False.
+        Returns:
+            int: Start-stop interval.
        """
        synchronize()
        end_time = time.time()
@@ -57,26 +58,27 @@ class Timer:
    def get_history_mean(self):
        """Mean of all history start-stop time intervals.

-        :return: Mean of time intervals
-        :rtype: int
+        Returns:
+            int: Mean of time intervals
        """
        return sum(self._history) / len(self._history)

    def get_history_sum(self):
        """Add up all the start-stop time intervals.

-        :return: Sum of time intervals
-        :rtype: int
+        Returns:
+            int: Sum of time intervals.
        """
        return sum(self._history)

    def get_elapsed_time(self):
        """Return the last start-stop time interval.

-        .. note:: Use it only when timer is not in progress
+        Returns:
+            int: The last time interval.

-        :return: The last time interval
-        :rtype: int
+        Note:
+            Use it only when timer is not in progress
        """
        assert not self._started, 'Timer is still in progress'
        return self._elapsed
@@ -90,10 +92,10 @@ class Timer:


 class MultiTimer:
-    """An object contains multiple timers
+    """An object contains multiple timers.

-    :param on: Whether the timer is enabled. Default is True
-    :type on: bool, optional
+    Args:
+        on (bool, optional): Whether the timer is enabled. Default is True.
    """

    def __init__(self, on: bool = True):
@@ -101,10 +103,10 @@ class MultiTimer:
        self._timers = dict()

    def start(self, name: str):
-        """Start namely one of the timers
+        """Start namely one of the timers.

-        :param name: Timer's key
-        :type name: str
+        Args:
+            name (str): Timer's key.
        """
        if self._on:
            if name not in self._timers:
@@ -114,10 +116,9 @@ class MultiTimer:
    def stop(self, name: str, keep_in_history: bool):
        """Stop namely one of the timers.

-        :param name: Timer's key
-        :type name: str
-        :param keep_in_history: Whether does it record into history each start-stop interval
-        :type keep_in_history: bool
+        Args:
+            name (str): Timer's key.
+            keep_in_history (bool): Whether does it record into history each start-stop interval.
        """
        if self._on:
            return self._timers[name].stop(keep_in_history)
@@ -127,17 +128,19 @@ class MultiTimer:
    def get_timer(self, name):
        """Get timer by its name (from multitimer)

-        :param name: Timer's key
-        :return: Timer with the name you give correctly
-        :rtype: Timer
+        Args:
+            name (str): Timer's key.
+        Returns:
+            :class:`colossalai.utils.Timer`: Timer with the name you give correctly.
        """
        return self._timers[name]

    def reset(self, name=None):
        """Reset timers.

-        :param name: If name is designated, the named timer will be reset and others will not, defaults to None
-        :type name: optional
+        Args:
+            name (str, optional): If name is designated, the named timer will be reset
+                and others will not, defaults to None.
        """
        if self._on:
            if name is not None: