update examples and sphnix docs for the new api (#63)

2025-09-15 22:19:38 +00:00 · 2021-12-13 22:07:01 +08:00
parent 7d3711058f
commit 35813ed3c4
124 changed files with 1251 additions and 1462 deletions
--- a/colossalai/utils/common.py
+++ b/colossalai/utils/common.py
@@ -109,15 +109,15 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
    added functionality to handle model parallel parameters. Note that
    the gradients are modified in place.

-    Arguments:
-        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
-            single Tensor that will have gradients normalized
-        max_norm (float or int): max norm of the gradients
-        norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
-            infinity norm.
+    :param parameters: an iterable of Tensors or a single Tensor that will have gradients normalized
+    :type parameters: (Iterable[Tensor] or Tensor)
+    :param max_norm: max norm of the gradients
+    :type max_norm: float or int
+    :param norm_type: type of the used p-norm. Can be ``'inf'`` for infinity norm.
+    :type norm_type: float or int 

-    Returns:
-        Total norm of the parameters (viewed as a single vector).
+    :return: Total norm of the parameters (viewed as a single vector).
+    :rtype: float
    """

    if isinstance(parameters, torch.Tensor):
--- a/colossalai/utils/data_sampler/data_parallel_sampler.py
+++ b/colossalai/utils/data_sampler/data_parallel_sampler.py
@@ -123,12 +123,23 @@ def get_dataloader(dataset,
        stage and label on the last stage

    :param dataset: a :class:utils.data.dataset dataset
+    :param shuffle: whether to shuffle the dataset
    :param seed: random worker seed, defaults to 1024
-    :type seed: int, optional
-    :param add_sampler_if_possible: [description], defaults to False
-    :type add_sampler_if_possible: bool, optional
-    :return: a :class:utils.data.dataset dataloader
-    :rtype: torch.utils.data.dataset
+    :param add_sampler: add DistributedDataParallelSampelr to the dataset
+    :param drop_last: drop the last incomplete batch of data
+    :param pin_memory: whether to pin memory address in CPU memory
+    :param num_workers: number of worker threads for this dataloader
+
+    :type dataset: :class:`torch.utils.data.Dataset`
+    :type shuffle: bool, optional. Default is False
+    :type seed: int, optional. Default is 1024
+    :type add_sampler: bool, optional. Default is True
+    :type drop_last: bool, optional. Default is False
+    :type pin_memory: bool, optional. Default is False
+    :type num_workers: int, optional. Default is 0
+
+    :return: a object of :class:`torch.utils.data.DataLoader`
+    :rtype: :class:`torch.utils.data.DataLoader`
    '''
    _kwargs = kwargs.copy()

--- a/colossalai/utils/gradient_accumulation/init.py
+++ b/colossalai/utils/gradient_accumulation/init.py
@@ -13,6 +13,20 @@ def accumulate_gradient(model: nn.Module,
                        accumulate_size: int,
                        gradient_handlers: List[BaseGradientHandler] = None,
                        lr_scheduler: _LRScheduler = None):
+    """
+    :param model: your model object
+    :type model: :class:`torch.nn.Module`
+    :param optimizer: your optimizer object
+    :type optimizer: :class:`torch.optim.Optimizer`
+    :param dataloader: your dataloader object
+    :type dataloader: Iterable
+    :param accumulate_size: the number of steps to accumulate gradients
+    :type accumulate_size: int
+    :param gradient_handlers: list of gradient handler objects. Default is None
+    :type gradient_handlers: List[:class:`colossalai.engine.BaseGradientHandler`]
+    :param lr_scheduler: your lr scheduler object. Default is None
+    :type lr_scheduler: `torch.optim.lr_scheduler._LRScheduler`
+    """
    optimizer = GradAccumOptimizer(optimizer, accumulate_size=accumulate_size, model=model)
    dataloader = GradAccumDataloader(dataloader, accumulate_size=accumulate_size)

--- a/colossalai/utils/gradient_accumulation/_gradient_accumulation.py
+++ b/colossalai/utils/gradient_accumulation/_gradient_accumulation.py
@@ -14,6 +14,17 @@ from colossalai.engine import BaseGradientHandler


 class GradAccumOptimizer(ColossalaiOptimizer):
+    """A wrapper for the optimizer to enable gradient accumulation by skipping the steps 
+    before accumulation size is reached
+
+    :param optim: your optimizer object
+    :type optim: :class:`torch.optim.Optimizer`
+    :param accumulate_size: the number of steps to accumulate gradients
+    :type accumualate_size: int
+    :param model: your model object to check if it is DDP for special handling of no_sync() context
+    :type model: :class:`torch.nn.Module`
+
+    """

    def __init__(self, optim: Optimizer, accumulate_size: int, model: nn.Module = None):
        super().__init__(optim)
@@ -64,6 +75,19 @@ class GradAccumOptimizer(ColossalaiOptimizer):


 class GradAccumDataloader():
+    """A wrapper for dataloder to enable gradient accumulation by dropping the last incomplete steps.
+
+    For example, if a dataloader has 10 batches of data and accumulate size is 4. The model paramters will 
+    be update only twice at step 4 and step 8. The last two batches of data do not form a complete 4-step cycle.
+    Thus, they will be automatically skipped by this class. If the dataloader is not standard PyTorch dataloader, 
+    (e.g. Dali dataloader), this class will automatically consume (load data for nothing) the remaining 2 batches.
+    
+    :param dataloader: your dataloader object
+    :type dataloader: Iterable
+    :param accumulate_size: the number of steps to accumulate gradients
+    :type accumualate_size: int
+
+    """

    def __init__(self, dataloader: Iterable, accumulate_size: int) -> None:
        self.dataloader = dataloader
@@ -99,6 +123,15 @@ class GradAccumDataloader():


 class GradAccumLrSchedulerByStep(_LRScheduler):
+    """A wrapper for the LR scheduler to enable gradient accumulation by skipping the steps 
+    before accumulation size is reached
+
+    :param lr_scheduler: your lr scheduler object
+    :type lr_scheduler: :class:`torch.optim.lr_scheduler._LRScheduler`    
+    :param accumulate_size: the number of steps to accumulate gradients
+    :type accumualate_size: int
+
+    """

    def __init__(self, lr_scheduler: _LRScheduler, accumulate_size: int) -> None:
        self.lr_scheduler = lr_scheduler
@@ -137,6 +170,15 @@ class GradAccumLrSchedulerByStep(_LRScheduler):


 class GradAccumGradientHandler():
+    """A wrapper for the gradient handler to enable gradient accumulation by skipping the steps 
+    before accumulation size is reached
+
+    :param grad_handler: your gradient handler object
+    :type grad_handler: :class:`colossalai.engine.BaseGradientHandler`    
+    :param accumulate_size: the number of steps to accumulate gradients
+    :type accumualate_size: int
+
+    """

    def __init__(self, grad_handler: BaseGradientHandler, accumulate_size: int) -> None:
        assert isinstance(grad_handler, BaseGradientHandler), \
--- a/colossalai/utils/memory.py
+++ b/colossalai/utils/memory.py
@@ -34,6 +34,10 @@ def report_memory_usage(message, logger=None, report_cpu=False):

    :param message: a prefix message to add in the log
    :type message: str
+    :param logger: an instance of :class:`colossalai.logging.DistributedLogger`
+    :type logger: :class:`colossalai.logging.DistributedLogger`
+    :param report_cpu: whether to report CPU memory
+    :type report_cpu: bool
    :raises EnvironmentError: raise error if no distributed environment has been initialized
    '''
    if not gpc.is_initialized(ParallelMode.GLOBAL):
--- a/colossalai/utils/multi_tensor_apply/multi_tensor_apply.py
+++ b/colossalai/utils/multi_tensor_apply/multi_tensor_apply.py
@@ -2,6 +2,13 @@


 class MultiTensorApply(object):
+    """
+    Apply an operation to a list of tensors efficiently
+
+    :param chunk_size: size of a chunk
+    :type chunk_size: int
+    """
+
    available = False
    warned = False

--- a/colossalai/utils/timer.py
+++ b/colossalai/utils/timer.py
@@ -74,6 +74,9 @@ class Timer:

 class MultiTimer:
    '''An object contains multiple timers
+
+    :param on: whether the timer is enabled. Default is True
+    :type on: bool
    '''

    def __init__(self, on: bool = True):