update examples and sphnix docs for the new api (#63)

2025-09-25 11:44:03 +00:00 · 2021-12-13 22:07:01 +08:00
parent 7d3711058f
commit 35813ed3c4
124 changed files with 1251 additions and 1462 deletions
--- a/colossalai/amp/init.py
+++ b/colossalai/amp/init.py
@@ -16,6 +16,22 @@ def convert_to_amp(model: nn.Module,
                   criterion: _Loss,
                   mode: AMP_TYPE,
                   amp_config: Config = None):
+    """A helper function to wrap training components with Torch AMP modules
+
+    :param model: your model object
+    :type model: :class:`torch.nn.Module`
+    :param optimizer: your optimizer object
+    :type optimizer: :class:`torch.optim.Optimzer`
+    :param criterion: your loss function object
+    :type criterion: :class:`torch.nn.modules.loss._Loss`
+    :param mode: amp mode
+    :type mode: :class:`colossalai.amp.AMP_TYPE`
+    :param amp_config: configuration for different amp modes
+    :type amp_config: :class:`colossalai.context.Config` or dict
+
+    :return: (model, optimizer, criterion)
+    :rtype: Tuple
+    """
    assert isinstance(mode, AMP_TYPE), \
        f'expected the argument mode be AMP_TYPE, but got {type(mode)}'

--- a/colossalai/amp/apex_amp/init.py
+++ b/colossalai/amp/apex_amp/init.py
@@ -7,6 +7,18 @@ import apex.amp as apex_amp
 def convert_to_apex_amp(model: nn.Module,
                        optimizer: Optimizer,
                        amp_config):
+    """A helper function to wrap training components with Torch AMP modules
+
+    :param model: your model object
+    :type model: :class:`torch.nn.Module`
+    :param optimizer: your optimizer object
+    :type optimizer: :class:`torch.optim.Optimzer`
+    :param amp_config: configuration for nvidia apex
+    :type amp_config: :class:`colossalai.context.Config` or dict
+
+    :return: (model, optimizer)
+    :rtype: Tuple
+    """
    model, optimizer = apex_amp.initialize(model, optimizer, **amp_config)
    optimizer = ApexAMPOptimizer(optimizer)
    return model, optimizer
--- a/colossalai/amp/apex_amp/apex_amp.py
+++ b/colossalai/amp/apex_amp/apex_amp.py
@@ -13,11 +13,24 @@ from colossalai.utils import clip_grad_norm_fp32


 class ApexAMPOptimizer(ColossalaiOptimizer):
+    ''' A wrapper class for APEX optimizer and it implements apex-specific backward and clip_grad_norm
+    methods
+    '''

    def backward(self, loss: Tensor):
+        """
+        :param loss: loss computed by a loss function
+        :type loss: torch.Tensor
+        """
        with apex_amp.scale_loss(loss, self.optim) as scaled_loss:
            scaled_loss.backward()

    def clip_grad_norm(self, model: nn.Module, max_norm: float):
+        """
+        :param model: your model object
+        :type model: torch.nn.Module
+        :param max_norm: the max norm value for gradient clipping
+        :type max_norm: float
+        """
        if max_norm > 0:
            clip_grad_norm_fp32(apex_amp.master_params(self.optim), max_norm)
--- a/colossalai/amp/naive_amp/init.py
+++ b/colossalai/amp/naive_amp/init.py
@@ -8,6 +8,18 @@ from .naive_amp import NaiveAMPOptimizer, NaiveAMPModel
 def convert_to_naive_amp(model: nn.Module,
                         optimizer: Optimizer,
                         amp_config):
+    """A helper function to wrap training components with Torch AMP modules
+
+    :param model: your model object
+    :type model: :class:`torch.nn.Module`
+    :param optimizer: your optimizer object
+    :type optimizer: :class:`torch.optim.Optimzer`
+    :param amp_config: configuration for naive mode amp
+    :type amp_config: :class:`colossalai.context.Config` or dict
+
+    :return: (model, optimizer)
+    :rtype: Tuple
+    """
    if is_no_pp_or_last_stage():
        model = NaiveAMPModel(model, output_to_fp32=True)
    else:
--- a/colossalai/amp/naive_amp/_fp16_optimizer.py
+++ b/colossalai/amp/naive_amp/_fp16_optimizer.py
@@ -146,26 +146,22 @@ class DynamicGradScaler:
 class FP16Optimizer(Optimizer):
    """Float16 optimizer for fp16 and bf16 data types.

-    Arguments:
-        optimizer: base optimizer such as Adam or SGD
-        clip_grad: clip gradeints with this global L2 norm. Note
-            that clipping is ignored if clip_grad == 0
-        log_num_zeros_in_grad: return number of zeros in the gradients.
-        params_have_main_grad: flag indicating if parameters have
-            a `main_grad` field. If this is set, we are assuming
-            that the model parameters are store in the `main_grad`
-            field instead of the typical `grad` field. This happens
-            for the DDP cases where there is a contihuous buffer
-            holding the gradients. For example for bfloat16, we want
-            to do gradient accumulation and all-reduces in float32
-            and as a result we store those gradients in the main_grad.
-            Note that main grad is not necessarily in float32.
-        bf16: if true, the model is running in bfloat16.
-        grad_scaler: used for scaling gradients. Note that this can be
-            None. This case happens when `bf16 = True` and we don't
-            use any loss scale. Note that for `bf16 = True`, we can have
-            a constnat gradient scaler. Also for `bf16 = False`, we
-            always require a grad scaler.
+    :param optimizer: base optimizer such as Adam or SGD
+    :type optimizer: torch.optim.Optimizer
+    :param clip_grad: clip gradeints with this global L2 norm. Note that clipping is ignored if clip_grad == 0
+    :type param clip_grad: float
+    :param log_num_zeros_in_grad: return number of zeros in the gradients.
+    :type log_num_zeros_in_grad: bool
+    :param initial_scale: initial scale of gradient scaler
+    :type initial_scale: int
+    :param growth_factor: the growth rate of loss scale
+    :type growth_factor: int
+    :param backoff_factor: the decrease rate of loss scale
+    :type backoff_factor: float
+    :param hysterisis: delay shift in dynamic loss scaling
+    :type hysterisis: int
+    :param max_scale: maximum loss scale allowed
+    :type max_scale: int
    """

    def __init__(self,
--- a/colossalai/amp/naive_amp/naive_amp.py
+++ b/colossalai/amp/naive_amp/naive_amp.py
@@ -13,12 +13,21 @@ from ._fp16_optimizer import FP16Optimizer


 class NaiveAMPOptimizer(ColossalaiOptimizer):
+    """A wrapper class for optimizer to cast all parameters to fp16
+
+    :param optim: a normal optimizer like Adam or SGD
+    :type optim: torch.optim.Optimizer
+    """

    def __init__(self, optim: Optimizer, *args, **kwargs):
        optim = FP16Optimizer(optimizer=optim, *args, **kwargs)
        super().__init__(optim)

    def backward(self, loss: Tensor):
+        """backward with gradient scaler
+        :param loss: loss computed by a loss function
+        :type loss: torch.Tensor
+        """
        loss = self.optim.scale_loss(loss)
        loss.backward()

@@ -30,6 +39,9 @@ class NaiveAMPOptimizer(ColossalaiOptimizer):


 class NaiveAMPModel(nn.Module):
+    """A wrapper class for model to cast the model into fp16 and 
+    automatically cast the input and output
+    """

    def __init__(self,
                 model: nn.Module,
--- a/colossalai/amp/torch_amp/init.py
+++ b/colossalai/amp/torch_amp/init.py
@@ -9,6 +9,20 @@ def convert_to_torch_amp(model: nn.Module,
                         optimizer: Optimizer,
                         criterion: _Loss,
                         amp_config: Config):
+    """A helper function to wrap training components with Torch AMP modules
+
+    :param model: your model object
+    :type model: :class:`torch.nn.Module`
+    :param optimizer: your optimizer object
+    :type optimizer: :class:`torch.optim.Optimzer`
+    :param criterion: your loss function object
+    :type criterion: :class:`torch.nn.modules.loss._Loss`
+    :param amp_config: configuration for different amp modes
+    :type amp_config: :class:`colossalai.context.Config` or dict
+    
+    :return: (model, optimizer, criterion)
+    :rtype: Tuple
+    """
    model = TorchAMPModel(model)
    optimizer = TorchAMPOptimizer(optimizer, **amp_config)
    criterion = TorchAMPLoss(criterion)
--- a/colossalai/amp/torch_amp/torch_amp.py
+++ b/colossalai/amp/torch_amp/torch_amp.py
@@ -14,25 +14,45 @@ from colossalai.utils import clip_grad_norm_fp32


 class TorchAMPOptimizer(ColossalaiOptimizer):
+    """A wrapper class which integrate pytorch amp with an optimizer
+
+    :param optim: a normal optimizer like Adam or SGD
+    :type optim: torch.optim.Optimizer
+    """

    def __init__(self, optim: Optimizer, *args, **kwargs):
        super().__init__(optim)
        self.scaler = GradScaler(*args, **kwargs)

    def backward(self, loss: Tensor):
+        """backward with torch amp gradient scaler
+        :param loss: loss computed by a loss function
+        :type loss: torch.Tensor
+        """
        self.scaler.scale(loss).backward()

    def step(self):
+        """update the parameters of the model
+        """
        self.scaler.step(self.optim)
        self.scaler.update()

    def clip_grad_norm(self, model: nn.Module, max_norm: float):
+        """apply gradient clipping to the model parameters
+        :param model: your model object
+        :type model: torch.nn.Module
+        :param max_norm: max norm value for gradient clipping
+        :type max_norm: float
+        """
        if max_norm > 0.0:
            self.scaler.unscale_(self.optim)
            clip_grad_norm_fp32(model.parameters(), max_norm)


 class TorchAMPModel(nn.Module):
+    """A wrapper class for a model object which executes forward with values automatically
+    cast to fp16
+    """

    def __init__(self, model: nn.Module) -> None:
        super().__init__()
@@ -44,7 +64,10 @@ class TorchAMPModel(nn.Module):


 class TorchAMPLoss(nn.Module):
-
+    """A wrapper class for a criterion object which computes the loss in mixed-precision context
+    :param loss: a loss function object
+    :type loss: torch.nn.modules.loss._Loss
+    """
    def __init__(self, loss: _Loss):
        super().__init__()
        self.loss = loss