update examples and sphnix docs for the new api (#63)

This commit is contained in:
Frank Lee
2021-12-13 22:07:01 +08:00
committed by GitHub
parent 7d3711058f
commit 35813ed3c4
124 changed files with 1251 additions and 1462 deletions

View File

@@ -16,6 +16,22 @@ def convert_to_amp(model: nn.Module,
criterion: _Loss,
mode: AMP_TYPE,
amp_config: Config = None):
"""A helper function to wrap training components with Torch AMP modules
:param model: your model object
:type model: :class:`torch.nn.Module`
:param optimizer: your optimizer object
:type optimizer: :class:`torch.optim.Optimzer`
:param criterion: your loss function object
:type criterion: :class:`torch.nn.modules.loss._Loss`
:param mode: amp mode
:type mode: :class:`colossalai.amp.AMP_TYPE`
:param amp_config: configuration for different amp modes
:type amp_config: :class:`colossalai.context.Config` or dict
:return: (model, optimizer, criterion)
:rtype: Tuple
"""
assert isinstance(mode, AMP_TYPE), \
f'expected the argument mode be AMP_TYPE, but got {type(mode)}'

View File

@@ -7,6 +7,18 @@ import apex.amp as apex_amp
def convert_to_apex_amp(model: nn.Module,
optimizer: Optimizer,
amp_config):
"""A helper function to wrap training components with Torch AMP modules
:param model: your model object
:type model: :class:`torch.nn.Module`
:param optimizer: your optimizer object
:type optimizer: :class:`torch.optim.Optimzer`
:param amp_config: configuration for nvidia apex
:type amp_config: :class:`colossalai.context.Config` or dict
:return: (model, optimizer)
:rtype: Tuple
"""
model, optimizer = apex_amp.initialize(model, optimizer, **amp_config)
optimizer = ApexAMPOptimizer(optimizer)
return model, optimizer

View File

@@ -13,11 +13,24 @@ from colossalai.utils import clip_grad_norm_fp32
class ApexAMPOptimizer(ColossalaiOptimizer):
''' A wrapper class for APEX optimizer and it implements apex-specific backward and clip_grad_norm
methods
'''
def backward(self, loss: Tensor):
"""
:param loss: loss computed by a loss function
:type loss: torch.Tensor
"""
with apex_amp.scale_loss(loss, self.optim) as scaled_loss:
scaled_loss.backward()
def clip_grad_norm(self, model: nn.Module, max_norm: float):
"""
:param model: your model object
:type model: torch.nn.Module
:param max_norm: the max norm value for gradient clipping
:type max_norm: float
"""
if max_norm > 0:
clip_grad_norm_fp32(apex_amp.master_params(self.optim), max_norm)

View File

@@ -8,6 +8,18 @@ from .naive_amp import NaiveAMPOptimizer, NaiveAMPModel
def convert_to_naive_amp(model: nn.Module,
optimizer: Optimizer,
amp_config):
"""A helper function to wrap training components with Torch AMP modules
:param model: your model object
:type model: :class:`torch.nn.Module`
:param optimizer: your optimizer object
:type optimizer: :class:`torch.optim.Optimzer`
:param amp_config: configuration for naive mode amp
:type amp_config: :class:`colossalai.context.Config` or dict
:return: (model, optimizer)
:rtype: Tuple
"""
if is_no_pp_or_last_stage():
model = NaiveAMPModel(model, output_to_fp32=True)
else:

View File

@@ -146,26 +146,22 @@ class DynamicGradScaler:
class FP16Optimizer(Optimizer):
"""Float16 optimizer for fp16 and bf16 data types.
Arguments:
optimizer: base optimizer such as Adam or SGD
clip_grad: clip gradeints with this global L2 norm. Note
that clipping is ignored if clip_grad == 0
log_num_zeros_in_grad: return number of zeros in the gradients.
params_have_main_grad: flag indicating if parameters have
a `main_grad` field. If this is set, we are assuming
that the model parameters are store in the `main_grad`
field instead of the typical `grad` field. This happens
for the DDP cases where there is a contihuous buffer
holding the gradients. For example for bfloat16, we want
to do gradient accumulation and all-reduces in float32
and as a result we store those gradients in the main_grad.
Note that main grad is not necessarily in float32.
bf16: if true, the model is running in bfloat16.
grad_scaler: used for scaling gradients. Note that this can be
None. This case happens when `bf16 = True` and we don't
use any loss scale. Note that for `bf16 = True`, we can have
a constnat gradient scaler. Also for `bf16 = False`, we
always require a grad scaler.
:param optimizer: base optimizer such as Adam or SGD
:type optimizer: torch.optim.Optimizer
:param clip_grad: clip gradeints with this global L2 norm. Note that clipping is ignored if clip_grad == 0
:type param clip_grad: float
:param log_num_zeros_in_grad: return number of zeros in the gradients.
:type log_num_zeros_in_grad: bool
:param initial_scale: initial scale of gradient scaler
:type initial_scale: int
:param growth_factor: the growth rate of loss scale
:type growth_factor: int
:param backoff_factor: the decrease rate of loss scale
:type backoff_factor: float
:param hysterisis: delay shift in dynamic loss scaling
:type hysterisis: int
:param max_scale: maximum loss scale allowed
:type max_scale: int
"""
def __init__(self,

View File

@@ -13,12 +13,21 @@ from ._fp16_optimizer import FP16Optimizer
class NaiveAMPOptimizer(ColossalaiOptimizer):
"""A wrapper class for optimizer to cast all parameters to fp16
:param optim: a normal optimizer like Adam or SGD
:type optim: torch.optim.Optimizer
"""
def __init__(self, optim: Optimizer, *args, **kwargs):
optim = FP16Optimizer(optimizer=optim, *args, **kwargs)
super().__init__(optim)
def backward(self, loss: Tensor):
"""backward with gradient scaler
:param loss: loss computed by a loss function
:type loss: torch.Tensor
"""
loss = self.optim.scale_loss(loss)
loss.backward()
@@ -30,6 +39,9 @@ class NaiveAMPOptimizer(ColossalaiOptimizer):
class NaiveAMPModel(nn.Module):
"""A wrapper class for model to cast the model into fp16 and
automatically cast the input and output
"""
def __init__(self,
model: nn.Module,

View File

@@ -9,6 +9,20 @@ def convert_to_torch_amp(model: nn.Module,
optimizer: Optimizer,
criterion: _Loss,
amp_config: Config):
"""A helper function to wrap training components with Torch AMP modules
:param model: your model object
:type model: :class:`torch.nn.Module`
:param optimizer: your optimizer object
:type optimizer: :class:`torch.optim.Optimzer`
:param criterion: your loss function object
:type criterion: :class:`torch.nn.modules.loss._Loss`
:param amp_config: configuration for different amp modes
:type amp_config: :class:`colossalai.context.Config` or dict
:return: (model, optimizer, criterion)
:rtype: Tuple
"""
model = TorchAMPModel(model)
optimizer = TorchAMPOptimizer(optimizer, **amp_config)
criterion = TorchAMPLoss(criterion)

View File

@@ -14,25 +14,45 @@ from colossalai.utils import clip_grad_norm_fp32
class TorchAMPOptimizer(ColossalaiOptimizer):
"""A wrapper class which integrate pytorch amp with an optimizer
:param optim: a normal optimizer like Adam or SGD
:type optim: torch.optim.Optimizer
"""
def __init__(self, optim: Optimizer, *args, **kwargs):
super().__init__(optim)
self.scaler = GradScaler(*args, **kwargs)
def backward(self, loss: Tensor):
"""backward with torch amp gradient scaler
:param loss: loss computed by a loss function
:type loss: torch.Tensor
"""
self.scaler.scale(loss).backward()
def step(self):
"""update the parameters of the model
"""
self.scaler.step(self.optim)
self.scaler.update()
def clip_grad_norm(self, model: nn.Module, max_norm: float):
"""apply gradient clipping to the model parameters
:param model: your model object
:type model: torch.nn.Module
:param max_norm: max norm value for gradient clipping
:type max_norm: float
"""
if max_norm > 0.0:
self.scaler.unscale_(self.optim)
clip_grad_norm_fp32(model.parameters(), max_norm)
class TorchAMPModel(nn.Module):
"""A wrapper class for a model object which executes forward with values automatically
cast to fp16
"""
def __init__(self, model: nn.Module) -> None:
super().__init__()
@@ -44,7 +64,10 @@ class TorchAMPModel(nn.Module):
class TorchAMPLoss(nn.Module):
"""A wrapper class for a criterion object which computes the loss in mixed-precision context
:param loss: a loss function object
:type loss: torch.nn.modules.loss._Loss
"""
def __init__(self, loss: _Loss):
super().__init__()
self.loss = loss