mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-25 11:44:03 +00:00
update examples and sphnix docs for the new api (#63)
This commit is contained in:
@@ -16,6 +16,22 @@ def convert_to_amp(model: nn.Module,
|
||||
criterion: _Loss,
|
||||
mode: AMP_TYPE,
|
||||
amp_config: Config = None):
|
||||
"""A helper function to wrap training components with Torch AMP modules
|
||||
|
||||
:param model: your model object
|
||||
:type model: :class:`torch.nn.Module`
|
||||
:param optimizer: your optimizer object
|
||||
:type optimizer: :class:`torch.optim.Optimzer`
|
||||
:param criterion: your loss function object
|
||||
:type criterion: :class:`torch.nn.modules.loss._Loss`
|
||||
:param mode: amp mode
|
||||
:type mode: :class:`colossalai.amp.AMP_TYPE`
|
||||
:param amp_config: configuration for different amp modes
|
||||
:type amp_config: :class:`colossalai.context.Config` or dict
|
||||
|
||||
:return: (model, optimizer, criterion)
|
||||
:rtype: Tuple
|
||||
"""
|
||||
assert isinstance(mode, AMP_TYPE), \
|
||||
f'expected the argument mode be AMP_TYPE, but got {type(mode)}'
|
||||
|
||||
|
@@ -7,6 +7,18 @@ import apex.amp as apex_amp
|
||||
def convert_to_apex_amp(model: nn.Module,
|
||||
optimizer: Optimizer,
|
||||
amp_config):
|
||||
"""A helper function to wrap training components with Torch AMP modules
|
||||
|
||||
:param model: your model object
|
||||
:type model: :class:`torch.nn.Module`
|
||||
:param optimizer: your optimizer object
|
||||
:type optimizer: :class:`torch.optim.Optimzer`
|
||||
:param amp_config: configuration for nvidia apex
|
||||
:type amp_config: :class:`colossalai.context.Config` or dict
|
||||
|
||||
:return: (model, optimizer)
|
||||
:rtype: Tuple
|
||||
"""
|
||||
model, optimizer = apex_amp.initialize(model, optimizer, **amp_config)
|
||||
optimizer = ApexAMPOptimizer(optimizer)
|
||||
return model, optimizer
|
||||
|
@@ -13,11 +13,24 @@ from colossalai.utils import clip_grad_norm_fp32
|
||||
|
||||
|
||||
class ApexAMPOptimizer(ColossalaiOptimizer):
|
||||
''' A wrapper class for APEX optimizer and it implements apex-specific backward and clip_grad_norm
|
||||
methods
|
||||
'''
|
||||
|
||||
def backward(self, loss: Tensor):
|
||||
"""
|
||||
:param loss: loss computed by a loss function
|
||||
:type loss: torch.Tensor
|
||||
"""
|
||||
with apex_amp.scale_loss(loss, self.optim) as scaled_loss:
|
||||
scaled_loss.backward()
|
||||
|
||||
def clip_grad_norm(self, model: nn.Module, max_norm: float):
|
||||
"""
|
||||
:param model: your model object
|
||||
:type model: torch.nn.Module
|
||||
:param max_norm: the max norm value for gradient clipping
|
||||
:type max_norm: float
|
||||
"""
|
||||
if max_norm > 0:
|
||||
clip_grad_norm_fp32(apex_amp.master_params(self.optim), max_norm)
|
||||
|
@@ -8,6 +8,18 @@ from .naive_amp import NaiveAMPOptimizer, NaiveAMPModel
|
||||
def convert_to_naive_amp(model: nn.Module,
|
||||
optimizer: Optimizer,
|
||||
amp_config):
|
||||
"""A helper function to wrap training components with Torch AMP modules
|
||||
|
||||
:param model: your model object
|
||||
:type model: :class:`torch.nn.Module`
|
||||
:param optimizer: your optimizer object
|
||||
:type optimizer: :class:`torch.optim.Optimzer`
|
||||
:param amp_config: configuration for naive mode amp
|
||||
:type amp_config: :class:`colossalai.context.Config` or dict
|
||||
|
||||
:return: (model, optimizer)
|
||||
:rtype: Tuple
|
||||
"""
|
||||
if is_no_pp_or_last_stage():
|
||||
model = NaiveAMPModel(model, output_to_fp32=True)
|
||||
else:
|
||||
|
@@ -146,26 +146,22 @@ class DynamicGradScaler:
|
||||
class FP16Optimizer(Optimizer):
|
||||
"""Float16 optimizer for fp16 and bf16 data types.
|
||||
|
||||
Arguments:
|
||||
optimizer: base optimizer such as Adam or SGD
|
||||
clip_grad: clip gradeints with this global L2 norm. Note
|
||||
that clipping is ignored if clip_grad == 0
|
||||
log_num_zeros_in_grad: return number of zeros in the gradients.
|
||||
params_have_main_grad: flag indicating if parameters have
|
||||
a `main_grad` field. If this is set, we are assuming
|
||||
that the model parameters are store in the `main_grad`
|
||||
field instead of the typical `grad` field. This happens
|
||||
for the DDP cases where there is a contihuous buffer
|
||||
holding the gradients. For example for bfloat16, we want
|
||||
to do gradient accumulation and all-reduces in float32
|
||||
and as a result we store those gradients in the main_grad.
|
||||
Note that main grad is not necessarily in float32.
|
||||
bf16: if true, the model is running in bfloat16.
|
||||
grad_scaler: used for scaling gradients. Note that this can be
|
||||
None. This case happens when `bf16 = True` and we don't
|
||||
use any loss scale. Note that for `bf16 = True`, we can have
|
||||
a constnat gradient scaler. Also for `bf16 = False`, we
|
||||
always require a grad scaler.
|
||||
:param optimizer: base optimizer such as Adam or SGD
|
||||
:type optimizer: torch.optim.Optimizer
|
||||
:param clip_grad: clip gradeints with this global L2 norm. Note that clipping is ignored if clip_grad == 0
|
||||
:type param clip_grad: float
|
||||
:param log_num_zeros_in_grad: return number of zeros in the gradients.
|
||||
:type log_num_zeros_in_grad: bool
|
||||
:param initial_scale: initial scale of gradient scaler
|
||||
:type initial_scale: int
|
||||
:param growth_factor: the growth rate of loss scale
|
||||
:type growth_factor: int
|
||||
:param backoff_factor: the decrease rate of loss scale
|
||||
:type backoff_factor: float
|
||||
:param hysterisis: delay shift in dynamic loss scaling
|
||||
:type hysterisis: int
|
||||
:param max_scale: maximum loss scale allowed
|
||||
:type max_scale: int
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
|
@@ -13,12 +13,21 @@ from ._fp16_optimizer import FP16Optimizer
|
||||
|
||||
|
||||
class NaiveAMPOptimizer(ColossalaiOptimizer):
|
||||
"""A wrapper class for optimizer to cast all parameters to fp16
|
||||
|
||||
:param optim: a normal optimizer like Adam or SGD
|
||||
:type optim: torch.optim.Optimizer
|
||||
"""
|
||||
|
||||
def __init__(self, optim: Optimizer, *args, **kwargs):
|
||||
optim = FP16Optimizer(optimizer=optim, *args, **kwargs)
|
||||
super().__init__(optim)
|
||||
|
||||
def backward(self, loss: Tensor):
|
||||
"""backward with gradient scaler
|
||||
:param loss: loss computed by a loss function
|
||||
:type loss: torch.Tensor
|
||||
"""
|
||||
loss = self.optim.scale_loss(loss)
|
||||
loss.backward()
|
||||
|
||||
@@ -30,6 +39,9 @@ class NaiveAMPOptimizer(ColossalaiOptimizer):
|
||||
|
||||
|
||||
class NaiveAMPModel(nn.Module):
|
||||
"""A wrapper class for model to cast the model into fp16 and
|
||||
automatically cast the input and output
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
model: nn.Module,
|
||||
|
@@ -9,6 +9,20 @@ def convert_to_torch_amp(model: nn.Module,
|
||||
optimizer: Optimizer,
|
||||
criterion: _Loss,
|
||||
amp_config: Config):
|
||||
"""A helper function to wrap training components with Torch AMP modules
|
||||
|
||||
:param model: your model object
|
||||
:type model: :class:`torch.nn.Module`
|
||||
:param optimizer: your optimizer object
|
||||
:type optimizer: :class:`torch.optim.Optimzer`
|
||||
:param criterion: your loss function object
|
||||
:type criterion: :class:`torch.nn.modules.loss._Loss`
|
||||
:param amp_config: configuration for different amp modes
|
||||
:type amp_config: :class:`colossalai.context.Config` or dict
|
||||
|
||||
:return: (model, optimizer, criterion)
|
||||
:rtype: Tuple
|
||||
"""
|
||||
model = TorchAMPModel(model)
|
||||
optimizer = TorchAMPOptimizer(optimizer, **amp_config)
|
||||
criterion = TorchAMPLoss(criterion)
|
||||
|
@@ -14,25 +14,45 @@ from colossalai.utils import clip_grad_norm_fp32
|
||||
|
||||
|
||||
class TorchAMPOptimizer(ColossalaiOptimizer):
|
||||
"""A wrapper class which integrate pytorch amp with an optimizer
|
||||
|
||||
:param optim: a normal optimizer like Adam or SGD
|
||||
:type optim: torch.optim.Optimizer
|
||||
"""
|
||||
|
||||
def __init__(self, optim: Optimizer, *args, **kwargs):
|
||||
super().__init__(optim)
|
||||
self.scaler = GradScaler(*args, **kwargs)
|
||||
|
||||
def backward(self, loss: Tensor):
|
||||
"""backward with torch amp gradient scaler
|
||||
:param loss: loss computed by a loss function
|
||||
:type loss: torch.Tensor
|
||||
"""
|
||||
self.scaler.scale(loss).backward()
|
||||
|
||||
def step(self):
|
||||
"""update the parameters of the model
|
||||
"""
|
||||
self.scaler.step(self.optim)
|
||||
self.scaler.update()
|
||||
|
||||
def clip_grad_norm(self, model: nn.Module, max_norm: float):
|
||||
"""apply gradient clipping to the model parameters
|
||||
:param model: your model object
|
||||
:type model: torch.nn.Module
|
||||
:param max_norm: max norm value for gradient clipping
|
||||
:type max_norm: float
|
||||
"""
|
||||
if max_norm > 0.0:
|
||||
self.scaler.unscale_(self.optim)
|
||||
clip_grad_norm_fp32(model.parameters(), max_norm)
|
||||
|
||||
|
||||
class TorchAMPModel(nn.Module):
|
||||
"""A wrapper class for a model object which executes forward with values automatically
|
||||
cast to fp16
|
||||
"""
|
||||
|
||||
def __init__(self, model: nn.Module) -> None:
|
||||
super().__init__()
|
||||
@@ -44,7 +64,10 @@ class TorchAMPModel(nn.Module):
|
||||
|
||||
|
||||
class TorchAMPLoss(nn.Module):
|
||||
|
||||
"""A wrapper class for a criterion object which computes the loss in mixed-precision context
|
||||
:param loss: a loss function object
|
||||
:type loss: torch.nn.modules.loss._Loss
|
||||
"""
|
||||
def __init__(self, loss: _Loss):
|
||||
super().__init__()
|
||||
self.loss = loss
|
||||
|
Reference in New Issue
Block a user