[misc] add verbose arg for zero and op builder (#3552)

* [misc] add print verbose

* [gemini] add print verbose

* [zero] add print verbose for low level

* [misc] add print verbose for op builder
This commit is contained in:
Hongxin Liu
2023-04-17 11:25:35 +08:00
committed by GitHub
parent 4341f5e8e6
commit 173dad0562
8 changed files with 55 additions and 28 deletions

View File

@@ -54,6 +54,7 @@ class ZeroOptimizer(ColossalaiOptimizer):
clipping_norm (float, optional): The norm value used to clip gradient. Defaults to 0.0.
norm_type (float, optional): The type of norm used for gradient clipping. Currently, only L2-norm (norm_type=2.0)
is supported in ZeroOptimizer. Defaults to 2.0.
verbose (bool, optional): Whether to print verbose information, including grad overflow info. Defaults to False.
"""
def __init__(self,
@@ -69,6 +70,7 @@ class ZeroOptimizer(ColossalaiOptimizer):
max_scale: float = 2**32,
clipping_norm: float = 0.0,
norm_type: float = 2.0,
verbose: bool = False,
**defaults: Any):
super().__init__(optim)
assert isinstance(module, ZeroDDP)
@@ -83,6 +85,7 @@ class ZeroOptimizer(ColossalaiOptimizer):
self.chunk16_set: Set[Chunk] = set()
self.clipping_flag = clipping_norm > 0.0
self.max_norm = clipping_norm
self.verbose = verbose
if self.clipping_flag:
assert norm_type == 2.0, "ZeroOptimizer only supports L2 norm now"
@@ -221,7 +224,8 @@ class ZeroOptimizer(ColossalaiOptimizer):
if found_inf:
self.optim_state = OptimState.UNSCALED # no need to unscale grad
self.grad_scaler.update(found_inf) # update gradient scaler
self._logger.info(f'Found overflow. Skip step')
if self.verbose:
self._logger.info(f'Found overflow. Skip step')
self._clear_global_norm() # clear recorded norm
self.zero_grad() # reset all gradients
self._update_fp16_params()