mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2026-05-07 04:28:58 +00:00
[misc] add verbose arg for zero and op builder (#3552)
* [misc] add print verbose * [gemini] add print verbose * [zero] add print verbose for low level * [misc] add print verbose for op builder
This commit is contained in:
@@ -20,6 +20,7 @@ def safe_div(a, b):
|
||||
def init_chunk_manager(model: nn.Module,
|
||||
init_device: Optional[torch.device] = None,
|
||||
hidden_dim: Optional[int] = None,
|
||||
verbose: bool = False,
|
||||
**kwargs) -> ChunkManager:
|
||||
if hidden_dim:
|
||||
search_interval_byte = hidden_dim
|
||||
@@ -39,7 +40,7 @@ def init_chunk_manager(model: nn.Module,
|
||||
total_size /= mb_size
|
||||
wasted_size /= mb_size
|
||||
|
||||
if dist.get_rank() == 0:
|
||||
if verbose and dist.get_rank() == 0:
|
||||
print("searching chunk configuration is completed in {:.2f} s.\n".format(span_s),
|
||||
"used number: {:.2f} MB, wasted number: {:.2f} MB\n".format(total_size, wasted_size),
|
||||
"total wasted percentage is {:.2f}%".format(100 * safe_div(wasted_size, total_size + wasted_size)),
|
||||
|
||||
@@ -567,7 +567,8 @@ class GeminiDDP(ZeroDDP):
|
||||
search_range_mb: int = 32,
|
||||
hidden_dim: Optional[int] = None,
|
||||
min_chunk_size_mb: float = 32,
|
||||
memstats: Optional[MemStats] = None) -> None:
|
||||
memstats: Optional[MemStats] = None,
|
||||
verbose: bool = False) -> None:
|
||||
"""
|
||||
A torch.Module warpper using ZeRO-DP and Genimi.
|
||||
ZeRO is for parallel. Gemini is for memory management.
|
||||
@@ -604,6 +605,7 @@ class GeminiDDP(ZeroDDP):
|
||||
hidden_dim=hidden_dim,
|
||||
search_range_mb=search_range_mb,
|
||||
min_chunk_size_mb=min_chunk_size_mb,
|
||||
strict_ddp_flag=strict_ddp_mode)
|
||||
strict_ddp_flag=strict_ddp_mode,
|
||||
verbose=verbose)
|
||||
gemini_manager = GeminiManager(placement_policy, chunk_manager, memstats)
|
||||
super().__init__(module, gemini_manager, pin_memory, force_outputs_fp32, strict_ddp_mode)
|
||||
|
||||
@@ -54,6 +54,7 @@ class ZeroOptimizer(ColossalaiOptimizer):
|
||||
clipping_norm (float, optional): The norm value used to clip gradient. Defaults to 0.0.
|
||||
norm_type (float, optional): The type of norm used for gradient clipping. Currently, only L2-norm (norm_type=2.0)
|
||||
is supported in ZeroOptimizer. Defaults to 2.0.
|
||||
verbose (bool, optional): Whether to print verbose information, including grad overflow info. Defaults to False.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
@@ -69,6 +70,7 @@ class ZeroOptimizer(ColossalaiOptimizer):
|
||||
max_scale: float = 2**32,
|
||||
clipping_norm: float = 0.0,
|
||||
norm_type: float = 2.0,
|
||||
verbose: bool = False,
|
||||
**defaults: Any):
|
||||
super().__init__(optim)
|
||||
assert isinstance(module, ZeroDDP)
|
||||
@@ -83,6 +85,7 @@ class ZeroOptimizer(ColossalaiOptimizer):
|
||||
self.chunk16_set: Set[Chunk] = set()
|
||||
self.clipping_flag = clipping_norm > 0.0
|
||||
self.max_norm = clipping_norm
|
||||
self.verbose = verbose
|
||||
|
||||
if self.clipping_flag:
|
||||
assert norm_type == 2.0, "ZeroOptimizer only supports L2 norm now"
|
||||
@@ -221,7 +224,8 @@ class ZeroOptimizer(ColossalaiOptimizer):
|
||||
if found_inf:
|
||||
self.optim_state = OptimState.UNSCALED # no need to unscale grad
|
||||
self.grad_scaler.update(found_inf) # update gradient scaler
|
||||
self._logger.info(f'Found overflow. Skip step')
|
||||
if self.verbose:
|
||||
self._logger.info(f'Found overflow. Skip step')
|
||||
self._clear_global_norm() # clear recorded norm
|
||||
self.zero_grad() # reset all gradients
|
||||
self._update_fp16_params()
|
||||
|
||||
@@ -440,6 +440,8 @@ class LowLevelZeroOptimizer(ColossalaiOptimizer):
|
||||
# update loss scale if overflow occurs
|
||||
if found_inf:
|
||||
self._grad_store.reset_all_average_gradients()
|
||||
if self._verbose:
|
||||
self._logger.info(f'Found overflow. Skip step')
|
||||
self.zero_grad()
|
||||
return
|
||||
|
||||
|
||||
@@ -7,7 +7,10 @@ import torch.nn as nn
|
||||
from .gemini import GeminiDDP
|
||||
|
||||
|
||||
def zero_model_wrapper(model: nn.Module, zero_stage: int = 1, gemini_config: Optional[Dict] = None):
|
||||
def zero_model_wrapper(model: nn.Module,
|
||||
zero_stage: int = 1,
|
||||
gemini_config: Optional[Dict] = None,
|
||||
verbose: bool = False):
|
||||
"""This wrapper function is used to wrap your training model for ZeRO DDP.
|
||||
|
||||
Example:
|
||||
@@ -40,7 +43,7 @@ def zero_model_wrapper(model: nn.Module, zero_stage: int = 1, gemini_config: Opt
|
||||
if zero_stage in [1, 2]:
|
||||
wrapped_model = model
|
||||
else:
|
||||
wrapped_model = GeminiDDP(model, **gemini_config)
|
||||
wrapped_model = GeminiDDP(model, **gemini_config, verbose=verbose)
|
||||
|
||||
setattr(wrapped_model, "_colo_zero_stage", zero_stage)
|
||||
|
||||
@@ -58,7 +61,8 @@ def zero_optim_wrapper(model: nn.Module,
|
||||
max_scale: float = 2**32,
|
||||
max_norm: float = 0.0,
|
||||
norm_type: float = 2.0,
|
||||
optim_config: Optional[Dict] = None):
|
||||
optim_config: Optional[Dict] = None,
|
||||
verbose: bool = False):
|
||||
"""This wrapper function is used to wrap your training optimizer for ZeRO DDP.
|
||||
|
||||
Args:
|
||||
@@ -79,6 +83,7 @@ def zero_optim_wrapper(model: nn.Module,
|
||||
|
||||
>>> zero2_config = dict(reduce_bucket_size=12 * 1024 * 1024, overlap_communication=True)
|
||||
>>> optim = zero_optim_wrapper(model, optim, optim_config=zero2_config)
|
||||
verbose (bool, optional): Whether to print the verbose info.
|
||||
"""
|
||||
assert hasattr(model, "_colo_zero_stage"), "You should use `zero_ddp_wrapper` first"
|
||||
zero_stage = getattr(model, "_colo_zero_stage")
|
||||
@@ -102,8 +107,8 @@ def zero_optim_wrapper(model: nn.Module,
|
||||
from colossalai.zero.low_level import LowLevelZeroOptimizer
|
||||
config_dict['partition_grad'] = zero_stage == 2
|
||||
config_dict['clip_grad_norm'] = max_norm
|
||||
return LowLevelZeroOptimizer(optimizer, **config_dict)
|
||||
return LowLevelZeroOptimizer(optimizer, **config_dict, verbose=verbose)
|
||||
else:
|
||||
from colossalai.zero.gemini.gemini_optimizer import ZeroOptimizer
|
||||
config_dict['clipping_norm'] = max_norm
|
||||
return ZeroOptimizer(optimizer, model, **config_dict)
|
||||
return ZeroOptimizer(optimizer, model, **config_dict, verbose=verbose)
|
||||
|
||||
Reference in New Issue
Block a user