[misc] add verbose arg for zero and op builder (#3552)

* [misc] add print verbose

* [gemini] add print verbose

* [zero] add print verbose for low level

* [misc] add print verbose for op builder
This commit is contained in:
Hongxin Liu
2023-04-17 11:25:35 +08:00
committed by GitHub
parent 4341f5e8e6
commit 173dad0562
8 changed files with 55 additions and 28 deletions

View File

@@ -20,6 +20,7 @@ def safe_div(a, b):
def init_chunk_manager(model: nn.Module,
init_device: Optional[torch.device] = None,
hidden_dim: Optional[int] = None,
verbose: bool = False,
**kwargs) -> ChunkManager:
if hidden_dim:
search_interval_byte = hidden_dim
@@ -39,7 +40,7 @@ def init_chunk_manager(model: nn.Module,
total_size /= mb_size
wasted_size /= mb_size
if dist.get_rank() == 0:
if verbose and dist.get_rank() == 0:
print("searching chunk configuration is completed in {:.2f} s.\n".format(span_s),
"used number: {:.2f} MB, wasted number: {:.2f} MB\n".format(total_size, wasted_size),
"total wasted percentage is {:.2f}%".format(100 * safe_div(wasted_size, total_size + wasted_size)),

View File

@@ -567,7 +567,8 @@ class GeminiDDP(ZeroDDP):
search_range_mb: int = 32,
hidden_dim: Optional[int] = None,
min_chunk_size_mb: float = 32,
memstats: Optional[MemStats] = None) -> None:
memstats: Optional[MemStats] = None,
verbose: bool = False) -> None:
"""
A torch.Module warpper using ZeRO-DP and Genimi.
ZeRO is for parallel. Gemini is for memory management.
@@ -604,6 +605,7 @@ class GeminiDDP(ZeroDDP):
hidden_dim=hidden_dim,
search_range_mb=search_range_mb,
min_chunk_size_mb=min_chunk_size_mb,
strict_ddp_flag=strict_ddp_mode)
strict_ddp_flag=strict_ddp_mode,
verbose=verbose)
gemini_manager = GeminiManager(placement_policy, chunk_manager, memstats)
super().__init__(module, gemini_manager, pin_memory, force_outputs_fp32, strict_ddp_mode)

View File

@@ -54,6 +54,7 @@ class ZeroOptimizer(ColossalaiOptimizer):
clipping_norm (float, optional): The norm value used to clip gradient. Defaults to 0.0.
norm_type (float, optional): The type of norm used for gradient clipping. Currently, only L2-norm (norm_type=2.0)
is supported in ZeroOptimizer. Defaults to 2.0.
verbose (bool, optional): Whether to print verbose information, including grad overflow info. Defaults to False.
"""
def __init__(self,
@@ -69,6 +70,7 @@ class ZeroOptimizer(ColossalaiOptimizer):
max_scale: float = 2**32,
clipping_norm: float = 0.0,
norm_type: float = 2.0,
verbose: bool = False,
**defaults: Any):
super().__init__(optim)
assert isinstance(module, ZeroDDP)
@@ -83,6 +85,7 @@ class ZeroOptimizer(ColossalaiOptimizer):
self.chunk16_set: Set[Chunk] = set()
self.clipping_flag = clipping_norm > 0.0
self.max_norm = clipping_norm
self.verbose = verbose
if self.clipping_flag:
assert norm_type == 2.0, "ZeroOptimizer only supports L2 norm now"
@@ -221,7 +224,8 @@ class ZeroOptimizer(ColossalaiOptimizer):
if found_inf:
self.optim_state = OptimState.UNSCALED # no need to unscale grad
self.grad_scaler.update(found_inf) # update gradient scaler
self._logger.info(f'Found overflow. Skip step')
if self.verbose:
self._logger.info(f'Found overflow. Skip step')
self._clear_global_norm() # clear recorded norm
self.zero_grad() # reset all gradients
self._update_fp16_params()

View File

@@ -440,6 +440,8 @@ class LowLevelZeroOptimizer(ColossalaiOptimizer):
# update loss scale if overflow occurs
if found_inf:
self._grad_store.reset_all_average_gradients()
if self._verbose:
self._logger.info(f'Found overflow. Skip step')
self.zero_grad()
return

View File

@@ -7,7 +7,10 @@ import torch.nn as nn
from .gemini import GeminiDDP
def zero_model_wrapper(model: nn.Module, zero_stage: int = 1, gemini_config: Optional[Dict] = None):
def zero_model_wrapper(model: nn.Module,
zero_stage: int = 1,
gemini_config: Optional[Dict] = None,
verbose: bool = False):
"""This wrapper function is used to wrap your training model for ZeRO DDP.
Example:
@@ -40,7 +43,7 @@ def zero_model_wrapper(model: nn.Module, zero_stage: int = 1, gemini_config: Opt
if zero_stage in [1, 2]:
wrapped_model = model
else:
wrapped_model = GeminiDDP(model, **gemini_config)
wrapped_model = GeminiDDP(model, **gemini_config, verbose=verbose)
setattr(wrapped_model, "_colo_zero_stage", zero_stage)
@@ -58,7 +61,8 @@ def zero_optim_wrapper(model: nn.Module,
max_scale: float = 2**32,
max_norm: float = 0.0,
norm_type: float = 2.0,
optim_config: Optional[Dict] = None):
optim_config: Optional[Dict] = None,
verbose: bool = False):
"""This wrapper function is used to wrap your training optimizer for ZeRO DDP.
Args:
@@ -79,6 +83,7 @@ def zero_optim_wrapper(model: nn.Module,
>>> zero2_config = dict(reduce_bucket_size=12 * 1024 * 1024, overlap_communication=True)
>>> optim = zero_optim_wrapper(model, optim, optim_config=zero2_config)
verbose (bool, optional): Whether to print the verbose info.
"""
assert hasattr(model, "_colo_zero_stage"), "You should use `zero_ddp_wrapper` first"
zero_stage = getattr(model, "_colo_zero_stage")
@@ -102,8 +107,8 @@ def zero_optim_wrapper(model: nn.Module,
from colossalai.zero.low_level import LowLevelZeroOptimizer
config_dict['partition_grad'] = zero_stage == 2
config_dict['clip_grad_norm'] = max_norm
return LowLevelZeroOptimizer(optimizer, **config_dict)
return LowLevelZeroOptimizer(optimizer, **config_dict, verbose=verbose)
else:
from colossalai.zero.gemini.gemini_optimizer import ZeroOptimizer
config_dict['clipping_norm'] = max_norm
return ZeroOptimizer(optimizer, model, **config_dict)
return ZeroOptimizer(optimizer, model, **config_dict, verbose=verbose)