diff --git a/colossalai/amp/naive_amp/grad_scaler/base_grad_scaler.py b/colossalai/amp/naive_amp/grad_scaler/base_grad_scaler.py index 2d3e3700d..64710440b 100644 --- a/colossalai/amp/naive_amp/grad_scaler/base_grad_scaler.py +++ b/colossalai/amp/naive_amp/grad_scaler/base_grad_scaler.py @@ -12,7 +12,7 @@ __all__ = ['BaseGradScaler'] class BaseGradScaler(ABC): - def __init__(self, initial_scale: int, verbose: bool): + def __init__(self, initial_scale: float, verbose: bool): assert initial_scale > 0 self._scale = torch.cuda.FloatTensor([initial_scale]) self._verbose = verbose @@ -31,6 +31,7 @@ class BaseGradScaler(ABC): def state_dict(self) -> Dict: state_dict = dict() state_dict['scale'] = self.scale + return state_dict def load_state_dict(self, state_dict: Dict) -> None: self._scale = state_dict['scale'] diff --git a/colossalai/amp/naive_amp/grad_scaler/dynamic_grad_scaler.py b/colossalai/amp/naive_amp/grad_scaler/dynamic_grad_scaler.py index 49f155f06..865500c31 100644 --- a/colossalai/amp/naive_amp/grad_scaler/dynamic_grad_scaler.py +++ b/colossalai/amp/naive_amp/grad_scaler/dynamic_grad_scaler.py @@ -3,6 +3,7 @@ import torch from .base_grad_scaler import BaseGradScaler +from typing import Optional __all__ = ['DynamicGradScaler'] @@ -10,12 +11,12 @@ __all__ = ['DynamicGradScaler'] class DynamicGradScaler(BaseGradScaler): def __init__(self, - initial_scale: int = 2**16, - growth_factor: int = 2, + initial_scale: float = 2**16, + growth_factor: float = 2, backoff_factor: float = 0.5, growth_interval: int = 1000, - min_scale: int = None, - max_scale: int = None, + min_scale: Optional[float] = None, + max_scale: Optional[float] = None, hysteresis: int = 2, verbose: bool = False): super().__init__(initial_scale, verbose) diff --git a/colossalai/zero/sharded_model/sharded_model_v2.py b/colossalai/zero/sharded_model/sharded_model_v2.py index 1db81991c..0ac114f58 100644 --- a/colossalai/zero/sharded_model/sharded_model_v2.py +++ b/colossalai/zero/sharded_model/sharded_model_v2.py @@ -358,8 +358,8 @@ class ShardedModelV2(nn.Module): assert param.colo_attr.saved_grad.is_null( ), 'Gradien accumulation is not supported when reuse_fp16_shard=True' - param.colo_attr.reset_grad_payload(grad) - param.colo_attr.reset_data_payload(grad) # release the memory of param + param.colo_attr.reset_grad_payload(grad.data) + param.colo_attr.reset_data_payload(grad.data) # release the memory of param if param.colo_attr.is_replicated: param.colo_attr.sharded_data_tensor.is_sharded = True diff --git a/colossalai/zero/sharded_optim/sharded_optim_v2.py b/colossalai/zero/sharded_optim/sharded_optim_v2.py index c4fbf1b7c..ac6c88d1d 100644 --- a/colossalai/zero/sharded_optim/sharded_optim_v2.py +++ b/colossalai/zero/sharded_optim/sharded_optim_v2.py @@ -83,11 +83,12 @@ class ShardedOptimizerV2(ColossalaiOptimizer): min_scale: float = 1, growth_factor: float = 2, backoff_factor: float = 0.5, - growth_interval: float = 1000, - hysteresis: float = 2, - max_scale: int = 2**32, + growth_interval: int = 1000, + hysteresis: int = 2, + max_scale: float = 2**32, dp_process_group: Optional[ProcessGroup] = None, - mp_process_group: Optional[ProcessGroup] = None) -> None: + mp_process_group: Optional[ProcessGroup] = None, + verbose: bool = False) -> None: assert isinstance(sharded_model, ShardedModelV2), 'model must be wrapped with ShardedModel' super().__init__(optimizer) @@ -115,14 +116,17 @@ class ShardedOptimizerV2(ColossalaiOptimizer): max_scale=max_scale) self._found_overflow: Tensor = torch.IntTensor([0]).to(torch.cuda.current_device()) self._logger = get_dist_logger("ShardedOptimizerV2") + self._verbose = verbose # Store fp32 param shards self._register_master_weight() if self.gpu_margin_mem_ratio != 0.0 and not isinstance(sharded_model._tensor_placement_policy, AutoTensorPlacementPolicy): self._logger.warning(f'gpu_margin_mem_ratio is meaningless when tensor_placement_policy is not "auto"') - self._logger.debug(f"After init ShardedOptimizerV2 consumes {self.get_memory_usage()[0] / 1e6} MB CUDA Memory!", - ranks=[0]) + + if self._verbose: + self._logger.debug( + f"After init ShardedOptimizerV2 consumes {self.get_memory_usage()[0] / 1e6} MB CUDA Memory!", ranks=[0]) self._use_memory_tracer = self.model.use_memory_tracer if self._use_memory_tracer: @@ -193,15 +197,20 @@ class ShardedOptimizerV2(ColossalaiOptimizer): self._point_param_fp16_to_master_param() - self._logger.debug( - f"Before step ShardedOptimizerV2 consumes {self.get_memory_usage()[0] / 1e6} MB CUDA Memory, {self.get_memory_usage()[1] / 1e6} MB CUDA Memory!", - ranks=[0]) + if self._verbose: + gpu_mem, cpu_mem = self.get_memory_usage() + self._logger.debug( + f"Before step ShardedOptimizerV2 consumes {gpu_mem / 1e6} MB CUDA Memory, {cpu_mem / 1e6} MB CUDA Memory!", + ranks=[0]) ret = self.optim.step(*args, **kwargs) - self._logger.debug( - f"After step ShardedOptimizerV2 consumes {self.get_memory_usage()[0] / 1e6} MB CUDA Memory, {self.get_memory_usage()[1] / 1e6} MB CUDA Memory!", - ranks=[0]) + if self._verbose: + gpu_mem, cpu_mem = self.get_memory_usage() + self._logger.debug( + f"After step ShardedOptimizerV2 consumes {gpu_mem / 1e6} MB CUDA Memory, {cpu_mem / 1e6} MB CUDA Memory!", + ranks=[0]) + self._copy_master_model_to_model_fp16() return ret diff --git a/colossalai/zero/sharded_param/sharded_param.py b/colossalai/zero/sharded_param/sharded_param.py index 7992a7f4a..72b88ec2f 100644 --- a/colossalai/zero/sharded_param/sharded_param.py +++ b/colossalai/zero/sharded_param/sharded_param.py @@ -5,18 +5,13 @@ from colossalai.zero.sharded_param.tensor_utils import colo_tensor_mem_usage from .tensorful_state import StatefulTensor, TensorState from typing import List -# use this tensor as empty data point for parameters -# we do not want users use param.data when its torch payload is removed -# empty tensor is expected to raise error when get used -FAKE_EMPTY_TENSOR = torch.BoolTensor([], device='cpu') - EMPTY_TENSOR_DICT = {} def get_empty_tensor(device: torch.device, dtype: torch.dtype): key = (device, dtype) if key not in EMPTY_TENSOR_DICT: - EMPTY_TENSOR_DICT[key] = FAKE_EMPTY_TENSOR.to(device, dtype) + EMPTY_TENSOR_DICT[key] = torch.empty(0, dtype=dtype, device=device) return EMPTY_TENSOR_DICT[key] diff --git a/tests/test_zero/test_stateful_tensor_mgr.py b/tests/test_zero/test_stateful_tensor_mgr.py index 93ef3af8e..d672fb549 100644 --- a/tests/test_zero/test_stateful_tensor_mgr.py +++ b/tests/test_zero/test_stateful_tensor_mgr.py @@ -72,23 +72,13 @@ def run_stm(): # warmup done # only 2 params can be on CUDA - limit_cuda_memory(0.26) + limit_cuda_memory(0.26 / tensor_placement_policy._steady_cuda_cap_ratio) # use OPT-like eviction strategy apply_adjust(model, model.p0, [model.p0, model.p1], stateful_tensor_mgr) - mem_collector.sample_model_data() - mem_collector.sample_overall_data() apply_adjust(model, model.p1, [model.p0, model.p1], stateful_tensor_mgr) - mem_collector.sample_model_data() - mem_collector.sample_overall_data() apply_adjust(model, model.p2, [model.p0, model.p2], stateful_tensor_mgr) - mem_collector.sample_model_data() - mem_collector.sample_overall_data() apply_adjust(model, model.p0, [model.p0, model.p2], stateful_tensor_mgr) - mem_collector.sample_model_data() - mem_collector.sample_overall_data() apply_adjust(model, model.p1, [model.p1, model.p2], stateful_tensor_mgr) - mem_collector.sample_model_data() - mem_collector.finish_collection() def apply_adjust(model: torch.nn.Module, compute_param: Parameter, cuda_param_after_adjust: List[Parameter],