mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-02 09:38:05 +00:00
[zero] improve adaptability for not-shard parameters (#708)
* adapt post grad hooks for not-shard parameters * adapt optimizer for not-shard parameters * offload gradients for not-replicated parameters
This commit is contained in:
@@ -71,9 +71,9 @@ def run_moe_zero_init(init_device_type, shard_strategy_class):
|
||||
|
||||
# the parameters in moe experts is not replicated
|
||||
if 'experts' in name:
|
||||
assert not param.is_replicated
|
||||
assert not param.colo_attr.is_replicated
|
||||
else:
|
||||
assert param.is_replicated
|
||||
assert param.colo_attr.is_replicated
|
||||
|
||||
if param.colo_attr.param_is_sharded:
|
||||
assert param.colo_attr.sharded_data_tensor.payload.device.type == init_device.type, \
|
||||
|
@@ -36,7 +36,7 @@ def run_model_test(enable_autocast, shard_strategy_class):
|
||||
|
||||
# check whether parameters are identical in ddp
|
||||
for name, p in zero_model.named_parameters():
|
||||
if not p.colo_attr.param_is_sharded and p.is_replicated:
|
||||
if not p.colo_attr.param_is_sharded and p.colo_attr.is_replicated:
|
||||
assert_equal_in_group(p.colo_attr.sharded_data_tensor.payload)
|
||||
|
||||
model = MoeModel().half()
|
||||
|
@@ -48,8 +48,13 @@ def _run_step(model, optimizer, data, label, criterion, grad_handler):
|
||||
|
||||
@parameterize("cpu_offload", [True])
|
||||
@parameterize("use_cpuadam", [True]) # We do not use Hybrid Adam right now, since it has a little bug
|
||||
@parameterize("reuse_fp16_shard", [True, False])
|
||||
@parameterize("shard_strategy_class", [TensorShardStrategy, BucketTensorShardStrategy])
|
||||
def _run_test_sharded_optim_v2(cpu_offload, shard_strategy_class, use_cpuadam, gpu_margin_mem_ratio=0.0):
|
||||
def _run_test_sharded_optim_v2(cpu_offload,
|
||||
shard_strategy_class,
|
||||
use_cpuadam,
|
||||
reuse_fp16_shard,
|
||||
gpu_margin_mem_ratio=0.0):
|
||||
shard_strategy = shard_strategy_class()
|
||||
if use_cpuadam and cpu_offload is False:
|
||||
return
|
||||
@@ -63,17 +68,15 @@ def _run_test_sharded_optim_v2(cpu_offload, shard_strategy_class, use_cpuadam, g
|
||||
shard_param=True):
|
||||
zero_model = MoeModel()
|
||||
|
||||
zero_model = ShardedModelV2(
|
||||
zero_model,
|
||||
shard_strategy,
|
||||
offload_config=dict(device='cpu') if cpu_offload else None,
|
||||
use_memory_tracer=gpu_margin_mem_ratio > 0.0,
|
||||
reuse_fp16_shard=use_cpuadam,
|
||||
)
|
||||
zero_model = ShardedModelV2(zero_model,
|
||||
shard_strategy,
|
||||
offload_config=dict(device='cpu') if cpu_offload else None,
|
||||
use_memory_tracer=gpu_margin_mem_ratio > 0.0,
|
||||
reuse_fp16_shard=reuse_fp16_shard)
|
||||
|
||||
# check whether parameters are identical in ddp
|
||||
for name, p in zero_model.named_parameters():
|
||||
if not p.colo_attr.param_is_sharded and p.is_replicated:
|
||||
if not p.colo_attr.param_is_sharded and p.colo_attr.is_replicated:
|
||||
assert_equal_in_group(p.colo_attr.sharded_data_tensor.payload.to(get_current_device()))
|
||||
|
||||
model = MoeModel().half()
|
||||
@@ -88,8 +91,7 @@ def _run_test_sharded_optim_v2(cpu_offload, shard_strategy_class, use_cpuadam, g
|
||||
sharded_optim,
|
||||
cpu_offload=cpu_offload,
|
||||
initial_scale=2**5,
|
||||
gpu_margin_mem_ratio=gpu_margin_mem_ratio,
|
||||
keep_unsharded=True)
|
||||
gpu_margin_mem_ratio=gpu_margin_mem_ratio)
|
||||
|
||||
amp_config = dict(opt_level='O2', keep_batchnorm_fp32=False)
|
||||
apex_model, apex_optimizer = convert_to_apex_amp(model, optim, amp_config)
|
||||
|
@@ -93,7 +93,7 @@ def check_grads_padding(model, zero_model, loose=False):
|
||||
rank = dist.get_rank()
|
||||
for (name, p), (zero_name, zero_p) in zip(model.named_parameters(), zero_model.named_parameters()):
|
||||
# zero_grad = zero_p.grad.clone().to(p.device)
|
||||
if zero_p.colo_attr.param_is_sharded:
|
||||
if zero_p.colo_attr.is_replicated:
|
||||
zero_grad = zero_p.colo_attr.saved_grad.payload.clone().to(p.device)
|
||||
chunks = torch.flatten(p.grad).chunk(dist.get_world_size())
|
||||
if rank >= len(chunks):
|
||||
@@ -102,8 +102,9 @@ def check_grads_padding(model, zero_model, loose=False):
|
||||
if zero_grad.size(0) > grad.size(0):
|
||||
zero_grad = zero_grad[:grad.size(0)]
|
||||
else:
|
||||
grad = p.grad
|
||||
zero_grad = zero_p.colo_attr.saved_grad.payload
|
||||
grad = p.grad.to(zero_grad.dtype)
|
||||
|
||||
assert grad.dtype == zero_grad.dtype
|
||||
assert allclose(grad, zero_grad, loose=loose), f'diff: {grad - zero_grad}'
|
||||
|
||||
@@ -134,7 +135,7 @@ def check_sharded_model_params(model, zero_model, loose=False, reuse_fp16_shard=
|
||||
if zero_p.size(0) > p.size(0):
|
||||
zero_p = zero_p[:p.size(0)]
|
||||
else:
|
||||
zero_p = zero_p.colo_attr.sharded_data_tensor.payload
|
||||
zero_p = zero_p.colo_attr.sharded_data_tensor.payload.to(p.device)
|
||||
|
||||
assert p.dtype == zero_p.dtype
|
||||
assert allclose(p, zero_p, loose=loose), f'{p} vs {zero_p}'
|
||||
|
Reference in New Issue
Block a user