mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-11-01 06:19:48 +00:00
[zero] fix init bugs in zero context (#686)
* adapt model weight initialization for methods in Pytorch nn.init
This commit is contained in:
@@ -28,7 +28,6 @@ def run_model_test(init_device_type, shard_strategy_class):
|
||||
|
||||
for get_components_func in non_distributed_component_funcs:
|
||||
model_builder, _, _, _, _ = get_components_func()
|
||||
model_numel_tensor = torch.zeros(1, dtype=torch.int)
|
||||
if init_device_type == 'cuda':
|
||||
init_device = torch.device(f"cuda:{get_current_device()}")
|
||||
elif init_device_type == 'cpu':
|
||||
@@ -40,8 +39,7 @@ def run_model_test(init_device_type, shard_strategy_class):
|
||||
with ZeroInitContext(target_device=init_device,
|
||||
shard_strategy=shard_strategy_class(),
|
||||
shard_param=True,
|
||||
model_numel_tensor=model_numel_tensor,
|
||||
rm_torch_payload_on_the_fly=False):
|
||||
model_numel_tensor=model_numel_tensor):
|
||||
model = model_builder(checkpoint=True)
|
||||
|
||||
for param in model.parameters():
|
||||
|
||||
@@ -29,12 +29,9 @@ def run_model_test(enable_autocast, shard_strategy_class):
|
||||
get_components_func = non_distributed_component_funcs.get_callable(model_name)
|
||||
model_builder, train_dataloader, _, _, criterion = get_components_func()
|
||||
|
||||
rm_torch_payload_on_the_fly = False
|
||||
|
||||
with ZeroInitContext(target_device=torch.cuda.current_device(),
|
||||
with ZeroInitContext(target_device=torch.device('cuda', torch.cuda.current_device()),
|
||||
shard_strategy=shard_strategy,
|
||||
shard_param=True,
|
||||
rm_torch_payload_on_the_fly=rm_torch_payload_on_the_fly):
|
||||
shard_param=True):
|
||||
zero_model = model_builder(checkpoint=True)
|
||||
zero_model = ShardedModelV2(zero_model, shard_strategy, use_memory_tracer=True)
|
||||
|
||||
|
||||
@@ -60,8 +60,7 @@ def _run_test_sharded_optim_v2(cpu_offload, shard_strategy_class, use_cpuadam, g
|
||||
with ZeroInitContext(
|
||||
target_device=torch.device(f'cpu:0') if cpu_offload else torch.device(f'cuda:{get_current_device()}'),
|
||||
shard_strategy=shard_strategy,
|
||||
shard_param=True,
|
||||
rm_torch_payload_on_the_fly=False):
|
||||
shard_param=True):
|
||||
zero_model = model_builder(checkpoint=True)
|
||||
zero_model = ShardedModelV2(
|
||||
zero_model,
|
||||
|
||||
@@ -27,10 +27,9 @@ def run_zero_state_dict(shard_strategy_class):
|
||||
get_components_func = non_distributed_component_funcs.get_callable(model_name)
|
||||
model_builder, train_dataloader, test_dataloader, optimizer, criterion = get_components_func()
|
||||
|
||||
with ZeroInitContext(target_device=torch.cuda.current_device(),
|
||||
with ZeroInitContext(target_device=torch.device('cuda', torch.cuda.current_device()),
|
||||
shard_strategy=shard_strategy,
|
||||
shard_param=True,
|
||||
rm_torch_payload_on_the_fly=False):
|
||||
shard_param=True):
|
||||
zero_model = model_builder(checkpoint=True)
|
||||
zero_model = ShardedModelV2(zero_model, shard_strategy)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user