mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-05 11:02:05 +00:00
[shardformer] support sharded optimizer checkpointIO of HybridParallelPlugin (#4540)
* implement sharded optimizer saving * add more param info * finish implementation of sharded optimizer saving * fix bugs in optimizer sharded saving * add pp+zero test * param group loading * greedy loading of optimizer * fix bug when loading * implement optimizer sharded saving * add optimizer test & arrange checkpointIO utils * fix gemini sharding state_dict * add verbose option * add loading of master params * fix typehint * fix master/working mapping in fp16 amp
This commit is contained in:
@@ -10,6 +10,7 @@ from colossalai.booster.plugin import HybridParallelPlugin
|
||||
from colossalai.shardformer.layer.utils import Randomizer
|
||||
from colossalai.tensor.d_tensor.api import clear_layout_converter
|
||||
from colossalai.testing import (
|
||||
assert_close_loose,
|
||||
check_state_dict_equal,
|
||||
clear_cache_before_run,
|
||||
parameterize,
|
||||
@@ -19,34 +20,34 @@ from colossalai.testing import (
|
||||
from tests.kit.model_zoo import model_zoo
|
||||
|
||||
|
||||
# TODO (Baizhou): Add test cases for shard=False
|
||||
@clear_cache_before_run()
|
||||
@parameterize('shard', [True])
|
||||
@parameterize('model_name', ['transformers_gpt'])
|
||||
@parameterize('size_per_shard', [32])
|
||||
@parameterize('test_config', [{
|
||||
'tp_size': 2,
|
||||
'pp_size': 2,
|
||||
'num_microbatches': 4,
|
||||
'precision': 'fp32',
|
||||
}, {
|
||||
'tp_size': 1,
|
||||
'pp_size': 2,
|
||||
'num_microbatches': 4,
|
||||
'precision': 'fp32',
|
||||
}, {
|
||||
'tp_size': 4,
|
||||
'pp_size': 1,
|
||||
'precision': 'fp32',
|
||||
}, {
|
||||
'tp_size': 2,
|
||||
'pp_size': 1,
|
||||
'precision': 'fp32',
|
||||
'pp_size': 2,
|
||||
'num_microbatches': 4,
|
||||
'precision': 'fp16',
|
||||
'initial_scale': 1
|
||||
}, {
|
||||
'tp_size': 2,
|
||||
'pp_size': 1,
|
||||
'zero_stage': 2,
|
||||
'precision': 'fp16',
|
||||
'initial_scale': 1
|
||||
}, {
|
||||
'tp_size': 1,
|
||||
'pp_size': 2,
|
||||
'num_microbatches': 4,
|
||||
'zero_stage': 1,
|
||||
'precision': 'fp16',
|
||||
'initial_scale': 1
|
||||
}])
|
||||
def exam_state_dict(shard: bool, model_name: str, size_per_shard: int, test_config: dict):
|
||||
|
||||
@@ -61,46 +62,91 @@ def exam_state_dict(shard: bool, model_name: str, size_per_shard: int, test_conf
|
||||
loss = criterion(outputs)
|
||||
return loss
|
||||
|
||||
def _preprocess_data(data):
|
||||
if booster.plugin.stage_manager is not None:
|
||||
for k, v in data.items():
|
||||
if torch.is_tensor(v) or 'Tensor' in v.__class__.__name__:
|
||||
new_shape = [1] * v.dim()
|
||||
new_shape[0] = 4
|
||||
data[k] = v.to('cuda').repeat(*new_shape)
|
||||
return iter([data])
|
||||
else:
|
||||
return {k: v.cuda() for k, v in data.items()}
|
||||
|
||||
model = model_fn().cuda()
|
||||
optimizer = Adam(model.parameters(), lr=1e-3)
|
||||
model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion)
|
||||
|
||||
new_model = model_fn().cuda()
|
||||
new_optimizer = Adam(new_model.parameters(), lr=1e-3)
|
||||
new_model, new_optimizer, criterion, _, _ = booster.boost(new_model, new_optimizer, criterion)
|
||||
|
||||
data = data_gen_fn()
|
||||
model.train()
|
||||
if booster.plugin.stage_manager is not None:
|
||||
for k, v in data.items():
|
||||
if torch.is_tensor(v) or 'Tensor' in v.__class__.__name__:
|
||||
new_shape = [1] * v.dim()
|
||||
new_shape[0] = 4
|
||||
data[k] = v.to('cuda').repeat(*new_shape)
|
||||
data_iter = iter([data])
|
||||
output = booster.execute_pipeline(data_iter,
|
||||
model,
|
||||
_criterion,
|
||||
optimizer,
|
||||
return_loss=True,
|
||||
return_outputs=False)
|
||||
booster.execute_pipeline(_preprocess_data(data),
|
||||
model,
|
||||
_criterion,
|
||||
optimizer,
|
||||
return_loss=True,
|
||||
return_outputs=False)
|
||||
else:
|
||||
data = {k: v.cuda() for k, v in data.items()}
|
||||
output = model(**data)
|
||||
output = model(**_preprocess_data(data))
|
||||
loss = criterion(output)
|
||||
optimizer.backward(loss)
|
||||
|
||||
optimizer.step()
|
||||
|
||||
with shared_tempdir() as tempdir:
|
||||
|
||||
model_ckpt_path = f"{tempdir}/model"
|
||||
# optimizer_ckpt_path = f"{tempdir}/optimizer"
|
||||
optimizer_ckpt_path = f"{tempdir}/optimizer"
|
||||
booster.save_model(model, model_ckpt_path, shard=shard, size_per_shard=size_per_shard)
|
||||
# booster.save_optimizer(optimizer, optimizer_ckpt_path, shard=shard, size_per_shard=size_per_shard)
|
||||
booster.save_optimizer(optimizer, optimizer_ckpt_path, shard=shard, size_per_shard=size_per_shard)
|
||||
dist.barrier()
|
||||
|
||||
new_model = model_fn().cuda()
|
||||
new_optimizer = Adam(new_model.parameters(), lr=1e-3)
|
||||
new_model, new_optimizer, criterion, _, _ = booster.boost(new_model, new_optimizer, criterion)
|
||||
|
||||
booster.load_model(new_model, model_ckpt_path)
|
||||
check_state_dict_equal(model.unwrap().state_dict(), new_model.unwrap().state_dict(), False)
|
||||
booster.load_optimizer(new_optimizer, optimizer_ckpt_path)
|
||||
check_state_dict_equal(optimizer.unwrap().state_dict(), new_optimizer.unwrap().state_dict(), False)
|
||||
dist.barrier()
|
||||
|
||||
# Check whether the loaded model & optimizer works smoothly.
|
||||
model.train()
|
||||
new_model.train()
|
||||
if booster.plugin.stage_manager is not None:
|
||||
booster.execute_pipeline(_preprocess_data(data),
|
||||
model,
|
||||
_criterion,
|
||||
optimizer,
|
||||
return_loss=True,
|
||||
return_outputs=False)
|
||||
booster.execute_pipeline(_preprocess_data(data),
|
||||
new_model,
|
||||
_criterion,
|
||||
new_optimizer,
|
||||
return_loss=True,
|
||||
return_outputs=False)
|
||||
else:
|
||||
old_model_loss = criterion(model(**_preprocess_data(data)))
|
||||
optimizer.backward(old_model_loss)
|
||||
new_model_loss = criterion(new_model(**_preprocess_data(data)))
|
||||
new_optimizer.backward(new_model_loss)
|
||||
|
||||
optimizer.step()
|
||||
new_optimizer.step()
|
||||
|
||||
# Check updated weights.
|
||||
stage_manager = booster.plugin.stage_manager
|
||||
|
||||
if stage_manager is None or stage_manager.is_first_stage():
|
||||
assert_close_loose(model.unwrap().wte.weight.data, new_model.unwrap().wte.weight.data, atol=5e-3, rtol=5e-3)
|
||||
assert_close_loose(model.unwrap().h[0].mlp.c_fc.weight.data,
|
||||
new_model.unwrap().h[0].mlp.c_fc.weight.data,
|
||||
atol=5e-3,
|
||||
rtol=5e-3)
|
||||
|
||||
dist.barrier()
|
||||
Randomizer.reset_index()
|
||||
clear_layout_converter()
|
||||
|
||||
|
Reference in New Issue
Block a user