mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-03 18:19:58 +00:00
[misc] update pre-commit and run all files (#4752)
* [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format
This commit is contained in:
@@ -19,50 +19,30 @@ from colossalai.testing import (
|
||||
from tests.kit.model_zoo import model_zoo
|
||||
|
||||
MODEL_PLACEMENT_CONFIGS = [
|
||||
{
|
||||
'placement_policy': 'static',
|
||||
'shard_param_frac': 0.0
|
||||
}, # zero2
|
||||
{
|
||||
'placement_policy': 'static',
|
||||
'shard_param_frac': 1.0
|
||||
}, # zero3
|
||||
{
|
||||
'placement_policy': 'static',
|
||||
'shard_param_frac': 0.5
|
||||
}, # zero3-half
|
||||
{"placement_policy": "static", "shard_param_frac": 0.0}, # zero2
|
||||
{"placement_policy": "static", "shard_param_frac": 1.0}, # zero3
|
||||
{"placement_policy": "static", "shard_param_frac": 0.5}, # zero3-half
|
||||
]
|
||||
|
||||
OPTIM_PLACEMENT_CONFIGS = [
|
||||
{
|
||||
'placement_policy': 'static',
|
||||
'shard_param_frac': 0.0,
|
||||
'offload_optim_frac': 0.0
|
||||
}, # zero2
|
||||
{
|
||||
'placement_policy': 'static',
|
||||
'shard_param_frac': 0.0,
|
||||
'offload_optim_frac': 1.0
|
||||
}, # zero2-offload
|
||||
{
|
||||
'placement_policy': 'static',
|
||||
'shard_param_frac': 0.0,
|
||||
'offload_optim_frac': 0.5
|
||||
}, # zero2-offload-half
|
||||
{"placement_policy": "static", "shard_param_frac": 0.0, "offload_optim_frac": 0.0}, # zero2
|
||||
{"placement_policy": "static", "shard_param_frac": 0.0, "offload_optim_frac": 1.0}, # zero2-offload
|
||||
{"placement_policy": "static", "shard_param_frac": 0.0, "offload_optim_frac": 0.5}, # zero2-offload-half
|
||||
]
|
||||
|
||||
|
||||
@clear_cache_before_run()
|
||||
@parameterize('placement_config', MODEL_PLACEMENT_CONFIGS)
|
||||
@parameterize('model_name', ['transformers_bert_for_sequence_classification'])
|
||||
@parameterize('use_safetensors', [False, True])
|
||||
@parameterize("placement_config", MODEL_PLACEMENT_CONFIGS)
|
||||
@parameterize("model_name", ["transformers_bert_for_sequence_classification"])
|
||||
@parameterize("use_safetensors", [False, True])
|
||||
def exam_state_dict_with_origin(placement_config, model_name, use_safetensors: bool):
|
||||
from transformers import BertForSequenceClassification
|
||||
|
||||
(model_fn, data_gen_fn, output_transform_fn, _, _) = next(iter(model_zoo.get_sub_registry(model_name).values()))
|
||||
bert_model = model_fn()
|
||||
|
||||
with shared_tempdir() as tempdir:
|
||||
pretrained_path = os.path.join(tempdir, 'pretrained')
|
||||
pretrained_path = os.path.join(tempdir, "pretrained")
|
||||
bert_model.config.save_pretrained(save_directory=pretrained_path)
|
||||
|
||||
plugin = GeminiPlugin(**placement_config)
|
||||
@@ -70,24 +50,22 @@ def exam_state_dict_with_origin(placement_config, model_name, use_safetensors: b
|
||||
bert_model, _, _, _, _ = booster.boost(bert_model)
|
||||
model_size = sum(p.numel() * p.element_size() for p in bert_model.parameters()) / 1024**2
|
||||
|
||||
booster.save_model(bert_model,
|
||||
pretrained_path,
|
||||
True,
|
||||
True,
|
||||
'', (model_size / 3),
|
||||
use_safetensors=use_safetensors)
|
||||
booster.save_model(
|
||||
bert_model, pretrained_path, True, True, "", (model_size / 3), use_safetensors=use_safetensors
|
||||
)
|
||||
dist.barrier()
|
||||
|
||||
new_bert_model = BertForSequenceClassification.from_pretrained(pretrained_path)
|
||||
check_state_dict_equal(bert_model.state_dict(only_rank_0=False, dtype=torch.float32),
|
||||
new_bert_model.state_dict(), False)
|
||||
check_state_dict_equal(
|
||||
bert_model.state_dict(only_rank_0=False, dtype=torch.float32), new_bert_model.state_dict(), False
|
||||
)
|
||||
|
||||
|
||||
@clear_cache_before_run()
|
||||
@parameterize('placement_config', OPTIM_PLACEMENT_CONFIGS)
|
||||
@parameterize('shard', [False, True])
|
||||
@parameterize('model_name', ['transformers_gpt'])
|
||||
@parameterize('size_per_shard', [32])
|
||||
@parameterize("placement_config", OPTIM_PLACEMENT_CONFIGS)
|
||||
@parameterize("shard", [False, True])
|
||||
@parameterize("model_name", ["transformers_gpt"])
|
||||
@parameterize("size_per_shard", [32])
|
||||
def exam_state_dict(placement_config, shard: bool, model_name: str, size_per_shard: int):
|
||||
(model_fn, data_gen_fn, output_transform_fn, _, _) = next(iter(model_zoo.get_sub_registry(model_name).values()))
|
||||
criterion = lambda x: x.mean()
|
||||
@@ -102,7 +80,7 @@ def exam_state_dict(placement_config, shard: bool, model_name: str, size_per_sha
|
||||
new_model, new_optimizer, criterion, _, _ = booster.boost(new_model, new_optimizer, criterion)
|
||||
|
||||
data = data_gen_fn()
|
||||
data = {k: v.to('cuda') if torch.is_tensor(v) or 'Tensor' in v.__class__.__name__ else v for k, v in data.items()}
|
||||
data = {k: v.to("cuda") if torch.is_tensor(v) or "Tensor" in v.__class__.__name__ else v for k, v in data.items()}
|
||||
output = model(**data)
|
||||
output = output_transform_fn(output)
|
||||
output_key = list(output.keys())[0]
|
||||
@@ -123,13 +101,14 @@ def exam_state_dict(placement_config, shard: bool, model_name: str, size_per_sha
|
||||
check_state_dict_equal(model.state_dict(only_rank_0=False), new_model.state_dict(only_rank_0=False), False)
|
||||
|
||||
booster.load_optimizer(new_optimizer, optimizer_ckpt_path)
|
||||
check_state_dict_equal(optimizer.state_dict(only_rank_0=False), new_optimizer.state_dict(only_rank_0=False),
|
||||
False)
|
||||
check_state_dict_equal(
|
||||
optimizer.state_dict(only_rank_0=False), new_optimizer.state_dict(only_rank_0=False), False
|
||||
)
|
||||
|
||||
# Check the new model/optimizer can successfully run.
|
||||
data = data_gen_fn()
|
||||
data = {
|
||||
k: v.to('cuda') if torch.is_tensor(v) or 'Tensor' in v.__class__.__name__ else v for k, v in data.items()
|
||||
k: v.to("cuda") if torch.is_tensor(v) or "Tensor" in v.__class__.__name__ else v for k, v in data.items()
|
||||
}
|
||||
output = new_model(**data)
|
||||
output = output_transform_fn(output)
|
||||
@@ -143,13 +122,13 @@ def exam_state_dict(placement_config, shard: bool, model_name: str, size_per_sha
|
||||
|
||||
def run_dist(rank, world_size, port):
|
||||
config = {}
|
||||
colossalai.launch(config=config, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
|
||||
colossalai.launch(config=config, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
exam_state_dict()
|
||||
exam_state_dict_with_origin()
|
||||
|
||||
|
||||
@pytest.mark.dist
|
||||
@pytest.mark.parametrize('world_size', [2])
|
||||
@pytest.mark.parametrize("world_size", [2])
|
||||
@rerun_if_address_is_in_use()
|
||||
def test_gemini_ckpIO(world_size):
|
||||
spawn(run_dist, world_size)
|
||||
|
@@ -19,10 +19,9 @@ from tests.kit.model_zoo import model_zoo
|
||||
|
||||
|
||||
@clear_cache_before_run()
|
||||
@parameterize('shard', [False, True])
|
||||
@parameterize('model_name', ['transformers_gpt'])
|
||||
@parameterize("shard", [False, True])
|
||||
@parameterize("model_name", ["transformers_gpt"])
|
||||
def exam_torch_load_from_gemini(shard: bool, model_name: str):
|
||||
|
||||
(model_fn, data_gen_fn, output_transform_fn, _, _) = next(iter(model_zoo.get_sub_registry(model_name).values()))
|
||||
criterion = lambda x: x.mean()
|
||||
plugin = GeminiPlugin(precision="fp16", initial_scale=(2**14))
|
||||
@@ -33,7 +32,7 @@ def exam_torch_load_from_gemini(shard: bool, model_name: str):
|
||||
model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion)
|
||||
|
||||
data = data_gen_fn()
|
||||
data = {k: v.to('cuda') if torch.is_tensor(v) or 'Tensor' in v.__class__.__name__ else v for k, v in data.items()}
|
||||
data = {k: v.to("cuda") if torch.is_tensor(v) or "Tensor" in v.__class__.__name__ else v for k, v in data.items()}
|
||||
output = model(**data)
|
||||
output = output_transform_fn(output)
|
||||
output_key = list(output.keys())[0]
|
||||
@@ -60,8 +59,11 @@ def exam_torch_load_from_gemini(shard: bool, model_name: str):
|
||||
new_booster.load_model(new_model, model_ckpt_path, strict=True)
|
||||
|
||||
# Add prefix to get aligned with pytorch parameter names.
|
||||
check_state_dict_equal(model.state_dict(only_rank_0=False, prefix='module.module.', dtype=torch.float32),
|
||||
new_model.state_dict(), False)
|
||||
check_state_dict_equal(
|
||||
model.state_dict(only_rank_0=False, prefix="module.module.", dtype=torch.float32),
|
||||
new_model.state_dict(),
|
||||
False,
|
||||
)
|
||||
|
||||
new_booster.load_optimizer(new_optimizer, optimizer_ckpt_path)
|
||||
check_state_dict_equal(optimizer.state_dict(only_rank_0=False), new_optimizer.state_dict(), False)
|
||||
@@ -69,7 +71,7 @@ def exam_torch_load_from_gemini(shard: bool, model_name: str):
|
||||
# Check the new model/optimizer can successfully run.
|
||||
data = data_gen_fn()
|
||||
data = {
|
||||
k: v.to('cuda') if torch.is_tensor(v) or 'Tensor' in v.__class__.__name__ else v for k, v in data.items()
|
||||
k: v.to("cuda") if torch.is_tensor(v) or "Tensor" in v.__class__.__name__ else v for k, v in data.items()
|
||||
}
|
||||
output = new_model(**data)
|
||||
output = output_transform_fn(output)
|
||||
@@ -82,10 +84,9 @@ def exam_torch_load_from_gemini(shard: bool, model_name: str):
|
||||
|
||||
|
||||
@clear_cache_before_run()
|
||||
@parameterize('shard', [False, True])
|
||||
@parameterize('model_name', ['transformers_gpt'])
|
||||
@parameterize("shard", [False, True])
|
||||
@parameterize("model_name", ["transformers_gpt"])
|
||||
def exam_gemini_load_from_torch(shard: bool, model_name: str):
|
||||
|
||||
(model_fn, data_gen_fn, output_transform_fn, _, _) = next(iter(model_zoo.get_sub_registry(model_name).values()))
|
||||
criterion = lambda x: x.mean()
|
||||
plugin = TorchDDPPlugin()
|
||||
@@ -96,7 +97,7 @@ def exam_gemini_load_from_torch(shard: bool, model_name: str):
|
||||
model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion)
|
||||
|
||||
data = data_gen_fn()
|
||||
data = {k: v.to('cuda') if torch.is_tensor(v) or 'Tensor' in v.__class__.__name__ else v for k, v in data.items()}
|
||||
data = {k: v.to("cuda") if torch.is_tensor(v) or "Tensor" in v.__class__.__name__ else v for k, v in data.items()}
|
||||
output = model(**data)
|
||||
output = output_transform_fn(output)
|
||||
output_key = list(output.keys())[0]
|
||||
@@ -123,8 +124,11 @@ def exam_gemini_load_from_torch(shard: bool, model_name: str):
|
||||
new_booster.load_model(new_model, model_ckpt_path, strict=True)
|
||||
|
||||
# Add prefix to get aligned with pytorch parameter names.
|
||||
check_state_dict_equal(new_model.state_dict(only_rank_0=False, prefix='module.module.', dtype=torch.float32),
|
||||
model.state_dict(), False)
|
||||
check_state_dict_equal(
|
||||
new_model.state_dict(only_rank_0=False, prefix="module.module.", dtype=torch.float32),
|
||||
model.state_dict(),
|
||||
False,
|
||||
)
|
||||
|
||||
new_booster.load_optimizer(new_optimizer, optimizer_ckpt_path)
|
||||
old_state_dict = optimizer.state_dict()
|
||||
@@ -132,18 +136,19 @@ def exam_gemini_load_from_torch(shard: bool, model_name: str):
|
||||
|
||||
# Comparison of param_groups needs special care here,
|
||||
# since not all hyperparameters in Adam are used by HybridAdam
|
||||
hyperparameters_to_examine = ['params', 'lr', 'betas', 'eps', 'weight_decay']
|
||||
for old_group, new_group in zip(old_state_dict['param_groups'], new_state_dict['param_groups']):
|
||||
hyperparameters_to_examine = ["params", "lr", "betas", "eps", "weight_decay"]
|
||||
for old_group, new_group in zip(old_state_dict["param_groups"], new_state_dict["param_groups"]):
|
||||
for k in hyperparameters_to_examine:
|
||||
assert k in old_group and k in new_group, \
|
||||
f"Old group's keys: {list(old_group.keys())}, New group's keys: {list(new_group.keys())}"
|
||||
assert (
|
||||
k in old_group and k in new_group
|
||||
), f"Old group's keys: {list(old_group.keys())}, New group's keys: {list(new_group.keys())}"
|
||||
assert old_group[k] == new_group[k]
|
||||
check_state_dict_equal(old_state_dict['state'], new_state_dict['state'], False)
|
||||
check_state_dict_equal(old_state_dict["state"], new_state_dict["state"], False)
|
||||
|
||||
# Check the new model/optimizer can successfully run.
|
||||
data = data_gen_fn()
|
||||
data = {
|
||||
k: v.to('cuda') if torch.is_tensor(v) or 'Tensor' in v.__class__.__name__ else v for k, v in data.items()
|
||||
k: v.to("cuda") if torch.is_tensor(v) or "Tensor" in v.__class__.__name__ else v for k, v in data.items()
|
||||
}
|
||||
output = new_model(**data)
|
||||
output = output_transform_fn(output)
|
||||
@@ -157,13 +162,13 @@ def exam_gemini_load_from_torch(shard: bool, model_name: str):
|
||||
|
||||
def run_dist(rank, world_size, port):
|
||||
config = {}
|
||||
colossalai.launch(config=config, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
|
||||
colossalai.launch(config=config, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
exam_torch_load_from_gemini()
|
||||
exam_gemini_load_from_torch()
|
||||
|
||||
|
||||
@pytest.mark.dist
|
||||
@pytest.mark.parametrize('world_size', [2])
|
||||
@pytest.mark.parametrize("world_size", [2])
|
||||
@rerun_if_address_is_in_use()
|
||||
def test_gemini_ckpIO(world_size):
|
||||
spawn(run_dist, world_size)
|
||||
|
@@ -5,7 +5,6 @@ import torch
|
||||
from torch.optim import Adam
|
||||
from torchvision.models import resnet18
|
||||
|
||||
from colossalai.booster.plugin.gemini_plugin import GeminiCheckpointIO
|
||||
from colossalai.checkpoint_io import GeneralCheckpointIO
|
||||
from colossalai.testing import check_state_dict_equal, clear_cache_before_run, parameterize
|
||||
|
||||
@@ -18,7 +17,7 @@ from colossalai.testing import check_state_dict_equal, clear_cache_before_run, p
|
||||
|
||||
|
||||
@clear_cache_before_run()
|
||||
@parameterize('use_safetensors', [True, False])
|
||||
@parameterize("use_safetensors", [True, False])
|
||||
def test_unsharded_checkpoint(use_safetensors: bool):
|
||||
# create a model and optimizer
|
||||
model = resnet18()
|
||||
@@ -59,7 +58,7 @@ def test_unsharded_checkpoint(use_safetensors: bool):
|
||||
check_state_dict_equal(optimizer.state_dict(), new_optimizer.state_dict())
|
||||
|
||||
|
||||
@pytest.mark.parametrize('use_safetensors', [True, False])
|
||||
@pytest.mark.parametrize("use_safetensors", [True, False])
|
||||
def test_sharded_model_checkpoint(use_safetensors: bool):
|
||||
# create a model and optimizer
|
||||
model = resnet18()
|
||||
@@ -75,11 +74,9 @@ def test_sharded_model_checkpoint(use_safetensors: bool):
|
||||
|
||||
# create a temp file for checkpoint
|
||||
if use_safetensors:
|
||||
suffix = ".safetensors"
|
||||
SAFE_WEIGHTS_INDEX_NAME = "model.safetensors.index.json"
|
||||
pass
|
||||
else:
|
||||
suffix = ".bin"
|
||||
WEIGHTS_INDEX_NAME = "model.bin.index.json"
|
||||
pass
|
||||
|
||||
model_ckpt_dir = tempfile.TemporaryDirectory()
|
||||
optimizer_ckpt_tempfile = tempfile.NamedTemporaryFile()
|
||||
@@ -103,7 +100,6 @@ def test_sharded_model_checkpoint(use_safetensors: bool):
|
||||
|
||||
|
||||
def test_sharded_optimizer_checkpoint():
|
||||
|
||||
# create a model and optimizer
|
||||
model = resnet18()
|
||||
optimizer = Adam(model.parameters(), lr=0.001)
|
||||
@@ -162,16 +158,11 @@ def test_sharded_optimizer_checkpoint():
|
||||
|
||||
|
||||
def test_sharded_optimizer_multiple_param_groups():
|
||||
|
||||
# create a model and optimizer
|
||||
model = resnet18()
|
||||
optimizer = Adam([{
|
||||
'params': model.layer1.parameters()
|
||||
}, {
|
||||
'params': model.layer2.parameters(),
|
||||
'lr': 0.002
|
||||
}],
|
||||
lr=0.001)
|
||||
optimizer = Adam(
|
||||
[{"params": model.layer1.parameters()}, {"params": model.layer2.parameters(), "lr": 0.002}], lr=0.001
|
||||
)
|
||||
|
||||
# create test data sample
|
||||
x = torch.randn(1, 3, 224, 224)
|
||||
@@ -194,13 +185,9 @@ def test_sharded_optimizer_multiple_param_groups():
|
||||
|
||||
# create new model
|
||||
new_model = resnet18()
|
||||
new_optimizer = Adam([{
|
||||
'params': new_model.layer1.parameters()
|
||||
}, {
|
||||
'params': new_model.layer2.parameters(),
|
||||
'lr': 0.002
|
||||
}],
|
||||
lr=0.001)
|
||||
new_optimizer = Adam(
|
||||
[{"params": new_model.layer1.parameters()}, {"params": new_model.layer2.parameters(), "lr": 0.002}], lr=0.001
|
||||
)
|
||||
|
||||
ckpt_io.load_model(new_model, str(model_ckpt_dir.name), strict=True)
|
||||
ckpt_io.load_optimizer(new_optimizer, str(optimizer_ckpt_dir.name))
|
||||
|
@@ -22,37 +22,26 @@ from tests.kit.model_zoo import model_zoo
|
||||
|
||||
# TODO (Baizhou): Add test cases for shard=False
|
||||
@clear_cache_before_run()
|
||||
@parameterize('shard', [True])
|
||||
@parameterize('model_name', ['transformers_gpt'])
|
||||
@parameterize('size_per_shard', [32])
|
||||
@parameterize('test_config', [{
|
||||
'tp_size': 4,
|
||||
'pp_size': 1,
|
||||
'precision': 'fp32',
|
||||
}, {
|
||||
'tp_size': 2,
|
||||
'pp_size': 2,
|
||||
'num_microbatches': 4,
|
||||
'precision': 'fp16',
|
||||
'initial_scale': 1
|
||||
}, {
|
||||
'tp_size': 2,
|
||||
'pp_size': 1,
|
||||
'zero_stage': 2,
|
||||
'precision': 'fp16',
|
||||
'initial_scale': 1
|
||||
}, {
|
||||
'tp_size': 1,
|
||||
'pp_size': 2,
|
||||
'num_microbatches': 4,
|
||||
'zero_stage': 1,
|
||||
'precision': 'fp16',
|
||||
'initial_scale': 1
|
||||
}])
|
||||
@parameterize("shard", [True])
|
||||
@parameterize("model_name", ["transformers_gpt"])
|
||||
@parameterize("size_per_shard", [32])
|
||||
@parameterize(
|
||||
"test_config",
|
||||
[
|
||||
{
|
||||
"tp_size": 4,
|
||||
"pp_size": 1,
|
||||
"precision": "fp32",
|
||||
},
|
||||
{"tp_size": 2, "pp_size": 2, "num_microbatches": 4, "precision": "fp16", "initial_scale": 1},
|
||||
{"tp_size": 2, "pp_size": 1, "zero_stage": 2, "precision": "fp16", "initial_scale": 1},
|
||||
{"tp_size": 1, "pp_size": 2, "num_microbatches": 4, "zero_stage": 1, "precision": "fp16", "initial_scale": 1},
|
||||
],
|
||||
)
|
||||
def exam_state_dict(shard: bool, model_name: str, size_per_shard: int, test_config: dict):
|
||||
|
||||
(model_fn, data_gen_fn, output_transform_fn, loss_fn,
|
||||
_) = next(iter(model_zoo.get_sub_registry(model_name).values()))
|
||||
(model_fn, data_gen_fn, output_transform_fn, loss_fn, _) = next(
|
||||
iter(model_zoo.get_sub_registry(model_name).values())
|
||||
)
|
||||
criterion = loss_fn
|
||||
plugin = HybridParallelPlugin(**test_config)
|
||||
booster = Booster(plugin=plugin)
|
||||
@@ -65,10 +54,10 @@ def exam_state_dict(shard: bool, model_name: str, size_per_shard: int, test_conf
|
||||
def _preprocess_data(data):
|
||||
if booster.plugin.stage_manager is not None:
|
||||
for k, v in data.items():
|
||||
if torch.is_tensor(v) or 'Tensor' in v.__class__.__name__:
|
||||
if torch.is_tensor(v) or "Tensor" in v.__class__.__name__:
|
||||
new_shape = [1] * v.dim()
|
||||
new_shape[0] = 4
|
||||
data[k] = v.to('cuda').repeat(*new_shape)
|
||||
data[k] = v.to("cuda").repeat(*new_shape)
|
||||
return iter([data])
|
||||
else:
|
||||
return {k: v.cuda() for k, v in data.items()}
|
||||
@@ -80,12 +69,9 @@ def exam_state_dict(shard: bool, model_name: str, size_per_shard: int, test_conf
|
||||
data = data_gen_fn()
|
||||
model.train()
|
||||
if booster.plugin.stage_manager is not None:
|
||||
booster.execute_pipeline(_preprocess_data(data),
|
||||
model,
|
||||
_criterion,
|
||||
optimizer,
|
||||
return_loss=True,
|
||||
return_outputs=False)
|
||||
booster.execute_pipeline(
|
||||
_preprocess_data(data), model, _criterion, optimizer, return_loss=True, return_outputs=False
|
||||
)
|
||||
else:
|
||||
output = model(**_preprocess_data(data))
|
||||
loss = criterion(output)
|
||||
@@ -94,7 +80,6 @@ def exam_state_dict(shard: bool, model_name: str, size_per_shard: int, test_conf
|
||||
optimizer.step()
|
||||
|
||||
with shared_tempdir() as tempdir:
|
||||
|
||||
model_ckpt_path = f"{tempdir}/model"
|
||||
optimizer_ckpt_path = f"{tempdir}/optimizer"
|
||||
booster.save_model(model, model_ckpt_path, shard=shard, size_per_shard=size_per_shard)
|
||||
@@ -115,18 +100,12 @@ def exam_state_dict(shard: bool, model_name: str, size_per_shard: int, test_conf
|
||||
model.train()
|
||||
new_model.train()
|
||||
if booster.plugin.stage_manager is not None:
|
||||
booster.execute_pipeline(_preprocess_data(data),
|
||||
model,
|
||||
_criterion,
|
||||
optimizer,
|
||||
return_loss=True,
|
||||
return_outputs=False)
|
||||
booster.execute_pipeline(_preprocess_data(data),
|
||||
new_model,
|
||||
_criterion,
|
||||
new_optimizer,
|
||||
return_loss=True,
|
||||
return_outputs=False)
|
||||
booster.execute_pipeline(
|
||||
_preprocess_data(data), model, _criterion, optimizer, return_loss=True, return_outputs=False
|
||||
)
|
||||
booster.execute_pipeline(
|
||||
_preprocess_data(data), new_model, _criterion, new_optimizer, return_loss=True, return_outputs=False
|
||||
)
|
||||
else:
|
||||
old_model_loss = criterion(model(**_preprocess_data(data)))
|
||||
optimizer.backward(old_model_loss)
|
||||
@@ -141,10 +120,9 @@ def exam_state_dict(shard: bool, model_name: str, size_per_shard: int, test_conf
|
||||
|
||||
if stage_manager is None or stage_manager.is_first_stage():
|
||||
assert_close_loose(model.unwrap().wte.weight.data, new_model.unwrap().wte.weight.data, atol=5e-3, rtol=5e-3)
|
||||
assert_close_loose(model.unwrap().h[0].mlp.c_fc.weight.data,
|
||||
new_model.unwrap().h[0].mlp.c_fc.weight.data,
|
||||
atol=5e-3,
|
||||
rtol=5e-3)
|
||||
assert_close_loose(
|
||||
model.unwrap().h[0].mlp.c_fc.weight.data, new_model.unwrap().h[0].mlp.c_fc.weight.data, atol=5e-3, rtol=5e-3
|
||||
)
|
||||
|
||||
dist.barrier()
|
||||
Randomizer.reset_index()
|
||||
@@ -153,12 +131,12 @@ def exam_state_dict(shard: bool, model_name: str, size_per_shard: int, test_conf
|
||||
|
||||
def run_dist(rank, world_size, port):
|
||||
config = {}
|
||||
colossalai.launch(config=config, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
|
||||
colossalai.launch(config=config, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
exam_state_dict()
|
||||
|
||||
|
||||
@pytest.mark.dist
|
||||
@pytest.mark.parametrize('world_size', [4])
|
||||
@pytest.mark.parametrize("world_size", [4])
|
||||
@rerun_if_address_is_in_use()
|
||||
def test_hybrid_ckpIO(world_size):
|
||||
spawn(run_dist, world_size)
|
||||
|
@@ -20,9 +20,9 @@ from colossalai.zero import LowLevelZeroOptimizer
|
||||
# stage 1 and 2 process the optimizer/mode the same way
|
||||
# only test 2 is fine
|
||||
@clear_cache_before_run()
|
||||
@parameterize('stage', [2])
|
||||
@parameterize('shard', [True, False])
|
||||
@parameterize('offload', [False, True])
|
||||
@parameterize("stage", [2])
|
||||
@parameterize("shard", [True, False])
|
||||
@parameterize("offload", [False, True])
|
||||
def check_low_level_zero_checkpointIO(stage: int, shard: bool, offload: bool):
|
||||
plugin = LowLevelZeroPlugin(stage=stage, max_norm=1.0, initial_scale=32, cpu_offload=offload)
|
||||
booster = Booster(plugin=plugin)
|
||||
@@ -31,7 +31,7 @@ def check_low_level_zero_checkpointIO(stage: int, shard: bool, offload: bool):
|
||||
optimizer = HybridAdam((model.parameters()), lr=0.001)
|
||||
model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion)
|
||||
|
||||
x = torch.randn(1, 3, 224, 224, device='cuda')
|
||||
x = torch.randn(1, 3, 224, 224, device="cuda")
|
||||
output = model(x)
|
||||
loss = criterion(output)
|
||||
booster.backward(loss, optimizer)
|
||||
@@ -60,15 +60,16 @@ def check_low_level_zero_checkpointIO(stage: int, shard: bool, offload: bool):
|
||||
padding = new_optimizer._param_store.get_param_padding_size(working_param)
|
||||
padded_param = torch.nn.functional.pad(working_param.data.view(-1), (0, padding))
|
||||
working_shard = padded_param.chunk(dist.get_world_size())[dist.get_rank()]
|
||||
assert torch.equal(working_shard,
|
||||
master_param.data.view(-1).to(dtype=padded_param.dtype, device=padded_param.device))
|
||||
assert torch.equal(
|
||||
working_shard, master_param.data.view(-1).to(dtype=padded_param.dtype, device=padded_param.device)
|
||||
)
|
||||
|
||||
booster.load_optimizer(new_optimizer, optimizer_ckpt_path)
|
||||
check_state_dict_equal(optimizer.optim.state_dict(), new_optimizer.optim.state_dict(), False)
|
||||
|
||||
|
||||
def run_dist(rank, world_size, port):
|
||||
colossalai.launch(config=(dict()), rank=rank, world_size=world_size, port=port, host='localhost')
|
||||
colossalai.launch(config=(dict()), rank=rank, world_size=world_size, port=port, host="localhost")
|
||||
check_low_level_zero_checkpointIO()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
|
@@ -1,5 +1,3 @@
|
||||
import os
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
@@ -20,18 +18,19 @@ from tests.kit.model_zoo import model_zoo
|
||||
|
||||
|
||||
@clear_cache_before_run()
|
||||
@parameterize('model_name', ['transformers_gpt'])
|
||||
@parameterize('plugin_type', ['ddp', 'zero', 'gemini'])
|
||||
@parameterize("model_name", ["transformers_gpt"])
|
||||
@parameterize("plugin_type", ["ddp", "zero", "gemini"])
|
||||
def exam_from_pretrained(plugin_type: str, model_name: str, shard=True, size_per_shard=32):
|
||||
(model_fn, data_gen_fn, output_transform_fn, loss_fn,
|
||||
_) = next(iter(model_zoo.get_sub_registry(model_name).values()))
|
||||
(model_fn, data_gen_fn, output_transform_fn, loss_fn, _) = next(
|
||||
iter(model_zoo.get_sub_registry(model_name).values())
|
||||
)
|
||||
criterion = loss_fn
|
||||
|
||||
if plugin_type == 'ddp':
|
||||
if plugin_type == "ddp":
|
||||
plugin = TorchDDPPlugin()
|
||||
elif plugin_type == 'zero':
|
||||
elif plugin_type == "zero":
|
||||
plugin = LowLevelZeroPlugin(stage=2, max_norm=1.0, initial_scale=32)
|
||||
elif plugin_type == 'gemini':
|
||||
elif plugin_type == "gemini":
|
||||
plugin = GeminiPlugin(precision="fp16", initial_scale=32)
|
||||
else:
|
||||
raise ValueError(f"Plugin with type {plugin_type} is invalid, please check your argument.")
|
||||
@@ -44,7 +43,7 @@ def exam_from_pretrained(plugin_type: str, model_name: str, shard=True, size_per
|
||||
model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion)
|
||||
|
||||
data = data_gen_fn()
|
||||
data = {k: v.to('cuda') if torch.is_tensor(v) or 'Tensor' in v.__class__.__name__ else v for k, v in data.items()}
|
||||
data = {k: v.to("cuda") if torch.is_tensor(v) or "Tensor" in v.__class__.__name__ else v for k, v in data.items()}
|
||||
output = model(**data)
|
||||
loss = criterion(output)
|
||||
|
||||
@@ -52,7 +51,6 @@ def exam_from_pretrained(plugin_type: str, model_name: str, shard=True, size_per
|
||||
optimizer.step()
|
||||
|
||||
with shared_tempdir() as tempdir:
|
||||
|
||||
model_ckpt_path = f"{tempdir}/model"
|
||||
booster.save_model(model, model_ckpt_path, shard=shard, size_per_shard=size_per_shard)
|
||||
dist.barrier()
|
||||
@@ -62,9 +60,10 @@ def exam_from_pretrained(plugin_type: str, model_name: str, shard=True, size_per
|
||||
new_optimizer = HybridAdam(new_model.parameters(), lr=0.001)
|
||||
new_model, new_optimizer, criterion, _, _ = booster.boost(new_model, new_optimizer, criterion)
|
||||
|
||||
if plugin_type == 'gemini':
|
||||
check_state_dict_equal(model.unwrap().state_dict(only_rank_0=False),
|
||||
new_model.unwrap().state_dict(only_rank_0=False), False)
|
||||
if plugin_type == "gemini":
|
||||
check_state_dict_equal(
|
||||
model.unwrap().state_dict(only_rank_0=False), new_model.unwrap().state_dict(only_rank_0=False), False
|
||||
)
|
||||
else:
|
||||
check_state_dict_equal(model.unwrap().state_dict(), new_model.unwrap().state_dict(), False)
|
||||
dist.barrier()
|
||||
@@ -72,12 +71,12 @@ def exam_from_pretrained(plugin_type: str, model_name: str, shard=True, size_per
|
||||
|
||||
def run_dist(rank, world_size, port):
|
||||
config = {}
|
||||
colossalai.launch(config=config, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
|
||||
colossalai.launch(config=config, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
exam_from_pretrained()
|
||||
|
||||
|
||||
@pytest.mark.dist
|
||||
@pytest.mark.parametrize('world_size', [2])
|
||||
@pytest.mark.parametrize("world_size", [2])
|
||||
@rerun_if_address_is_in_use()
|
||||
def test_huggingface_compatibility(world_size):
|
||||
spawn(run_dist, world_size)
|
||||
|
@@ -12,8 +12,8 @@ from colossalai.interface import OptimizerWrapper
|
||||
from colossalai.testing import check_state_dict_equal, parameterize, rerun_if_address_is_in_use, spawn
|
||||
|
||||
|
||||
@parameterize('shard', [True, False])
|
||||
@parameterize('size_per_shard', [16, 128])
|
||||
@parameterize("shard", [True, False])
|
||||
@parameterize("size_per_shard", [16, 128])
|
||||
def check_torch_ddp_checkpointIO(shard: bool, size_per_shard: int):
|
||||
plugin = TorchDDPPlugin()
|
||||
booster = Booster(plugin=plugin)
|
||||
@@ -27,7 +27,7 @@ def check_torch_ddp_checkpointIO(shard: bool, size_per_shard: int):
|
||||
assert isinstance(optimizer, OptimizerWrapper)
|
||||
|
||||
x = torch.randn(4, 3, 224, 224)
|
||||
x = x.to('cuda')
|
||||
x = x.to("cuda")
|
||||
output = model(x)
|
||||
loss = criterion(output)
|
||||
booster.backward(loss, optimizer)
|
||||
@@ -47,9 +47,9 @@ def check_torch_ddp_checkpointIO(shard: bool, size_per_shard: int):
|
||||
new_model = resnet18()
|
||||
new_optimizer = SGD((new_model.parameters()), lr=0.001)
|
||||
new_scheduler = torch.optim.lr_scheduler.StepLR(new_optimizer, step_size=1, gamma=0.1)
|
||||
new_model, new_optimizer, _, _, new_scheduler = booster.boost(new_model,
|
||||
new_optimizer,
|
||||
lr_scheduler=new_scheduler)
|
||||
new_model, new_optimizer, _, _, new_scheduler = booster.boost(
|
||||
new_model, new_optimizer, lr_scheduler=new_scheduler
|
||||
)
|
||||
|
||||
booster.load_model(new_model, model_ckpt_path)
|
||||
check_state_dict_equal(model.state_dict(), new_model.state_dict(), False)
|
||||
@@ -61,7 +61,7 @@ def check_torch_ddp_checkpointIO(shard: bool, size_per_shard: int):
|
||||
|
||||
|
||||
def run_dist(rank, world_size, port):
|
||||
colossalai.launch(config=(dict()), rank=rank, world_size=world_size, port=port, host='localhost')
|
||||
colossalai.launch(config=(dict()), rank=rank, world_size=world_size, port=port, host="localhost")
|
||||
check_torch_ddp_checkpointIO()
|
||||
|
||||
|
||||
|
@@ -1,7 +1,6 @@
|
||||
import pytest
|
||||
import torch
|
||||
from packaging import version
|
||||
from torch import nn
|
||||
from torch.optim import SGD
|
||||
from torchvision.models import resnet18
|
||||
from utils import shared_tempdir
|
||||
@@ -9,11 +8,10 @@ from utils import shared_tempdir
|
||||
import colossalai
|
||||
from colossalai.booster import Booster
|
||||
|
||||
if version.parse(torch.__version__) >= version.parse('1.12.0'):
|
||||
from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
|
||||
if version.parse(torch.__version__) >= version.parse("1.12.0"):
|
||||
from colossalai.booster.plugin import TorchFSDPPlugin
|
||||
|
||||
from colossalai.testing import rerun_if_address_is_in_use, spawn, check_state_dict_equal
|
||||
from colossalai.testing import rerun_if_address_is_in_use, spawn
|
||||
|
||||
|
||||
def compare_nested_dict(dict1, dict2):
|
||||
@@ -72,15 +70,16 @@ def check_torch_fsdp_ckpt():
|
||||
booster.save_optimizer(optimizer, optim_ckpt_path, shard=False)
|
||||
|
||||
full_msd = fsdp_model.state_dict()
|
||||
#full_osd = FSDP.full_optim_state_dict(fsdp_model, optimizer)
|
||||
# full_osd = FSDP.full_optim_state_dict(fsdp_model, optimizer)
|
||||
sharded_osd = optimizer.state_dict()
|
||||
import copy
|
||||
|
||||
sharded_osd = copy.deepcopy(sharded_osd)
|
||||
|
||||
run_model()
|
||||
|
||||
full_msd_updated = fsdp_model.state_dict()
|
||||
#full_osd_updated = FSDP.full_optim_state_dict(fsdp_model, optimizer, rank0_only=True)
|
||||
# full_osd_updated = FSDP.full_optim_state_dict(fsdp_model, optimizer, rank0_only=True)
|
||||
sharded_osd_updated = optimizer.state_dict()
|
||||
|
||||
assert not compare_nested_dict(sharded_osd, sharded_osd_updated)
|
||||
@@ -92,9 +91,9 @@ def check_torch_fsdp_ckpt():
|
||||
booster.load_optimizer(optimizer, optim_ckpt_path)
|
||||
|
||||
full_msd_restore = fsdp_model.state_dict()
|
||||
#full_osd_restore = FSDP.full_optim_state_dict(fsdp_model, optimizer, rank0_only=True)
|
||||
# full_osd_restore = FSDP.full_optim_state_dict(fsdp_model, optimizer, rank0_only=True)
|
||||
sharded_osd_restore = optimizer.state_dict()
|
||||
|
||||
|
||||
assert compare_nested_dict(sharded_osd, sharded_osd_restore)
|
||||
assert compare_nested_dict(full_msd_restore, full_msd)
|
||||
outputs_sec = fsdp_model(inputs)
|
||||
@@ -103,11 +102,11 @@ def check_torch_fsdp_ckpt():
|
||||
|
||||
def run_dist(rank, world_size, port):
|
||||
# init dist env
|
||||
colossalai.launch(config=dict(), rank=rank, world_size=world_size, port=port, host='localhost')
|
||||
colossalai.launch(config=dict(), rank=rank, world_size=world_size, port=port, host="localhost")
|
||||
check_torch_fsdp_ckpt()
|
||||
|
||||
|
||||
@pytest.mark.skipif(version.parse(torch.__version__) < version.parse('1.12.0'), reason="requires torch1.12 or higher")
|
||||
@pytest.mark.skipif(version.parse(torch.__version__) < version.parse("1.12.0"), reason="requires torch1.12 or higher")
|
||||
@rerun_if_address_is_in_use()
|
||||
def test_torch_fsdp_ckpt():
|
||||
spawn(run_dist, 2)
|
||||
|
@@ -15,7 +15,7 @@ def shared_tempdir() -> Iterator[str]:
|
||||
try:
|
||||
obj = [tempdir]
|
||||
dist.broadcast_object_list(obj, src=0)
|
||||
tempdir = obj[0] # use the same directory on all ranks
|
||||
tempdir = obj[0] # use the same directory on all ranks
|
||||
yield tempdir
|
||||
finally:
|
||||
dist.barrier()
|
||||
|
Reference in New Issue
Block a user