[misc] update pre-commit and run all files (#4752)

* [misc] update pre-commit

* [misc] run pre-commit

* [misc] remove useless configuration files

* [misc] ignore cuda for clang-format
This commit is contained in:
Hongxin Liu
2023-09-19 14:20:26 +08:00
committed by GitHub
parent 3c6b831c26
commit 079bf3cb26
1268 changed files with 50037 additions and 38444 deletions

View File

@@ -19,50 +19,30 @@ from colossalai.testing import (
from tests.kit.model_zoo import model_zoo
MODEL_PLACEMENT_CONFIGS = [
{
'placement_policy': 'static',
'shard_param_frac': 0.0
}, # zero2
{
'placement_policy': 'static',
'shard_param_frac': 1.0
}, # zero3
{
'placement_policy': 'static',
'shard_param_frac': 0.5
}, # zero3-half
{"placement_policy": "static", "shard_param_frac": 0.0}, # zero2
{"placement_policy": "static", "shard_param_frac": 1.0}, # zero3
{"placement_policy": "static", "shard_param_frac": 0.5}, # zero3-half
]
OPTIM_PLACEMENT_CONFIGS = [
{
'placement_policy': 'static',
'shard_param_frac': 0.0,
'offload_optim_frac': 0.0
}, # zero2
{
'placement_policy': 'static',
'shard_param_frac': 0.0,
'offload_optim_frac': 1.0
}, # zero2-offload
{
'placement_policy': 'static',
'shard_param_frac': 0.0,
'offload_optim_frac': 0.5
}, # zero2-offload-half
{"placement_policy": "static", "shard_param_frac": 0.0, "offload_optim_frac": 0.0}, # zero2
{"placement_policy": "static", "shard_param_frac": 0.0, "offload_optim_frac": 1.0}, # zero2-offload
{"placement_policy": "static", "shard_param_frac": 0.0, "offload_optim_frac": 0.5}, # zero2-offload-half
]
@clear_cache_before_run()
@parameterize('placement_config', MODEL_PLACEMENT_CONFIGS)
@parameterize('model_name', ['transformers_bert_for_sequence_classification'])
@parameterize('use_safetensors', [False, True])
@parameterize("placement_config", MODEL_PLACEMENT_CONFIGS)
@parameterize("model_name", ["transformers_bert_for_sequence_classification"])
@parameterize("use_safetensors", [False, True])
def exam_state_dict_with_origin(placement_config, model_name, use_safetensors: bool):
from transformers import BertForSequenceClassification
(model_fn, data_gen_fn, output_transform_fn, _, _) = next(iter(model_zoo.get_sub_registry(model_name).values()))
bert_model = model_fn()
with shared_tempdir() as tempdir:
pretrained_path = os.path.join(tempdir, 'pretrained')
pretrained_path = os.path.join(tempdir, "pretrained")
bert_model.config.save_pretrained(save_directory=pretrained_path)
plugin = GeminiPlugin(**placement_config)
@@ -70,24 +50,22 @@ def exam_state_dict_with_origin(placement_config, model_name, use_safetensors: b
bert_model, _, _, _, _ = booster.boost(bert_model)
model_size = sum(p.numel() * p.element_size() for p in bert_model.parameters()) / 1024**2
booster.save_model(bert_model,
pretrained_path,
True,
True,
'', (model_size / 3),
use_safetensors=use_safetensors)
booster.save_model(
bert_model, pretrained_path, True, True, "", (model_size / 3), use_safetensors=use_safetensors
)
dist.barrier()
new_bert_model = BertForSequenceClassification.from_pretrained(pretrained_path)
check_state_dict_equal(bert_model.state_dict(only_rank_0=False, dtype=torch.float32),
new_bert_model.state_dict(), False)
check_state_dict_equal(
bert_model.state_dict(only_rank_0=False, dtype=torch.float32), new_bert_model.state_dict(), False
)
@clear_cache_before_run()
@parameterize('placement_config', OPTIM_PLACEMENT_CONFIGS)
@parameterize('shard', [False, True])
@parameterize('model_name', ['transformers_gpt'])
@parameterize('size_per_shard', [32])
@parameterize("placement_config", OPTIM_PLACEMENT_CONFIGS)
@parameterize("shard", [False, True])
@parameterize("model_name", ["transformers_gpt"])
@parameterize("size_per_shard", [32])
def exam_state_dict(placement_config, shard: bool, model_name: str, size_per_shard: int):
(model_fn, data_gen_fn, output_transform_fn, _, _) = next(iter(model_zoo.get_sub_registry(model_name).values()))
criterion = lambda x: x.mean()
@@ -102,7 +80,7 @@ def exam_state_dict(placement_config, shard: bool, model_name: str, size_per_sha
new_model, new_optimizer, criterion, _, _ = booster.boost(new_model, new_optimizer, criterion)
data = data_gen_fn()
data = {k: v.to('cuda') if torch.is_tensor(v) or 'Tensor' in v.__class__.__name__ else v for k, v in data.items()}
data = {k: v.to("cuda") if torch.is_tensor(v) or "Tensor" in v.__class__.__name__ else v for k, v in data.items()}
output = model(**data)
output = output_transform_fn(output)
output_key = list(output.keys())[0]
@@ -123,13 +101,14 @@ def exam_state_dict(placement_config, shard: bool, model_name: str, size_per_sha
check_state_dict_equal(model.state_dict(only_rank_0=False), new_model.state_dict(only_rank_0=False), False)
booster.load_optimizer(new_optimizer, optimizer_ckpt_path)
check_state_dict_equal(optimizer.state_dict(only_rank_0=False), new_optimizer.state_dict(only_rank_0=False),
False)
check_state_dict_equal(
optimizer.state_dict(only_rank_0=False), new_optimizer.state_dict(only_rank_0=False), False
)
# Check the new model/optimizer can successfully run.
data = data_gen_fn()
data = {
k: v.to('cuda') if torch.is_tensor(v) or 'Tensor' in v.__class__.__name__ else v for k, v in data.items()
k: v.to("cuda") if torch.is_tensor(v) or "Tensor" in v.__class__.__name__ else v for k, v in data.items()
}
output = new_model(**data)
output = output_transform_fn(output)
@@ -143,13 +122,13 @@ def exam_state_dict(placement_config, shard: bool, model_name: str, size_per_sha
def run_dist(rank, world_size, port):
config = {}
colossalai.launch(config=config, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
colossalai.launch(config=config, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
exam_state_dict()
exam_state_dict_with_origin()
@pytest.mark.dist
@pytest.mark.parametrize('world_size', [2])
@pytest.mark.parametrize("world_size", [2])
@rerun_if_address_is_in_use()
def test_gemini_ckpIO(world_size):
spawn(run_dist, world_size)

View File

@@ -19,10 +19,9 @@ from tests.kit.model_zoo import model_zoo
@clear_cache_before_run()
@parameterize('shard', [False, True])
@parameterize('model_name', ['transformers_gpt'])
@parameterize("shard", [False, True])
@parameterize("model_name", ["transformers_gpt"])
def exam_torch_load_from_gemini(shard: bool, model_name: str):
(model_fn, data_gen_fn, output_transform_fn, _, _) = next(iter(model_zoo.get_sub_registry(model_name).values()))
criterion = lambda x: x.mean()
plugin = GeminiPlugin(precision="fp16", initial_scale=(2**14))
@@ -33,7 +32,7 @@ def exam_torch_load_from_gemini(shard: bool, model_name: str):
model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion)
data = data_gen_fn()
data = {k: v.to('cuda') if torch.is_tensor(v) or 'Tensor' in v.__class__.__name__ else v for k, v in data.items()}
data = {k: v.to("cuda") if torch.is_tensor(v) or "Tensor" in v.__class__.__name__ else v for k, v in data.items()}
output = model(**data)
output = output_transform_fn(output)
output_key = list(output.keys())[0]
@@ -60,8 +59,11 @@ def exam_torch_load_from_gemini(shard: bool, model_name: str):
new_booster.load_model(new_model, model_ckpt_path, strict=True)
# Add prefix to get aligned with pytorch parameter names.
check_state_dict_equal(model.state_dict(only_rank_0=False, prefix='module.module.', dtype=torch.float32),
new_model.state_dict(), False)
check_state_dict_equal(
model.state_dict(only_rank_0=False, prefix="module.module.", dtype=torch.float32),
new_model.state_dict(),
False,
)
new_booster.load_optimizer(new_optimizer, optimizer_ckpt_path)
check_state_dict_equal(optimizer.state_dict(only_rank_0=False), new_optimizer.state_dict(), False)
@@ -69,7 +71,7 @@ def exam_torch_load_from_gemini(shard: bool, model_name: str):
# Check the new model/optimizer can successfully run.
data = data_gen_fn()
data = {
k: v.to('cuda') if torch.is_tensor(v) or 'Tensor' in v.__class__.__name__ else v for k, v in data.items()
k: v.to("cuda") if torch.is_tensor(v) or "Tensor" in v.__class__.__name__ else v for k, v in data.items()
}
output = new_model(**data)
output = output_transform_fn(output)
@@ -82,10 +84,9 @@ def exam_torch_load_from_gemini(shard: bool, model_name: str):
@clear_cache_before_run()
@parameterize('shard', [False, True])
@parameterize('model_name', ['transformers_gpt'])
@parameterize("shard", [False, True])
@parameterize("model_name", ["transformers_gpt"])
def exam_gemini_load_from_torch(shard: bool, model_name: str):
(model_fn, data_gen_fn, output_transform_fn, _, _) = next(iter(model_zoo.get_sub_registry(model_name).values()))
criterion = lambda x: x.mean()
plugin = TorchDDPPlugin()
@@ -96,7 +97,7 @@ def exam_gemini_load_from_torch(shard: bool, model_name: str):
model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion)
data = data_gen_fn()
data = {k: v.to('cuda') if torch.is_tensor(v) or 'Tensor' in v.__class__.__name__ else v for k, v in data.items()}
data = {k: v.to("cuda") if torch.is_tensor(v) or "Tensor" in v.__class__.__name__ else v for k, v in data.items()}
output = model(**data)
output = output_transform_fn(output)
output_key = list(output.keys())[0]
@@ -123,8 +124,11 @@ def exam_gemini_load_from_torch(shard: bool, model_name: str):
new_booster.load_model(new_model, model_ckpt_path, strict=True)
# Add prefix to get aligned with pytorch parameter names.
check_state_dict_equal(new_model.state_dict(only_rank_0=False, prefix='module.module.', dtype=torch.float32),
model.state_dict(), False)
check_state_dict_equal(
new_model.state_dict(only_rank_0=False, prefix="module.module.", dtype=torch.float32),
model.state_dict(),
False,
)
new_booster.load_optimizer(new_optimizer, optimizer_ckpt_path)
old_state_dict = optimizer.state_dict()
@@ -132,18 +136,19 @@ def exam_gemini_load_from_torch(shard: bool, model_name: str):
# Comparison of param_groups needs special care here,
# since not all hyperparameters in Adam are used by HybridAdam
hyperparameters_to_examine = ['params', 'lr', 'betas', 'eps', 'weight_decay']
for old_group, new_group in zip(old_state_dict['param_groups'], new_state_dict['param_groups']):
hyperparameters_to_examine = ["params", "lr", "betas", "eps", "weight_decay"]
for old_group, new_group in zip(old_state_dict["param_groups"], new_state_dict["param_groups"]):
for k in hyperparameters_to_examine:
assert k in old_group and k in new_group, \
f"Old group's keys: {list(old_group.keys())}, New group's keys: {list(new_group.keys())}"
assert (
k in old_group and k in new_group
), f"Old group's keys: {list(old_group.keys())}, New group's keys: {list(new_group.keys())}"
assert old_group[k] == new_group[k]
check_state_dict_equal(old_state_dict['state'], new_state_dict['state'], False)
check_state_dict_equal(old_state_dict["state"], new_state_dict["state"], False)
# Check the new model/optimizer can successfully run.
data = data_gen_fn()
data = {
k: v.to('cuda') if torch.is_tensor(v) or 'Tensor' in v.__class__.__name__ else v for k, v in data.items()
k: v.to("cuda") if torch.is_tensor(v) or "Tensor" in v.__class__.__name__ else v for k, v in data.items()
}
output = new_model(**data)
output = output_transform_fn(output)
@@ -157,13 +162,13 @@ def exam_gemini_load_from_torch(shard: bool, model_name: str):
def run_dist(rank, world_size, port):
config = {}
colossalai.launch(config=config, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
colossalai.launch(config=config, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
exam_torch_load_from_gemini()
exam_gemini_load_from_torch()
@pytest.mark.dist
@pytest.mark.parametrize('world_size', [2])
@pytest.mark.parametrize("world_size", [2])
@rerun_if_address_is_in_use()
def test_gemini_ckpIO(world_size):
spawn(run_dist, world_size)

View File

@@ -5,7 +5,6 @@ import torch
from torch.optim import Adam
from torchvision.models import resnet18
from colossalai.booster.plugin.gemini_plugin import GeminiCheckpointIO
from colossalai.checkpoint_io import GeneralCheckpointIO
from colossalai.testing import check_state_dict_equal, clear_cache_before_run, parameterize
@@ -18,7 +17,7 @@ from colossalai.testing import check_state_dict_equal, clear_cache_before_run, p
@clear_cache_before_run()
@parameterize('use_safetensors', [True, False])
@parameterize("use_safetensors", [True, False])
def test_unsharded_checkpoint(use_safetensors: bool):
# create a model and optimizer
model = resnet18()
@@ -59,7 +58,7 @@ def test_unsharded_checkpoint(use_safetensors: bool):
check_state_dict_equal(optimizer.state_dict(), new_optimizer.state_dict())
@pytest.mark.parametrize('use_safetensors', [True, False])
@pytest.mark.parametrize("use_safetensors", [True, False])
def test_sharded_model_checkpoint(use_safetensors: bool):
# create a model and optimizer
model = resnet18()
@@ -75,11 +74,9 @@ def test_sharded_model_checkpoint(use_safetensors: bool):
# create a temp file for checkpoint
if use_safetensors:
suffix = ".safetensors"
SAFE_WEIGHTS_INDEX_NAME = "model.safetensors.index.json"
pass
else:
suffix = ".bin"
WEIGHTS_INDEX_NAME = "model.bin.index.json"
pass
model_ckpt_dir = tempfile.TemporaryDirectory()
optimizer_ckpt_tempfile = tempfile.NamedTemporaryFile()
@@ -103,7 +100,6 @@ def test_sharded_model_checkpoint(use_safetensors: bool):
def test_sharded_optimizer_checkpoint():
# create a model and optimizer
model = resnet18()
optimizer = Adam(model.parameters(), lr=0.001)
@@ -162,16 +158,11 @@ def test_sharded_optimizer_checkpoint():
def test_sharded_optimizer_multiple_param_groups():
# create a model and optimizer
model = resnet18()
optimizer = Adam([{
'params': model.layer1.parameters()
}, {
'params': model.layer2.parameters(),
'lr': 0.002
}],
lr=0.001)
optimizer = Adam(
[{"params": model.layer1.parameters()}, {"params": model.layer2.parameters(), "lr": 0.002}], lr=0.001
)
# create test data sample
x = torch.randn(1, 3, 224, 224)
@@ -194,13 +185,9 @@ def test_sharded_optimizer_multiple_param_groups():
# create new model
new_model = resnet18()
new_optimizer = Adam([{
'params': new_model.layer1.parameters()
}, {
'params': new_model.layer2.parameters(),
'lr': 0.002
}],
lr=0.001)
new_optimizer = Adam(
[{"params": new_model.layer1.parameters()}, {"params": new_model.layer2.parameters(), "lr": 0.002}], lr=0.001
)
ckpt_io.load_model(new_model, str(model_ckpt_dir.name), strict=True)
ckpt_io.load_optimizer(new_optimizer, str(optimizer_ckpt_dir.name))

View File

@@ -22,37 +22,26 @@ from tests.kit.model_zoo import model_zoo
# TODO (Baizhou): Add test cases for shard=False
@clear_cache_before_run()
@parameterize('shard', [True])
@parameterize('model_name', ['transformers_gpt'])
@parameterize('size_per_shard', [32])
@parameterize('test_config', [{
'tp_size': 4,
'pp_size': 1,
'precision': 'fp32',
}, {
'tp_size': 2,
'pp_size': 2,
'num_microbatches': 4,
'precision': 'fp16',
'initial_scale': 1
}, {
'tp_size': 2,
'pp_size': 1,
'zero_stage': 2,
'precision': 'fp16',
'initial_scale': 1
}, {
'tp_size': 1,
'pp_size': 2,
'num_microbatches': 4,
'zero_stage': 1,
'precision': 'fp16',
'initial_scale': 1
}])
@parameterize("shard", [True])
@parameterize("model_name", ["transformers_gpt"])
@parameterize("size_per_shard", [32])
@parameterize(
"test_config",
[
{
"tp_size": 4,
"pp_size": 1,
"precision": "fp32",
},
{"tp_size": 2, "pp_size": 2, "num_microbatches": 4, "precision": "fp16", "initial_scale": 1},
{"tp_size": 2, "pp_size": 1, "zero_stage": 2, "precision": "fp16", "initial_scale": 1},
{"tp_size": 1, "pp_size": 2, "num_microbatches": 4, "zero_stage": 1, "precision": "fp16", "initial_scale": 1},
],
)
def exam_state_dict(shard: bool, model_name: str, size_per_shard: int, test_config: dict):
(model_fn, data_gen_fn, output_transform_fn, loss_fn,
_) = next(iter(model_zoo.get_sub_registry(model_name).values()))
(model_fn, data_gen_fn, output_transform_fn, loss_fn, _) = next(
iter(model_zoo.get_sub_registry(model_name).values())
)
criterion = loss_fn
plugin = HybridParallelPlugin(**test_config)
booster = Booster(plugin=plugin)
@@ -65,10 +54,10 @@ def exam_state_dict(shard: bool, model_name: str, size_per_shard: int, test_conf
def _preprocess_data(data):
if booster.plugin.stage_manager is not None:
for k, v in data.items():
if torch.is_tensor(v) or 'Tensor' in v.__class__.__name__:
if torch.is_tensor(v) or "Tensor" in v.__class__.__name__:
new_shape = [1] * v.dim()
new_shape[0] = 4
data[k] = v.to('cuda').repeat(*new_shape)
data[k] = v.to("cuda").repeat(*new_shape)
return iter([data])
else:
return {k: v.cuda() for k, v in data.items()}
@@ -80,12 +69,9 @@ def exam_state_dict(shard: bool, model_name: str, size_per_shard: int, test_conf
data = data_gen_fn()
model.train()
if booster.plugin.stage_manager is not None:
booster.execute_pipeline(_preprocess_data(data),
model,
_criterion,
optimizer,
return_loss=True,
return_outputs=False)
booster.execute_pipeline(
_preprocess_data(data), model, _criterion, optimizer, return_loss=True, return_outputs=False
)
else:
output = model(**_preprocess_data(data))
loss = criterion(output)
@@ -94,7 +80,6 @@ def exam_state_dict(shard: bool, model_name: str, size_per_shard: int, test_conf
optimizer.step()
with shared_tempdir() as tempdir:
model_ckpt_path = f"{tempdir}/model"
optimizer_ckpt_path = f"{tempdir}/optimizer"
booster.save_model(model, model_ckpt_path, shard=shard, size_per_shard=size_per_shard)
@@ -115,18 +100,12 @@ def exam_state_dict(shard: bool, model_name: str, size_per_shard: int, test_conf
model.train()
new_model.train()
if booster.plugin.stage_manager is not None:
booster.execute_pipeline(_preprocess_data(data),
model,
_criterion,
optimizer,
return_loss=True,
return_outputs=False)
booster.execute_pipeline(_preprocess_data(data),
new_model,
_criterion,
new_optimizer,
return_loss=True,
return_outputs=False)
booster.execute_pipeline(
_preprocess_data(data), model, _criterion, optimizer, return_loss=True, return_outputs=False
)
booster.execute_pipeline(
_preprocess_data(data), new_model, _criterion, new_optimizer, return_loss=True, return_outputs=False
)
else:
old_model_loss = criterion(model(**_preprocess_data(data)))
optimizer.backward(old_model_loss)
@@ -141,10 +120,9 @@ def exam_state_dict(shard: bool, model_name: str, size_per_shard: int, test_conf
if stage_manager is None or stage_manager.is_first_stage():
assert_close_loose(model.unwrap().wte.weight.data, new_model.unwrap().wte.weight.data, atol=5e-3, rtol=5e-3)
assert_close_loose(model.unwrap().h[0].mlp.c_fc.weight.data,
new_model.unwrap().h[0].mlp.c_fc.weight.data,
atol=5e-3,
rtol=5e-3)
assert_close_loose(
model.unwrap().h[0].mlp.c_fc.weight.data, new_model.unwrap().h[0].mlp.c_fc.weight.data, atol=5e-3, rtol=5e-3
)
dist.barrier()
Randomizer.reset_index()
@@ -153,12 +131,12 @@ def exam_state_dict(shard: bool, model_name: str, size_per_shard: int, test_conf
def run_dist(rank, world_size, port):
config = {}
colossalai.launch(config=config, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
colossalai.launch(config=config, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
exam_state_dict()
@pytest.mark.dist
@pytest.mark.parametrize('world_size', [4])
@pytest.mark.parametrize("world_size", [4])
@rerun_if_address_is_in_use()
def test_hybrid_ckpIO(world_size):
spawn(run_dist, world_size)

View File

@@ -20,9 +20,9 @@ from colossalai.zero import LowLevelZeroOptimizer
# stage 1 and 2 process the optimizer/mode the same way
# only test 2 is fine
@clear_cache_before_run()
@parameterize('stage', [2])
@parameterize('shard', [True, False])
@parameterize('offload', [False, True])
@parameterize("stage", [2])
@parameterize("shard", [True, False])
@parameterize("offload", [False, True])
def check_low_level_zero_checkpointIO(stage: int, shard: bool, offload: bool):
plugin = LowLevelZeroPlugin(stage=stage, max_norm=1.0, initial_scale=32, cpu_offload=offload)
booster = Booster(plugin=plugin)
@@ -31,7 +31,7 @@ def check_low_level_zero_checkpointIO(stage: int, shard: bool, offload: bool):
optimizer = HybridAdam((model.parameters()), lr=0.001)
model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion)
x = torch.randn(1, 3, 224, 224, device='cuda')
x = torch.randn(1, 3, 224, 224, device="cuda")
output = model(x)
loss = criterion(output)
booster.backward(loss, optimizer)
@@ -60,15 +60,16 @@ def check_low_level_zero_checkpointIO(stage: int, shard: bool, offload: bool):
padding = new_optimizer._param_store.get_param_padding_size(working_param)
padded_param = torch.nn.functional.pad(working_param.data.view(-1), (0, padding))
working_shard = padded_param.chunk(dist.get_world_size())[dist.get_rank()]
assert torch.equal(working_shard,
master_param.data.view(-1).to(dtype=padded_param.dtype, device=padded_param.device))
assert torch.equal(
working_shard, master_param.data.view(-1).to(dtype=padded_param.dtype, device=padded_param.device)
)
booster.load_optimizer(new_optimizer, optimizer_ckpt_path)
check_state_dict_equal(optimizer.optim.state_dict(), new_optimizer.optim.state_dict(), False)
def run_dist(rank, world_size, port):
colossalai.launch(config=(dict()), rank=rank, world_size=world_size, port=port, host='localhost')
colossalai.launch(config=(dict()), rank=rank, world_size=world_size, port=port, host="localhost")
check_low_level_zero_checkpointIO()
torch.cuda.empty_cache()

View File

@@ -1,5 +1,3 @@
import os
import pytest
import torch
import torch.distributed as dist
@@ -20,18 +18,19 @@ from tests.kit.model_zoo import model_zoo
@clear_cache_before_run()
@parameterize('model_name', ['transformers_gpt'])
@parameterize('plugin_type', ['ddp', 'zero', 'gemini'])
@parameterize("model_name", ["transformers_gpt"])
@parameterize("plugin_type", ["ddp", "zero", "gemini"])
def exam_from_pretrained(plugin_type: str, model_name: str, shard=True, size_per_shard=32):
(model_fn, data_gen_fn, output_transform_fn, loss_fn,
_) = next(iter(model_zoo.get_sub_registry(model_name).values()))
(model_fn, data_gen_fn, output_transform_fn, loss_fn, _) = next(
iter(model_zoo.get_sub_registry(model_name).values())
)
criterion = loss_fn
if plugin_type == 'ddp':
if plugin_type == "ddp":
plugin = TorchDDPPlugin()
elif plugin_type == 'zero':
elif plugin_type == "zero":
plugin = LowLevelZeroPlugin(stage=2, max_norm=1.0, initial_scale=32)
elif plugin_type == 'gemini':
elif plugin_type == "gemini":
plugin = GeminiPlugin(precision="fp16", initial_scale=32)
else:
raise ValueError(f"Plugin with type {plugin_type} is invalid, please check your argument.")
@@ -44,7 +43,7 @@ def exam_from_pretrained(plugin_type: str, model_name: str, shard=True, size_per
model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion)
data = data_gen_fn()
data = {k: v.to('cuda') if torch.is_tensor(v) or 'Tensor' in v.__class__.__name__ else v for k, v in data.items()}
data = {k: v.to("cuda") if torch.is_tensor(v) or "Tensor" in v.__class__.__name__ else v for k, v in data.items()}
output = model(**data)
loss = criterion(output)
@@ -52,7 +51,6 @@ def exam_from_pretrained(plugin_type: str, model_name: str, shard=True, size_per
optimizer.step()
with shared_tempdir() as tempdir:
model_ckpt_path = f"{tempdir}/model"
booster.save_model(model, model_ckpt_path, shard=shard, size_per_shard=size_per_shard)
dist.barrier()
@@ -62,9 +60,10 @@ def exam_from_pretrained(plugin_type: str, model_name: str, shard=True, size_per
new_optimizer = HybridAdam(new_model.parameters(), lr=0.001)
new_model, new_optimizer, criterion, _, _ = booster.boost(new_model, new_optimizer, criterion)
if plugin_type == 'gemini':
check_state_dict_equal(model.unwrap().state_dict(only_rank_0=False),
new_model.unwrap().state_dict(only_rank_0=False), False)
if plugin_type == "gemini":
check_state_dict_equal(
model.unwrap().state_dict(only_rank_0=False), new_model.unwrap().state_dict(only_rank_0=False), False
)
else:
check_state_dict_equal(model.unwrap().state_dict(), new_model.unwrap().state_dict(), False)
dist.barrier()
@@ -72,12 +71,12 @@ def exam_from_pretrained(plugin_type: str, model_name: str, shard=True, size_per
def run_dist(rank, world_size, port):
config = {}
colossalai.launch(config=config, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
colossalai.launch(config=config, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
exam_from_pretrained()
@pytest.mark.dist
@pytest.mark.parametrize('world_size', [2])
@pytest.mark.parametrize("world_size", [2])
@rerun_if_address_is_in_use()
def test_huggingface_compatibility(world_size):
spawn(run_dist, world_size)

View File

@@ -12,8 +12,8 @@ from colossalai.interface import OptimizerWrapper
from colossalai.testing import check_state_dict_equal, parameterize, rerun_if_address_is_in_use, spawn
@parameterize('shard', [True, False])
@parameterize('size_per_shard', [16, 128])
@parameterize("shard", [True, False])
@parameterize("size_per_shard", [16, 128])
def check_torch_ddp_checkpointIO(shard: bool, size_per_shard: int):
plugin = TorchDDPPlugin()
booster = Booster(plugin=plugin)
@@ -27,7 +27,7 @@ def check_torch_ddp_checkpointIO(shard: bool, size_per_shard: int):
assert isinstance(optimizer, OptimizerWrapper)
x = torch.randn(4, 3, 224, 224)
x = x.to('cuda')
x = x.to("cuda")
output = model(x)
loss = criterion(output)
booster.backward(loss, optimizer)
@@ -47,9 +47,9 @@ def check_torch_ddp_checkpointIO(shard: bool, size_per_shard: int):
new_model = resnet18()
new_optimizer = SGD((new_model.parameters()), lr=0.001)
new_scheduler = torch.optim.lr_scheduler.StepLR(new_optimizer, step_size=1, gamma=0.1)
new_model, new_optimizer, _, _, new_scheduler = booster.boost(new_model,
new_optimizer,
lr_scheduler=new_scheduler)
new_model, new_optimizer, _, _, new_scheduler = booster.boost(
new_model, new_optimizer, lr_scheduler=new_scheduler
)
booster.load_model(new_model, model_ckpt_path)
check_state_dict_equal(model.state_dict(), new_model.state_dict(), False)
@@ -61,7 +61,7 @@ def check_torch_ddp_checkpointIO(shard: bool, size_per_shard: int):
def run_dist(rank, world_size, port):
colossalai.launch(config=(dict()), rank=rank, world_size=world_size, port=port, host='localhost')
colossalai.launch(config=(dict()), rank=rank, world_size=world_size, port=port, host="localhost")
check_torch_ddp_checkpointIO()

View File

@@ -1,7 +1,6 @@
import pytest
import torch
from packaging import version
from torch import nn
from torch.optim import SGD
from torchvision.models import resnet18
from utils import shared_tempdir
@@ -9,11 +8,10 @@ from utils import shared_tempdir
import colossalai
from colossalai.booster import Booster
if version.parse(torch.__version__) >= version.parse('1.12.0'):
from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
if version.parse(torch.__version__) >= version.parse("1.12.0"):
from colossalai.booster.plugin import TorchFSDPPlugin
from colossalai.testing import rerun_if_address_is_in_use, spawn, check_state_dict_equal
from colossalai.testing import rerun_if_address_is_in_use, spawn
def compare_nested_dict(dict1, dict2):
@@ -72,15 +70,16 @@ def check_torch_fsdp_ckpt():
booster.save_optimizer(optimizer, optim_ckpt_path, shard=False)
full_msd = fsdp_model.state_dict()
#full_osd = FSDP.full_optim_state_dict(fsdp_model, optimizer)
# full_osd = FSDP.full_optim_state_dict(fsdp_model, optimizer)
sharded_osd = optimizer.state_dict()
import copy
sharded_osd = copy.deepcopy(sharded_osd)
run_model()
full_msd_updated = fsdp_model.state_dict()
#full_osd_updated = FSDP.full_optim_state_dict(fsdp_model, optimizer, rank0_only=True)
# full_osd_updated = FSDP.full_optim_state_dict(fsdp_model, optimizer, rank0_only=True)
sharded_osd_updated = optimizer.state_dict()
assert not compare_nested_dict(sharded_osd, sharded_osd_updated)
@@ -92,9 +91,9 @@ def check_torch_fsdp_ckpt():
booster.load_optimizer(optimizer, optim_ckpt_path)
full_msd_restore = fsdp_model.state_dict()
#full_osd_restore = FSDP.full_optim_state_dict(fsdp_model, optimizer, rank0_only=True)
# full_osd_restore = FSDP.full_optim_state_dict(fsdp_model, optimizer, rank0_only=True)
sharded_osd_restore = optimizer.state_dict()
assert compare_nested_dict(sharded_osd, sharded_osd_restore)
assert compare_nested_dict(full_msd_restore, full_msd)
outputs_sec = fsdp_model(inputs)
@@ -103,11 +102,11 @@ def check_torch_fsdp_ckpt():
def run_dist(rank, world_size, port):
# init dist env
colossalai.launch(config=dict(), rank=rank, world_size=world_size, port=port, host='localhost')
colossalai.launch(config=dict(), rank=rank, world_size=world_size, port=port, host="localhost")
check_torch_fsdp_ckpt()
@pytest.mark.skipif(version.parse(torch.__version__) < version.parse('1.12.0'), reason="requires torch1.12 or higher")
@pytest.mark.skipif(version.parse(torch.__version__) < version.parse("1.12.0"), reason="requires torch1.12 or higher")
@rerun_if_address_is_in_use()
def test_torch_fsdp_ckpt():
spawn(run_dist, 2)

View File

@@ -15,7 +15,7 @@ def shared_tempdir() -> Iterator[str]:
try:
obj = [tempdir]
dist.broadcast_object_list(obj, src=0)
tempdir = obj[0] # use the same directory on all ranks
tempdir = obj[0] # use the same directory on all ranks
yield tempdir
finally:
dist.barrier()