[test] remove useless tests (#4359)

* [test] remove legacy zero test

* [test] remove lazy distribute test

* [test] remove outdated checkpoint io
This commit is contained in:
Hongxin Liu
2023-08-01 18:52:14 +08:00
committed by GitHub
parent 16c0acc01b
commit 16bf4c0221
31 changed files with 0 additions and 3355 deletions

View File

@@ -1,140 +0,0 @@
from functools import partial
import torch
import torch.distributed as dist
from colossalai.logging import get_dist_logger
from colossalai.utils import checkpoint
from colossalai.zero.legacy.shard_utils import TensorShardStrategy
from colossalai.zero.legacy.sharded_model import ShardedModelV2
LOGGER = get_dist_logger('zero_test')
MP_PARALLEL_CONFIG = dict(fp16=dict(mode=None,), parallel=dict(pipeline=dict(size=1), tensor=dict(size=2, mode=None)))
_ZERO_MODEL_CONFIG = dict(reduce_scatter_bucket_size_mb=25,
fp32_reduce_scatter=False,
tensor_placement_policy='cuda',
gradient_predivide_factor=1.0,
shard_strategy=TensorShardStrategy(),
reuse_fp16_shard=False)
_ZERO_OPTIMIZER_CONFIG = dict(initial_scale=2**5,
min_scale=1,
growth_factor=2,
backoff_factor=0.5,
growth_interval=1000,
hysteresis=2,
max_scale=2**32)
ZERO_PARALLEL_CONFIG = dict(fp16=dict(mode=None,),
zero=dict(
model_config=_ZERO_MODEL_CONFIG,
optimizer_config=_ZERO_OPTIMIZER_CONFIG,
),
parallel=dict(pipeline=dict(size=1), tensor=dict(size=1, mode=None)))
CONFIG = dict(fp16=dict(mode=None,),
zero=dict(level=3,
verbose=False,
offload_optimizer_config=dict(device='cpu', pin_memory=True, buffer_count=5, fast_init=False),
offload_param_config=dict(device='cpu',
pin_memory=True,
buffer_count=5,
buffer_size=1e8,
max_in_cpu=1e9)),
parallel=dict(pipeline=dict(size=1), tensor=dict(size=1, mode=None)))
def run_fwd_bwd(model, data, label, criterion, enable_autocast=False):
model.train()
with torch.cuda.amp.autocast(enabled=enable_autocast):
if criterion:
y = model(data)
loss = criterion(y, label)
else:
loss = model(data, label)
loss = loss.float()
if isinstance(model, ShardedModelV2):
model.backward(loss)
else:
loss.backward()
def checkpoint_wrapper(module, enable=True):
if enable:
module.forward = partial(checkpoint, module.forward)
return module
def allclose(tensor_a: torch.Tensor, tensor_b: torch.Tensor, loose=False) -> bool:
if loose:
return torch.allclose(tensor_a, tensor_b, atol=1e-2, rtol=1e-3)
return torch.allclose(tensor_a, tensor_b)
def check_grads(model, zero_model, loose=False):
for p, zero_p in zip(model.parameters(), zero_model.parameters()):
zero_grad = zero_p.grad.clone().to(p.device)
grad = p.grad.float()
assert grad.dtype == zero_grad.dtype
assert allclose(grad, zero_grad, loose=loose)
def check_params(model, zero_model, loose=False):
for p, zero_p in zip(model.parameters(), zero_model.parameters()):
zero_p = zero_p.clone().to(p.device)
# assert p.dtype == zero_p.dtype
assert allclose(p.float(), zero_p.float(), loose=loose), f"diff {p.float() - zero_p.float()}"
def check_grads_padding(model, zero_model, loose=False):
rank = dist.get_rank()
for (name, p), (zero_name, zero_p) in zip(model.named_parameters(), zero_model.named_parameters()):
# zero_grad = zero_p.grad.clone().to(p.device)
if zero_p.colo_attr.is_replicated:
zero_grad = zero_p.colo_attr.grad_payload.clone().to(p.device)
chunks = torch.flatten(p.grad).chunk(dist.get_world_size())
if rank >= len(chunks):
continue
grad = chunks[rank].float()
if zero_grad.size(0) > grad.size(0):
zero_grad = zero_grad[:grad.size(0)]
else:
zero_grad = zero_p.colo_attr.grad_payload
grad = p.grad.to(zero_grad.dtype)
assert grad.dtype == zero_grad.dtype
assert allclose(grad, zero_grad, loose=loose), f'diff: {grad - zero_grad}'
def check_params_padding(model, zero_model, loose=False):
rank = dist.get_rank()
for p, zero_p in zip(model.parameters(), zero_model.parameters()):
zero_p = zero_p.clone().to(p.device)
chunks = torch.flatten(p).chunk(dist.get_world_size())
if rank >= len(chunks):
continue
p = chunks[rank]
if zero_p.size(0) > p.size(0):
zero_p = zero_p[:p.size(0)]
assert p.dtype == zero_p.dtype
assert allclose(p, zero_p, loose=loose)
def check_sharded_model_params(model, zero_model, loose=False, reuse_fp16_shard=False):
rank = dist.get_rank()
for (name, p), (zero_name, zero_p) in zip(model.named_parameters(), zero_model.named_parameters()):
if zero_p.colo_attr.param_is_sharded:
zero_p = zero_p.colo_attr.data_payload.to(p.device).float()
chunks = torch.flatten(p).chunk(dist.get_world_size())
if rank >= len(chunks):
continue
p = chunks[rank].float()
if zero_p.size(0) > p.size(0):
zero_p = zero_p[:p.size(0)]
else:
zero_p = zero_p.colo_attr.data_payload.to(p.device)
assert p.dtype == zero_p.dtype, "Parameter `{}`:\n{} vs {}".format(name, p.dtype, zero_p.dtype)
assert allclose(p, zero_p, loose=loose), f'{p} vs {zero_p}'

View File

@@ -1,67 +0,0 @@
import pytest
import torch
from common import CONFIG
from test_sharded_optim_v2 import _run_step
import colossalai
from colossalai.nn.optimizer import HybridAdam
from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
from colossalai.utils.cuda import get_current_device
from colossalai.zero.legacy.init_ctx import ZeroInitContext
from colossalai.zero.legacy.shard_utils import BucketTensorShardStrategy
from colossalai.zero.legacy.sharded_model import ShardedModelV2
from colossalai.zero.legacy.sharded_optim import ShardedOptimizerV2
from colossalai.zero.low_level._utils import has_inf_or_nan
from tests.components_to_test.registry import non_distributed_component_funcs
@parameterize("cpu_offload", [True, False])
@parameterize("shard_strategy_class", [BucketTensorShardStrategy])
@parameterize("gpu_margin_mem_ratio", [0.0, 0.7])
def _run_test_found_inf(cpu_offload, shard_strategy_class, gpu_margin_mem_ratio):
test_models = ['repeated_computed_layers']
shard_strategy = shard_strategy_class()
for model_name in test_models:
get_components_func = non_distributed_component_funcs.get_callable(model_name)
model_builder, train_dataloader, _, optimizer_class, criterion = get_components_func()
with ZeroInitContext(target_device=torch.device(f'cpu:0') if cpu_offload else get_current_device(),
shard_strategy=shard_strategy,
shard_param=True):
zero_model = model_builder(checkpoint=True)
zero_model = ShardedModelV2(
zero_model,
shard_strategy,
tensor_placement_policy='cpu' if cpu_offload else 'cuda',
reuse_fp16_shard=True,
)
sharded_optim = HybridAdam(zero_model.parameters(), lr=1e-3)
sharded_optim = ShardedOptimizerV2(zero_model, sharded_optim, gpu_margin_mem_ratio=gpu_margin_mem_ratio)
for i, (data, label) in enumerate(train_dataloader):
if i > 1:
break
assert zero_model.overflow_counter == 0
data, label = data.cuda(), label.cuda()
_run_step(zero_model, sharded_optim, data, label, criterion, False)
for param in zero_model.parameters():
assert not has_inf_or_nan(param.colo_attr.data_payload)
def _run_dist(rank, world_size, port):
colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
_run_test_found_inf()
# use_cpuadam = True can be used with cpu_offload = False
@pytest.mark.dist
@pytest.mark.parametrize("world_size", [1, 2])
@rerun_if_address_is_in_use()
def test_found_inf(world_size):
spawn(_run_dist, world_size)
if __name__ == '__main__':
test_found_inf(world_size=2)

View File

@@ -1,75 +0,0 @@
import pytest
import torch
from colossalai.testing import clear_cache_before_run
from colossalai.zero.legacy.gemini.stateful_tensor import StatefulTensor, TensorState
@pytest.mark.dist
@clear_cache_before_run()
def test_gemini_manager():
# reset the manager, in case that there exists memory information left
manager = StatefulTensor.GST_MGR
manager.reset()
# occupation 8
st1 = StatefulTensor(torch.empty(2, 2, dtype=torch.float16, device='cuda'))
# occupation 60
st2 = StatefulTensor(torch.empty(3, 5, dtype=torch.float32, device='cpu'))
# occupation 28
t1 = torch.empty(7, device='cuda')
# occupation 12
t2 = torch.empty(3, device='cpu')
st3 = StatefulTensor(t1, TensorState.HOLD_AFTER_FWD)
st4 = StatefulTensor(None, TensorState.FREE)
assert manager.total_number == 4
assert manager.total_mem['cpu'] == 60
assert manager.total_mem['cuda'] == 36
assert manager.state_mem['cpu'][TensorState.HOLD] == 60
assert manager.state_mem['cuda'][TensorState.HOLD] == 8
assert manager.state_mem['cuda'][TensorState.HOLD_AFTER_FWD] == 28
st4.payload_reset(t2)
st3.payload_reset(t2)
assert manager.total_number == 4
assert manager.total_mem['cpu'] == 84
assert manager.total_mem['cuda'] == 8
assert manager.state_mem['cpu'][TensorState.HOLD] == 72
assert manager.state_mem['cuda'][TensorState.HOLD] == 8
assert manager.state_mem['cpu'][TensorState.HOLD_AFTER_FWD] == 12
assert manager.state_mem['cuda'][TensorState.HOLD_AFTER_FWD] == 0
st1.move_to(torch.device('cpu'))
st2.move_to(torch.device('cpu'))
st3.move_to(torch.device('cuda', 0))
assert manager.total_number == 4
assert manager.total_mem['cpu'] == 80
assert manager.total_mem['cuda'] == 12
assert manager.state_mem['cpu'][TensorState.HOLD] == 80
assert manager.state_mem['cuda'][TensorState.HOLD] == 0
assert manager.state_mem['cpu'][TensorState.HOLD_AFTER_FWD] == 0
assert manager.state_mem['cuda'][TensorState.HOLD_AFTER_FWD] == 12
st1.trans_state(TensorState.COMPUTE)
st2.trans_state(TensorState.COMPUTE)
st2.trans_state(TensorState.HOLD_AFTER_BWD)
assert manager.total_number == 4
assert manager.total_mem['cpu'] == 80
assert manager.total_mem['cuda'] == 12
assert manager.state_mem['cpu'][TensorState.HOLD] == 12
assert manager.state_mem['cuda'][TensorState.HOLD] == 0
assert manager.state_mem['cpu'][TensorState.HOLD_AFTER_FWD] == 0
assert manager.state_mem['cuda'][TensorState.HOLD_AFTER_FWD] == 12
assert manager.state_mem['cpu'][TensorState.HOLD_AFTER_BWD] == 60
assert manager.state_mem['cuda'][TensorState.HOLD_AFTER_BWD] == 0
assert manager.state_mem['cpu'][TensorState.COMPUTE] == 8
assert manager.state_mem['cuda'][TensorState.COMPUTE] == 0
if __name__ == '__main__':
test_gemini_manager()

View File

@@ -1,73 +0,0 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import pytest
import torch
from common import CONFIG
import colossalai
from colossalai.logging import get_dist_logger
from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
from colossalai.utils.cuda import get_current_device
from colossalai.utils.memory import colo_device_memory_used
from colossalai.zero.gemini.memory_tracer.utils import colo_model_mem_usage
from colossalai.zero.legacy.init_ctx import ZeroInitContext
from colossalai.zero.legacy.shard_utils import BucketTensorShardStrategy, TensorShardStrategy
from tests.components_to_test.registry import non_distributed_component_funcs
@parameterize("init_device_type", ['cpu', 'cuda'])
@parameterize("shard_strategy_class", [TensorShardStrategy, BucketTensorShardStrategy])
def run_model_test(init_device_type, shard_strategy_class):
logger = get_dist_logger("test_zero_init")
for name, get_components_func in non_distributed_component_funcs._registry.items():
# because the ZeroInitContext automatically turns parameters to fp16
# and the beit model use tensor.erfinv_() function to initialize weights
# tensor.erfinv_() doesn't support Half in CPU, we omit the beit model
if name == 'beit':
continue
model_builder, _, _, _, _ = get_components_func()
if init_device_type == 'cuda':
init_device = get_current_device()
elif init_device_type == 'cpu':
init_device = torch.device("cpu")
else:
continue
model_numel_tensor = torch.zeros(1, dtype=torch.int)
with ZeroInitContext(target_device=init_device,
shard_strategy=shard_strategy_class(),
shard_param=True,
model_numel_tensor=model_numel_tensor):
model = model_builder(checkpoint=True)
for param in model.parameters():
assert hasattr(param, 'colo_attr')
assert param.colo_attr.sharded_data_tensor.dtype == torch.half
assert param.colo_attr.sharded_data_tensor.is_sharded
assert param.colo_attr.data_payload.device.type == init_device.type, \
f'{param.colo_attr.data_payload.device.type} vs. {init_device.type}'
cuda_mem_use, _ = colo_model_mem_usage(model)
model_data_cuda_mem_MB = cuda_mem_use / 1e6
logger.info(f"Existing ZeRO Context.\nModel Data CUDA Memory {model_data_cuda_mem_MB} MB", ranks=[0])
sys_cuda_mem_MB = colo_device_memory_used(get_current_device()) / 1e6
logger.info(f"System CUDA Memory Usage {sys_cuda_mem_MB} MB", ranks=[0])
logger.info(f"Model Number Parameter {model_numel_tensor.numpy()[0]/1e6} M", ranks=[0])
def run_dist(rank, world_size, port):
colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
run_model_test()
@pytest.mark.dist
@pytest.mark.parametrize("world_size", [1, 4])
@rerun_if_address_is_in_use()
def test_zero_init_context(world_size):
spawn(run_dist, world_size)
if __name__ == '__main__':
test_zero_init_context(1)

View File

@@ -1,82 +0,0 @@
import copy
import torch
from colossalai.testing import clear_cache_before_run
from colossalai.zero.legacy.gemini.paramhooks import BaseParamHookMgr
from tests.components_to_test.registry import non_distributed_component_funcs
def allclose(tensor_a: torch.Tensor, tensor_b: torch.Tensor, loose=False) -> bool:
if loose:
return torch.allclose(tensor_a, tensor_b, atol=1e-3, rtol=1e-3)
return torch.allclose(tensor_a, tensor_b)
def run_model(model, inputs, label, criterion, use_param_hook=False):
if use_param_hook:
class HooKWrapper:
def __init__(self) -> None:
self.hook_triggered_times = 0
def wrapper_func(self):
def hook(param, grad) -> torch.Tensor or None:
self.hook_triggered_times += 1
return grad
return hook
hookwrapper = HooKWrapper()
param_list = [p for p in model.parameters()]
hook_mgr = BaseParamHookMgr(param_list)
hook_mgr.register_backward_hooks(hookwrapper.wrapper_func())
model.zero_grad(set_to_none=True)
with torch.cuda.amp.autocast():
if criterion:
y = model(inputs)
loss = criterion(y, label)
else:
loss = model(inputs, label)
loss = loss.float()
loss.backward()
if use_param_hook:
hook_mgr.remove_hooks()
return hookwrapper.hook_triggered_times
@clear_cache_before_run()
def test_base_param_hook():
test_models = ['repeated_computed_layers', 'resnet18', 'hanging_param_model', 'inline_op_model']
# test_models = ['bert']
for model_name in test_models:
get_components_func = non_distributed_component_funcs.get_callable(model_name)
model_builder, train_dataloader, _, _, criterion = get_components_func()
torch.manual_seed(0)
model = model_builder(checkpoint=True).cuda()
model.train()
for i, (inputs, label) in enumerate(train_dataloader):
if i > 0:
break
model_copy = copy.deepcopy(model)
run_model(model, inputs.cuda(), label.cuda(), criterion, False)
ret2 = run_model(model_copy, inputs.cuda(), label.cuda(), criterion, True)
# Make sure param hook has only be fired once in case of parameter sharing
assert ret2 == len(list(model.parameters()))
for p, p_copy in zip(model.parameters(), model_copy.parameters()):
assert allclose(p.grad, p_copy.grad), f"{p.grad} vs {p_copy.grad}"
if __name__ == '__main__':
test_base_param_hook()

View File

@@ -1,64 +0,0 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import pytest
import torch
from common import CONFIG, check_grads_padding, run_fwd_bwd
from torch.nn.parallel import DistributedDataParallel as DDP
import colossalai
from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
from colossalai.zero.legacy.init_ctx import ZeroInitContext
from colossalai.zero.legacy.shard_utils import BucketTensorShardStrategy
from colossalai.zero.legacy.sharded_model import ShardedModelV2
from colossalai.zero.legacy.sharded_model._utils import cast_tensor_to_fp16
from colossalai.zero.legacy.sharded_model.utils import col_model_deepcopy
from tests.components_to_test.registry import non_distributed_component_funcs
@parameterize("enable_autocast", [True])
@parameterize("shard_strategy_class", [BucketTensorShardStrategy])
def run_model_test(enable_autocast, shard_strategy_class):
test_models = ['repeated_computed_layers', 'resnet18', 'bert', 'hanging_param_model']
shard_strategy = shard_strategy_class()
for model_name in test_models:
get_components_func = non_distributed_component_funcs.get_callable(model_name)
model_builder, train_dataloader, _, _, criterion = get_components_func()
with ZeroInitContext(target_device=torch.device('cuda', torch.cuda.current_device()),
shard_strategy=shard_strategy,
shard_param=True):
zero_model = model_builder(checkpoint=True)
zero_model = ShardedModelV2(zero_model, shard_strategy)
model = model_builder(checkpoint=True).half()
col_model_deepcopy(zero_model, model)
model = model.cuda()
model = DDP(model, device_ids=[torch.cuda.current_device()])
for i, (data, label) in enumerate(train_dataloader):
if i > 5:
break
data, label = cast_tensor_to_fp16(data).cuda(), label.cuda()
run_fwd_bwd(model, data, label, criterion, enable_autocast)
run_fwd_bwd(zero_model, data, label, criterion, enable_autocast)
check_grads_padding(model, zero_model, loose=True)
def run_dist(rank, world_size, port):
colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
run_model_test()
@pytest.mark.dist
@pytest.mark.parametrize("world_size", [1, 2])
@rerun_if_address_is_in_use()
def test_shard_model_v2(world_size):
spawn(run_dist, world_size)
if __name__ == '__main__':
test_shard_model_v2(world_size=2)

View File

@@ -1,91 +0,0 @@
from copy import deepcopy
import pytest
import torch
from common import CONFIG, allclose
import colossalai
from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
from colossalai.zero.legacy.gemini.stateful_tensor import StatefulTensor
from colossalai.zero.legacy.shard_utils import BucketTensorShardStrategy, TensorShardStrategy
from colossalai.zero.legacy.sharded_param import ShardedTensor
from colossalai.zero.legacy.sharded_param.sharded_param import ShardedParamV2
@parameterize("shard_strategy_class", [TensorShardStrategy, BucketTensorShardStrategy])
def run_shard_tensor_with_strategy(shard_strategy_class, world_size):
t = ShardedTensor(tensor=torch.randn(world_size * 2, 3))
assert list(t.origin_shape) == [world_size * 2, 3]
assert list(t.shape) == [world_size * 2, 3]
shard_strategy = shard_strategy_class()
# test shard strategy
shard_strategy.shard([t])
assert list(t.shape) == [6], f"{list(t.shape)} vs 6"
shard_strategy.gather([t])
assert list(t.shape) == [world_size * 2, 3], f"{list(t.shape)} vs {[world_size * 2, 3]}"
def _run_shard_tensor(rank, world_size, port):
colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
run_shard_tensor_with_strategy(world_size=world_size)
@pytest.mark.dist
@pytest.mark.parametrize("world_size", [1, 2])
@rerun_if_address_is_in_use()
def test_shard_tensor(world_size):
spawn(_run_shard_tensor, world_size)
def _run_shard_param_v2(rank, world_size, port):
colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
param = torch.nn.Parameter(torch.randn(2, 3))
param_ref = deepcopy(param)
sparam = ShardedParamV2(param=param)
allclose(sparam.data_payload, param_ref.data)
# Test get memory usage
sparam.saved_grad = StatefulTensor(torch.randn(2, 3))
cuda_mem_use, cpu_mem_use = sparam.get_memory_usage()
assert cpu_mem_use == 2 * 3 * 4 * 2, f"cpu_mem_use: {cpu_mem_use}"
sparam.set_data_none()
assert (param.data.numel() == 0)
cuda_mem_use, cpu_mem_use = sparam.get_memory_usage()
# 4 is size of dummy tensor of param.data
assert cpu_mem_use == 2 * 3 * 4 * 2
sparam.saved_grad = StatefulTensor(torch.randn(2, 3))
sparam.set_data_none()
cuda_mem_use, cpu_mem_use = sparam.get_memory_usage()
assert cpu_mem_use == 2 * 3 * 4 * 2
assert cuda_mem_use == 0
# append a grad to torch param
param.data = sparam.data_payload
param.grad = torch.randn(2, 3)
cuda_mem_use, cpu_mem_use = sparam.get_memory_usage()
assert cpu_mem_use == 2 * 3 * 4 * 2 + 2 * 3 * 4, f"cpu_mem_use {cpu_mem_use}"
assert cuda_mem_use == 0
# reuse torch grad for sparam
sparam.saved_grad = StatefulTensor(param.grad)
cuda_mem_use, cpu_mem_use = sparam.get_memory_usage()
assert cpu_mem_use == 2 * 3 * 4 * 2
assert cuda_mem_use == 0
@pytest.mark.dist
@pytest.mark.parametrize("world_size", [1, 2])
@rerun_if_address_is_in_use()
def test_shard_param_v2(world_size):
spawn(_run_shard_param_v2, world_size)
if __name__ == '__main__':
# test_shard_tensor(2)
test_shard_param_v2(2)

View File

@@ -1,89 +0,0 @@
import pytest
import torch
import colossalai
from colossalai.nn.optimizer import HybridAdam
from colossalai.tensor import ProcessGroup
from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
from colossalai.utils.cuda import get_current_device
from colossalai.zero.legacy.init_ctx import ZeroInitContext
from colossalai.zero.legacy.shard_utils import TensorShardStrategy
from colossalai.zero.legacy.sharded_model import ShardedModelV2
from colossalai.zero.legacy.sharded_optim import ShardedOptimizerV2
from tests.components_to_test.registry import non_distributed_component_funcs
from tests.test_tensor.common_utils import set_seed
def init_zero(model_builder, placement_policy):
device = get_current_device() if placement_policy == 'cuda' else torch.device('cpu')
shard_strategy = TensorShardStrategy()
with ZeroInitContext(target_device=device, shard_strategy=shard_strategy, shard_param=True):
model = model_builder()
model = ShardedModelV2(
model,
shard_strategy,
tensor_placement_policy=placement_policy,
reuse_fp16_shard=True,
)
optim = HybridAdam(model.parameters(), lr=1e-3)
optim = ShardedOptimizerV2(model, optim, initial_scale=32)
return model, optim
def run_step(model, optim, criterion, data, label):
optim.zero_grad()
logits = model(data)
loss = criterion(logits, label)
optim.backward(loss)
optim.step()
def check_state_dict_eq(state_dict, other):
for p, state in state_dict['state'].items():
other_state = other['state'][p]
for k, v in state.items():
if isinstance(v, torch.Tensor):
assert torch.allclose(v, other_state[k], atol=1e-3), f'{v} vs {other_state[k]}'
else:
assert v == other_state[k]
@parameterize('placement_policy', ['cuda', 'cpu'])
def run_nested_model(placement_policy):
get_components_func = non_distributed_component_funcs.get_callable('simple_net')
model_builder, train_dataloader, test_dataloader, optimizer_class, criterion = get_components_func()
set_seed(42)
model, optim = init_zero(model_builder, placement_policy)
set_seed(42)
model_copy, optim_copy = init_zero(model_builder, placement_policy)
model.train()
model_copy.train()
pg = ProcessGroup()
set_seed(pg.dp_local_rank())
data_iter = iter(train_dataloader)
data, label = map(lambda x: x.cuda(), next(data_iter))
run_step(model, optim, criterion, data, label)
optim_copy.load_state_dict(optim.state_dict())
check_state_dict_eq(optim.state_dict(), optim_copy.state_dict())
data, label = map(lambda x: x.cuda(), next(data_iter))
run_step(model_copy, optim_copy, criterion, data, label)
def run_dist(rank, world_size, port):
colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
run_nested_model()
@pytest.mark.dist
@pytest.mark.parametrize('world_size', [1, 2])
@rerun_if_address_is_in_use()
def test_sharded_optim_state_dist(world_size):
spawn(run_dist, world_size)
if __name__ == '__main__':
test_sharded_optim_state_dist(2)

View File

@@ -1,110 +0,0 @@
import pytest
import torch
import torch.distributed as dist
from common import CONFIG, check_sharded_model_params
from torch.nn.parallel import DistributedDataParallel as DDP
import colossalai
from colossalai.amp import convert_to_apex_amp
from colossalai.nn.optimizer import CPUAdam
from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
from colossalai.utils.cuda import get_current_device
from colossalai.zero.legacy.init_ctx import ZeroInitContext
from colossalai.zero.legacy.shard_utils import BucketTensorShardStrategy, TensorShardStrategy
from colossalai.zero.legacy.sharded_model import ShardedModelV2
from colossalai.zero.legacy.sharded_model.utils import col_model_deepcopy
from colossalai.zero.legacy.sharded_optim import ShardedOptimizerV2
from colossalai.zero.low_level._utils import has_inf_or_nan
from tests.components_to_test.registry import non_distributed_component_funcs
def _run_step(model, optimizer, data, label, criterion, enable_autocast=False):
model.train()
optimizer.zero_grad()
with torch.cuda.amp.autocast(enabled=enable_autocast):
if criterion:
y = model(data)
loss = criterion(y, label)
else:
loss = model(data, label)
loss = loss.float()
if isinstance(model, ShardedModelV2):
optimizer.backward(loss)
else:
loss.backward()
optimizer.step()
@parameterize("cpu_offload", [True, False])
@parameterize("use_cpuadam", [True, False])
@parameterize("shard_strategy_class", [TensorShardStrategy, BucketTensorShardStrategy])
@parameterize("gpu_margin_mem_ratio", [0.0, 0.7])
def _run_test_sharded_optim_v2(cpu_offload, shard_strategy_class, use_cpuadam, gpu_margin_mem_ratio):
test_models = ['repeated_computed_layers', 'resnet18', 'bert', 'hanging_param_model']
shard_strategy = shard_strategy_class()
if use_cpuadam and cpu_offload is False:
return
if gpu_margin_mem_ratio > 0.0 and not (cpu_offload and use_cpuadam):
return
for model_name in test_models:
get_components_func = non_distributed_component_funcs.get_callable(model_name)
model_builder, train_dataloader, _, optimizer_class, criterion = get_components_func()
with ZeroInitContext(target_device=torch.device(f'cpu:0') if cpu_offload else get_current_device(),
shard_strategy=shard_strategy,
shard_param=True):
zero_model = model_builder(checkpoint=True)
zero_model = ShardedModelV2(
zero_model,
shard_strategy,
tensor_placement_policy='cpu' if cpu_offload else 'auto',
reuse_fp16_shard=use_cpuadam,
)
model = model_builder(checkpoint=True).half()
col_model_deepcopy(zero_model, model)
model = model.cuda().float()
if use_cpuadam:
optimizer_class = CPUAdam
optim = optimizer_class(model.parameters(), lr=1e-3)
sharded_optim = optimizer_class(zero_model.parameters(), lr=1e-3)
sharded_optim = ShardedOptimizerV2(zero_model,
sharded_optim,
initial_scale=2**5,
gpu_margin_mem_ratio=gpu_margin_mem_ratio)
amp_config = dict(opt_level='O2', keep_batchnorm_fp32=False)
apex_model, apex_optimizer = convert_to_apex_amp(model, optim, amp_config)
if dist.get_world_size() > 1:
apex_model = DDP(apex_model, device_ids=[torch.cuda.current_device()])
for i, (data, label) in enumerate(train_dataloader):
if i > 5:
break
data, label = data.cuda(), label.cuda()
_run_step(apex_model, apex_optimizer, data, label, criterion, False)
_run_step(zero_model, sharded_optim, data, label, criterion, False)
check_sharded_model_params(model, zero_model, loose=True, reuse_fp16_shard=use_cpuadam)
for param in model.parameters():
assert not has_inf_or_nan(param)
def _run_dist(rank, world_size, port):
colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
_run_test_sharded_optim_v2()
# use_cpuadam = True can be used with cpu_offload = False
@pytest.mark.dist
@pytest.mark.parametrize("world_size", [1, 2])
@rerun_if_address_is_in_use()
def test_sharded_optim_v2(world_size):
spawn(_run_dist, world_size)
if __name__ == '__main__':
test_sharded_optim_v2(world_size=2)

View File

@@ -1,87 +0,0 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import pytest
import torch
import torch.distributed as dist
from torchvision.models import resnet50
import colossalai
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.testing import rerun_if_address_is_in_use, spawn
from colossalai.zero.legacy.init_ctx import ZeroInitContext
from colossalai.zero.legacy.shard_utils import TensorShardStrategy
def run_dist(rank, world_size, port):
# this test only runs on resnet18
# as this model has sync batch normalization
# need to configure cudnn deterministic so that
# randomness of convolution layers will be disabled
zero_config = dict(model_config=dict(shard_strategy=TensorShardStrategy()))
colossalai.launch(config=dict(zero=zero_config, cudnn_deterministic=True, cudnn_benchmark=False),
rank=rank,
world_size=world_size,
host='localhost',
port=port,
backend='nccl')
with ZeroInitContext(target_device=torch.cuda.current_device(),
shard_strategy=gpc.config.zero.model_config.shard_strategy,
shard_param=True):
model = resnet50()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = torch.nn.CrossEntropyLoss()
engine, *args = colossalai.initialize(model, optimizer, criterion)
# train for dummy iterations
engine.train()
for _ in range(2):
data = torch.rand(4, 3, 128, 128).cuda().half()
label = torch.randint(0, 10, size=(4,)).cuda()
engine.zero_grad()
out = engine(data)
loss = engine.criterion(out, label)
engine.backward(loss)
engine.step()
# test
# need to make sure the batch norm stats are synchronized
# so that given the same input, the model will produce the same
# output on different ranks
engine.eval()
data = torch.rand(4, 3, 128, 128).cuda().half()
dist.broadcast(data, src=0, group=gpc.get_group(ParallelMode.DATA))
# predict
out = engine(data)
# test if results are equal
tensor_list = [torch.empty_like(out) for _ in range(world_size - 1)]
tensor_list.insert(rank, out)
dist.all_gather(tensor_list=tensor_list, tensor=out, group=gpc.get_group(ParallelMode.DATA))
assert torch.all(tensor_list[0] == tensor_list[1]), \
'expected the output from different ranks to be the same, but got different values'
@pytest.mark.dist
@rerun_if_address_is_in_use()
def test_sharded_optim_with_sync_bn():
"""
This test is to make sure that buffers are synchronized between ranks
when using ZeRO. An example of module buffer is the running stats of
BatchNormalization layer, i.e. mean and var.
If the buffers are not synchronized, the model will produce different
output even though the input and parameters are the same. This is not
wanted if we are doing predictions.
"""
spawn(run_dist, 2)
if __name__ == '__main__':
test_sharded_optim_with_sync_bn()

View File

@@ -1,55 +0,0 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
from functools import partial
import pytest
import torch
from common import CONFIG
import colossalai
from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
from colossalai.zero.legacy.init_ctx import ZeroInitContext
from colossalai.zero.legacy.shard_utils import BucketTensorShardStrategy, TensorShardStrategy
from colossalai.zero.legacy.sharded_model import ShardedModelV2
from colossalai.zero.legacy.sharded_model.utils import col_model_deepcopy
from tests.components_to_test.registry import non_distributed_component_funcs
@parameterize("shard_strategy_class", [TensorShardStrategy, BucketTensorShardStrategy])
def run_zero_state_dict(shard_strategy_class):
test_models = ['repeated_computed_layers', 'resnet18']
shard_strategy = shard_strategy_class()
for model_name in test_models:
get_components_func = non_distributed_component_funcs.get_callable(model_name)
model_builder, train_dataloader, test_dataloader, optimizer, criterion = get_components_func()
with ZeroInitContext(target_device=torch.device('cuda', torch.cuda.current_device()),
shard_strategy=shard_strategy,
shard_param=True):
zero_model = model_builder(checkpoint=True)
zero_model = ShardedModelV2(zero_model, shard_strategy)
model = model_builder(checkpoint=True).half()
col_model_deepcopy(zero_model, model)
model = model.cuda()
zero_state_dict = zero_model.state_dict()
for key, val in model.state_dict().items():
assert torch.equal(val, zero_state_dict[key].to(val.device))
def run_dist(rank, world_size, port):
colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
run_zero_state_dict()
@pytest.mark.dist
@pytest.mark.parametrize("world_size", [1, 2])
@rerun_if_address_is_in_use()
def test_zero_state_dict(world_size):
spawn(run_dist, world_size)
if __name__ == '__main__':
test_zero_state_dict(2)

View File

@@ -1,94 +0,0 @@
import pytest
import torch
import colossalai
from colossalai.testing import rerun_if_address_is_in_use, spawn
from colossalai.utils.cuda import get_current_device
from colossalai.zero.legacy.gemini.stateful_tensor import StatefulTensor
from colossalai.zero.legacy.gemini.tensor_utils import (
colo_model_data_move_to_cpu,
colo_model_data_tensor_move,
colo_model_data_tensor_move_inline,
colo_model_tensor_clone,
colo_tensor_mem_usage,
)
def _run_colo_tensor_mem_usage():
for i in range(1):
if i == 1:
t1 = StatefulTensor(torch.randn(2, 2))
t2 = StatefulTensor(torch.randn(4, 4))
c1, g1 = colo_tensor_mem_usage(t1)
c2, g2 = colo_tensor_mem_usage(t2)
assert c1 * 4 == c2
assert g1 * 4 == g2
else:
t1 = torch.randn(2, 2)
t2 = torch.randn(4, 4)
c1, g1 = colo_tensor_mem_usage(t1)
c2, g2 = colo_tensor_mem_usage(t2)
assert c1 * 4 == c2
assert g1 * 4 == g2
def _run_colo_model_data_tensor_move_inline():
for t in [StatefulTensor(torch.randn(2, 3)), torch.randn(2, 3)]:
colo_model_data_tensor_move_inline(t, get_current_device())
assert t.device == get_current_device()
def _run_colo_model_data_tensor_move():
for t in [(StatefulTensor(torch.ones(2, 3)), StatefulTensor(torch.zeros(2, 3).to(get_current_device()))),
(torch.ones(2, 3), torch.zeros(2, 3).to(get_current_device()))]:
cpu_t, cuda_t = t
colo_model_data_tensor_move(cpu_t, cuda_t)
assert cuda_t.device == get_current_device()
def _run_colo_model_data_move_to_cpu():
for t in [StatefulTensor(torch.randn(2, 2)), torch.randn(4, 4)]:
colo_model_data_move_to_cpu(t)
assert t.device == torch.device("cpu")
def _run_colo_model_tensor_clone():
for t in [
StatefulTensor(torch.randn(2, 2).cuda(torch.cuda.current_device())),
torch.randn(4, 4).cuda(torch.cuda.current_device())
]:
if issubclass(type(t), StatefulTensor):
assert t.payload.device == get_current_device()
else:
assert t.device == get_current_device()
p = colo_model_tensor_clone(t, get_current_device())
assert p.device == get_current_device()
for i in range(2):
for j in range(2):
if issubclass(type(t), StatefulTensor):
assert t.payload.device == p.device
assert t.payload[i][j] == p[i][j]
else:
assert t.device == p.device
assert t[i][j] == p[i][j]
def run_dist(rank, world_size, port):
colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
_run_colo_tensor_mem_usage()
_run_colo_model_data_tensor_move_inline()
_run_colo_model_data_tensor_move()
_run_colo_model_data_move_to_cpu()
_run_colo_model_tensor_clone()
@pytest.mark.dist
@pytest.mark.parametrize("world_size", [2, 4])
@rerun_if_address_is_in_use()
def test_zero_tensor_utils(world_size):
spawn(run_dist, world_size)
if __name__ == '__main__':
test_zero_tensor_utils(world_size=2)

View File

@@ -1,113 +0,0 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import pytest
import torch
import torch.distributed as dist
from common import MP_PARALLEL_CONFIG, ZERO_PARALLEL_CONFIG, check_params, check_sharded_model_params
from torch.nn.parallel import DistributedDataParallel as DDP
import colossalai
from colossalai.core import global_context as gpc
from colossalai.testing import rerun_if_address_is_in_use, spawn
from colossalai.zero.legacy.init_ctx import ZeroInitContext
from colossalai.zero.legacy.sharded_model.utils import col_model_deepcopy
from colossalai.zero.low_level._utils import has_inf_or_nan
from tests.components_to_test.registry import non_distributed_component_funcs
def run_dist(rank, world_size, port, parallel_config, bf16):
is_mp_config = parallel_config == MP_PARALLEL_CONFIG
is_zero_config = parallel_config == ZERO_PARALLEL_CONFIG
if bf16:
parallel_config['zero']['model_config']['bf16'] = True
colossalai.launch(config=parallel_config,
rank=rank,
world_size=world_size,
host='localhost',
port=port,
backend='nccl')
test_models = ['repeated_computed_layers', 'resnet18', 'bert']
for model_name in test_models:
get_components_func = non_distributed_component_funcs.get_callable(model_name)
model_builder, train_dataloader, _, optimizer_class, criterion = get_components_func()
with ZeroInitContext(target_device=torch.cuda.current_device(),
shard_strategy=gpc.config.zero.model_config.shard_strategy,
shard_param=True,
bf16=bf16):
colo_model = model_builder(checkpoint=True)
colo_optimizer = optimizer_class(colo_model.parameters(), lr=1e-3)
engine, train_dataloader, _, _ = colossalai.initialize(colo_model,
optimizer=colo_optimizer,
criterion=criterion,
train_dataloader=train_dataloader)
dtype = torch.bfloat16 if bf16 else torch.float16
torch_model = model_builder(checkpoint=True).to(dtype)
col_model_deepcopy(engine.model, torch_model)
torch_model = torch_model.cuda().float()
engine.train()
torch_optimizer = optimizer_class(torch_model.parameters(), lr=1e-3)
if dist.get_world_size() > 1:
torch_model = DDP(torch_model, device_ids=[torch.cuda.current_device()])
i = 0
for data, label in train_dataloader:
if i > 4:
break
data, label = data.cuda(), label.cuda()
engine.zero_grad()
torch_optimizer.zero_grad()
if criterion:
output = engine(data)
loss = engine.criterion(output, label)
torch_output = torch_model(data)
torch_loss = engine.criterion(torch_output, label)
else:
loss = engine(data, label)
torch_loss = torch_model(data, label)
engine.backward(loss)
engine.step()
torch_loss.backward()
for param in torch_model.parameters():
if param.grad is not None:
assert not has_inf_or_nan(param.grad)
torch_optimizer.step()
i += 1
if is_mp_config:
check_params(torch_model, colo_model, loose=True)
elif is_zero_config:
check_sharded_model_params(torch_model, colo_model, loose=True)
# FIXME: enable this test in next PR
@pytest.mark.skip
@pytest.mark.dist
@pytest.mark.parametrize("world_size", [2, 4])
@rerun_if_address_is_in_use()
def test_mp_engine(world_size):
spawn(run_dist, world_size, parallel_config=MP_PARALLEL_CONFIG)
@pytest.mark.dist
@pytest.mark.parametrize("world_size", [1, 2])
@pytest.mark.parametrize("bf16", [True, False])
@rerun_if_address_is_in_use()
def test_zero_engine(world_size, bf16):
spawn(run_dist, world_size, parallel_config=ZERO_PARALLEL_CONFIG, bf16=bf16)
if __name__ == '__main__':
test_zero_engine(world_size=4)