mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-01 09:07:51 +00:00
[test] reorganize zero/gemini tests (#3445)
This commit is contained in:
72
tests/test_zero/test_gemini/test_chunk_mgrv2.py
Normal file
72
tests/test_zero/test_gemini/test_chunk_mgrv2.py
Normal file
@@ -0,0 +1,72 @@
|
||||
from functools import partial
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import torch.multiprocessing as mp
|
||||
|
||||
import colossalai
|
||||
from colossalai.tensor import ColoTensor, ColoTensorSpec, ProcessGroup
|
||||
from colossalai.testing import parameterize, rerun_if_address_is_in_use
|
||||
from colossalai.utils import free_port
|
||||
from colossalai.zero.gemini.chunk import ChunkManager
|
||||
from tests.test_tensor.common_utils import debug_print
|
||||
|
||||
CUDA_MEM_0 = {False: 512, True: 1024}
|
||||
CUDA_MEM_1 = {False: 0, True: 1024}
|
||||
CPU_MEM = {True: {True: 0, False: 0}, False: {True: 512, False: 0}}
|
||||
|
||||
|
||||
@parameterize('keep_gathered', [True, False])
|
||||
@parameterize('pin_memory', [True, False])
|
||||
def exam_chunk_memory(keep_gathered, pin_memory):
|
||||
pg = ProcessGroup()
|
||||
|
||||
debug_print([0], "keep_gathered: {}, pin_memory: {}".format(keep_gathered, pin_memory))
|
||||
|
||||
params = [ColoTensor(torch.rand(8, 8), spec=ColoTensorSpec(pg)) for _ in range(3)]
|
||||
config = {2: dict(chunk_size=128, keep_gathered=keep_gathered)}
|
||||
|
||||
chunk_manager = ChunkManager(config)
|
||||
assert chunk_manager.total_mem['cpu'] == 0
|
||||
assert chunk_manager.total_mem['cuda'] == 0
|
||||
|
||||
for p in params:
|
||||
chunk_manager.register_tensor(p, 'param', 2, pin_memory=pin_memory)
|
||||
chunk_manager.close_all_groups()
|
||||
assert chunk_manager.total_mem['cpu'] == CPU_MEM[keep_gathered][pin_memory]
|
||||
assert chunk_manager.total_mem['cuda'] == CUDA_MEM_0[keep_gathered]
|
||||
|
||||
chunks = chunk_manager.get_chunks(params)
|
||||
|
||||
for chunk in chunks:
|
||||
chunk_manager.access_chunk(chunk)
|
||||
assert chunk_manager.total_mem['cpu'] == CPU_MEM[keep_gathered][pin_memory]
|
||||
assert chunk_manager.total_mem['cuda'] == CUDA_MEM_0[True]
|
||||
|
||||
for chunk in chunks:
|
||||
chunk_manager.release_chunk(chunk)
|
||||
|
||||
assert chunk_manager.total_mem['cpu'] == CPU_MEM[keep_gathered][pin_memory]
|
||||
assert chunk_manager.total_mem['cuda'] == CUDA_MEM_0[keep_gathered]
|
||||
|
||||
for chunk in chunks:
|
||||
chunk_manager.move_chunk(chunk, torch.device('cpu'))
|
||||
assert chunk_manager.total_mem['cpu'] == CPU_MEM[keep_gathered][True]
|
||||
assert chunk_manager.total_mem['cuda'] == CUDA_MEM_1[keep_gathered]
|
||||
|
||||
|
||||
def run_dist(rank, world_size, port):
|
||||
colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
|
||||
exam_chunk_memory()
|
||||
|
||||
|
||||
@pytest.mark.dist
|
||||
@pytest.mark.parametrize('world_size', [2])
|
||||
@rerun_if_address_is_in_use()
|
||||
def test_chunk_manager(world_size):
|
||||
run_func = partial(run_dist, world_size=world_size, port=free_port())
|
||||
mp.spawn(run_func, nprocs=world_size)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_chunk_manager(2)
|
125
tests/test_zero/test_gemini/test_chunkv2.py
Normal file
125
tests/test_zero/test_gemini/test_chunkv2.py
Normal file
@@ -0,0 +1,125 @@
|
||||
from functools import partial
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
import torch.multiprocessing as mp
|
||||
|
||||
import colossalai
|
||||
from colossalai.tensor import ColoParameter
|
||||
from colossalai.tensor import ProcessGroup as ColoProcessGroup
|
||||
from colossalai.testing import parameterize, rerun_if_address_is_in_use
|
||||
from colossalai.utils import free_port, get_current_device
|
||||
from colossalai.zero.gemini import TensorState
|
||||
from colossalai.zero.gemini.chunk import Chunk
|
||||
|
||||
|
||||
def dist_sum(x):
|
||||
temp = torch.tensor([x], device=get_current_device())
|
||||
dist.all_reduce(temp)
|
||||
return temp.item()
|
||||
|
||||
|
||||
def add_param(param_list, param_cp_list, *args, **kwargs):
|
||||
param = ColoParameter(torch.randn(*args, **kwargs))
|
||||
param_list.append(param)
|
||||
param_cp_list.append(param.clone())
|
||||
|
||||
|
||||
def check_euqal(param, param_cp):
|
||||
if param.device != param_cp.device:
|
||||
temp = param.data.to(param_cp.device)
|
||||
else:
|
||||
temp = param.data
|
||||
return torch.equal(temp, param_cp.data)
|
||||
|
||||
|
||||
@parameterize('init_device', [None, torch.device('cpu')])
|
||||
@parameterize('keep_gathered', [True, False])
|
||||
@parameterize('pin_memory', [True, False])
|
||||
def exam_chunk_basic(init_device, keep_gathered, pin_memory):
|
||||
world_size = torch.distributed.get_world_size()
|
||||
pg = ColoProcessGroup()
|
||||
my_chunk = Chunk(chunk_size=1024,
|
||||
process_group=pg,
|
||||
dtype=torch.float32,
|
||||
init_device=init_device,
|
||||
cpu_shard_init=True,
|
||||
keep_gathered=keep_gathered,
|
||||
pin_memory=pin_memory)
|
||||
|
||||
param_list = []
|
||||
param_cp_list = []
|
||||
|
||||
add_param(param_list, param_cp_list, 8, 8, 8, device='cuda')
|
||||
add_param(param_list, param_cp_list, 4, 4)
|
||||
add_param(param_list, param_cp_list, 4, 8, 2, device='cuda')
|
||||
add_param(param_list, param_cp_list, 1, 1, 5)
|
||||
|
||||
for param in param_list:
|
||||
my_chunk.append_tensor(param)
|
||||
assert my_chunk.utilized_size == 597
|
||||
for param, param_cp in zip(param_list, param_cp_list):
|
||||
check_euqal(param, param_cp)
|
||||
my_chunk.close_chunk()
|
||||
|
||||
if keep_gathered is False:
|
||||
assert my_chunk.cpu_shard.size(0) == 1024 // world_size
|
||||
assert my_chunk.device_type == 'cpu'
|
||||
assert my_chunk.can_move
|
||||
my_chunk.shard_move(get_current_device())
|
||||
else:
|
||||
assert my_chunk.cuda_global_chunk.size(0) == 1024
|
||||
assert my_chunk.device_type == 'cuda'
|
||||
assert not my_chunk.can_move
|
||||
|
||||
assert dist_sum(my_chunk.valid_end) == my_chunk.utilized_size
|
||||
flag = my_chunk.has_inf_or_nan
|
||||
assert not flag, "has_inf_or_nan is {}".format(flag)
|
||||
|
||||
my_chunk.access_chunk()
|
||||
assert my_chunk.device_type == 'cuda'
|
||||
for param, param_cp in zip(param_list, param_cp_list):
|
||||
check_euqal(param, param_cp)
|
||||
|
||||
assert my_chunk.tensor_state_cnter[TensorState.HOLD] == 4
|
||||
my_chunk.tensor_trans_state(param_list[0], TensorState.COMPUTE)
|
||||
assert my_chunk.tensor_state_cnter[TensorState.HOLD] == 3
|
||||
assert my_chunk.tensor_state_cnter[TensorState.COMPUTE] == 1
|
||||
assert not my_chunk.can_release
|
||||
|
||||
for param in param_list:
|
||||
my_chunk.tensor_trans_state(param, TensorState.COMPUTE)
|
||||
my_chunk.tensor_trans_state(param, TensorState.HOLD_AFTER_BWD)
|
||||
my_chunk.tensor_trans_state(param, TensorState.READY_FOR_REDUCE)
|
||||
|
||||
assert my_chunk.tensor_state_cnter[TensorState.READY_FOR_REDUCE] == 4
|
||||
assert my_chunk.can_reduce
|
||||
my_chunk.reduce()
|
||||
assert my_chunk.tensor_state_cnter[TensorState.HOLD] == 4
|
||||
|
||||
if keep_gathered is False:
|
||||
assert my_chunk.cuda_shard.size(0) == 1024 // world_size
|
||||
assert my_chunk.device_type == 'cuda'
|
||||
assert my_chunk.can_move
|
||||
else:
|
||||
assert my_chunk.cuda_global_chunk.size(0) == 1024
|
||||
assert my_chunk.device_type == 'cuda'
|
||||
assert not my_chunk.can_move
|
||||
|
||||
|
||||
def run_dist(rank, world_size, port):
|
||||
colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
|
||||
exam_chunk_basic()
|
||||
|
||||
|
||||
@pytest.mark.dist
|
||||
@pytest.mark.parametrize('world_size', [1, 2, 4])
|
||||
@rerun_if_address_is_in_use()
|
||||
def test_chunk_function(world_size):
|
||||
run_func = partial(run_dist, world_size=world_size, port=free_port())
|
||||
mp.spawn(run_func, nprocs=world_size)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_chunk_function(4)
|
111
tests/test_zero/test_gemini/test_fwd_bwd.py
Normal file
111
tests/test_zero/test_gemini/test_fwd_bwd.py
Normal file
@@ -0,0 +1,111 @@
|
||||
from functools import partial
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import torch.multiprocessing as mp
|
||||
from torch.nn.parallel import DistributedDataParallel as DDP
|
||||
from torch.testing import assert_close
|
||||
|
||||
import colossalai
|
||||
from colossalai.amp import convert_to_apex_amp
|
||||
from colossalai.nn.optimizer import HybridAdam
|
||||
from colossalai.tensor import ProcessGroup
|
||||
from colossalai.testing import parameterize, rerun_if_address_is_in_use
|
||||
from colossalai.utils import free_port
|
||||
from colossalai.utils.cuda import get_current_device
|
||||
from colossalai.zero import ColoInitContext, ZeroDDP, ZeroOptimizer
|
||||
from colossalai.zero.gemini.chunk import ChunkManager, search_chunk_configuration
|
||||
from colossalai.zero.gemini.gemini_mgr import GeminiManager
|
||||
from tests.components_to_test import run_fwd_bwd
|
||||
from tests.components_to_test.registry import non_distributed_component_funcs
|
||||
from tests.test_tensor.common_utils import set_seed
|
||||
|
||||
|
||||
def check_grad(model: ZeroDDP, torch_model: torch.nn.Module):
|
||||
chunk_manager = model.chunk_manager
|
||||
param_list = [p for p in model.parameters()]
|
||||
chunk_list = chunk_manager.get_chunks(param_list)
|
||||
for chunk in chunk_list:
|
||||
chunk_manager.access_chunk(chunk)
|
||||
|
||||
for (p0, p1) in zip(model.parameters(), torch_model.parameters()):
|
||||
assert_close(p0, p1.grad, rtol=1e-3, atol=5e-5)
|
||||
|
||||
|
||||
@parameterize('placement_policy', ['cuda', 'cpu', 'auto', 'const'])
|
||||
@parameterize('keep_gather', [False, True])
|
||||
@parameterize('model_name', ['gpt2', 'bert', 'albert'])
|
||||
@parameterize('use_grad_checkpoint', [False, True])
|
||||
def exam_gpt_fwd_bwd(
|
||||
placement_policy,
|
||||
keep_gather,
|
||||
model_name: str,
|
||||
use_grad_checkpoint: bool = False,
|
||||
):
|
||||
init_device = get_current_device()
|
||||
get_components_func = non_distributed_component_funcs.get_callable(model_name)
|
||||
model_builder, train_dataloader, test_dataloader, optimizer_class, criterion = get_components_func()
|
||||
|
||||
set_seed(42)
|
||||
with ColoInitContext(device=init_device):
|
||||
model = model_builder(use_grad_checkpoint)
|
||||
|
||||
set_seed(42)
|
||||
torch_model = model_builder(use_grad_checkpoint).cuda()
|
||||
for torch_p, p in zip(torch_model.parameters(), model.parameters()):
|
||||
torch_p.data.copy_(p.data)
|
||||
|
||||
world_size = torch.distributed.get_world_size()
|
||||
config_dict, *_ = search_chunk_configuration(model, search_range_mb=1, search_interval_byte=100)
|
||||
config_dict[world_size]['chunk_size'] = 5000
|
||||
config_dict[world_size]['keep_gathered'] = keep_gather
|
||||
chunk_manager = ChunkManager(config_dict)
|
||||
gemini_manager = GeminiManager(placement_policy, chunk_manager)
|
||||
model = ZeroDDP(model, gemini_manager, pin_memory=True)
|
||||
optimizer = HybridAdam(model.parameters(), lr=1e-3)
|
||||
zero_optim = ZeroOptimizer(optimizer, model, initial_scale=1)
|
||||
|
||||
pg = ProcessGroup()
|
||||
amp_config = dict(opt_level='O2', keep_batchnorm_fp32=False, loss_scale=1)
|
||||
torch_optim = torch.optim.Adam(torch_model.parameters(), lr=1e-3)
|
||||
torch_model, torch_optim = convert_to_apex_amp(torch_model, torch_optim, amp_config)
|
||||
torch_model = DDP(torch_model, device_ids=[pg.rank()], process_group=pg.dp_process_group())
|
||||
|
||||
set_seed(pg.dp_local_rank())
|
||||
for i, (input_ids, label) in enumerate(train_dataloader):
|
||||
# you can only test a single fwd + bwd.
|
||||
# after bwd param is grad for Gemini, due to the chunk reuse optimization.
|
||||
if i > 0:
|
||||
break
|
||||
input_ids, label = input_ids.cuda(), label.cuda()
|
||||
|
||||
torch_optim.zero_grad()
|
||||
zero_optim.zero_grad()
|
||||
|
||||
# set random seed is same as torch_model.eval()
|
||||
set_seed(42)
|
||||
torch_loss = run_fwd_bwd(torch_model, input_ids, label, criterion, torch_optim)
|
||||
set_seed(42)
|
||||
loss = run_fwd_bwd(model, input_ids, label, criterion, zero_optim)
|
||||
|
||||
assert torch.equal(torch_loss, loss)
|
||||
|
||||
check_grad(model, torch_model)
|
||||
|
||||
|
||||
def run_dist(rank, world_size, port):
|
||||
config = {}
|
||||
colossalai.launch(config=config, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
|
||||
exam_gpt_fwd_bwd()
|
||||
|
||||
|
||||
@pytest.mark.dist
|
||||
@pytest.mark.parametrize('world_size', [1, 4])
|
||||
@rerun_if_address_is_in_use()
|
||||
def test_gpt(world_size):
|
||||
run_func = partial(run_dist, world_size=world_size, port=free_port())
|
||||
mp.spawn(run_func, nprocs=world_size)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_gpt(4)
|
106
tests/test_zero/test_gemini/test_gemini_use_rmt.py
Normal file
106
tests/test_zero/test_gemini/test_gemini_use_rmt.py
Normal file
@@ -0,0 +1,106 @@
|
||||
from functools import partial
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import torch.multiprocessing as mp
|
||||
|
||||
import colossalai
|
||||
from colossalai.tensor import ProcessGroup
|
||||
from colossalai.testing import parameterize, rerun_if_address_is_in_use
|
||||
from colossalai.utils import free_port
|
||||
from colossalai.zero import ColoInitContext, GeminiAdamOptimizer, GeminiDDP, ZeroDDP
|
||||
from colossalai.zero.gemini.chunk import ChunkManager, search_chunk_configuration
|
||||
from colossalai.zero.gemini.gemini_mgr import GeminiManager
|
||||
from colossalai.zero.gemini.memory_tracer.runtime_mem_tracer import RuntimeMemTracer
|
||||
from tests.components_to_test import run_fwd_bwd
|
||||
from tests.components_to_test.registry import non_distributed_component_funcs
|
||||
from tests.test_tensor.common_utils import set_seed
|
||||
|
||||
# run gemini use the runtime memory tracer
|
||||
|
||||
|
||||
@parameterize('placement_policy', ['auto'])
|
||||
@parameterize('keep_gather', [False])
|
||||
@parameterize('model_name', ['repeated_computed_layers', 'bert', 'albert', 'gpt2'])
|
||||
@parameterize('use_grad_checkpoint', [False, True])
|
||||
def run_gemini_use_rmt(placement_policy, keep_gather, model_name: str, use_grad_checkpoint: bool = False):
|
||||
set_seed(42)
|
||||
get_components_func = non_distributed_component_funcs.get_callable(model_name)
|
||||
model_builder, train_dataloader, test_dataloader, optimizer_class, criterion = get_components_func()
|
||||
|
||||
with ColoInitContext(device='cpu'):
|
||||
model = model_builder(use_grad_checkpoint)
|
||||
|
||||
print(f'model_name {model_name}')
|
||||
runtime_mem_tracer = RuntimeMemTracer(model)
|
||||
for i, (input_ids, label) in enumerate(train_dataloader):
|
||||
if i > 0:
|
||||
break
|
||||
input_ids, label = input_ids.cuda(), label.cuda()
|
||||
|
||||
# mem tracing
|
||||
if i == 0:
|
||||
run_fwd_bwd(runtime_mem_tracer, input_ids, label, criterion, runtime_mem_tracer)
|
||||
memstats = runtime_mem_tracer.memstats()
|
||||
runtime_tracer_non_model_data = runtime_mem_tracer._memstats._non_model_data_cuda_list
|
||||
print('runtime tracer non model data points: ', len(runtime_tracer_non_model_data))
|
||||
print('runtime tracer: ', runtime_tracer_non_model_data)
|
||||
print([memstats.param_used_step(p) for p in model.parameters()])
|
||||
|
||||
if model_name == 'repeated_computed_layers':
|
||||
for idx, p in enumerate(model.parameters()):
|
||||
step_list = memstats.param_used_step(p)
|
||||
if idx < 4:
|
||||
assert len(step_list) == 4
|
||||
|
||||
if model_name == 'repeated_computed_layers':
|
||||
for idx, p in enumerate(model.parameters()):
|
||||
step_list = memstats.param_used_step(p)
|
||||
if idx < 4:
|
||||
assert len(step_list) == 4
|
||||
|
||||
world_size = torch.distributed.get_world_size()
|
||||
config_dict, *_ = search_chunk_configuration(model, search_range_mb=1, search_interval_byte=100)
|
||||
config_dict[world_size]['chunk_size'] = 5000
|
||||
config_dict[world_size]['keep_gathered'] = keep_gather
|
||||
chunk_manager = ChunkManager(config_dict)
|
||||
gemini_manager = GeminiManager(placement_policy, chunk_manager, memstats)
|
||||
model = ZeroDDP(model, gemini_manager, pin_memory=True)
|
||||
|
||||
pg = ProcessGroup()
|
||||
set_seed(pg.dp_local_rank())
|
||||
for i, (input_ids, label) in enumerate(train_dataloader):
|
||||
# you can only test a single fwd + bwd.
|
||||
# after bwd param is grad for Gemini, due to the chunk reuse optimization.
|
||||
# print(f'iteration {i}')
|
||||
if i > 4:
|
||||
break
|
||||
input_ids, label = input_ids.cuda(), label.cuda()
|
||||
|
||||
set_seed(42)
|
||||
loss = run_fwd_bwd(model, input_ids, label, criterion, model)
|
||||
|
||||
gemini_non_model_data = gemini_manager._mem_stats_collector._memstats.non_model_data_list('cuda')
|
||||
|
||||
# print('gemini non model data:', gemini_non_model_data)
|
||||
|
||||
assert len(gemini_non_model_data) == len(runtime_tracer_non_model_data), \
|
||||
f'model_name {model_name} {len(gemini_non_model_data)} vs {len(runtime_tracer_non_model_data)}'
|
||||
|
||||
|
||||
def run_dist(rank, world_size, port):
|
||||
config = {}
|
||||
colossalai.launch(config=config, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
|
||||
run_gemini_use_rmt()
|
||||
|
||||
|
||||
@pytest.mark.dist
|
||||
@pytest.mark.parametrize('world_size', [1, 4])
|
||||
@rerun_if_address_is_in_use()
|
||||
def test_gemini_use_rmt(world_size):
|
||||
run_func = partial(run_dist, world_size=world_size, port=free_port())
|
||||
mp.spawn(run_func, nprocs=world_size)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_gemini_use_rmt(1)
|
58
tests/test_zero/test_gemini/test_get_torch_model.py
Normal file
58
tests/test_zero/test_gemini/test_get_torch_model.py
Normal file
@@ -0,0 +1,58 @@
|
||||
import os
|
||||
from functools import partial
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import torch.multiprocessing as mp
|
||||
|
||||
import colossalai
|
||||
from colossalai.tensor import ColoParameter
|
||||
from colossalai.testing import parameterize, rerun_if_address_is_in_use
|
||||
from colossalai.utils import free_port
|
||||
from colossalai.utils.cuda import get_current_device
|
||||
from colossalai.zero import ColoInitContext, GeminiDDP
|
||||
from colossalai.zero.gemini.utils import get_static_torch_model
|
||||
from tests.components_to_test.registry import non_distributed_component_funcs
|
||||
|
||||
|
||||
@parameterize('model_name', ['hanging_param_model', 'resnet18', 'gpt2'])
|
||||
def run_convert_torch_module(model_name: str):
|
||||
get_components_func = non_distributed_component_funcs.get_callable(model_name)
|
||||
model_builder, _, _, _, _ = get_components_func()
|
||||
|
||||
with ColoInitContext(device=torch.device("cpu")):
|
||||
model = model_builder(checkpoint=False)
|
||||
model = GeminiDDP(model, device=get_current_device(), placement_policy='auto', pin_memory=True)
|
||||
pytorch_model = get_static_torch_model(model, only_rank_0=False)
|
||||
|
||||
for n, p in pytorch_model.named_parameters():
|
||||
assert type(p) == torch.nn.Parameter, f"type error: {n} is a {type(p)}"
|
||||
|
||||
# get the static model should not change the original model
|
||||
for n, p in model.named_parameters():
|
||||
assert isinstance(p, ColoParameter)
|
||||
|
||||
for (pn, pm), (cn, cm) in zip(pytorch_model.named_modules(), model.named_modules()):
|
||||
assert pn == cn
|
||||
assert id(pm) != id(cm)
|
||||
for pp, cp in zip(pm.parameters(recurse=False), cm.parameters(recurse=False)):
|
||||
assert id(pp) != id(cp)
|
||||
assert pp.shape == cp.shape
|
||||
|
||||
|
||||
def run_dist(rank, world_size, port):
|
||||
config = {}
|
||||
colossalai.launch(config=config, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
|
||||
run_convert_torch_module()
|
||||
|
||||
|
||||
@pytest.mark.dist
|
||||
@pytest.mark.parametrize('world_size', [1, 4])
|
||||
@rerun_if_address_is_in_use()
|
||||
def test_convert_torch_module(world_size):
|
||||
run_func = partial(run_dist, world_size=world_size, port=free_port())
|
||||
mp.spawn(run_func, nprocs=world_size)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_convert_torch_module(2)
|
113
tests/test_zero/test_gemini/test_grad_clip.py
Normal file
113
tests/test_zero/test_gemini/test_grad_clip.py
Normal file
@@ -0,0 +1,113 @@
|
||||
from functools import partial
|
||||
from time import time
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
import torch.multiprocessing as mp
|
||||
from torch.nn.parallel import DistributedDataParallel as DDP
|
||||
from torch.testing import assert_close
|
||||
|
||||
import colossalai
|
||||
from colossalai.amp import convert_to_apex_amp
|
||||
from colossalai.nn.optimizer import HybridAdam
|
||||
from colossalai.testing import parameterize, rerun_if_address_is_in_use
|
||||
from colossalai.utils import free_port
|
||||
from colossalai.utils.cuda import get_current_device
|
||||
from colossalai.zero import ColoInitContext, ZeroDDP, ZeroOptimizer
|
||||
from colossalai.zero.gemini.chunk import ChunkManager, search_chunk_configuration
|
||||
from colossalai.zero.gemini.gemini_mgr import GeminiManager
|
||||
from tests.components_to_test import run_fwd_bwd
|
||||
from tests.components_to_test.registry import non_distributed_component_funcs
|
||||
from tests.test_tensor.common_utils import debug_print, set_seed
|
||||
|
||||
|
||||
def check_param(model: ZeroDDP, torch_model: torch.nn.Module):
|
||||
zero_dict = model.state_dict(only_rank_0=False)
|
||||
torch_dict = torch_model.state_dict()
|
||||
|
||||
for key, value in torch_dict.items():
|
||||
# key is 'module.model.PARAMETER', so we truncate it
|
||||
key = key[7:]
|
||||
assert key in zero_dict, "{} not in ZeRO dictionary.".format(key)
|
||||
temp_zero_value = zero_dict[key].to(device=value.device, dtype=value.dtype)
|
||||
# debug_print([0], "max range: ", key, torch.max(torch.abs(value - temp_zero_value)))
|
||||
assert_close(value, temp_zero_value, rtol=1e-3, atol=4e-3)
|
||||
|
||||
|
||||
@parameterize('placement_policy', ['cuda', 'cpu', 'auto', 'const'])
|
||||
@parameterize('model_name', ['gpt2'])
|
||||
def exam_grad_clipping(placement_policy, model_name: str):
|
||||
set_seed(1912)
|
||||
get_components_func = non_distributed_component_funcs.get_callable(model_name)
|
||||
model_builder, train_dataloader, test_dataloader, optimizer_class, criterion = get_components_func()
|
||||
|
||||
torch_model = model_builder().cuda()
|
||||
amp_config = dict(opt_level='O2', keep_batchnorm_fp32=False, loss_scale=32)
|
||||
torch_optim = torch.optim.Adam(torch_model.parameters(), lr=1e-3)
|
||||
torch_model, torch_optim = convert_to_apex_amp(torch_model, torch_optim, amp_config)
|
||||
torch_model = DDP(torch_model, device_ids=[dist.get_rank()])
|
||||
|
||||
init_dev = get_current_device()
|
||||
with ColoInitContext(device=init_dev):
|
||||
model = model_builder()
|
||||
|
||||
for torch_p, p in zip(torch_model.parameters(), model.parameters()):
|
||||
p.data.copy_(torch_p.data)
|
||||
|
||||
world_size = torch.distributed.get_world_size()
|
||||
config_dict, *_ = search_chunk_configuration(model, search_range_mb=1, search_interval_byte=100)
|
||||
config_dict[world_size]['chunk_size'] = 5000
|
||||
config_dict[world_size]['keep_gathered'] = False
|
||||
if placement_policy != 'cuda':
|
||||
init_device = torch.device('cpu')
|
||||
else:
|
||||
init_device = None
|
||||
chunk_manager = ChunkManager(config_dict, init_device=init_device)
|
||||
gemini_manager = GeminiManager(placement_policy, chunk_manager)
|
||||
model = ZeroDDP(model, gemini_manager, pin_memory=True)
|
||||
|
||||
optimizer = HybridAdam(model.parameters(), lr=1e-3)
|
||||
zero_optim = ZeroOptimizer(optimizer, model, initial_scale=32, clipping_norm=1.0)
|
||||
|
||||
model.train()
|
||||
torch_model.train()
|
||||
|
||||
set_seed(dist.get_rank() * 3 + 128)
|
||||
for i, (data, label) in enumerate(train_dataloader):
|
||||
if i > 2:
|
||||
break
|
||||
data = data.cuda()
|
||||
label = label.cuda()
|
||||
|
||||
zero_optim.zero_grad()
|
||||
torch_optim.zero_grad()
|
||||
|
||||
torch_loss = run_fwd_bwd(torch_model, data, label, criterion, torch_optim)
|
||||
loss = run_fwd_bwd(model, data, label, criterion, zero_optim)
|
||||
assert_close(torch_loss, loss)
|
||||
|
||||
import apex.amp as apex_amp
|
||||
torch.nn.utils.clip_grad_norm_(apex_amp.master_params(torch_optim), 1.0)
|
||||
torch_optim.step()
|
||||
zero_optim.step()
|
||||
|
||||
check_param(model, torch_model)
|
||||
|
||||
|
||||
def run_dist(rank, world_size, port):
|
||||
config = {}
|
||||
colossalai.launch(config=config, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
|
||||
exam_grad_clipping()
|
||||
|
||||
|
||||
@pytest.mark.dist
|
||||
@pytest.mark.parametrize('world_size', [1, 2])
|
||||
@rerun_if_address_is_in_use()
|
||||
def test_grad_clip(world_size):
|
||||
run_func = partial(run_dist, world_size=world_size, port=free_port())
|
||||
mp.spawn(run_func, nprocs=world_size)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_grad_clip(2)
|
136
tests/test_zero/test_gemini/test_inference.py
Normal file
136
tests/test_zero/test_gemini/test_inference.py
Normal file
@@ -0,0 +1,136 @@
|
||||
from functools import partial
|
||||
from typing import Callable
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
import torch.multiprocessing as mp
|
||||
from torch.nn.parallel import DistributedDataParallel as DDP
|
||||
from torch.testing import assert_close
|
||||
|
||||
import colossalai
|
||||
from colossalai.amp import convert_to_apex_amp
|
||||
from colossalai.nn.optimizer import HybridAdam
|
||||
from colossalai.testing import parameterize, rerun_if_address_is_in_use
|
||||
from colossalai.utils import free_port
|
||||
from colossalai.utils.cuda import get_current_device
|
||||
from colossalai.zero import ColoInitContext, ZeroDDP, ZeroOptimizer, post_process_colo_init_ctx, zero_model_wrapper
|
||||
from colossalai.zero.gemini.chunk import ChunkManager, init_chunk_manager, search_chunk_configuration
|
||||
from colossalai.zero.gemini.gemini_mgr import GeminiManager
|
||||
from tests.components_to_test import run_fwd_bwd
|
||||
from tests.components_to_test.registry import non_distributed_component_funcs
|
||||
from tests.test_tensor.common_utils import debug_print, set_seed
|
||||
|
||||
|
||||
def check_param(model: ZeroDDP, torch_model: torch.nn.Module):
|
||||
zero_dict = model.state_dict(only_rank_0=False)
|
||||
torch_dict = torch_model.state_dict()
|
||||
|
||||
for key, value in torch_dict.items():
|
||||
# key is 'module.model.PARAMETER', so we truncate it
|
||||
key = key[7:]
|
||||
assert key in zero_dict, "{} not in ZeRO dictionary.".format(key)
|
||||
temp_zero_value = zero_dict[key].to(device=value.device, dtype=value.dtype)
|
||||
# debug_print([0], "max range: ", key, torch.max(torch.abs(value - temp_zero_value)))
|
||||
assert_close(value, temp_zero_value, rtol=1e-3, atol=4e-3)
|
||||
|
||||
|
||||
def multi_chunk_init(model: torch.nn.Module, placement_policy: str):
|
||||
world_size = dist.get_world_size()
|
||||
config_dict, *_ = search_chunk_configuration(model, search_range_mb=1, search_interval_byte=100)
|
||||
config_dict[world_size]['chunk_size'] = 5000
|
||||
config_dict[world_size]['keep_gathered'] = False
|
||||
if placement_policy != 'cuda':
|
||||
init_device = torch.device('cpu')
|
||||
else:
|
||||
init_device = None
|
||||
chunk_manager = ChunkManager(config_dict, init_device=init_device)
|
||||
gemini_manager = GeminiManager(placement_policy, chunk_manager)
|
||||
model = ZeroDDP(model, gemini_manager, pin_memory=True)
|
||||
return model
|
||||
|
||||
|
||||
def single_chunk_init(model: torch.nn.Module, placement_policy: str):
|
||||
gemini_config = dict(
|
||||
device=get_current_device(),
|
||||
placement_policy=placement_policy,
|
||||
pin_memory=True,
|
||||
)
|
||||
model = zero_model_wrapper(model=model, zero_stage=3, gemini_config=gemini_config)
|
||||
return model
|
||||
|
||||
|
||||
@parameterize('placement_policy', ['cuda', 'cpu', 'auto', 'const'])
|
||||
@parameterize('model_name', ['gpt2'])
|
||||
@parameterize('model_init_func', [single_chunk_init, multi_chunk_init])
|
||||
def exam_inference(placement_policy: str, model_name: str, model_init_func: Callable):
|
||||
set_seed(19360226)
|
||||
get_components_func = non_distributed_component_funcs.get_callable(model_name)
|
||||
model_builder, train_dataloader, test_dataloader, optimizer_class, criterion = get_components_func()
|
||||
|
||||
torch_model = model_builder().cuda()
|
||||
amp_config = dict(opt_level='O2', keep_batchnorm_fp32=False, loss_scale=128)
|
||||
torch_optim = torch.optim.Adam(torch_model.parameters(), lr=1e-3)
|
||||
torch_model, torch_optim = convert_to_apex_amp(torch_model, torch_optim, amp_config)
|
||||
torch_model = DDP(torch_model, device_ids=[dist.get_rank()])
|
||||
|
||||
init_dev = get_current_device()
|
||||
with ColoInitContext(device=init_dev):
|
||||
model = model_builder()
|
||||
|
||||
for torch_p, p in zip(torch_model.parameters(), model.parameters()):
|
||||
p.data.copy_(torch_p.data)
|
||||
|
||||
model = model_init_func(model, placement_policy)
|
||||
optimizer = HybridAdam(model.parameters(), lr=1e-3)
|
||||
zero_optim = ZeroOptimizer(optimizer, model, initial_scale=128)
|
||||
|
||||
model.eval()
|
||||
torch_model.eval()
|
||||
|
||||
set_seed(dist.get_rank() * 3 + 128)
|
||||
train_dataloader = iter(train_dataloader)
|
||||
|
||||
def train_iter():
|
||||
input_ids, label = next(train_dataloader)
|
||||
input_ids, label = input_ids.cuda(), label.cuda()
|
||||
zero_optim.zero_grad()
|
||||
torch_optim.zero_grad()
|
||||
torch_loss = run_fwd_bwd(torch_model, input_ids, label, criterion, torch_optim)
|
||||
loss = run_fwd_bwd(model, input_ids, label, criterion, zero_optim)
|
||||
assert_close(torch_loss, loss)
|
||||
zero_optim.step()
|
||||
torch_optim.step()
|
||||
check_param(model, torch_model)
|
||||
|
||||
def inference_iter():
|
||||
input_ids, label = next(train_dataloader)
|
||||
input_ids, label = input_ids.cuda(), label.cuda()
|
||||
with torch.no_grad():
|
||||
torch_output = torch_model(input_ids)
|
||||
torch_loss = criterion(torch_output.float(), label)
|
||||
zero_output = model(input_ids)
|
||||
zero_loss = criterion(zero_output.float(), label)
|
||||
assert_close(torch_loss, zero_loss)
|
||||
|
||||
train_iter()
|
||||
inference_iter()
|
||||
train_iter()
|
||||
|
||||
|
||||
def run_dist(rank, world_size, port):
|
||||
config = {}
|
||||
colossalai.launch(config=config, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
|
||||
exam_inference()
|
||||
|
||||
|
||||
@pytest.mark.dist
|
||||
@pytest.mark.parametrize('world_size', [1, 4])
|
||||
@rerun_if_address_is_in_use()
|
||||
def test_inference(world_size):
|
||||
run_func = partial(run_dist, world_size=world_size, port=free_port())
|
||||
mp.spawn(run_func, nprocs=world_size)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_inference(1)
|
165
tests/test_zero/test_gemini/test_optim.py
Normal file
165
tests/test_zero/test_gemini/test_optim.py
Normal file
@@ -0,0 +1,165 @@
|
||||
from functools import partial
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
import torch.multiprocessing as mp
|
||||
from torch.nn.parallel import DistributedDataParallel as DDP
|
||||
from torch.testing import assert_close
|
||||
|
||||
import colossalai
|
||||
from colossalai.amp import convert_to_apex_amp
|
||||
from colossalai.nn.optimizer import HybridAdam
|
||||
from colossalai.tensor import ColoParameter, ColoTensor
|
||||
from colossalai.testing import parameterize, rerun_if_address_is_in_use
|
||||
from colossalai.utils import free_port
|
||||
from colossalai.utils.cuda import get_current_device
|
||||
from colossalai.zero import ColoInitContext, ZeroDDP, ZeroOptimizer, post_process_colo_init_ctx
|
||||
from colossalai.zero.gemini.chunk import ChunkManager, init_chunk_manager, search_chunk_configuration
|
||||
from colossalai.zero.gemini.gemini_mgr import GeminiManager
|
||||
from tests.components_to_test import run_fwd_bwd
|
||||
from tests.components_to_test.registry import non_distributed_component_funcs
|
||||
from tests.test_tensor.common_utils import debug_print, set_seed
|
||||
|
||||
# this model is large enough to slice to chunks
|
||||
TEST_MODELS = ['gpt2']
|
||||
# these models are too small, all parameters in these models are compacted into one chunk
|
||||
EXAMPLE_MODELS = ['albert', 'beit', 'bert', 'hanging_param_model', 'nested_model', 'repeated_computed_layers']
|
||||
|
||||
|
||||
def check_param(model: ZeroDDP, torch_model: torch.nn.Module):
|
||||
zero_dict = model.state_dict(only_rank_0=False)
|
||||
torch_dict = torch_model.state_dict()
|
||||
|
||||
for key, value in torch_dict.items():
|
||||
# key is 'module.model.PARAMETER', so we truncate it
|
||||
key = key[7:]
|
||||
assert key in zero_dict, "{} not in ZeRO dictionary.".format(key)
|
||||
temp_zero_value = zero_dict[key].to(device=value.device, dtype=value.dtype)
|
||||
# debug_print([0], "max range: ", key, torch.max(torch.abs(value - temp_zero_value)))
|
||||
assert_close(value, temp_zero_value, rtol=1e-3, atol=4e-3)
|
||||
|
||||
|
||||
@parameterize('placement_policy', ['cuda', 'cpu', 'auto', 'const'])
|
||||
@parameterize('model_name', TEST_MODELS)
|
||||
def exam_model_step(placement_policy, model_name: str):
|
||||
set_seed(42)
|
||||
get_components_func = non_distributed_component_funcs.get_callable(model_name)
|
||||
model_builder, train_dataloader, test_dataloader, optimizer_class, criterion = get_components_func()
|
||||
|
||||
torch_model = model_builder().cuda()
|
||||
amp_config = dict(opt_level='O2', keep_batchnorm_fp32=False, loss_scale=128)
|
||||
torch_optim = torch.optim.Adam(torch_model.parameters(), lr=1e-3)
|
||||
torch_model, torch_optim = convert_to_apex_amp(torch_model, torch_optim, amp_config)
|
||||
torch_model = DDP(torch_model, device_ids=[dist.get_rank()])
|
||||
|
||||
init_dev = get_current_device()
|
||||
with ColoInitContext(device=init_dev):
|
||||
model = model_builder()
|
||||
|
||||
for torch_p, p in zip(torch_model.parameters(), model.parameters()):
|
||||
p.data.copy_(torch_p.data)
|
||||
|
||||
world_size = torch.distributed.get_world_size()
|
||||
config_dict, *_ = search_chunk_configuration(model, search_range_mb=1, search_interval_byte=100)
|
||||
config_dict[world_size]['chunk_size'] = 5000
|
||||
config_dict[world_size]['keep_gathered'] = False
|
||||
if placement_policy != 'cuda':
|
||||
init_device = torch.device('cpu')
|
||||
else:
|
||||
init_device = None
|
||||
chunk_manager = ChunkManager(config_dict, init_device=init_device)
|
||||
gemini_manager = GeminiManager(placement_policy, chunk_manager)
|
||||
model = ZeroDDP(model, gemini_manager, pin_memory=True)
|
||||
|
||||
optimizer = HybridAdam(model.parameters(), lr=1e-3)
|
||||
zero_optim = ZeroOptimizer(optimizer, model, initial_scale=128)
|
||||
|
||||
model.eval()
|
||||
torch_model.eval()
|
||||
|
||||
set_seed(dist.get_rank() * 3 + 128)
|
||||
for i, (input_ids, label) in enumerate(train_dataloader):
|
||||
if i > 2:
|
||||
break
|
||||
input_ids, label = input_ids.cuda(), label.cuda()
|
||||
zero_optim.zero_grad()
|
||||
torch_optim.zero_grad()
|
||||
|
||||
torch_loss = run_fwd_bwd(torch_model, input_ids, label, criterion, torch_optim)
|
||||
loss = run_fwd_bwd(model, input_ids, label, criterion, zero_optim)
|
||||
assert_close(torch_loss, loss)
|
||||
|
||||
zero_optim.step()
|
||||
torch_optim.step()
|
||||
|
||||
check_param(model, torch_model)
|
||||
|
||||
|
||||
@parameterize('placement_policy', ['cuda', 'cpu', 'auto', 'const'])
|
||||
@parameterize('model_name', EXAMPLE_MODELS)
|
||||
def exam_tiny_example(placement_policy, model_name: str):
|
||||
set_seed(2008)
|
||||
get_components_func = non_distributed_component_funcs.get_callable(model_name)
|
||||
model_builder, train_dataloader, test_dataloader, optimizer_class, criterion = get_components_func()
|
||||
|
||||
torch_model = model_builder().cuda()
|
||||
amp_config = dict(opt_level='O2', keep_batchnorm_fp32=False, loss_scale=2)
|
||||
torch_optim = torch.optim.Adam(torch_model.parameters(), lr=1e-3)
|
||||
torch_model, torch_optim = convert_to_apex_amp(torch_model, torch_optim, amp_config)
|
||||
torch_model = DDP(torch_model, device_ids=[dist.get_rank()])
|
||||
|
||||
init_dev = get_current_device()
|
||||
with ColoInitContext(device=init_dev):
|
||||
model = model_builder()
|
||||
|
||||
for torch_p, p in zip(torch_model.parameters(), model.parameters()):
|
||||
p.data.copy_(torch_p.data)
|
||||
|
||||
chunk_manager = init_chunk_manager(model=model, init_device=get_current_device(), search_range_mb=1)
|
||||
gemini_manager = GeminiManager(placement_policy, chunk_manager)
|
||||
model = ZeroDDP(model, gemini_manager, pin_memory=True)
|
||||
optimizer = HybridAdam(model.parameters(), lr=1e-3)
|
||||
zero_optim = ZeroOptimizer(optimizer, model, initial_scale=2)
|
||||
|
||||
model.eval()
|
||||
torch_model.eval()
|
||||
|
||||
set_seed(dist.get_rank() * 3 + 128)
|
||||
for i, (input_ids, label) in enumerate(train_dataloader):
|
||||
if i > 2:
|
||||
break
|
||||
|
||||
input_ids = input_ids.cuda()
|
||||
label = label.cuda()
|
||||
|
||||
zero_optim.zero_grad()
|
||||
torch_optim.zero_grad()
|
||||
|
||||
torch_loss = run_fwd_bwd(torch_model, input_ids, label, criterion, torch_optim)
|
||||
loss = run_fwd_bwd(model, input_ids, label, criterion, zero_optim)
|
||||
assert_close(torch_loss, loss, rtol=1.5e-6, atol=2e-5) # atol should be 2e-5 for torch lower than 1.12
|
||||
|
||||
zero_optim.step()
|
||||
torch_optim.step()
|
||||
|
||||
check_param(model, torch_model)
|
||||
|
||||
|
||||
def run_dist(rank, world_size, port):
|
||||
config = {}
|
||||
colossalai.launch(config=config, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
|
||||
exam_model_step()
|
||||
exam_tiny_example()
|
||||
|
||||
|
||||
@pytest.mark.dist
|
||||
@pytest.mark.parametrize('world_size', [1, 4])
|
||||
@rerun_if_address_is_in_use()
|
||||
def test_optim(world_size):
|
||||
run_func = partial(run_dist, world_size=world_size, port=free_port())
|
||||
mp.spawn(run_func, nprocs=world_size)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_optim(1)
|
52
tests/test_zero/test_gemini/test_runtime_mem_tracer.py
Normal file
52
tests/test_zero/test_gemini/test_runtime_mem_tracer.py
Normal file
@@ -0,0 +1,52 @@
|
||||
from copy import deepcopy
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from colossalai.zero import ColoInitContext
|
||||
from colossalai.zero.gemini.memory_tracer.runtime_mem_tracer import RuntimeMemTracer
|
||||
from tests.components_to_test import run_fwd_bwd
|
||||
from tests.components_to_test.registry import non_distributed_component_funcs
|
||||
|
||||
|
||||
def test_runtime_mem_tracer():
|
||||
test_models = ['gpt2', 'bert', 'simple_net', 'repeated_computed_layers', 'nested_model', 'albert']
|
||||
|
||||
for model_name in test_models:
|
||||
get_components_func = non_distributed_component_funcs.get_callable(model_name)
|
||||
model_builder, train_dataloader, _, _, criterion = get_components_func()
|
||||
|
||||
with ColoInitContext(device='cpu'):
|
||||
model = model_builder(checkpoint=False)
|
||||
|
||||
model_bk = deepcopy(model)
|
||||
runtime_mem_tracer = RuntimeMemTracer(model)
|
||||
|
||||
for i, (data, label) in enumerate(train_dataloader):
|
||||
if i > 1:
|
||||
break
|
||||
data = data.cuda()
|
||||
label = label.cuda()
|
||||
|
||||
run_fwd_bwd(runtime_mem_tracer, data, label, criterion, optimizer=runtime_mem_tracer)
|
||||
|
||||
for p1, p2 in zip(model_bk.parameters(), model.parameters()):
|
||||
torch.allclose(p1.to(torch.half), p2)
|
||||
|
||||
non_model_data_list = runtime_mem_tracer._memstats.non_model_data_list('cuda')
|
||||
cuda_non_model_data_list = np.array(non_model_data_list) / 1024**2
|
||||
print("cuda_non_model_data_list", len(cuda_non_model_data_list))
|
||||
print(non_model_data_list)
|
||||
|
||||
cnt1 = 0
|
||||
for p in runtime_mem_tracer.parameters_in_runtime_order():
|
||||
cnt1 += 1
|
||||
cnt2 = 0
|
||||
for p in model.parameters():
|
||||
cnt2 += 1
|
||||
assert cnt2 == cnt1, f'visited param number {cnt1} vs real param number {cnt2}'
|
||||
del model
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_runtime_mem_tracer()
|
123
tests/test_zero/test_gemini/test_search.py
Normal file
123
tests/test_zero/test_gemini/test_search.py
Normal file
@@ -0,0 +1,123 @@
|
||||
from functools import partial
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
import torch.multiprocessing as mp
|
||||
|
||||
import colossalai
|
||||
from colossalai.tensor import ComputePattern, ComputeSpec, ProcessGroup, ShardSpec
|
||||
from colossalai.testing import rerun_if_address_is_in_use
|
||||
from colossalai.utils import free_port, get_current_device
|
||||
from colossalai.zero import ColoInitContext
|
||||
from colossalai.zero.gemini.chunk import init_chunk_manager, search_chunk_configuration
|
||||
from tests.components_to_test.registry import non_distributed_component_funcs
|
||||
|
||||
|
||||
def init_1d_row_spec(model, pg: ProcessGroup):
|
||||
tensor_spec = (ShardSpec([0], [pg.tp_world_size()]), ComputeSpec(ComputePattern.TP1D))
|
||||
for n, p in model.named_parameters():
|
||||
if 'weight' in n and 'ln' not in n:
|
||||
p.set_process_group(pg)
|
||||
p.set_tensor_spec(*tensor_spec)
|
||||
|
||||
|
||||
def exam_search_chunk_size():
|
||||
world_size = torch.distributed.get_world_size()
|
||||
pg_tp = ProcessGroup(tp_degree=world_size)
|
||||
|
||||
get_components_func = non_distributed_component_funcs.get_callable('gpt2')
|
||||
model_builder, train_dataloader, test_dataloader, optimizer_class, criterion = get_components_func()
|
||||
|
||||
# make sure torch_model and model has the same parameter values
|
||||
with ColoInitContext(device=get_current_device()):
|
||||
model = model_builder()
|
||||
init_1d_row_spec(model, pg_tp)
|
||||
config_dict, *_ = search_chunk_configuration(model,
|
||||
search_range_mb=1,
|
||||
search_interval_byte=16,
|
||||
min_chunk_size_mb=0,
|
||||
filter_exlarge_params=True)
|
||||
|
||||
for key in config_dict:
|
||||
chunk_size = config_dict[key]['chunk_size']
|
||||
if world_size == 1:
|
||||
assert chunk_size == 31616
|
||||
else:
|
||||
assert chunk_size == 1024
|
||||
|
||||
|
||||
def exam_search_strict_ddp():
|
||||
world_size = torch.distributed.get_world_size()
|
||||
default_shard_pg = ProcessGroup(tp_degree=world_size)
|
||||
default_shard_spec = ShardSpec([-1], [world_size])
|
||||
|
||||
get_components_func = non_distributed_component_funcs.get_callable('gpt2')
|
||||
model_builder, train_dataloader, test_dataloader, optimizer_class, criterion = get_components_func()
|
||||
# get the chunk configuration over replicated models
|
||||
with ColoInitContext(device=get_current_device()):
|
||||
ddp_model = model_builder()
|
||||
re_dict, re_total, re_wasted = search_chunk_configuration(ddp_model,
|
||||
search_range_mb=1,
|
||||
search_interval_byte=16,
|
||||
min_chunk_size_mb=0,
|
||||
filter_exlarge_params=True,
|
||||
strict_ddp_flag=False)
|
||||
# get the chunk configuration over sharded ddp models
|
||||
with ColoInitContext(device=get_current_device(), default_pg=default_shard_pg,
|
||||
default_dist_spec=default_shard_spec):
|
||||
sharded_ddp_model = model_builder()
|
||||
sh_dict, sh_total, sh_wasted = search_chunk_configuration(sharded_ddp_model,
|
||||
search_range_mb=1,
|
||||
search_interval_byte=16,
|
||||
min_chunk_size_mb=0,
|
||||
filter_exlarge_params=True,
|
||||
strict_ddp_flag=True)
|
||||
assert re_dict == sh_dict
|
||||
for key in re_dict:
|
||||
assert re_dict[key] == sh_dict[key]
|
||||
|
||||
assert re_total == sh_total
|
||||
assert re_wasted == sh_wasted
|
||||
|
||||
|
||||
def exam_chunk_manager():
|
||||
world_size = torch.distributed.get_world_size()
|
||||
default_shard_pg = ProcessGroup(tp_degree=world_size)
|
||||
default_shard_spec = ShardSpec([-1], [world_size])
|
||||
|
||||
get_components_func = non_distributed_component_funcs.get_callable('gpt2')
|
||||
model_builder, train_dataloader, test_dataloader, optimizer_class, criterion = get_components_func()
|
||||
|
||||
with ColoInitContext(device=get_current_device(), default_pg=default_shard_pg,
|
||||
default_dist_spec=default_shard_spec):
|
||||
sharded_ddp_model = model_builder()
|
||||
chunk_manager = init_chunk_manager(sharded_ddp_model,
|
||||
get_current_device(),
|
||||
hidden_dim=16,
|
||||
search_range_mb=1,
|
||||
min_chunk_size_mb=0,
|
||||
filter_exlarge_params=True,
|
||||
strict_ddp_flag=True)
|
||||
config_dict = chunk_manager.dp_degree_chunk_size_dict
|
||||
assert len(config_dict) == 1
|
||||
assert config_dict[world_size] == 31616
|
||||
|
||||
|
||||
def run_dist(rank, world_size, port):
|
||||
colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
|
||||
exam_search_chunk_size()
|
||||
exam_search_strict_ddp()
|
||||
exam_chunk_manager()
|
||||
|
||||
|
||||
@pytest.mark.dist
|
||||
@pytest.mark.parametrize('world_size', [1, 4])
|
||||
@rerun_if_address_is_in_use()
|
||||
def test_search(world_size):
|
||||
run_func = partial(run_dist, world_size=world_size, port=free_port())
|
||||
mp.spawn(run_func, nprocs=world_size)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_search(4)
|
113
tests/test_zero/test_gemini/test_zeroddp_state_dict.py
Normal file
113
tests/test_zero/test_gemini/test_zeroddp_state_dict.py
Normal file
@@ -0,0 +1,113 @@
|
||||
from functools import partial
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
import torch.multiprocessing as mp
|
||||
from torch.testing import assert_close
|
||||
|
||||
import colossalai
|
||||
from colossalai.testing import parameterize, rerun_if_address_is_in_use
|
||||
from colossalai.utils import free_port
|
||||
from colossalai.utils.cuda import get_current_device
|
||||
from colossalai.zero import ColoInitContext, ZeroDDP
|
||||
from colossalai.zero.gemini.chunk import ChunkManager, search_chunk_configuration
|
||||
from colossalai.zero.gemini.gemini_mgr import GeminiManager
|
||||
from tests.components_to_test.registry import non_distributed_component_funcs
|
||||
from tests.test_tensor.common_utils import debug_print, set_seed
|
||||
|
||||
|
||||
def ignore_the_first_parameter(model: torch.nn.Module):
|
||||
for name, param in model.named_parameters():
|
||||
print(f"parameter `{name}` is set ignored")
|
||||
ZeroDDP.set_params_to_ignore([param])
|
||||
return
|
||||
|
||||
|
||||
@parameterize('placement_policy', ['cuda', 'cpu', 'auto'])
|
||||
@parameterize('keep_gathered', [True, False])
|
||||
@parameterize('model_name', ['gpt2', 'bert'])
|
||||
def exam_state_dict(placement_policy, keep_gathered, model_name: str):
|
||||
set_seed(431)
|
||||
get_components_func = non_distributed_component_funcs.get_callable(model_name)
|
||||
model_builder, train_dataloader, test_dataloader, optimizer_class, criterion = get_components_func()
|
||||
|
||||
with ColoInitContext(device=get_current_device()):
|
||||
model = model_builder()
|
||||
|
||||
torch_model = model_builder()
|
||||
for torch_p, p in zip(torch_model.parameters(), model.parameters()):
|
||||
torch_p.data.copy_(p.data)
|
||||
|
||||
world_size = torch.distributed.get_world_size()
|
||||
config_dict, *_ = search_chunk_configuration(model, search_range_mb=1, search_interval_byte=100)
|
||||
config_dict[world_size]['chunk_size'] = 5000
|
||||
config_dict[world_size]['keep_gathered'] = keep_gathered
|
||||
chunk_manager = ChunkManager(config_dict)
|
||||
gemini_manager = GeminiManager(placement_policy, chunk_manager)
|
||||
model = ZeroDDP(model, gemini_manager, pin_memory=True)
|
||||
model.train()
|
||||
|
||||
zero_dict = model.state_dict(only_rank_0=False)
|
||||
torch_dict = torch_model.state_dict()
|
||||
|
||||
for key, value in torch_dict.items():
|
||||
assert key in zero_dict, "{} not in ZeRO dictionary.".format(key)
|
||||
temp_zero_value = zero_dict[key].to(device=value.device, dtype=value.dtype)
|
||||
assert_close(value, temp_zero_value, rtol=1e-3, atol=1e-5)
|
||||
|
||||
|
||||
@parameterize('placement_policy', ['cuda', 'cpu', 'auto'])
|
||||
@parameterize('keep_gathered', [True, False])
|
||||
@parameterize('model_name', ['gpt2', 'bert'])
|
||||
def exam_load_state_dict(placement_policy, keep_gathered, model_name: str):
|
||||
set_seed(431)
|
||||
get_components_func = non_distributed_component_funcs.get_callable(model_name)
|
||||
model_builder, train_dataloader, test_dataloader, optimizer_class, criterion = get_components_func()
|
||||
|
||||
with ColoInitContext(device=get_current_device()):
|
||||
model = model_builder()
|
||||
|
||||
set_seed(451)
|
||||
torch_model = model_builder() # get a different model
|
||||
|
||||
world_size = torch.distributed.get_world_size()
|
||||
config_dict, *_ = search_chunk_configuration(model, search_range_mb=1, search_interval_byte=100)
|
||||
config_dict[world_size]['chunk_size'] = 5000
|
||||
config_dict[world_size]['keep_gathered'] = keep_gathered
|
||||
|
||||
if placement_policy != 'cuda':
|
||||
init_device = torch.device('cpu')
|
||||
else:
|
||||
init_device = None
|
||||
chunk_manager = ChunkManager(config_dict, init_device=init_device)
|
||||
gemini_manager = GeminiManager(placement_policy, chunk_manager)
|
||||
model = ZeroDDP(model, gemini_manager, pin_memory=True)
|
||||
|
||||
torch_dict = torch_model.state_dict()
|
||||
model.load_state_dict(torch_dict, strict=False)
|
||||
zero_dict = model.state_dict(only_rank_0=False)
|
||||
|
||||
for key, value in torch_dict.items():
|
||||
assert key in zero_dict, "{} not in ZeRO dictionary.".format(key)
|
||||
temp_zero_value = zero_dict[key].to(device=value.device, dtype=value.dtype)
|
||||
assert_close(value, temp_zero_value, rtol=1e-3, atol=1e-5)
|
||||
|
||||
|
||||
def run_dist(rank, world_size, port):
|
||||
config = {}
|
||||
colossalai.launch(config=config, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
|
||||
exam_state_dict()
|
||||
exam_load_state_dict()
|
||||
|
||||
|
||||
@pytest.mark.dist
|
||||
@pytest.mark.parametrize('world_size', [1, 4])
|
||||
@rerun_if_address_is_in_use()
|
||||
def test_zero_ddp(world_size):
|
||||
run_func = partial(run_dist, world_size=world_size, port=free_port())
|
||||
mp.spawn(run_func, nprocs=world_size)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_zero_ddp(1)
|
91
tests/test_zero/test_gemini/test_zerooptim_state_dict.py
Normal file
91
tests/test_zero/test_gemini/test_zerooptim_state_dict.py
Normal file
@@ -0,0 +1,91 @@
|
||||
from functools import partial
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
import torch.multiprocessing as mp
|
||||
|
||||
import colossalai
|
||||
from colossalai.nn.optimizer import HybridAdam
|
||||
from colossalai.testing import parameterize, rerun_if_address_is_in_use
|
||||
from colossalai.utils import free_port
|
||||
from colossalai.utils.cuda import get_current_device
|
||||
from colossalai.zero import ColoInitContext, ZeroDDP, ZeroOptimizer
|
||||
from colossalai.zero.gemini.chunk import ChunkManager, search_chunk_configuration
|
||||
from colossalai.zero.gemini.gemini_mgr import GeminiManager
|
||||
from tests.components_to_test.registry import non_distributed_component_funcs
|
||||
from tests.test_tensor.common_utils import debug_print, set_seed
|
||||
|
||||
|
||||
@parameterize('placement_policy', ['cuda', 'cpu', 'auto'])
|
||||
@parameterize('keep_gathered', [True, False])
|
||||
def exam_zero_optim_state_dict(placement_policy, keep_gathered):
|
||||
set_seed(431)
|
||||
get_components_func = non_distributed_component_funcs.get_callable('gpt2')
|
||||
model_builder, train_dataloader, test_dataloader, optimizer_class, criterion = get_components_func()
|
||||
|
||||
with ColoInitContext(device=get_current_device()):
|
||||
model = model_builder()
|
||||
|
||||
set_seed(451)
|
||||
torch_model = model_builder() # get a different model
|
||||
|
||||
world_size = torch.distributed.get_world_size()
|
||||
config_dict, *_ = search_chunk_configuration(model, search_range_mb=1, search_interval_byte=100)
|
||||
config_dict[world_size]['chunk_size'] = 5000
|
||||
config_dict[world_size]['keep_gathered'] = keep_gathered
|
||||
|
||||
if placement_policy != 'cuda':
|
||||
init_device = torch.device('cpu')
|
||||
else:
|
||||
init_device = None
|
||||
chunk_manager = ChunkManager(config_dict, init_device=init_device)
|
||||
gemini_manager = GeminiManager(placement_policy, chunk_manager)
|
||||
model = ZeroDDP(model, gemini_manager, pin_memory=True)
|
||||
|
||||
optimizer = HybridAdam(model.parameters())
|
||||
optim = ZeroOptimizer(optimizer, model, initial_scale=32) # initialize the link between chunk16 and chunk32
|
||||
|
||||
set_seed(dist.get_rank() * 3 + 128)
|
||||
model.train()
|
||||
for i, (input_ids, label) in enumerate(train_dataloader):
|
||||
if i > 0:
|
||||
break
|
||||
optim.zero_grad()
|
||||
logits = model(input_ids)
|
||||
logits = logits.float()
|
||||
loss = criterion(logits, input_ids)
|
||||
optim.backward(loss)
|
||||
optim.step()
|
||||
|
||||
optim_state_dict = optim.state_dict()
|
||||
optim.load_state_dict(optim_state_dict)
|
||||
new_state = optim.state_dict()['state']
|
||||
org_state = optim_state_dict['state']
|
||||
|
||||
for k, v in org_state.items():
|
||||
w = new_state[k]
|
||||
for n, m in v.items():
|
||||
if isinstance(m, torch.Tensor):
|
||||
o = w[n]
|
||||
assert torch.equal(m, o)
|
||||
else:
|
||||
assert m == w[n]
|
||||
|
||||
|
||||
def run_dist(rank, world_size, port):
|
||||
config = {}
|
||||
colossalai.launch(config=config, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
|
||||
exam_zero_optim_state_dict()
|
||||
|
||||
|
||||
@pytest.mark.dist
|
||||
@pytest.mark.parametrize('world_size', [1, 4])
|
||||
@rerun_if_address_is_in_use()
|
||||
def test_zero_optim(world_size):
|
||||
run_func = partial(run_dist, world_size=world_size, port=free_port())
|
||||
mp.spawn(run_func, nprocs=world_size)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_zero_optim(1)
|
Reference in New Issue
Block a user