[misc] resolve code factor issues (#4433)

This commit is contained in:
Hongxin Liu
2023-08-14 17:43:33 +08:00
parent 328a791d10
commit 172f7fa3cf
20 changed files with 31 additions and 205 deletions

View File

@@ -15,7 +15,7 @@ def test_gpt():
for name, (model_fn, data_gen_fn, _, _, _) in sub_registry.items():
model = model_fn()
# TODO: support the following models
# TODO(ver217): support the following models
# 1. GPT2DoubleHeadsModel
# as they are not supported, let's skip them
if model.__class__.__name__ in ['GPT2DoubleHeadsModel', 'GPT2ForQuestionAnswering']:

View File

@@ -1,171 +0,0 @@
import copy
import random
from typing import Any, Callable, Iterator, List, Optional, Tuple
import numpy as np
import pytest
import torch
import torch.distributed as dist
from torch.nn import Module
from torch.optim import Optimizer
from torch.optim.lr_scheduler import _LRScheduler as LRScheduler
from torch.utils.data import DataLoader
from torch.utils.data.distributed import DistributedSampler
import colossalai
from colossalai.cluster import ProcessGroupMesh
from colossalai.interface import ModelWrapper, OptimizerWrapper
from colossalai.logging import disable_existing_loggers
from colossalai.pipeline.schedule import OneForwardOneBackwardSchedule
from colossalai.pipeline.stage_manager import PipelineStageManager
from colossalai.shardformer import ShardConfig, ShardFormer
from colossalai.testing import (
assert_hf_output_close,
clear_cache_before_run,
parameterize,
rerun_if_address_is_in_use,
spawn,
)
from tests.kit.model_zoo import model_zoo
from tests.test_shardformer.test_model._utils import build_model, build_pipeline_model, run_forward
DP_AXIS, PP_AXIS, TP_AXIS = 0, 1, 2
class PipelineOptimizer(OptimizerWrapper):
def __init__(self, optim: Optimizer, model: Module):
super().__init__(optim)
params = set(model.parameters())
new_param_groups = []
for group in optim.param_groups:
params = [p for p in group['params'] if p in params]
new_param_groups.append({**group, 'params': params})
optim.__setstate__({'param_groups': new_param_groups})
# TODO: support amp
class PipelinedModel(ModelWrapper):
def __init__(self, module: Module, shard_config: ShardConfig, stage_manager: PipelineStageManager) -> None:
self.stage_manager = stage_manager
shardformer = ShardFormer(shard_config)
module, self.shared_params = shardformer.optimize(module)
self.shared_param_process_groups = []
super().__init__(module)
def prepare_dataloader(dataset, batch_size, shuffle=False, seed=1024, drop_last=False, pin_memory=False, num_workers=0):
sampler = DistributedSampler(
dataset,
# rank=self.pg_mesh.coordinate(DP_AXIS),
shuffle=shuffle)
# Deterministic dataloader
def seed_worker(worker_id):
worker_seed = seed
np.random.seed(worker_seed)
torch.manual_seed(worker_seed)
random.seed(worker_seed)
return DataLoader(
dataset,
batch_size=batch_size,
sampler=sampler,
worker_init_fn=seed_worker,
drop_last=drop_last,
pin_memory=pin_memory,
num_workers=num_workers,
)
def execute_pipeline(
data_iter: Iterator,
model: PipelinedModel,
criterion: Callable[[Any, Any], torch.Tensor],
optimizer: PipelineOptimizer,
return_loss: bool = True,
return_outputs: bool = False,
schedule: OneForwardOneBackwardSchedule = None,
) -> dict:
# return loss or outputs if needed
outputs = schedule.forward_backward_step(model, optimizer, data_iter, criterion, return_loss, return_outputs)
return outputs
class data_loader():
def __getitem__(self, x):
return torch.ones((4, 128), dtype=torch.int).cuda() * 10
def loss(y, x):
return (y[0].float().mean() - x[0].float().mean())
@parameterize('enable_fused_normalization', [False])
@parameterize('enable_tensor_parallelism', [False])
@parameterize('use_lazy_init', [False])
def run_llama_test(enable_fused_normalization, enable_tensor_parallelism, use_lazy_init):
PP_DIM = 0
PP_SIZE = 2
RANK_TO_COORDINATE = {
0: (0, 0),
1: (0, 1),
2: (1, 0),
3: (1, 1),
}
PP_RANKS_IN_GROUP = {
0: [0, 1],
1: [0, 1],
2: [2, 3],
3: [2, 3],
}
pg_mesh = ProcessGroupMesh(PP_SIZE)
stage_manager = PipelineStageManager(pg_mesh, PP_DIM)
sub_model_zoo = model_zoo.get_sub_registry('transformers_llama')
for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items():
if name != 'transformers_llama':
continue
num_microbatches = 2
org_model = model_fn().cuda()
data_iter = iter(data_loader())
model_copy = copy.deepcopy(org_model)
batch = next(data_iter)
with torch.no_grad():
y = model_copy(batch)
org_loss = loss(y, batch)
optimizer = torch.optim.AdamW(org_model.parameters(), lr=1e-3)
schedule = OneForwardOneBackwardSchedule(num_microbatches, stage_manager)
shard_config = ShardConfig(enable_fused_normalization=enable_fused_normalization,
enable_tensor_parallelism=enable_tensor_parallelism,
pipeline_stage_manager=stage_manager)
pipelined_model = PipelinedModel(org_model, shard_config, stage_manager)
pp_optimizer = PipelineOptimizer(optimizer, pipelined_model)
results = execute_pipeline(data_iter, pipelined_model, loss, pp_optimizer, schedule=schedule)
if stage_manager.is_last_stage():
assert results['loss'] == org_loss
else:
assert results['loss'] is None
assert results['outputs'] is None
torch.cuda.empty_cache()
def check_llama(rank, world_size, port):
disable_existing_loggers()
colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
run_llama_test()
@pytest.mark.dist
@rerun_if_address_is_in_use()
@clear_cache_before_run()
def test_llama():
spawn(check_llama, 2)
if __name__ == "__main__":
test_llama()

View File

@@ -101,7 +101,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
}])
def run_bloom_test(test_config):
# TODO: add test_config for TP+DP after supporting & debugging it
# TODO(baizhou): add test_config for TP+DP after supporting & debugging it
sub_model_zoo = model_zoo.get_sub_registry('transformers_bloom')

View File

@@ -125,7 +125,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
}])
def run_chatglm_test(test_config):
# TODO: add test_config for TP+DP after supporting & debugging it
# TODO(baizhou): add test_config for TP+DP after supporting & debugging it
sub_model_zoo = model_zoo.get_sub_registry('transformers_chatglm')

View File

@@ -110,7 +110,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
@clear_cache_before_run()
def run_gpt2_test(test_config):
# TODO: add test_config for TP+DP after supporting & debugging it
# TODO(baizhou): add test_config for TP+DP after supporting & debugging it
sub_model_zoo = model_zoo.get_sub_registry('transformers_gpt')

View File

@@ -133,7 +133,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
}])
def run_llama_test(test_config):
# TODO: add test_config for TP+DP after supporting & debugging it
# TODO(baizhou): add test_config for TP+DP after supporting & debugging it
sub_model_zoo = model_zoo.get_sub_registry('transformers_llama')

View File

@@ -127,7 +127,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
}])
def run_opt_test(test_config):
# TODO: add test_config for TP+DP after supporting & debugging it
# TODO(baizhou): add test_config for TP+DP after supporting & debugging it
sub_model_zoo = model_zoo.get_sub_registry('transformers_opt')

View File

@@ -105,10 +105,10 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
@clear_cache_before_run()
def run_t5_test(test_config):
# TODO: add plugin_config for TP+DP after supporting & debugging it
# TODO(baizhou): add plugin_config for TP+DP after supporting & debugging it
# {'tp_size': 2, 'pp_size': 1, 'enable_fused_normalization': True}
# TODO: add test_config for flash attention & jit operator after supporting
# TODO(baizhou): add test_config for flash attention & jit operator after supporting
sub_model_zoo = model_zoo.get_sub_registry('transformers_t5')

View File

@@ -124,8 +124,8 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
}])
def run_vit_test(test_config):
# TODO: add test_config for TP+DP after supporting & debugging it
# TODO: fix bug when settign lazy_init for Conv2D Layers in ViT models
# TODO(baizhou): add test_config for TP+DP after supporting & debugging it
# TODO(baizhou): fix bug when settign lazy_init for Conv2D Layers in ViT models
sub_model_zoo = model_zoo.get_sub_registry('transformers_vit')