mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-21 01:24:04 +00:00
[misc] update pre-commit and run all files (#4752)
* [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format
This commit is contained in:
@@ -6,25 +6,26 @@ from colossalai.legacy.core import global_context as gpc
|
||||
from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
|
||||
from tests.components_to_test.registry import non_distributed_component_funcs
|
||||
|
||||
CONFIG = dict(parallel=dict(pipeline=dict(size=1), tensor=dict(size=1, mode=None)),
|
||||
fp16=dict(mode=None),
|
||||
clip_grad_norm=1.0)
|
||||
CONFIG = dict(
|
||||
parallel=dict(pipeline=dict(size=1), tensor=dict(size=1, mode=None)), fp16=dict(mode=None), clip_grad_norm=1.0
|
||||
)
|
||||
|
||||
|
||||
@parameterize('model_name', ['repeated_computed_layers', 'resnet18', 'repeated_computed_layers'])
|
||||
@parameterize('amp_mode', [AMP_TYPE.APEX, AMP_TYPE.TORCH, AMP_TYPE.NAIVE, None])
|
||||
@parameterize("model_name", ["repeated_computed_layers", "resnet18", "repeated_computed_layers"])
|
||||
@parameterize("amp_mode", [AMP_TYPE.APEX, AMP_TYPE.TORCH, AMP_TYPE.NAIVE, None])
|
||||
def run_train(model_name, amp_mode):
|
||||
# FIXME: test bert
|
||||
get_components_func = non_distributed_component_funcs.get_callable(model_name)
|
||||
gpc.config.fp16['mode'] = amp_mode
|
||||
gpc.config.fp16["mode"] = amp_mode
|
||||
model_builder, train_dataloader, _, optimizer_class, criterion = get_components_func()
|
||||
|
||||
model = model_builder(checkpoint=False)
|
||||
engine, train_dataloader, *args = colossalai.legacy.initialize(model=model,
|
||||
optimizer=optimizer_class(model.parameters(),
|
||||
lr=1e-3),
|
||||
criterion=criterion,
|
||||
train_dataloader=train_dataloader)
|
||||
engine, train_dataloader, *args = colossalai.legacy.initialize(
|
||||
model=model,
|
||||
optimizer=optimizer_class(model.parameters(), lr=1e-3),
|
||||
criterion=criterion,
|
||||
train_dataloader=train_dataloader,
|
||||
)
|
||||
|
||||
try:
|
||||
engine.train()
|
||||
@@ -49,12 +50,9 @@ def run_train(model_name, amp_mode):
|
||||
|
||||
def run_engine(rank, world_size, port):
|
||||
# init dist env
|
||||
colossalai.legacy.launch(config=CONFIG,
|
||||
rank=rank,
|
||||
world_size=world_size,
|
||||
host='localhost',
|
||||
port=port,
|
||||
backend='nccl')
|
||||
colossalai.legacy.launch(
|
||||
config=CONFIG, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl"
|
||||
)
|
||||
run_train()
|
||||
|
||||
|
||||
@@ -64,5 +62,5 @@ def test_engine():
|
||||
spawn(run_engine, 2)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
test_engine()
|
||||
|
@@ -19,46 +19,40 @@ from colossalai.testing import rerun_if_address_is_in_use, spawn
|
||||
BATCH_SIZE = 2
|
||||
NUM_CLASSES = 10
|
||||
|
||||
CONFIG = dict(parallel=dict(pipeline=dict(size=1), tensor=dict(size=1, mode=None)),
|
||||
clip_grad_norm=1.0,
|
||||
gradient_accumulation=4)
|
||||
CONFIG = dict(
|
||||
parallel=dict(pipeline=dict(size=1), tensor=dict(size=1, mode=None)), clip_grad_norm=1.0, gradient_accumulation=4
|
||||
)
|
||||
|
||||
|
||||
def run_no_pipeline(rank, world_size, port):
|
||||
|
||||
# init dist env
|
||||
colossalai.legacy.launch(config=CONFIG,
|
||||
rank=rank,
|
||||
world_size=world_size,
|
||||
host='localhost',
|
||||
port=port,
|
||||
backend='nccl')
|
||||
colossalai.legacy.launch(
|
||||
config=CONFIG, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl"
|
||||
)
|
||||
|
||||
# build model
|
||||
model = resnet18(num_classes=10)
|
||||
|
||||
# build dataloaders
|
||||
train_dataset = CIFAR10(root=Path(os.environ['DATA']),
|
||||
download=True,
|
||||
transform=transforms.Compose([
|
||||
transforms.ToTensor(),
|
||||
transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))
|
||||
]))
|
||||
train_dataloader = get_dataloader(dataset=train_dataset,
|
||||
shuffle=True,
|
||||
batch_size=BATCH_SIZE,
|
||||
pin_memory=True,
|
||||
drop_last=True)
|
||||
train_dataset = CIFAR10(
|
||||
root=Path(os.environ["DATA"]),
|
||||
download=True,
|
||||
transform=transforms.Compose(
|
||||
[transforms.ToTensor(), transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))]
|
||||
),
|
||||
)
|
||||
train_dataloader = get_dataloader(
|
||||
dataset=train_dataset, shuffle=True, batch_size=BATCH_SIZE, pin_memory=True, drop_last=True
|
||||
)
|
||||
|
||||
# build optimizer
|
||||
optimizer = Adam(model.parameters(), lr=0.001)
|
||||
criterion = nn.CrossEntropyLoss()
|
||||
|
||||
engine, train_dataloader, *args = colossalai.legacy.initialize(model=model,
|
||||
optimizer=optimizer,
|
||||
criterion=criterion,
|
||||
train_dataloader=train_dataloader)
|
||||
logger = get_dist_logger()
|
||||
engine, train_dataloader, *args = colossalai.legacy.initialize(
|
||||
model=model, optimizer=optimizer, criterion=criterion, train_dataloader=train_dataloader
|
||||
)
|
||||
get_dist_logger()
|
||||
rank = torch.distributed.get_rank()
|
||||
param_track = []
|
||||
grad_track = []
|
||||
@@ -79,12 +73,13 @@ def run_no_pipeline(rank, world_size, port):
|
||||
param_track.append(next(model.parameters())[0].clone())
|
||||
grad_track.append(next(model.parameters()).grad[0].clone())
|
||||
step += 1
|
||||
if step == CONFIG['gradient_accumulation']:
|
||||
if step == CONFIG["gradient_accumulation"]:
|
||||
break
|
||||
|
||||
assert not torch.all(grad_track[0] == grad_track[-1]), 'grad should be different in different iterations'
|
||||
assert torch.all(param_track[0] == param_track[1]) and not torch.all(param_track[0] == param_track[-1]), \
|
||||
'param should be the same in the first few iterations and only changed in the last iteration'
|
||||
assert not torch.all(grad_track[0] == grad_track[-1]), "grad should be different in different iterations"
|
||||
assert torch.all(param_track[0] == param_track[1]) and not torch.all(
|
||||
param_track[0] == param_track[-1]
|
||||
), "param should be the same in the first few iterations and only changed in the last iteration"
|
||||
|
||||
gpc.destroy()
|
||||
torch.cuda.empty_cache()
|
||||
@@ -96,5 +91,5 @@ def test_engine():
|
||||
spawn(run_no_pipeline, 4)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
test_engine()
|
||||
|
Reference in New Issue
Block a user