mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-16 22:52:25 +00:00
Migrated project
This commit is contained in:
94
tests/test_trainer/configs/test_trainer_resnet.py
Normal file
94
tests/test_trainer/configs/test_trainer_resnet.py
Normal file
@@ -0,0 +1,94 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
BATCH_SIZE = 128
|
||||
IMG_SIZE = 32
|
||||
|
||||
# resnet 50
|
||||
model = dict(
|
||||
type='VanillaResNet',
|
||||
block_type='ResNetBottleneck',
|
||||
layers=[3, 4, 6, 3],
|
||||
num_cls=10
|
||||
)
|
||||
|
||||
train_data = dict(
|
||||
dataset=dict(
|
||||
type='CIFAR10Dataset',
|
||||
root=Path(os.environ['DATA']),
|
||||
transform_pipeline=[
|
||||
dict(type='Resize', size=IMG_SIZE),
|
||||
dict(type='RandomCrop', size=IMG_SIZE, padding=4),
|
||||
dict(type='RandomHorizontalFlip'),
|
||||
dict(type='ToTensor'),
|
||||
dict(type='Normalize',
|
||||
mean=[0.4914, 0.4822, 0.4465],
|
||||
std=[0.2023, 0.1994, 0.2010]),
|
||||
]
|
||||
),
|
||||
dataloader=dict(
|
||||
batch_size=BATCH_SIZE,
|
||||
pin_memory=True,
|
||||
num_workers=4,
|
||||
shuffle=True
|
||||
)
|
||||
)
|
||||
|
||||
test_data = dict(
|
||||
dataset=dict(
|
||||
type='CIFAR10Dataset',
|
||||
root=Path(os.environ['DATA']),
|
||||
train=False,
|
||||
transform_pipeline=[
|
||||
dict(type='Resize', size=IMG_SIZE),
|
||||
dict(type='ToTensor'),
|
||||
dict(type='Normalize',
|
||||
mean=[0.4914, 0.4822, 0.4465],
|
||||
std=[0.2023, 0.1994, 0.2010]
|
||||
),
|
||||
]
|
||||
),
|
||||
dataloader=dict(
|
||||
batch_size=BATCH_SIZE,
|
||||
pin_memory=True,
|
||||
num_workers=4,
|
||||
shuffle=True
|
||||
)
|
||||
)
|
||||
|
||||
optimizer = dict(
|
||||
type='SGD',
|
||||
lr=0.2,
|
||||
momentum=0.9,
|
||||
weight_decay=5e-4
|
||||
)
|
||||
|
||||
loss = dict(
|
||||
type='CrossEntropyLoss',
|
||||
)
|
||||
|
||||
parallel = dict(
|
||||
pipeline=dict(size=1),
|
||||
tensor=dict(size=1, mode=None),
|
||||
)
|
||||
|
||||
hooks = [
|
||||
dict(type='LogMetricByEpochHook'),
|
||||
dict(type='AccuracyHook'),
|
||||
dict(type='LossHook'),
|
||||
dict(type='TensorboardHook', log_dir='./tfb_logs'),
|
||||
dict(type='SaveCheckpointHook', interval=5, checkpoint_dir='./ckpt'),
|
||||
# dict(type='LoadCheckpointHook', epoch=20, checkpoint_dir='./ckpt')
|
||||
]
|
||||
|
||||
# fp16 = dict(
|
||||
# mode=AMP_TYPE.PARALLEL,
|
||||
# initial_scale=1
|
||||
# )
|
||||
|
||||
lr_scheduler = dict(
|
||||
type='CosineAnnealingLR',
|
||||
T_max=200
|
||||
)
|
||||
|
||||
num_epochs = 200
|
135
tests/test_trainer/configs/test_trainer_vit_2d.py
Normal file
135
tests/test_trainer/configs/test_trainer_vit_2d.py
Normal file
@@ -0,0 +1,135 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from colossalai.engine import AMP_TYPE
|
||||
|
||||
BATCH_SIZE = 512
|
||||
IMG_SIZE = 32
|
||||
PATCH_SIZE = 4
|
||||
DIM = 512
|
||||
NUM_ATTENTION_HEADS = 8
|
||||
SUMMA_DIM = 2
|
||||
NUM_CLASSES = 10
|
||||
DEPTH = 6
|
||||
|
||||
train_data = dict(
|
||||
dataset=dict(type='CIFAR10Dataset',
|
||||
root=Path(os.environ['DATA']),
|
||||
transform_pipeline=[
|
||||
dict(type='Resize', size=IMG_SIZE),
|
||||
dict(type='RandomCrop', size=IMG_SIZE, padding=4),
|
||||
dict(type='RandomHorizontalFlip'),
|
||||
dict(type='ToTensor'),
|
||||
dict(type='Normalize',
|
||||
mean=[0.4914, 0.4822, 0.4465],
|
||||
std=[0.2023, 0.1994, 0.2010]),
|
||||
]),
|
||||
dataloader=dict(
|
||||
batch_size=BATCH_SIZE,
|
||||
pin_memory=True,
|
||||
# num_workers=1,
|
||||
shuffle=True,
|
||||
))
|
||||
|
||||
test_data = dict(
|
||||
dataset=dict(type='CIFAR10Dataset',
|
||||
root=Path(os.environ['DATA']),
|
||||
train=False,
|
||||
transform_pipeline=[
|
||||
dict(type='Resize', size=IMG_SIZE),
|
||||
dict(type='ToTensor'),
|
||||
dict(type='Normalize',
|
||||
mean=[0.4914, 0.4822, 0.4465],
|
||||
std=[0.2023, 0.1994, 0.2010]),
|
||||
]),
|
||||
dataloader=dict(
|
||||
batch_size=400,
|
||||
pin_memory=True,
|
||||
# num_workers=1,
|
||||
))
|
||||
|
||||
optimizer = dict(type='Adam', lr=0.001, weight_decay=0)
|
||||
|
||||
loss = dict(type='CrossEntropyLoss2D', )
|
||||
|
||||
# model = dict(
|
||||
# type='VanillaResNet',
|
||||
# block_type='ResNetBasicBlock',
|
||||
# layers=[2, 2, 2, 2],
|
||||
# num_cls=10
|
||||
# )
|
||||
|
||||
model = dict(
|
||||
type='VisionTransformerFromConfig',
|
||||
tensor_splitting_cfg=dict(type='ViTInputSplitter2D', ),
|
||||
embedding_cfg=dict(
|
||||
type='ViTPatchEmbedding2D',
|
||||
img_size=IMG_SIZE,
|
||||
patch_size=PATCH_SIZE,
|
||||
embed_dim=DIM,
|
||||
),
|
||||
token_fusion_cfg=dict(type='ViTTokenFuser2D',
|
||||
img_size=IMG_SIZE,
|
||||
patch_size=PATCH_SIZE,
|
||||
embed_dim=DIM,
|
||||
drop_rate=0.1),
|
||||
norm_cfg=dict(
|
||||
type='LayerNorm2D',
|
||||
normalized_shape=DIM,
|
||||
eps=1e-6,
|
||||
),
|
||||
block_cfg=dict(
|
||||
type='ViTBlock',
|
||||
attention_cfg=dict(
|
||||
type='ViTSelfAttention2D',
|
||||
hidden_size=DIM,
|
||||
num_attention_heads=NUM_ATTENTION_HEADS,
|
||||
attention_dropout_prob=0.,
|
||||
hidden_dropout_prob=0.1,
|
||||
),
|
||||
droppath_cfg=dict(type='VanillaViTDropPath', ),
|
||||
mlp_cfg=dict(type='ViTMLP2D',
|
||||
in_features=DIM,
|
||||
dropout_prob=0.1,
|
||||
mlp_ratio=1),
|
||||
norm_cfg=dict(
|
||||
type='LayerNorm2D',
|
||||
normalized_shape=DIM,
|
||||
eps=1e-6,
|
||||
),
|
||||
),
|
||||
head_cfg=dict(
|
||||
type='ViTHead2D',
|
||||
hidden_size=DIM,
|
||||
num_classes=NUM_CLASSES,
|
||||
),
|
||||
embed_dim=DIM,
|
||||
depth=DEPTH,
|
||||
drop_path_rate=0.,
|
||||
)
|
||||
|
||||
hooks = [
|
||||
dict(type='LogMetricByEpochHook'),
|
||||
dict(type='LogTimingByEpochHook'),
|
||||
dict(type='Accuracy2DHook'),
|
||||
dict(type='LossHook'),
|
||||
dict(type='TensorboardHook', log_dir='./tfb_logs'),
|
||||
dict(type='SaveCheckpointHook', interval=5, checkpoint_dir='./ckpt'),
|
||||
# dict(type='LoadCheckpointHook', epoch=20, checkpoint_dir='./ckpt')
|
||||
]
|
||||
|
||||
parallel = dict(
|
||||
pipeline=dict(size=1),
|
||||
tensor=dict(size=4, mode='2d'),
|
||||
)
|
||||
|
||||
fp16 = dict(mode=AMP_TYPE.PARALLEL, initial_scale=2 ** 8)
|
||||
|
||||
lr_scheduler = dict(type='LinearWarmupLR', warmup_epochs=5)
|
||||
|
||||
schedule = dict(num_microbatches=1)
|
||||
|
||||
num_epochs = 60
|
||||
num_microbatches = 1
|
||||
|
||||
logging = dict(root_path='./logs')
|
5
tests/test_trainer/test.sh
Normal file
5
tests/test_trainer/test.sh
Normal file
@@ -0,0 +1,5 @@
|
||||
#!/usr/bin/env sh
|
||||
test_file=$1
|
||||
config_file=$2
|
||||
|
||||
python $test_file --local_rank $SLURM_PROCID --world_size $SLURM_NPROCS --host $HOST --port 29500 --config $config_file
|
37
tests/test_trainer/test_trainer.py
Normal file
37
tests/test_trainer/test_trainer.py
Normal file
@@ -0,0 +1,37 @@
|
||||
import colossalai
|
||||
from colossalai.core import global_context as gpc
|
||||
from colossalai.engine import Engine
|
||||
from colossalai.logging import get_global_dist_logger
|
||||
from colossalai.trainer import Trainer
|
||||
|
||||
|
||||
def test_trainer():
|
||||
model, train_dataloader, test_dataloader, criterion, optimizer, schedule, lr_scheduler = colossalai.initialize()
|
||||
logger = get_global_dist_logger()
|
||||
|
||||
engine = Engine(
|
||||
model=model,
|
||||
criterion=criterion,
|
||||
optimizer=optimizer,
|
||||
lr_scheduler=lr_scheduler,
|
||||
schedule=schedule
|
||||
)
|
||||
logger.info("engine is built", ranks=[0])
|
||||
|
||||
trainer = Trainer(engine=engine,
|
||||
hooks_cfg=gpc.config.hooks,
|
||||
verbose=True)
|
||||
logger.info("trainer is built", ranks=[0])
|
||||
|
||||
logger.info("start training", ranks=[0])
|
||||
trainer.fit(
|
||||
train_dataloader=train_dataloader,
|
||||
test_dataloader=test_dataloader,
|
||||
max_epochs=gpc.config.num_epochs,
|
||||
display_progress=False,
|
||||
test_interval=5
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_trainer()
|
Reference in New Issue
Block a user