mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2026-05-04 01:48:43 +00:00
Migrated project
This commit is contained in:
140
tests/test_fp16_optimizer/configs/vit_2d.py
Normal file
140
tests/test_fp16_optimizer/configs/vit_2d.py
Normal file
@@ -0,0 +1,140 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from colossalai.engine import AMP_TYPE
|
||||
|
||||
BATCH_SIZE = 512
|
||||
IMG_SIZE = 32
|
||||
PATCH_SIZE = 4
|
||||
DIM = 512
|
||||
NUM_ATTENTION_HEADS = 8
|
||||
SUMMA_DIM = 2
|
||||
NUM_CLASSES = 10
|
||||
DEPTH = 6
|
||||
|
||||
train_data = dict(
|
||||
dataset=dict(
|
||||
type='CIFAR10Dataset',
|
||||
root=Path(os.environ['DATA']),
|
||||
transform_pipeline=[
|
||||
dict(type='RandomCrop', size=IMG_SIZE, padding=4),
|
||||
dict(type='RandomHorizontalFlip'),
|
||||
dict(type='ToTensor'),
|
||||
dict(type='Normalize',
|
||||
mean=[0.4914, 0.4822, 0.4465],
|
||||
std=[0.2023, 0.1994, 0.2010]),
|
||||
]
|
||||
),
|
||||
dataloader=dict(
|
||||
batch_size=BATCH_SIZE,
|
||||
pin_memory=True,
|
||||
num_workers=4,
|
||||
shuffle=True
|
||||
)
|
||||
)
|
||||
|
||||
test_data = dict(
|
||||
dataset=dict(
|
||||
type='CIFAR10Dataset',
|
||||
root=Path(os.environ['DATA']),
|
||||
train=False,
|
||||
transform_pipeline=[
|
||||
dict(type='Resize', size=IMG_SIZE),
|
||||
dict(type='ToTensor'),
|
||||
dict(type='Normalize',
|
||||
mean=[0.4914, 0.4822, 0.4465],
|
||||
std=[0.2023, 0.1994, 0.2010]
|
||||
),
|
||||
]
|
||||
),
|
||||
dataloader=dict(
|
||||
batch_size=BATCH_SIZE,
|
||||
pin_memory=True,
|
||||
num_workers=4,
|
||||
shuffle=True
|
||||
)
|
||||
)
|
||||
|
||||
optimizer = dict(
|
||||
type='Adam',
|
||||
lr=0.001,
|
||||
weight_decay=0
|
||||
)
|
||||
|
||||
loss = dict(
|
||||
type='CrossEntropyLoss2D',
|
||||
)
|
||||
|
||||
model = dict(
|
||||
type='VisionTransformerFromConfig',
|
||||
tensor_splitting_cfg=dict(
|
||||
type='ViTInputSplitter2D',
|
||||
),
|
||||
embedding_cfg=dict(
|
||||
type='ViTPatchEmbedding2D',
|
||||
img_size=IMG_SIZE,
|
||||
patch_size=PATCH_SIZE,
|
||||
embed_dim=DIM,
|
||||
),
|
||||
token_fusion_cfg=dict(
|
||||
type='ViTTokenFuser2D',
|
||||
img_size=IMG_SIZE,
|
||||
patch_size=PATCH_SIZE,
|
||||
embed_dim=DIM,
|
||||
drop_rate=0.1
|
||||
),
|
||||
norm_cfg=dict(
|
||||
type='LayerNorm2D',
|
||||
normalized_shape=DIM,
|
||||
eps=1e-6,
|
||||
),
|
||||
block_cfg=dict(
|
||||
type='ViTBlock',
|
||||
attention_cfg=dict(
|
||||
type='ViTSelfAttention2D',
|
||||
hidden_size=DIM,
|
||||
num_attention_heads=NUM_ATTENTION_HEADS,
|
||||
attention_dropout_prob=0.,
|
||||
hidden_dropout_prob=0.1,
|
||||
),
|
||||
droppath_cfg=dict(
|
||||
type='VanillaViTDropPath',
|
||||
),
|
||||
mlp_cfg=dict(
|
||||
type='ViTMLP2D',
|
||||
in_features=DIM,
|
||||
dropout_prob=0.1,
|
||||
mlp_ratio=1
|
||||
),
|
||||
norm_cfg=dict(
|
||||
type='LayerNorm2D',
|
||||
normalized_shape=DIM,
|
||||
eps=1e-6,
|
||||
),
|
||||
),
|
||||
head_cfg=dict(
|
||||
type='ViTHead2D',
|
||||
hidden_size=DIM,
|
||||
num_classes=NUM_CLASSES,
|
||||
),
|
||||
embed_dim=DIM,
|
||||
depth=DEPTH,
|
||||
drop_path_rate=0.,
|
||||
)
|
||||
|
||||
parallel = dict(
|
||||
pipeline=dict(size=1),
|
||||
tensor=dict(size=4, mode='2d'),
|
||||
)
|
||||
|
||||
fp16 = dict(
|
||||
mode=AMP_TYPE.PARALLEL,
|
||||
initial_scale=2 ** 4
|
||||
)
|
||||
|
||||
lr_scheduler = dict(
|
||||
type='LinearWarmupLR',
|
||||
warmup_epochs=5
|
||||
)
|
||||
|
||||
num_epochs = 60
|
||||
4
tests/test_fp16_optimizer/test.sh
Normal file
4
tests/test_fp16_optimizer/test.sh
Normal file
@@ -0,0 +1,4 @@
|
||||
#!/usr/bin/env sh
|
||||
test_file=$1
|
||||
|
||||
python $test_file --local_rank $SLURM_PROCID --world_size $SLURM_NPROCS --host $HOST --port 29500
|
||||
88
tests/test_fp16_optimizer/test_vit_2d/test_vit_2d.py
Normal file
88
tests/test_fp16_optimizer/test_vit_2d/test_vit_2d.py
Normal file
@@ -0,0 +1,88 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- encoding: utf-8 -*-
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
import torch.autograd
|
||||
|
||||
import colossalai
|
||||
from colossalai.context.parallel_mode import ParallelMode
|
||||
from colossalai.core import global_context as gpc
|
||||
from colossalai.engine import Engine
|
||||
from colossalai.logging import get_global_dist_logger
|
||||
from colossalai.nn.layer._parallel_utilities import _gather
|
||||
|
||||
CONFIG_PATH = Path(__file__).parent.parent.joinpath('configs/vit_2d.py')
|
||||
|
||||
|
||||
def eval(engine):
|
||||
engine.eval()
|
||||
accumulated_loss = 0
|
||||
correct_sum = 0
|
||||
total_sum = 0
|
||||
|
||||
for i in range(engine.schedule.num_steps):
|
||||
output, label, loss = engine.step()
|
||||
accumulated_loss += loss.detach().cpu().numpy()
|
||||
|
||||
output = _gather(
|
||||
output[0],
|
||||
ParallelMode.PARALLEL_2D_ROW,
|
||||
1
|
||||
)
|
||||
output = _gather(
|
||||
output,
|
||||
ParallelMode.PARALLEL_2D_COL,
|
||||
0,
|
||||
)
|
||||
output = torch.argmax(output, dim=-1)
|
||||
correct = torch.sum(label[0] == output)
|
||||
correct_sum += correct
|
||||
total_sum += label[0].size(0)
|
||||
avg_loss = accumulated_loss / engine.schedule.num_steps
|
||||
return correct_sum, total_sum, avg_loss
|
||||
|
||||
|
||||
def train(engine):
|
||||
engine.train()
|
||||
accumulated_loss = 0
|
||||
|
||||
for i in range(engine.schedule.num_steps):
|
||||
output, label, loss = engine.step()
|
||||
accumulated_loss += loss.squeeze(0).detach().cpu().numpy()
|
||||
avg_loss = accumulated_loss / engine.schedule.num_steps
|
||||
return avg_loss
|
||||
|
||||
|
||||
@pytest.mark.dist
|
||||
@pytest.mark.skip("This test should be invoked by test.sh in the same folder as it runs on multiple gpus")
|
||||
def test_2d_parallel_vision_transformer():
|
||||
# init dist
|
||||
model, train_dataloader, test_dataloader, criterion, optimizer, schedule, lr_scheduler = colossalai.initialize(
|
||||
CONFIG_PATH)
|
||||
logger = get_global_dist_logger()
|
||||
|
||||
engine = Engine(model=model,
|
||||
train_dataloader=train_dataloader,
|
||||
test_dataloader=test_dataloader,
|
||||
criterion=criterion,
|
||||
optimizer=optimizer,
|
||||
lr_scheduler=lr_scheduler,
|
||||
schedule=schedule)
|
||||
|
||||
logger.info('start training')
|
||||
for epoch in range(gpc.config.num_epochs):
|
||||
train_loss = train(engine)
|
||||
|
||||
logger.info(f'epoch {epoch} - train loss: {train_loss}')
|
||||
|
||||
if epoch % 2 == 0:
|
||||
correct_sum, total_sum, eval_loss = eval(engine)
|
||||
logger.info(
|
||||
f'epoch {epoch} - eval loss: {eval_loss}, total: {total_sum}, '
|
||||
f'correct: {correct_sum}, acc: {correct_sum / total_sum}')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_2d_parallel_vision_transformer()
|
||||
Reference in New Issue
Block a user