Support TP-compatible Torch AMP and Update trainer API (#27)

* Add gradient accumulation, fix lr scheduler

* fix FP16 optimizer and adapted torch amp with tensor parallel (#18)

* fixed bugs in compatibility between torch amp and tensor parallel and performed some minor fixes

* fixed trainer

* Revert "fixed trainer"

This reverts commit 2e0b0b7699.

* improved consistency between trainer, engine and schedule (#23)

Co-authored-by: 1SAA <c2h214748@gmail.com>

Co-authored-by: 1SAA <c2h214748@gmail.com>
Co-authored-by: ver217 <lhx0217@gmail.com>
This commit is contained in:
Frank Lee
2021-11-18 19:45:06 +08:00
committed by GitHub
parent 2b05de4c64
commit 3defa32aee
80 changed files with 2194 additions and 1584 deletions

View File

@@ -27,8 +27,6 @@ train_data = dict(
dataloader=dict(
batch_size=BATCH_SIZE,
pin_memory=True,
# num_workers=1,
# shuffle=True,
)
)
@@ -63,14 +61,6 @@ loss = dict(
type='CrossEntropyLoss2D',
)
# model = dict(
# type='VanillaResNet',
# block_type='ResNetBasicBlock',
# layers=[2, 2, 2, 2],
# num_cls=10
# )
model = dict(
type='VisionTransformerFromConfig',
tensor_splitting_cfg=dict(
@@ -135,25 +125,26 @@ parallel = dict(
fp16 = dict(
mode=AMP_TYPE.PARALLEL,
initial_scale=2 ** 8
)
# fp16 = dict(
# mode=None,
# )
schedule = dict(
num_microbatches=2
)
lr_scheduler = dict(
type='LinearWarmupLR',
warmup_epochs=5
engine = dict(
schedule=dict(
num_microbatches=2
)
)
hooks = [
dict(
type='LRSchedulerHook',
by_epoch=True,
lr_scheduler_cfg=dict(
type='LinearWarmupLR',
warmup_steps=5
)
),
]
num_epochs = 60
logging = dict(
root_path='test_vit_2d_log'
)
seed = 100

View File

@@ -124,14 +124,21 @@ parallel = dict(
tensor=dict(size=4, depth=1, mode='2.5d'),
)
lr_scheduler = dict(
type='LinearWarmupLR',
warmup_epochs=5
)
hooks = [
dict(
type='LRSchedulerHook',
by_epoch=True,
lr_scheduler_cfg=dict(
type='LinearWarmupLR',
warmup_steps=5
)
),
]
engine = dict(
schedule = dict(
num_microbatches=2
)
)
num_epochs = 60
num_microbatches = 1

View File

@@ -9,21 +9,22 @@ import torch.autograd
import colossalai
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.engine import Engine
from colossalai.logging import get_global_dist_logger
from colossalai.nn.layer._parallel_utilities import _gather
CONFIG_PATH = Path(__file__).parent.parent.joinpath('configs/vit_2d.py')
def eval(engine):
def eval(engine, test_dataloader):
engine.eval()
accumulated_loss = 0
correct_sum = 0
total_sum = 0
num_steps = len(test_dataloader)
data_iter = iter(test_dataloader)
for i in range(engine.schedule.num_steps):
output, label, loss = engine.step()
for i in range(num_steps):
output, label, loss = engine.step(data_iter)
if gpc.is_last_rank(ParallelMode.PIPELINE):
# loss = sum(loss)
@@ -43,20 +44,22 @@ def eval(engine):
correct = torch.sum(label == output)
correct_sum += correct
total_sum += label.size(0)
avg_loss = accumulated_loss / engine.schedule.num_steps
avg_loss = accumulated_loss / num_steps
return correct_sum, total_sum, avg_loss
def train(engine):
def train(engine, train_dataloader):
engine.train()
accumulated_loss = 0
num_steps = len(train_dataloader)
data_iter = iter(train_dataloader)
for i in range(engine.schedule.num_steps):
output, label, loss = engine.step()
for i in range(num_steps):
output, label, loss = engine.step(data_iter)
if gpc.is_last_rank(ParallelMode.PIPELINE):
accumulated_loss += loss.detach().cpu().numpy()
avg_loss = accumulated_loss / engine.schedule.num_steps
avg_loss = accumulated_loss / num_steps
return avg_loss
@@ -64,25 +67,16 @@ def train(engine):
@pytest.mark.skip("This test should be invoked by test.sh in the same folder as it runs on multiple gpus")
def test_2d_parallel_vision_transformer():
# init dist
model, train_dataloader, test_dataloader, criterion, optimizer, schedule, lr_scheduler = colossalai.initialize(
CONFIG_PATH)
engine, train_dataloader, test_dataloader = colossalai.initialize(CONFIG_PATH)
logger = get_global_dist_logger()
engine = Engine(model=model,
train_dataloader=train_dataloader,
test_dataloader=test_dataloader,
criterion=criterion,
optimizer=optimizer,
lr_scheduler=lr_scheduler,
schedule=schedule)
for epoch in range(gpc.config.num_epochs):
train_loss = train(engine)
train_loss = train(engine, train_dataloader)
if gpc.is_last_rank(ParallelMode.PIPELINE):
logger.info(f'epoch {epoch} - train loss: {train_loss}')
if epoch % 2 == 0:
correct_sum, total_sum, eval_loss = eval(engine)
correct_sum, total_sum, eval_loss = eval(engine, test_dataloader)
if gpc.is_last_rank(ParallelMode.PIPELINE):
logger.info(
f'epoch {epoch} - eval loss: {eval_loss}, total: {total_sum}, '

View File

@@ -6,20 +6,22 @@ import torch.autograd
import colossalai
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.engine import Engine
from colossalai.logging import get_global_dist_logger
from colossalai.nn.layer._parallel_utilities import _gather
CONFIG_PATH = Path(__file__).parent.parent.joinpath('configs/vit_2p5d.py')
def eval(engine):
def eval(engine, test_dataloader):
engine.eval()
accumulated_loss = 0
correct_sum = 0
total_sum = 0
num_steps = len(test_dataloader)
data_iter = iter(test_dataloader)
for i in range(engine.schedule.num_steps):
output, label, loss = engine.step()
for i in range(num_steps):
output, label, loss = engine.step(data_iter)
if gpc.is_last_rank(ParallelMode.PIPELINE):
accumulated_loss += loss.detach().cpu().numpy()
@@ -43,21 +45,23 @@ def eval(engine):
correct = torch.sum(label == output)
correct_sum += correct
total_sum += label.size(0)
avg_loss = accumulated_loss / engine.schedule.num_steps
avg_loss = accumulated_loss / num_steps
return correct_sum, total_sum, avg_loss
def train(engine):
def train(engine, train_dataloader):
engine.train()
accumulated_loss = 0
num_steps = len(train_dataloader)
data_iter = iter(train_dataloader)
for i in range(num_steps):
output, label, loss = engine.step(data_iter)
for i in range(engine.schedule.num_steps):
output, label, loss = engine.step()
if gpc.is_last_rank(ParallelMode.PIPELINE):
accumulated_loss += loss.detach().cpu().numpy()
avg_loss = accumulated_loss / engine.schedule.num_steps
avg_loss = accumulated_loss / num_steps
return avg_loss
@@ -65,25 +69,16 @@ def train(engine):
@pytest.mark.skip("This test should be invoked by test.sh in the same folder as it runs on multiple gpus")
def test_2p5d_parallel_vision_transformer():
# init dist
model, train_dataloader, test_dataloader, criterion, optimizer, schedule, lr_scheduler = colossalai.initialize(
CONFIG_PATH)
engine, train_dataloader, test_dataloader = colossalai.initialize(CONFIG_PATH)
logger = get_global_dist_logger()
engine = Engine(model=model,
train_dataloader=train_dataloader,
test_dataloader=test_dataloader,
criterion=criterion,
optimizer=optimizer,
lr_scheduler=lr_scheduler,
schedule=schedule)
for epoch in range(gpc.config.num_epochs):
train_loss = train(engine)
train_loss = train(engine, train_dataloader)
if gpc.is_last_rank(ParallelMode.PIPELINE):
logger.info(f'epoch {epoch} - train loss: {train_loss}')
if epoch % 2 == 0:
correct_sum, total_sum, eval_loss = eval(engine)
correct_sum, total_sum, eval_loss = eval(engine, test_dataloader)
if gpc.is_last_rank(ParallelMode.PIPELINE):
logger.info(
f'epoch {epoch} - eval loss: {eval_loss}, total: {total_sum}, '
@@ -91,4 +86,4 @@ def test_2p5d_parallel_vision_transformer():
if __name__ == '__main__':
test_2p5d_parallel_vision_transformer()
test_2p5d_parallel_vision_transformer()