Support TP-compatible Torch AMP and Update trainer API (#27)

* Add gradient accumulation, fix lr scheduler * fix FP16 optimizer and adapted torch amp with tensor parallel (#18) * fixed bugs in compatibility between torch amp and tensor parallel and performed some minor fixes * fixed trainer * Revert "fixed trainer" This reverts commit 2e0b0b7699. * improved consistency between trainer, engine and schedule (#23) Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: ver217 <lhx0217@gmail.com>
2026-01-05 15:44:49 +00:00 · 2021-11-18 19:45:06 +08:00
parent 2b05de4c64
commit 3defa32aee
80 changed files with 2194 additions and 1584 deletions
--- a/tests/test_data_pipeline_tensor_parallel/configs/vit_2d.py
+++ b/tests/test_data_pipeline_tensor_parallel/configs/vit_2d.py
@@ -27,8 +27,6 @@ train_data = dict(
    dataloader=dict(
        batch_size=BATCH_SIZE,
        pin_memory=True,
-        # num_workers=1,
-        # shuffle=True,
    )
 )

@@ -63,14 +61,6 @@ loss = dict(
    type='CrossEntropyLoss2D',
 )

-# model = dict(
-#     type='VanillaResNet',
-#     block_type='ResNetBasicBlock',
-#     layers=[2, 2, 2, 2],
-#     num_cls=10
-# )
-
-
 model = dict(
    type='VisionTransformerFromConfig',
    tensor_splitting_cfg=dict(
@@ -135,25 +125,26 @@ parallel = dict(

 fp16 = dict(
    mode=AMP_TYPE.PARALLEL,
-    initial_scale=2 ** 8
 )

-# fp16 = dict(
-#     mode=None,
-# )
-
-schedule = dict(
-    num_microbatches=2
-)
-lr_scheduler = dict(
-    type='LinearWarmupLR',
-    warmup_epochs=5
+engine = dict(
+    schedule=dict(
+        num_microbatches=2
+    )
 )

+hooks = [
+    dict(
+        type='LRSchedulerHook',
+        by_epoch=True,
+        lr_scheduler_cfg=dict(
+            type='LinearWarmupLR',
+            warmup_steps=5
+        )
+    ),
+]
 num_epochs = 60

 logging = dict(
    root_path='test_vit_2d_log'
 )
-
-seed = 100
--- a/tests/test_data_pipeline_tensor_parallel/configs/vit_2p5d.py
+++ b/tests/test_data_pipeline_tensor_parallel/configs/vit_2p5d.py
@@ -124,14 +124,21 @@ parallel = dict(
    tensor=dict(size=4, depth=1, mode='2.5d'),
 )

-lr_scheduler = dict(
-    type='LinearWarmupLR',
-    warmup_epochs=5
-)
+hooks = [
+    dict(
+        type='LRSchedulerHook',
+        by_epoch=True,
+        lr_scheduler_cfg=dict(
+            type='LinearWarmupLR',
+            warmup_steps=5
+        )
+    ),
+]

+engine = dict(
 schedule = dict(
    num_microbatches=2
 )
+)

 num_epochs = 60
-num_microbatches = 1
--- a/tests/test_data_pipeline_tensor_parallel/test_vit_2d/test_vit_2d.py
+++ b/tests/test_data_pipeline_tensor_parallel/test_vit_2d/test_vit_2d.py
@@ -9,21 +9,22 @@ import torch.autograd
 import colossalai
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
-from colossalai.engine import Engine
 from colossalai.logging import get_global_dist_logger
 from colossalai.nn.layer._parallel_utilities import _gather

 CONFIG_PATH = Path(__file__).parent.parent.joinpath('configs/vit_2d.py')


-def eval(engine):
+def eval(engine, test_dataloader):
    engine.eval()
    accumulated_loss = 0
    correct_sum = 0
    total_sum = 0
+    num_steps = len(test_dataloader)
+    data_iter = iter(test_dataloader)

-    for i in range(engine.schedule.num_steps):
-        output, label, loss = engine.step()
+    for i in range(num_steps):
+        output, label, loss = engine.step(data_iter)

        if gpc.is_last_rank(ParallelMode.PIPELINE):
            # loss = sum(loss)
@@ -43,20 +44,22 @@ def eval(engine):
            correct = torch.sum(label == output)
            correct_sum += correct
            total_sum += label.size(0)
-    avg_loss = accumulated_loss / engine.schedule.num_steps
+    avg_loss = accumulated_loss / num_steps
    return correct_sum, total_sum, avg_loss


-def train(engine):
+def train(engine, train_dataloader):
    engine.train()
    accumulated_loss = 0
+    num_steps = len(train_dataloader)
+    data_iter = iter(train_dataloader)

-    for i in range(engine.schedule.num_steps):
-        output, label, loss = engine.step()
+    for i in range(num_steps):
+        output, label, loss = engine.step(data_iter)

        if gpc.is_last_rank(ParallelMode.PIPELINE):
            accumulated_loss += loss.detach().cpu().numpy()
-    avg_loss = accumulated_loss / engine.schedule.num_steps
+    avg_loss = accumulated_loss / num_steps
    return avg_loss


@@ -64,25 +67,16 @@ def train(engine):
@pytest.mark.skip("This test should be invoked by test.sh in the same folder as it runs on multiple gpus")
 def test_2d_parallel_vision_transformer():
    # init dist
-    model, train_dataloader, test_dataloader, criterion, optimizer, schedule, lr_scheduler = colossalai.initialize(
-        CONFIG_PATH)
+    engine, train_dataloader, test_dataloader = colossalai.initialize(CONFIG_PATH)
    logger = get_global_dist_logger()

-    engine = Engine(model=model,
-                    train_dataloader=train_dataloader,
-                    test_dataloader=test_dataloader,
-                    criterion=criterion,
-                    optimizer=optimizer,
-                    lr_scheduler=lr_scheduler,
-                    schedule=schedule)
-
    for epoch in range(gpc.config.num_epochs):
-        train_loss = train(engine)
+        train_loss = train(engine, train_dataloader)
        if gpc.is_last_rank(ParallelMode.PIPELINE):
            logger.info(f'epoch {epoch} - train loss: {train_loss}')

        if epoch % 2 == 0:
-            correct_sum, total_sum, eval_loss = eval(engine)
+            correct_sum, total_sum, eval_loss = eval(engine, test_dataloader)
            if gpc.is_last_rank(ParallelMode.PIPELINE):
                logger.info(
                    f'epoch {epoch} - eval loss: {eval_loss}, total: {total_sum}, '
--- a/tests/test_data_pipeline_tensor_parallel/test_vit_2p5d/test_vit_2p5d.py
+++ b/tests/test_data_pipeline_tensor_parallel/test_vit_2p5d/test_vit_2p5d.py
@@ -6,20 +6,22 @@ import torch.autograd
 import colossalai
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
-from colossalai.engine import Engine
 from colossalai.logging import get_global_dist_logger
 from colossalai.nn.layer._parallel_utilities import _gather

 CONFIG_PATH = Path(__file__).parent.parent.joinpath('configs/vit_2p5d.py')

-def eval(engine):
+
+def eval(engine, test_dataloader):
    engine.eval()
    accumulated_loss = 0
    correct_sum = 0
    total_sum = 0
+    num_steps = len(test_dataloader)
+    data_iter = iter(test_dataloader)

-    for i in range(engine.schedule.num_steps):
-        output, label, loss = engine.step()
+    for i in range(num_steps):
+        output, label, loss = engine.step(data_iter)

        if gpc.is_last_rank(ParallelMode.PIPELINE):
            accumulated_loss += loss.detach().cpu().numpy()
@@ -43,21 +45,23 @@ def eval(engine):
            correct = torch.sum(label == output)
            correct_sum += correct
            total_sum += label.size(0)
-    avg_loss = accumulated_loss / engine.schedule.num_steps
+    avg_loss = accumulated_loss / num_steps
    return correct_sum, total_sum, avg_loss


-def train(engine):
+def train(engine, train_dataloader):
    engine.train()
    accumulated_loss = 0
+    num_steps = len(train_dataloader)
+    data_iter = iter(train_dataloader)
+
+    for i in range(num_steps):
+        output, label, loss = engine.step(data_iter)

-    for i in range(engine.schedule.num_steps):
-        output, label, loss = engine.step()
-        
        if gpc.is_last_rank(ParallelMode.PIPELINE):
            accumulated_loss += loss.detach().cpu().numpy()

-    avg_loss = accumulated_loss / engine.schedule.num_steps
+    avg_loss = accumulated_loss / num_steps
    return avg_loss


@@ -65,25 +69,16 @@ def train(engine):
@pytest.mark.skip("This test should be invoked by test.sh in the same folder as it runs on multiple gpus")
 def test_2p5d_parallel_vision_transformer():
    # init dist
-    model, train_dataloader, test_dataloader, criterion, optimizer, schedule, lr_scheduler = colossalai.initialize(
-        CONFIG_PATH)
+    engine, train_dataloader, test_dataloader = colossalai.initialize(CONFIG_PATH)
    logger = get_global_dist_logger()

-    engine = Engine(model=model,
-                    train_dataloader=train_dataloader,
-                    test_dataloader=test_dataloader,
-                    criterion=criterion,
-                    optimizer=optimizer,
-                    lr_scheduler=lr_scheduler,
-                    schedule=schedule)
-
    for epoch in range(gpc.config.num_epochs):
-        train_loss = train(engine)
+        train_loss = train(engine, train_dataloader)
        if gpc.is_last_rank(ParallelMode.PIPELINE):
            logger.info(f'epoch {epoch} - train loss: {train_loss}')

        if epoch % 2 == 0:
-            correct_sum, total_sum, eval_loss = eval(engine)
+            correct_sum, total_sum, eval_loss = eval(engine, test_dataloader)
            if gpc.is_last_rank(ParallelMode.PIPELINE):
                logger.info(
                    f'epoch {epoch} - eval loss: {eval_loss}, total: {total_sum}, '
@@ -91,4 +86,4 @@ def test_2p5d_parallel_vision_transformer():


 if __name__ == '__main__':
-    test_2p5d_parallel_vision_transformer()
+    test_2p5d_parallel_vision_transformer()