Migrated project

2026-05-04 01:48:43 +00:00 · 2021-10-28 18:21:23 +02:00
parent 2ebaefc542
commit 404ecbdcc6
409 changed files with 35853 additions and 0 deletions
--- a/tests/test_fp16_optimizer/configs/vit_2d.py
+++ b/tests/test_fp16_optimizer/configs/vit_2d.py
@@ -0,0 +1,140 @@
+import os
+from pathlib import Path
+
+from colossalai.engine import AMP_TYPE
+
+BATCH_SIZE = 512
+IMG_SIZE = 32
+PATCH_SIZE = 4
+DIM = 512
+NUM_ATTENTION_HEADS = 8
+SUMMA_DIM = 2
+NUM_CLASSES = 10
+DEPTH = 6
+
+train_data = dict(
+    dataset=dict(
+        type='CIFAR10Dataset',
+        root=Path(os.environ['DATA']),
+        transform_pipeline=[
+            dict(type='RandomCrop', size=IMG_SIZE, padding=4),
+            dict(type='RandomHorizontalFlip'),
+            dict(type='ToTensor'),
+            dict(type='Normalize',
+                 mean=[0.4914, 0.4822, 0.4465],
+                 std=[0.2023, 0.1994, 0.2010]),
+        ]
+    ),
+    dataloader=dict(
+        batch_size=BATCH_SIZE,
+        pin_memory=True,
+        num_workers=4,
+        shuffle=True
+    )
+)
+
+test_data = dict(
+    dataset=dict(
+        type='CIFAR10Dataset',
+        root=Path(os.environ['DATA']),
+        train=False,
+        transform_pipeline=[
+            dict(type='Resize', size=IMG_SIZE),
+            dict(type='ToTensor'),
+            dict(type='Normalize',
+                 mean=[0.4914, 0.4822, 0.4465],
+                 std=[0.2023, 0.1994, 0.2010]
+                 ),
+        ]
+    ),
+    dataloader=dict(
+        batch_size=BATCH_SIZE,
+        pin_memory=True,
+        num_workers=4,
+        shuffle=True
+    )
+)
+
+optimizer = dict(
+    type='Adam',
+    lr=0.001,
+    weight_decay=0
+)
+
+loss = dict(
+    type='CrossEntropyLoss2D',
+)
+
+model = dict(
+    type='VisionTransformerFromConfig',
+    tensor_splitting_cfg=dict(
+        type='ViTInputSplitter2D',
+    ),
+    embedding_cfg=dict(
+        type='ViTPatchEmbedding2D',
+        img_size=IMG_SIZE,
+        patch_size=PATCH_SIZE,
+        embed_dim=DIM,
+    ),
+    token_fusion_cfg=dict(
+        type='ViTTokenFuser2D',
+        img_size=IMG_SIZE,
+        patch_size=PATCH_SIZE,
+        embed_dim=DIM,
+        drop_rate=0.1
+    ),
+    norm_cfg=dict(
+        type='LayerNorm2D',
+        normalized_shape=DIM,
+        eps=1e-6,
+    ),
+    block_cfg=dict(
+        type='ViTBlock',
+        attention_cfg=dict(
+            type='ViTSelfAttention2D',
+            hidden_size=DIM,
+            num_attention_heads=NUM_ATTENTION_HEADS,
+            attention_dropout_prob=0.,
+            hidden_dropout_prob=0.1,
+        ),
+        droppath_cfg=dict(
+            type='VanillaViTDropPath',
+        ),
+        mlp_cfg=dict(
+            type='ViTMLP2D',
+            in_features=DIM,
+            dropout_prob=0.1,
+            mlp_ratio=1
+        ),
+        norm_cfg=dict(
+            type='LayerNorm2D',
+            normalized_shape=DIM,
+            eps=1e-6,
+        ),
+    ),
+    head_cfg=dict(
+        type='ViTHead2D',
+        hidden_size=DIM,
+        num_classes=NUM_CLASSES,
+    ),
+    embed_dim=DIM,
+    depth=DEPTH,
+    drop_path_rate=0.,
+)
+
+parallel = dict(
+    pipeline=dict(size=1),
+    tensor=dict(size=4, mode='2d'),
+)
+
+fp16 = dict(
+    mode=AMP_TYPE.PARALLEL,
+    initial_scale=2 ** 4
+)
+
+lr_scheduler = dict(
+    type='LinearWarmupLR',
+    warmup_epochs=5
+)
+
+num_epochs = 60
--- a/tests/test_fp16_optimizer/test.sh
+++ b/tests/test_fp16_optimizer/test.sh
@@ -0,0 +1,4 @@
+#!/usr/bin/env sh
+test_file=$1
+
+python $test_file --local_rank $SLURM_PROCID --world_size $SLURM_NPROCS --host $HOST --port 29500
--- a/tests/test_fp16_optimizer/test_vit_2d/test_vit_2d.py
+++ b/tests/test_fp16_optimizer/test_vit_2d/test_vit_2d.py
@@ -0,0 +1,88 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+from pathlib import Path
+
+import pytest
+import torch.autograd
+
+import colossalai
+from colossalai.context.parallel_mode import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.engine import Engine
+from colossalai.logging import get_global_dist_logger
+from colossalai.nn.layer._parallel_utilities import _gather
+
+CONFIG_PATH = Path(__file__).parent.parent.joinpath('configs/vit_2d.py')
+
+
+def eval(engine):
+    engine.eval()
+    accumulated_loss = 0
+    correct_sum = 0
+    total_sum = 0
+
+    for i in range(engine.schedule.num_steps):
+        output, label, loss = engine.step()
+        accumulated_loss += loss.detach().cpu().numpy()
+
+        output = _gather(
+            output[0],
+            ParallelMode.PARALLEL_2D_ROW,
+            1
+        )
+        output = _gather(
+            output,
+            ParallelMode.PARALLEL_2D_COL,
+            0,
+        )
+        output = torch.argmax(output, dim=-1)
+        correct = torch.sum(label[0] == output)
+        correct_sum += correct
+        total_sum += label[0].size(0)
+    avg_loss = accumulated_loss / engine.schedule.num_steps
+    return correct_sum, total_sum, avg_loss
+
+
+def train(engine):
+    engine.train()
+    accumulated_loss = 0
+
+    for i in range(engine.schedule.num_steps):
+        output, label, loss = engine.step()
+        accumulated_loss += loss.squeeze(0).detach().cpu().numpy()
+    avg_loss = accumulated_loss / engine.schedule.num_steps
+    return avg_loss
+
+
+@pytest.mark.dist
+@pytest.mark.skip("This test should be invoked by test.sh in the same folder as it runs on multiple gpus")
+def test_2d_parallel_vision_transformer():
+    # init dist
+    model, train_dataloader, test_dataloader, criterion, optimizer, schedule, lr_scheduler = colossalai.initialize(
+        CONFIG_PATH)
+    logger = get_global_dist_logger()
+
+    engine = Engine(model=model,
+                    train_dataloader=train_dataloader,
+                    test_dataloader=test_dataloader,
+                    criterion=criterion,
+                    optimizer=optimizer,
+                    lr_scheduler=lr_scheduler,
+                    schedule=schedule)
+
+    logger.info('start training')
+    for epoch in range(gpc.config.num_epochs):
+        train_loss = train(engine)
+
+        logger.info(f'epoch {epoch} - train loss: {train_loss}')
+
+        if epoch % 2 == 0:
+            correct_sum, total_sum, eval_loss = eval(engine)
+            logger.info(
+                f'epoch {epoch} - eval loss: {eval_loss}, total: {total_sum}, '
+                f'correct: {correct_sum}, acc: {correct_sum / total_sum}')
+
+
+if __name__ == '__main__':
+    test_2d_parallel_vision_transformer()