added CI for unit testing (#69)

2025-09-08 04:24:47 +00:00 · 2021-12-16 10:32:08 +08:00
parent 45355a62f7
commit cd9c28e055
68 changed files with 1089 additions and 766 deletions
--- a/tests/test_layers/test.sh
+++ b/tests/test_layers/test.sh
@@ -1,4 +0,0 @@
-#!/usr/bin/env sh
-test_file=$1
-
-python $test_file --rank $SLURM_PROCID --world_size $SLURM_NPROCS --host $HOST --port 29500
--- a/tests/test_layers/test_1d/checks_1d/init.py
+++ b/tests/test_layers/test_1d/checks_1d/init.py
--- a/tests/test_layers/test_1d/checks_1d/check_layer_1d.py
+++ b/tests/test_layers/test_1d/checks_1d/check_layer_1d.py
@@ -1,4 +1,3 @@
-from tests.test_layers.test_3d.common import IMG_SIZE
 import torch
 import torch.distributed as dist
 from torch.nn import Parameter
@@ -7,7 +6,7 @@ from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.nn import Linear1D_Col, Linear1D_Row, TransformerMLP1D, TransformerSelfAttention1D, ViTMLP1D, ViTSelfAttention1D, ViTPatchEmbedding1D, ViTHead1D, ViTTokenFuser1D
 from colossalai.utils import get_current_device, print_rank_0
-from common import HIDDEN_SIZE, DEPTH, BATCH_SIZE, SEQ_LENGTH, NUM_CLASSES, check_equal, IMG_SIZE
+from .common import HIDDEN_SIZE, DEPTH, BATCH_SIZE, SEQ_LENGTH, NUM_CLASSES, check_equal, IMG_SIZE


 def check_linear_col():
--- a/tests/test_layers/test_1d/checks_1d/common.py
+++ b/tests/test_layers/test_1d/checks_1d/common.py
--- a/tests/test_layers/test_1d/test_1d.py
+++ b/tests/test_layers/test_1d/test_1d.py
@@ -2,10 +2,13 @@
 # -*- encoding: utf-8 -*-

 import pytest
+import torch
+import torch.multiprocessing as mp

 from colossalai.core import global_context as gpc
 from colossalai.initialize import launch, get_default_parser
-from test_layer import *
+from functools import partial
+from checks_1d.check_layer_1d import *

 CONFIG = dict(
    parallel=dict(
@@ -18,8 +21,14 @@ CONFIG = dict(
 )


-def check_layer():
-    # print_rank_0('start check_linear_col')
+def check_layer(rank, world_size):
+    launch(config=CONFIG,
+           rank=rank,
+           world_size=world_size,
+           host='localhost',
+           port=29920,
+           backend='nccl')
+
    check_linear_col()
    check_linear_row()
    check_attention()
@@ -28,21 +37,15 @@ def check_layer():
    check_embed()
    check_head()

+    gpc.destroy()
+    torch.cuda.empty_cache()
+

@pytest.mark.dist
-@pytest.mark.skip("This test should be invoked by test.sh in the same folder as it runs on multiple gpus")
 def test_1d():
-    parser = get_default_parser()
-    args = parser.parse_args()
-    launch(config=CONFIG,
-           rank=args.rank,
-           world_size=args.world_size,
-           host=args.host,
-           port=args.port,
-           backend=args.backend)
-
-    check_layer()
-    gpc.destroy()
+    world_size = 2
+    run_func = partial(check_layer, world_size=world_size)
+    mp.spawn(run_func, nprocs=world_size)


 if __name__ == '__main__':
--- a/tests/test_layers/test_2d/checks_2d/init.py
+++ b/tests/test_layers/test_2d/checks_2d/init.py
--- a/tests/test_layers/test_2d/checks_2d/check_layer_2d.py
+++ b/tests/test_layers/test_2d/checks_2d/check_layer_2d.py
@@ -5,7 +5,7 @@ from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.nn import Linear2D, LayerNorm2D, TransformerSelfAttention2D, TransformerMLP2D, TransformerLayer2D
 from colossalai.utils import get_current_device, print_rank_0
-from common import HIDDEN_SIZE, DEPTH, BATCH_SIZE, SEQ_LENGTH, check_equal
+from .common import HIDDEN_SIZE, DEPTH, BATCH_SIZE, SEQ_LENGTH, check_equal


 def check_linear():
--- a/tests/test_layers/test_2d/checks_2d/check_operation_2d.py
+++ b/tests/test_layers/test_2d/checks_2d/check_operation_2d.py
@@ -8,7 +8,7 @@ from colossalai.core import global_context as gpc
 from colossalai.nn.layer.parallel_2d import Matmul_AB_2D, Matmul_ABT_2D, Matmul_ATB_2D
 from colossalai.utils import get_current_device
 from colossalai.utils import print_rank_0
-from common import check_equal, BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE, DEPTH
+from .common import check_equal, BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE, DEPTH


 def check_AB():
--- a/tests/test_layers/test_2d/checks_2d/common.py
+++ b/tests/test_layers/test_2d/checks_2d/common.py
--- a/tests/test_layers/test_2d/test_2d.py
+++ b/tests/test_layers/test_2d/test_2d.py
@@ -2,11 +2,15 @@
 # -*- encoding: utf-8 -*-

 import pytest
+import torch
+import torch.multiprocessing as mp

 from colossalai.core import global_context as gpc
 from colossalai.initialize import launch, get_default_parser
-from test_layer import check_linear, check_layernorm, check_attention, check_mlp, check_transformerlayer
-from test_operation import check_AB, check_ABT, check_ATB
+from checks_2d.check_layer_2d import check_linear, check_layernorm, check_attention, check_mlp, check_transformerlayer
+from checks_2d.check_operation_2d import check_AB, check_ABT, check_ATB
+from functools import partial
+

 CONFIG = dict(
    parallel=dict(
@@ -33,20 +37,25 @@ def check_layer():
    check_transformerlayer()


-@pytest.mark.dist
-@pytest.mark.skip("This test should be invoked by test.sh in the same folder as it runs on multiple gpus")
-def test_2d():
-    parser = get_default_parser()
-    args = parser.parse_args()
+def check_layer_and_operation(rank, world_size):
    launch(config=CONFIG,
-           rank=args.rank,
-           world_size=args.world_size,
-           host=args.host,
-           port=args.port,
-           backend=args.backend)
+           rank=rank,
+           world_size=world_size,
+           host='localhost',
+           port=29921,
+           backend='nccl')
+
    check_operations()
    check_layer()
    gpc.destroy()
+    torch.cuda.empty_cache()
+
+
+@pytest.mark.dist
+def test_2d():
+    world_size = 4
+    run_func = partial(check_layer_and_operation, world_size=world_size)
+    mp.spawn(run_func, nprocs=world_size)


 if __name__ == '__main__':
--- a/tests/test_layers/test_2p5d/checks_2p5d/init.py
+++ b/tests/test_layers/test_2p5d/checks_2p5d/init.py
--- a/tests/test_layers/test_2p5d/checks_2p5d/check_layer_2p5d.py
+++ b/tests/test_layers/test_2p5d/checks_2p5d/check_layer_2p5d.py
@@ -6,7 +6,7 @@ from colossalai.nn import (Linear2p5D, LayerNorm2p5D, TransformerSelfAttention2p
                           TransformerLayer2p5D)
 from colossalai.utils import get_current_device
 from colossalai.utils import print_rank_0
-from common import *
+from .common import *


 def check_linear():
--- a/tests/test_layers/test_2p5d/checks_2p5d/check_operation_2p5d.py
+++ b/tests/test_layers/test_2p5d/checks_2p5d/check_operation_2p5d.py
@@ -6,7 +6,7 @@ from colossalai.nn.layer.parallel_2p5d._operation import Matmul_AB_2p5D, Matmul_
    Matmul_ATB_2p5D
 from colossalai.utils import get_current_device
 from colossalai.utils import print_rank_0
-from common import *
+from .common import *


 def check_AB():
--- a/tests/test_layers/test_2p5d/checks_2p5d/common.py
+++ b/tests/test_layers/test_2p5d/checks_2p5d/common.py
--- a/tests/test_layers/test_2p5d/test.sh
+++ b/tests/test_layers/test_2p5d/test.sh
@@ -1,3 +0,0 @@
-#!/bin/bash
-
-python -m torch.distributed.launch test_2p5d.py --nproc_per_node 8  --host $HOST --port 29516 --world_size 8
--- a/tests/test_layers/test_2p5d/test_2p5d.py
+++ b/tests/test_layers/test_2p5d/test_2p5d.py
@@ -1,9 +1,13 @@
 import pytest
+import torch
+import torch.multiprocessing as mp

 from colossalai.core import global_context as gpc
-from colossalai.initialize import launch, get_default_parser
-from test_layer import check_linear, check_layernorm, check_attention, check_mlp, check_transformerlayer
-from test_operation import check_AB, check_ABT, check_ATB
+from colossalai.initialize import launch
+from checks_2p5d.check_layer_2p5d import check_linear, check_layernorm, check_attention, check_mlp, check_transformerlayer
+from checks_2p5d.check_operation_2p5d import check_AB, check_ABT, check_ATB
+from functools import partial
+

 CONFIG = dict(
    parallel=dict(
@@ -27,20 +31,25 @@ def check_layer():
    check_transformerlayer()


-@pytest.mark.dist
-@pytest.mark.skip("This test should be invoked by test.sh in the same folder as it runs on multiple gpus")
-def test_2p5d():
-    parser = get_default_parser()
-    args = parser.parse_args()
+def check_layer_and_operation(rank, world_size):
    launch(config=CONFIG,
-           rank=args.rank,
-           world_size=args.world_size,
-           host=args.host,
-           port=args.port,
-           backend=args.backend)
-    check_layer()
+           rank=rank,
+           world_size=world_size,
+           host='localhost',
+           port=29922,
+           backend='nccl')
+
    check_operations()
+    check_layer()
    gpc.destroy()
+    torch.cuda.empty_cache()
+
+
+@pytest.mark.dist
+def test_2p5d():
+    world_size = 8
+    run_func = partial(check_layer_and_operation, world_size=world_size)
+    mp.spawn(run_func, nprocs=world_size)


 if __name__ == '__main__':
--- a/tests/test_layers/test_3d/checks_3d/init.py
+++ b/tests/test_layers/test_3d/checks_3d/init.py
--- a/tests/test_layers/test_3d/checks_3d/check_conn.py
+++ b/tests/test_layers/test_3d/checks_3d/check_conn.py
--- a/tests/test_layers/test_3d/checks_3d/check_layer_3d.py
+++ b/tests/test_layers/test_3d/checks_3d/check_layer_3d.py
@@ -13,7 +13,7 @@ from colossalai.utils import get_current_device, print_rank_0
 from colossalai.nn.layer.parallel_3d._utils import get_parallel_mode_from_env
 from colossalai.constants import INPUT_GROUP_3D, WEIGHT_GROUP_3D, OUTPUT_GROUP_3D

-from common import *
+from .common import *


 def check_linear():
--- a/tests/test_layers/test_3d/checks_3d/check_operation_3d.py
+++ b/tests/test_layers/test_3d/checks_3d/check_operation_3d.py
@@ -7,7 +7,7 @@ from colossalai.logging import get_dist_logger
 from colossalai.nn.layer.parallel_3d._operation import *
 from colossalai.utils import get_current_device

-from common import *
+from .common import *


 def check_AB():
--- a/tests/test_layers/test_3d/checks_3d/common.py
+++ b/tests/test_layers/test_3d/checks_3d/common.py
--- a/tests/test_layers/test_3d/test.sh
+++ b/tests/test_layers/test_3d/test.sh
@@ -1,22 +0,0 @@
-#!/bin/bash
-
-python -m torch.distributed.launch test_2d.py --nproc_per_node 8 test_3d.py --host $HOST --port 29516 --world_size 8
-
-# expected test output
-#  distributed environment initialized
-#  AB forward: pass
-#  AB backward: pass
-#  ABT forward: pass
-#  ABT backward: pass
-#  ATB forward: pass
-#  ATB backward: pass
-#  linear backward: pass
-#  linear backward: pass
-#  layer norm forward: pass
-#  layer norm backward: pass
-#  self attention forward: pass
-#  self attention backward: pass
-#  mlp forward: pass
-#  mlp backward: pass
-#  transformerlayer forward: pass
-#  transformerlayer backward: pass
--- a/tests/test_layers/test_3d/test_3d.py
+++ b/tests/test_layers/test_3d/test_3d.py
@@ -1,11 +1,14 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-
-
+import pytest
+import torch
+import torch.multiprocessing as mp
 from colossalai.initialize import launch, get_default_parser

-from test_layer import *
-from test_operation import *
+from checks_3d.check_layer_3d import *
+from checks_3d.check_operation_3d import *
 from colossalai.logging import get_dist_logger
+from functools import partial

 CONFIG = dict(parallel=dict(pipeline=1, tensor=dict(mode='3d', size=8)),
              seed=0)
@@ -38,26 +41,25 @@ def check_layer():
        ranks=[0])


-def _test_main():
-    # init dist
-    parser = get_default_parser()
-    args = parser.parse_args()
+def check_layer_and_operation(rank, world_size):
    launch(config=CONFIG,
-           rank=args.rank,
-           world_size=args.world_size,
-           host=args.host,
-           port=args.port,
-           backend=args.backend)
-    logger = get_dist_logger()
-    logger.info('Distributed environment is initialzied.', ranks=[0])
-    torch.backends.cudnn.benchmark = True
+           rank=rank,
+           world_size=world_size,
+           host='localhost',
+           port=29923,
+           backend='nccl')

-    # check operation
-    # check_operations()
-
-    # check layers
    check_layer()
+    gpc.destroy()
+    torch.cuda.empty_cache()
+
+
+@pytest.mark.dist
+def test_3d():
+    world_size = 8
+    run_func = partial(check_layer_and_operation, world_size=world_size)
+    mp.spawn(run_func, nprocs=world_size)


 if __name__ == '__main__':
-    _test_main()
+    test_3d()
--- a/tests/test_layers/test_sequence/checks_seq/init.py
+++ b/tests/test_layers/test_sequence/checks_seq/init.py
--- a/tests/test_layers/test_sequence/checks_seq/check_layer_seq.py
+++ b/tests/test_layers/test_sequence/checks_seq/check_layer_seq.py
--- a/tests/test_layers/test_sequence/test_sequence.py
+++ b/tests/test_layers/test_sequence/test_sequence.py
@@ -1,9 +1,14 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-

+import pytest
+import torch
+import torch.multiprocessing as mp
 from colossalai.initialize import launch, get_default_parser
 from colossalai.logging import get_dist_logger
-from test_layer import *
+from checks_seq.check_layer_seq import *
+from functools import partial
+

 CONFIG = dict(
    parallel=dict(
@@ -17,24 +22,28 @@ def check_layer():
    check_selfattention()


-def _test_main():
+def run_check_sequence(rank, world_size):
    # init dist
-    parser = get_default_parser()
-    args = parser.parse_args()
    launch(config=CONFIG,
-           rank=args.rank,
-           world_size=args.world_size,
-           host=args.host,
-           port=args.port,
-           backend=args.backend)
+           rank=rank,
+           world_size=world_size,
+           host='localhost',
+           port=29924,
+           backend='nccl')
    logger = get_dist_logger()
    logger.info('Distributed environment is initialzied.', ranks=[0])

-    torch.backends.cudnn.benchmark = True
-
    # check layers
    check_layer()
+    torch.cuda.empty_cache()
+
+
+@pytest.mark.dist
+def test_sequence():
+    world_size = 4
+    run_func = partial(run_check_sequence, world_size=world_size)
+    mp.spawn(run_func, nprocs=world_size)


 if __name__ == '__main__':
-    _test_main()
+    test_sequence()