Migrated project

2025-09-06 19:40:28 +00:00 · 2021-10-28 18:21:23 +02:00
parent 2ebaefc542
commit 404ecbdcc6
409 changed files with 35853 additions and 0 deletions
--- a/tests/test_layers/test.sh
+++ b/tests/test_layers/test.sh
@@ -0,0 +1,4 @@
+#!/usr/bin/env sh
+test_file=$1
+
+python $test_file --local_rank $SLURM_PROCID --world_size $SLURM_NPROCS --host $HOST --port 29500
--- a/tests/test_layers/test_1d/common.py
+++ b/tests/test_layers/test_1d/common.py
@@ -0,0 +1,13 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+import torch
+
+DEPTH = 2
+BATCH_SIZE = 8
+SEQ_LENGTH = 8
+HIDDEN_SIZE = 8
+
+
+def check_equal(A, B):
+    assert torch.allclose(A, B, rtol=1e-5, atol=1e-2) == True
--- a/tests/test_layers/test_1d/test_1d.py
+++ b/tests/test_layers/test_1d/test_1d.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+import pytest
+
+from colossalai.core import global_context as gpc
+from colossalai.initialize import init_dist
+from test_layer import check_linear_col, check_linear_row
+
+CONFIG = dict(
+    parallel=dict(
+        pipeline=dict(size=1),
+        tensor=dict(
+            size=2,
+            mode='1d'
+        )
+    ),
+)
+
+
+def check_layer():
+    check_linear_col()
+    check_linear_row()
+    # check_attention()
+    # check_mlp()
+
+
+@pytest.mark.dist
+@pytest.mark.skip("This test should be invoked by test.sh in the same folder as it runs on multiple gpus")
+def test_2d():
+    init_dist(config=CONFIG)
+    gpc.set_seed()
+    check_layer()
+    gpc.destroy()
+
+
+if __name__ == '__main__':
+    test_2d()
--- a/tests/test_layers/test_1d/test_layer.py
+++ b/tests/test_layers/test_1d/test_layer.py
@@ -0,0 +1,211 @@
+import torch
+import torch.distributed as dist
+from torch.nn import Parameter
+
+from colossalai.context.parallel_mode import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.nn import Linear1D_Col, Linear1D_Row
+# TransformerMLP1D, \
+# TransformerSelfAttention1D, TransformerEncoderLayer1D
+from colossalai.utils import get_current_device, print_rank_0
+from common import HIDDEN_SIZE, DEPTH, BATCH_SIZE, SEQ_LENGTH, check_equal
+
+
+def check_linear_col():
+    device = get_current_device()
+    dtype = torch.float32
+    INPUT_SIZE = HIDDEN_SIZE
+    OUTPUT_SIZE = 2 * HIDDEN_SIZE
+
+    i = gpc.get_local_rank(ParallelMode.PARALLEL_1D)
+
+    layer = Linear1D_Col(INPUT_SIZE, OUTPUT_SIZE, gather_output=True)
+
+    A_shape = (BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE)
+    A_master = torch.randn(A_shape, dtype=dtype, device=device)
+    dist.broadcast(A_master, src=0)
+    A = A_master.clone()
+    A.requires_grad = True
+
+    W_shape = (OUTPUT_SIZE, INPUT_SIZE)
+    W_master = torch.randn(W_shape, dtype=dtype, device=device)
+    dist.broadcast(W_master, src=0)
+    W = torch.chunk(W_master, DEPTH, dim=0)[i]
+    W = W.clone()
+    W.requires_grad = True
+
+    B_shape = (OUTPUT_SIZE)
+    B_master = torch.randn(B_shape, dtype=dtype, device=device)
+    dist.broadcast(B_master, src=0)
+    B = torch.chunk(B_master, DEPTH, dim=0)[i]
+    B = B.clone()
+    B.requires_grad = True
+
+    layer.weight = Parameter(W)
+    layer.bias = Parameter(B)
+    out = layer(A)
+
+    A_master = A_master.clone()
+    A_master.requires_grad = True
+    W_master = W_master.clone()
+    W_master.requires_grad = True
+    B_master = B_master.clone()
+    B_master.requires_grad = True
+    C_master = torch.matmul(A_master, W_master.transpose(0, 1)) + B_master
+    C = C_master.clone()
+
+    check_equal(out, C)
+    print_rank_0('linear_col gather_output forward: pass')
+
+    grad_shape = C_master.shape
+    grad_master = torch.randn(grad_shape, dtype=dtype, device=get_current_device())
+    dist.broadcast(grad_master, src=0)
+    grad = grad_master.detach()
+    out.backward(grad)
+
+    C_master.backward(grad)
+    A_grad = A_master.grad
+    check_equal(A_grad, A.grad)
+
+    W_grad = W_master.grad
+    W_grad = torch.chunk(W_grad, DEPTH, dim=0)[i]
+    check_equal(W_grad, layer.weight.grad)
+
+    B_grad = B_master.grad
+    B_grad = torch.chunk(B_grad, DEPTH, dim=0)[i]
+    check_equal(B_grad, layer.bias.grad)
+
+    print_rank_0('linear_col gather_output backward: pass')
+
+
+def check_linear_row():
+    device = get_current_device()
+    dtype = torch.float32
+    INPUT_SIZE = HIDDEN_SIZE
+    OUTPUT_SIZE = 2 * HIDDEN_SIZE
+
+    i = gpc.get_local_rank(ParallelMode.PARALLEL_1D)
+
+    layer = Linear1D_Row(OUTPUT_SIZE, INPUT_SIZE, parallel_input=False)
+
+    A_shape = (BATCH_SIZE, SEQ_LENGTH, OUTPUT_SIZE)
+    A_master = torch.randn(A_shape, dtype=dtype, device=device)
+    dist.broadcast(A_master, src=0)
+    A = A_master.clone()
+    A.requires_grad = True
+
+    W_shape = (INPUT_SIZE, OUTPUT_SIZE)
+    W_master = torch.randn(W_shape, dtype=dtype, device=device)
+    dist.broadcast(W_master, src=0)
+    W = torch.chunk(W_master, DEPTH, dim=-1)[i]
+    W = W.clone()
+    W.requires_grad = True
+
+    B_shape = (INPUT_SIZE)
+    B_master = torch.randn(B_shape, dtype=dtype, device=device)
+    dist.broadcast(B_master, src=0)
+    B = B_master.clone()
+    B.requires_grad = True
+
+    layer.weight = Parameter(W)
+    layer.bias = Parameter(B)
+    out = layer(A)
+
+    A_master = A_master.clone()
+    A_master.requires_grad = True
+    W_master = W_master.clone()
+    W_master.requires_grad = True
+    B_master = B_master.clone()
+    B_master.requires_grad = True
+    C_master = torch.matmul(A_master, W_master.transpose(0, 1)) + B_master
+    C = C_master.clone()
+
+    check_equal(out, C)
+    print_rank_0('linear_row no parallel_input forward: pass')
+
+    grad_shape = C_master.shape
+    grad_master = torch.randn(grad_shape, dtype=dtype, device=get_current_device())
+    dist.broadcast(grad_master, src=0)
+    grad = grad_master.detach()
+    out.backward(grad)
+
+    C_master.backward(grad)
+    A_grad = A_master.grad
+    check_equal(A_grad, A.grad)
+
+    W_grad = W_master.grad
+    W_grad = torch.chunk(W_grad, DEPTH, dim=-1)[i]
+    check_equal(W_grad, layer.weight.grad)
+
+    B_grad = B_master.grad
+    check_equal(B_grad, layer.bias.grad)
+
+    print_rank_0('linear_row no parallel_input backward: pass')
+
+#
+# def check_attention():
+#     device = get_current_device()
+#     dtype = torch.float32
+#     INPUT_SIZE = HIDDEN_SIZE
+#     NUM_ATTENTION_HEADS = 2
+#
+#     i = gpc.get_local_rank(ParallelMode.PARALLEL_1D)
+#
+#     layer = TransformerSelfAttention1D(
+#         1,
+#         HIDDEN_SIZE // NUM_ATTENTION_HEADS,
+#         HIDDEN_SIZE,
+#         NUM_ATTENTION_HEADS,
+#         0.5
+#     )
+#
+#     A_shape = (BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE)
+#     A_master = torch.randn(A_shape, dtype=dtype, device=device)
+#     torch.distributed.broadcast(A_master, src=0)
+#     A = A_master.clone()
+#     A.requires_grad = True
+#
+#     mask_shape = (BATCH_SIZE, NUM_ATTENTION_HEADS // DEPTH, SEQ_LENGTH, SEQ_LENGTH)
+#     attention_mask = torch.zeros(mask_shape, dtype=dtype, device=device)
+#
+#     out = layer(A, attention_mask)
+#     assert out.shape == (BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE)
+#     print_rank_0('self attention forward: pass')
+#
+#     grad_shape = out.shape
+#     grad = torch.randn(grad_shape, dtype=dtype, device=device)
+#
+#     out.backward(grad)
+#     assert A.grad.shape == A.shape
+#     print_rank_0('self attention backward: pass')
+#
+#
+# def check_mlp():
+#     device = get_current_device()
+#     dtype = torch.float32
+#     INPUT_SIZE = HIDDEN_SIZE
+#
+#     i = gpc.get_local_rank(ParallelMode.PARALLEL_1D)
+#
+#     layer = TransformerMLP1D(
+#         HIDDEN_SIZE,
+#         HIDDEN_SIZE,
+#         4.0
+#     )
+#
+#     A_shape = (BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE)
+#     A_master = torch.randn(A_shape, dtype=dtype, device=device)
+#     torch.distributed.broadcast(A_master, src=0)
+#     A = A_master.clone()
+#     A.requires_grad = True
+#
+#     out = layer(A)
+#     assert out.shape == (BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE)
+#     print_rank_0('mlp forward: pass')
+#
+#     grad_shape = out.shape
+#     grad = torch.randn(grad_shape, dtype=dtype, device=device)
+#
+#     out.backward(grad)
+#     assert A.grad.shape == A.shape
+#     print_rank_0('mlp backward: pass')
--- a/tests/test_layers/test_2d/common.py
+++ b/tests/test_layers/test_2d/common.py
@@ -0,0 +1,13 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+import torch
+
+DEPTH = 2
+BATCH_SIZE = 8
+SEQ_LENGTH = 8
+HIDDEN_SIZE = 8
+
+
+def check_equal(A, B):
+    assert torch.allclose(A, B, rtol=1e-5, atol=1e-2) == True
--- a/tests/test_layers/test_2d/test_2d.py
+++ b/tests/test_layers/test_2d/test_2d.py
@@ -0,0 +1,47 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+import pytest
+
+from colossalai.core import global_context as gpc
+from colossalai.initialize import init_dist
+from test_layer import check_linear, check_layernorm, check_attention, check_mlp, check_transformerlayer
+from test_operation import check_AB, check_ABT, check_ATB
+
+CONFIG = dict(
+    parallel=dict(
+        pipeline=dict(size=1),
+        tensor=dict(
+            size=4,
+            mode='2d'
+        )
+    ),
+)
+
+
+def check_operations():
+    check_AB()
+    check_ABT()
+    check_ATB()
+
+
+def check_layer():
+    check_linear()
+    check_layernorm()
+    check_attention()
+    check_mlp()
+    check_transformerlayer()
+
+
+@pytest.mark.dist
+@pytest.mark.skip("This test should be invoked by test.sh in the same folder as it runs on multiple gpus")
+def test_2d():
+    init_dist(config=CONFIG)
+    gpc.set_seed()
+    check_operations()
+    check_layer()
+    gpc.destroy()
+
+
+if __name__ == '__main__':
+    test_2d()
--- a/tests/test_layers/test_2d/test_layer.py
+++ b/tests/test_layers/test_2d/test_layer.py
@@ -0,0 +1,248 @@
+import torch
+from torch.nn import Parameter
+
+from colossalai.context.parallel_mode import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.nn import Linear2D, LayerNorm2D, TransformerSelfAttention2D, TransformerMLP2D, TransformerLayer2D
+from colossalai.utils import get_current_device, print_rank_0
+from common import HIDDEN_SIZE, DEPTH, BATCH_SIZE, SEQ_LENGTH, check_equal
+
+
+def check_linear():
+    device = get_current_device()
+    dtype = torch.float32
+    INPUT_SIZE = HIDDEN_SIZE
+    OUTPUT_SIZE = 2 * HIDDEN_SIZE
+
+    j = gpc.get_local_rank(ParallelMode.PARALLEL_2D_ROW)
+    i = gpc.get_local_rank(ParallelMode.PARALLEL_2D_COL)
+
+    layer = Linear2D(INPUT_SIZE, OUTPUT_SIZE)
+
+    A_shape = (BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE)
+    A_master = torch.randn(A_shape, dtype=dtype, device=device)
+    torch.distributed.broadcast(A_master, src=0)
+    A = torch.chunk(A_master, DEPTH, dim=0)[i]
+    A = torch.chunk(A, DEPTH, dim=-1)[j]
+    A = A.clone()
+    A.requires_grad = True
+
+    W_shape = (INPUT_SIZE, OUTPUT_SIZE)
+    W_master = torch.randn(W_shape, dtype=dtype, device=device)
+    torch.distributed.broadcast(W_master, src=0)
+    W = torch.chunk(W_master, DEPTH, dim=0)[i]
+    W = torch.chunk(W, DEPTH, dim=-1)[j]
+    W = W.clone()
+    W.requires_grad = True
+
+    B_shape = (OUTPUT_SIZE)
+    B_master = torch.randn(B_shape, dtype=dtype, device=device)
+    torch.distributed.broadcast(B_master, src=0)
+    B = torch.chunk(B_master, DEPTH, dim=0)[j]
+    B = B.clone()
+    B.requires_grad = True
+
+    layer.weight = Parameter(W)
+    layer.bias = Parameter(B)
+    out = layer(A)
+
+    A_master = A_master.clone()
+    A_master.requires_grad = True
+    W_master = W_master.clone()
+    W_master.requires_grad = True
+    B_master = B_master.clone()
+    B_master.requires_grad = True
+    C_master = torch.matmul(A_master, W_master) + B_master
+    C = torch.chunk(C_master, DEPTH, dim=0)[i]
+    C = torch.chunk(C, DEPTH, dim=-1)[j]
+
+    check_equal(out, C)
+    print_rank_0('linear forward: pass')
+
+    grad_shape = C_master.shape
+    grad_master = torch.randn(grad_shape, dtype=dtype, device=get_current_device())
+    torch.distributed.broadcast(grad_master, src=0)
+    grad = torch.chunk(grad_master, DEPTH, dim=0)[i]
+    grad = torch.chunk(grad, DEPTH, dim=-1)[j]
+    out.backward(grad)
+
+    C_master.backward(grad_master)
+    A_grad = A_master.grad
+    A_grad = torch.chunk(A_grad, DEPTH, dim=0)[i]
+    A_grad = torch.chunk(A_grad, DEPTH, dim=-1)[j]
+    check_equal(A_grad, A.grad)
+
+    W_grad = W_master.grad
+    W_grad = torch.chunk(W_grad, DEPTH, dim=0)[i]
+    W_grad = torch.chunk(W_grad, DEPTH, dim=-1)[j]
+    check_equal(W_grad, layer.weight.grad)
+
+    B_grad = B_master.grad
+    B_grad = torch.chunk(B_grad, DEPTH, dim=0)[j]
+    if i == 0:
+        check_equal(B_grad, layer.bias.grad)
+
+    print_rank_0('linear backward: pass')
+
+
+def check_layernorm():
+    device = get_current_device()
+    dtype = torch.float32
+    INPUT_SIZE = HIDDEN_SIZE
+    EPS = 1e-12
+
+    j = gpc.get_local_rank(ParallelMode.PARALLEL_2D_ROW)
+    i = gpc.get_local_rank(ParallelMode.PARALLEL_2D_COL)
+
+    layernorm = LayerNorm2D(INPUT_SIZE)
+
+    A_shape = (BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE)
+    A_master = torch.randn(A_shape, dtype=dtype, device=device)
+    torch.distributed.broadcast(A_master, src=0)
+    A = torch.chunk(A_master, DEPTH, dim=0)[i]
+    A = torch.chunk(A, DEPTH, dim=-1)[j]
+    A = A.clone()
+    A.requires_grad = True
+
+    out = layernorm(A)
+
+    A_master = A_master.clone()
+    A_master.requires_grad = True
+    E_master = torch.sum(A_master, dim=-1, keepdim=True)
+    E_master /= INPUT_SIZE
+    V_master = torch.sum(A_master * A_master, dim=-1, keepdim=True)
+    V_master /= INPUT_SIZE
+    V_master = V_master - E_master * E_master
+    V_master = 1.0 / torch.sqrt(V_master + EPS)
+    C_master = (A_master - E_master) * V_master
+    C = torch.chunk(C_master, DEPTH, dim=0)[i]
+    C = torch.chunk(C, DEPTH, dim=-1)[j]
+
+    check_equal(out, C)
+    print_rank_0('layer norm forward: pass')
+
+    grad_shape = C_master.shape
+    grad_master = torch.randn(grad_shape, dtype=dtype, device=get_current_device())
+    torch.distributed.broadcast(grad_master, src=0)
+    grad = torch.chunk(grad_master, DEPTH, dim=0)[i]
+    grad = torch.chunk(grad, DEPTH, dim=-1)[j]
+    out.backward(grad)
+
+    C_master.backward(grad_master)
+    A_grad = A_master.grad
+    A_grad = torch.chunk(A_grad, DEPTH, dim=0)[i]
+    A_grad = torch.chunk(A_grad, DEPTH, dim=-1)[j]
+    check_equal(A_grad, A.grad)
+    print_rank_0('layer norm backward: pass')
+
+
+def check_attention():
+    device = get_current_device()
+    dtype = torch.float32
+    INPUT_SIZE = HIDDEN_SIZE
+    NUM_ATTENTION_HEADS = 2
+
+    j = gpc.get_local_rank(ParallelMode.PARALLEL_2D_ROW)
+    i = gpc.get_local_rank(ParallelMode.PARALLEL_2D_COL)
+
+    layer = TransformerSelfAttention2D(
+        HIDDEN_SIZE,
+        NUM_ATTENTION_HEADS,
+        attention_dropout_prob=0.5,
+        hidden_dropout_prob=0.5,
+    )
+
+    A_shape = (BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE)
+    A_master = torch.randn(A_shape, dtype=dtype, device=device)
+    torch.distributed.broadcast(A_master, src=0)
+    A = torch.chunk(A_master, DEPTH, dim=0)[i]
+    A = torch.chunk(A, DEPTH, dim=-1)[j]
+    A = A.clone()
+    A.requires_grad = True
+
+    mask_shape = (BATCH_SIZE // DEPTH, NUM_ATTENTION_HEADS // DEPTH, SEQ_LENGTH, SEQ_LENGTH)
+    attention_mask = torch.zeros(mask_shape, dtype=dtype, device=device)
+
+    out = layer(A, attention_mask)
+    assert out.shape == (BATCH_SIZE // DEPTH, SEQ_LENGTH, INPUT_SIZE // DEPTH)
+    print_rank_0('self attention forward: pass')
+
+    grad_shape = out.shape
+    grad = torch.randn(grad_shape, dtype=dtype, device=device)
+
+    out.backward(grad)
+    assert A.grad.shape == A.shape
+    print_rank_0('self attention backward: pass')
+
+
+def check_mlp():
+    device = get_current_device()
+    dtype = torch.float32
+    INPUT_SIZE = HIDDEN_SIZE
+
+    j = gpc.get_local_rank(ParallelMode.PARALLEL_2D_ROW)
+    i = gpc.get_local_rank(ParallelMode.PARALLEL_2D_COL)
+
+    layer = TransformerMLP2D(
+        HIDDEN_SIZE,
+        dropout_prob=0.5,
+        act_func='gelu',
+    )
+
+    A_shape = (BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE)
+    A_master = torch.randn(A_shape, dtype=dtype, device=device)
+    torch.distributed.broadcast(A_master, src=0)
+    A = torch.chunk(A_master, DEPTH, dim=0)[i]
+    A = torch.chunk(A, DEPTH, dim=-1)[j]
+    A = A.clone()
+    A.requires_grad = True
+
+    out = layer(A)
+    assert out.shape == (BATCH_SIZE // DEPTH, SEQ_LENGTH, INPUT_SIZE // DEPTH)
+    print_rank_0('mlp forward: pass')
+
+    grad_shape = out.shape
+    grad = torch.randn(grad_shape, dtype=dtype, device=device)
+
+    out.backward(grad)
+    assert A.grad.shape == A.shape
+    print_rank_0('mlp backward: pass')
+
+
+def check_transformerlayer():
+    device = get_current_device()
+    dtype = torch.float32
+    INPUT_SIZE = HIDDEN_SIZE
+    NUM_ATTENTION_HEADS = 2
+
+    j = gpc.get_local_rank(ParallelMode.PARALLEL_2D_ROW)
+    i = gpc.get_local_rank(ParallelMode.PARALLEL_2D_COL)
+
+    layer = TransformerLayer2D(
+        HIDDEN_SIZE,
+        NUM_ATTENTION_HEADS,
+        act_func='gelu',
+        attention_dropout_prob=0.5,
+        hidden_dropout_prob=0.5)
+
+    A_shape = (BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE)
+    A_master = torch.randn(A_shape, dtype=dtype, device=device)
+    torch.distributed.broadcast(A_master, src=0)
+    A = torch.chunk(A_master, DEPTH, dim=0)[i]
+    A = torch.chunk(A, DEPTH, dim=-1)[j]
+    A = A.clone()
+    A.requires_grad = True
+
+    mask_shape = (BATCH_SIZE // DEPTH, NUM_ATTENTION_HEADS // DEPTH, SEQ_LENGTH, SEQ_LENGTH)
+    attention_mask = torch.zeros(mask_shape, dtype=dtype, device=device)
+
+    out = layer(A, attention_mask)
+    assert out.shape == (BATCH_SIZE // DEPTH, SEQ_LENGTH, INPUT_SIZE // DEPTH)
+    print_rank_0('transformerlayer forward: pass')
+
+    grad_shape = out.shape
+    grad = torch.randn(grad_shape, dtype=dtype, device=device)
+
+    out.backward(grad)
+    assert A.grad.shape == A.shape
+    print_rank_0('transformerlayer backward: pass')
--- a/tests/test_layers/test_2d/test_operation.py
+++ b/tests/test_layers/test_2d/test_operation.py
@@ -0,0 +1,240 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+import torch
+
+from colossalai.context.parallel_mode import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.nn.layer.parallel_2d import Matmul_AB_2D, Matmul_ABT_2D, Matmul_ATB_2D
+from colossalai.utils import get_current_device
+from colossalai.utils import print_rank_0
+from common import check_equal, BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE, DEPTH
+
+
+def check_AB():
+    data_parallel_rank = 0 if not gpc.is_initialized(ParallelMode.DATA) else gpc.get_local_rank(ParallelMode.DATA)
+    pipeline_parallel_rank = 0 if not gpc.is_initialized(ParallelMode.PIPELINE) else gpc.get_local_rank(
+        ParallelMode.PIPELINE)
+    pipeline_parallel_size = 1 if not gpc.is_initialized(ParallelMode.PIPELINE) else gpc.get_world_size(
+        ParallelMode.PIPELINE)
+    tensor_parallel_size = gpc.get_world_size(ParallelMode.TENSOR)
+
+    dtype = torch.float
+    j = gpc.get_local_rank(ParallelMode.PARALLEL_2D_ROW)
+    i = gpc.get_local_rank(ParallelMode.PARALLEL_2D_COL)
+
+    A_shape = (BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE)
+    A_master = torch.randn(A_shape, dtype=dtype, device=get_current_device())
+    torch.distributed.broadcast(A_master, src=0)
+    A = torch.chunk(A_master, DEPTH, dim=0)[i]
+    A = torch.chunk(A, DEPTH, dim=-1)[j]
+    A = A.clone()
+    A.requires_grad = True
+
+    B_shape = (HIDDEN_SIZE, 4 * HIDDEN_SIZE)
+    B_master = torch.randn(B_shape, dtype=dtype, device=get_current_device())
+    torch.distributed.broadcast(B_master, src=0)
+    B = torch.chunk(B_master, DEPTH, dim=0)[i]
+    B = torch.chunk(B, DEPTH, dim=-1)[j]
+    B = B.clone()
+    B.requires_grad = True
+
+    out_shape = (BATCH_SIZE // DEPTH, SEQ_LENGTH, 4 * HIDDEN_SIZE // DEPTH)
+
+    out = Matmul_AB_2D.apply(
+        A, B,
+        DEPTH,
+        out_shape,
+        i, j,
+        ParallelMode.PARALLEL_2D_ROW,
+        ParallelMode.PARALLEL_2D_COL,
+        data_parallel_rank,
+        pipeline_parallel_rank,
+        pipeline_parallel_size,
+        tensor_parallel_size
+    )
+
+    C_shape = (BATCH_SIZE, SEQ_LENGTH, 4 * HIDDEN_SIZE)
+    A_master = A_master.clone()
+    A_master.requires_grad = True
+    B_master = B_master.clone()
+    B_master.requires_grad = True
+    C_master = torch.matmul(A_master, B_master)
+    C = torch.chunk(C_master, DEPTH, dim=0)[i]
+    C = torch.chunk(C, DEPTH, dim=-1)[j]
+    # check forward correctness
+    check_equal(out, C)
+    print_rank_0('AB forward: pass')
+
+    grad_shape = C_master.shape
+    grad_master = torch.randn(grad_shape, dtype=dtype, device=get_current_device())
+    torch.distributed.broadcast(grad_master, src=0)
+    grad = torch.chunk(grad_master, DEPTH, dim=0)[i]
+    grad = torch.chunk(grad, DEPTH, dim=-1)[j]
+
+    out.backward(grad)
+
+    C_master.backward(grad_master)
+    A_grad = A_master.grad
+    A_grad = torch.chunk(A_grad, DEPTH, dim=0)[i]
+    A_grad = torch.chunk(A_grad, DEPTH, dim=-1)[j]
+    # check backward correctness
+    check_equal(A_grad, A.grad)
+
+    B_grad = B_master.grad
+    B_grad = torch.chunk(B_grad, DEPTH, dim=0)[i]
+    B_grad = torch.chunk(B_grad, DEPTH, dim=-1)[j]
+    # check backward correctness
+    check_equal(B_grad, B.grad)
+    print_rank_0('AB backward: pass')
+
+
+def check_ABT():
+    data_parallel_rank = 0 if not gpc.is_initialized(ParallelMode.DATA) else gpc.get_local_rank(ParallelMode.DATA)
+    pipeline_parallel_rank = 0 if not gpc.is_initialized(ParallelMode.PIPELINE) else gpc.get_local_rank(
+        ParallelMode.PIPELINE)
+    pipeline_parallel_size = 1 if not gpc.is_initialized(ParallelMode.PIPELINE) else gpc.get_world_size(
+        ParallelMode.PIPELINE)
+    tensor_parallel_size = gpc.get_world_size(ParallelMode.TENSOR)
+
+    dtype = torch.float
+    device = get_current_device()
+
+    j = gpc.get_local_rank(ParallelMode.PARALLEL_2D_ROW)
+    i = gpc.get_local_rank(ParallelMode.PARALLEL_2D_COL)
+
+    C_shape = (BATCH_SIZE, SEQ_LENGTH, 4 * HIDDEN_SIZE)
+    C_master = torch.randn(C_shape, dtype=dtype, device=device)
+    torch.distributed.broadcast(C_master, src=0)
+    C = torch.chunk(C_master, DEPTH, dim=0)[i]
+    C = torch.chunk(C, DEPTH, dim=-1)[j]
+    C = C.clone()
+    C.requires_grad = True
+
+    B_shape = (HIDDEN_SIZE, 4 * HIDDEN_SIZE)
+    B_master = torch.randn(B_shape, dtype=dtype, device=device)
+    torch.distributed.broadcast(B_master, src=0)
+    B = torch.chunk(B_master, DEPTH, dim=0)[i]
+    B = torch.chunk(B, DEPTH, dim=-1)[j]
+    B = B.clone()
+    B.requires_grad = True
+
+    out = Matmul_ABT_2D.apply(
+        C, B,
+        DEPTH, (BATCH_SIZE // DEPTH, SEQ_LENGTH, HIDDEN_SIZE // DEPTH),
+        i, j,
+        ParallelMode.PARALLEL_2D_ROW,
+        ParallelMode.PARALLEL_2D_COL,
+        data_parallel_rank,
+        pipeline_parallel_rank,
+        pipeline_parallel_size,
+        tensor_parallel_size
+    )
+
+    A_shape = (BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE)
+    C_master = C_master.clone()
+    C_master.requires_grad = True
+    B_master = B_master.clone()
+    B_master.requires_grad = True
+    A_master = torch.matmul(C_master, B_master.transpose(0, 1))
+    A = torch.chunk(A_master, DEPTH, dim=0)[i]
+    A = torch.chunk(A, DEPTH, dim=-1)[j]
+    check_equal(out, A)
+    print_rank_0('ABT forward: pass')
+
+    grad_shape = A_master.shape
+    grad_master = torch.randn(grad_shape, dtype=dtype, device=device)
+    torch.distributed.broadcast(grad_master, src=0)
+    grad = torch.chunk(grad_master, DEPTH, dim=0)[i]
+    grad = torch.chunk(grad, DEPTH, dim=-1)[j]
+
+    # backward
+    out.backward(grad)
+
+    A_master.backward(grad_master)
+    C_grad = C_master.grad
+    C_grad = torch.chunk(C_grad, DEPTH, dim=0)[i]
+    C_grad = torch.chunk(C_grad, DEPTH, dim=-1)[j]
+    check_equal(C_grad, C.grad)
+
+    B_grad = B_master.grad
+    B_grad = torch.chunk(B_grad, DEPTH, dim=0)[i]
+    B_grad = torch.chunk(B_grad, DEPTH, dim=-1)[j]
+    check_equal(B_grad, B.grad)
+    print_rank_0('ABT backward: pass')
+
+
+def check_ATB():
+    data_parallel_rank = 0 if not gpc.is_initialized(ParallelMode.DATA) else gpc.get_local_rank(ParallelMode.DATA)
+    pipeline_parallel_rank = 0 if not gpc.is_initialized(ParallelMode.PIPELINE) else gpc.get_local_rank(
+        ParallelMode.PIPELINE)
+    pipeline_parallel_size = 1 if not gpc.is_initialized(ParallelMode.PIPELINE) else gpc.get_world_size(
+        ParallelMode.PIPELINE)
+    tensor_parallel_size = gpc.get_world_size(ParallelMode.TENSOR)
+
+    device = get_current_device()
+    dtype = torch.float
+
+    j = gpc.get_local_rank(ParallelMode.PARALLEL_2D_ROW)
+    i = gpc.get_local_rank(ParallelMode.PARALLEL_2D_COL)
+
+    A_shape = (BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE)
+    A_master = torch.randn(A_shape, dtype=dtype, device=device)
+    torch.distributed.broadcast(A_master, src=0)
+    A = torch.chunk(A_master, DEPTH, dim=0)[i]
+    A = torch.chunk(A, DEPTH, dim=-1)[j]
+    A = A.clone()
+    A.requires_grad = True
+
+    C_shape = (BATCH_SIZE, SEQ_LENGTH, 4 * HIDDEN_SIZE)
+    C_master = torch.randn(C_shape, dtype=dtype, device=device)
+    torch.distributed.broadcast(C_master, src=0)
+    C = torch.chunk(C_master, DEPTH, dim=0)[i]
+    C = torch.chunk(C, DEPTH, dim=-1)[j]
+    C = C.clone()
+    C.requires_grad = True
+
+    out = Matmul_ATB_2D.apply(
+        A, C,
+        DEPTH, (HIDDEN_SIZE // DEPTH, 4 * HIDDEN_SIZE // DEPTH),
+        i, j,
+        ParallelMode.PARALLEL_2D_ROW,
+        ParallelMode.PARALLEL_2D_COL,
+        data_parallel_rank,
+        pipeline_parallel_rank,
+        pipeline_parallel_size,
+        tensor_parallel_size
+    )
+
+    B_shape = (HIDDEN_SIZE, 4 * HIDDEN_SIZE)
+    A_master = A_master.clone()
+    A_master.requires_grad = True
+    C_master = C_master.clone()
+    C_master.requires_grad = True
+    B_master = torch.matmul(
+        A_master.view(-1, A_master.shape[-1]).transpose(0, 1),
+        C_master.view(-1, C_master.shape[-1]))
+    B = torch.chunk(B_master, DEPTH, dim=0)[i]
+    B = torch.chunk(B, DEPTH, dim=-1)[j]
+    check_equal(out, B)
+    print_rank_0('ATB forward: pass')
+
+    grad_shape = B_master.shape
+    grad_master = torch.randn(grad_shape, dtype=dtype, device=device)
+    torch.distributed.broadcast(grad_master, src=0)
+    grad = torch.chunk(grad_master, DEPTH, dim=0)[i]
+    grad = torch.chunk(grad, DEPTH, dim=-1)[j]
+
+    out.backward(grad)
+
+    B_master.backward(grad_master)
+    A_grad = A_master.grad
+    A_grad = torch.chunk(A_grad, DEPTH, dim=0)[i]
+    A_grad = torch.chunk(A_grad, DEPTH, dim=-1)[j]
+    check_equal(A_grad, A.grad)
+
+    C_grad = C_master.grad
+    C_grad = torch.chunk(C_grad, DEPTH, dim=0)[i]
+    C_grad = torch.chunk(C_grad, DEPTH, dim=-1)[j]
+    check_equal(C_grad, C.grad)
+    print_rank_0('ATB backward: pass')
--- a/tests/test_layers/test_2p5d/common.py
+++ b/tests/test_layers/test_2p5d/common.py
@@ -0,0 +1,11 @@
+import torch
+
+TESSERACT_DIM = 2
+TESSERACT_DEP = 2
+BATCH_SIZE = 8
+SEQ_LENGTH = 8
+HIDDEN_SIZE = 8
+
+
+def check_equal(A, B):
+    assert torch.allclose(A, B, rtol=1e-5, atol=1e-2) == True
--- a/tests/test_layers/test_2p5d/test.sh
+++ b/tests/test_layers/test_2p5d/test.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+python -m torch.distributed.launch test_2p5d.py --nproc_per_node 8  --host $HOST --port 29516 --world_size 8
--- a/tests/test_layers/test_2p5d/test_2p5d.py
+++ b/tests/test_layers/test_2p5d/test_2p5d.py
@@ -0,0 +1,41 @@
+import pytest
+
+from colossalai.core import global_context as gpc
+from colossalai.initialize import init_dist
+from test_layer import check_linear, check_layernorm, check_attention, check_mlp, check_transformerlayer
+from test_operation import check_AB, check_ABT, check_ATB
+
+CONFIG = dict(
+    parallel=dict(
+        pipeline=dict(size=1),
+        tensor=dict(size=8, mode='2.5d', depth=2),
+    ),
+)
+
+
+def check_operations():
+    check_AB()
+    check_ABT()
+    check_ATB()
+
+
+def check_layer():
+    check_linear()
+    check_layernorm()
+    check_attention()
+    check_mlp()
+    check_transformerlayer()
+
+
+@pytest.mark.dist
+@pytest.mark.skip("This test should be invoked by test.sh in the same folder as it runs on multiple gpus")
+def test_2p5d():
+    init_dist(config=CONFIG)
+    gpc.set_seed()
+    check_layer()
+    check_operations()
+    gpc.destroy()
+
+
+if __name__ == '__main__':
+    test_2p5d()
--- a/tests/test_layers/test_2p5d/test_layer.py
+++ b/tests/test_layers/test_2p5d/test_layer.py
@@ -0,0 +1,265 @@
+from torch.nn import Parameter
+
+from colossalai.context.parallel_mode import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.nn import (Linear2p5D, LayerNorm2p5D, TransformerSelfAttention2p5D, TransformerMLP2p5D,
+                           TransformerLayer2p5D)
+from colossalai.utils import get_current_device
+from colossalai.utils import print_rank_0
+from common import *
+
+
+def check_linear():
+    device = get_current_device()
+    dtype = torch.float32
+    INPUT_SIZE = HIDDEN_SIZE
+    OUTPUT_SIZE = 2 * HIDDEN_SIZE
+
+    i = gpc.get_local_rank(ParallelMode.PARALLEL_2P5D_COL)
+    j = gpc.get_local_rank(ParallelMode.PARALLEL_2P5D_ROW)
+    k = gpc.get_local_rank(ParallelMode.PARALLEL_2P5D_DEP)
+
+    layer = Linear2p5D(
+        INPUT_SIZE,
+        OUTPUT_SIZE,
+        dtype=dtype,
+        skip_bias_add=False)
+
+    A_shape = (BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE)
+    A_master = torch.randn(A_shape, dtype=dtype, device=device)
+    torch.distributed.broadcast(A_master, src=0)
+    A = torch.chunk(A_master, TESSERACT_DIM, dim=0)[i]
+    A = torch.chunk(A, TESSERACT_DIM, dim=-1)[j]
+    A = A.clone()
+    A.requires_grad = True
+
+    W_shape = (INPUT_SIZE, OUTPUT_SIZE)
+    W_master = torch.randn(W_shape, dtype=dtype, device=device)
+    torch.distributed.broadcast(W_master, src=0)
+    W = torch.chunk(W_master, TESSERACT_DIM, dim=0)[i]
+    W = torch.chunk(W, TESSERACT_DIM, dim=-1)[j]
+    W = W.clone()
+    W.requires_grad = True
+
+    B_shape = (OUTPUT_SIZE)
+    B_master = torch.randn(B_shape, dtype=dtype, device=device)
+    torch.distributed.broadcast(B_master, src=0)
+    B = torch.chunk(B_master, TESSERACT_DIM, dim=0)[j]
+    B = B.clone()
+    B.requires_grad = True
+
+    layer.weight = Parameter(W)
+    layer.bias = Parameter(B)
+    out = layer(A)
+    bias = layer.bias
+
+    A_master = A_master.clone()
+    A_master.requires_grad = True
+    W_master = W_master.clone()
+    W_master.requires_grad = True
+    B_master = B_master.clone()
+    B_master.requires_grad = True
+    C_master = torch.matmul(A_master, W_master) + B_master
+    C = torch.chunk(C_master, TESSERACT_DIM, dim=0)[i]
+    C = torch.chunk(C, TESSERACT_DIM, dim=-1)[j]
+
+    check_equal(out, C)
+    print_rank_0('linear forward: pass')
+
+    grad_shape = C_master.shape
+    grad_master = torch.randn(grad_shape, dtype=dtype, device=get_current_device())
+    torch.distributed.broadcast(grad_master, src=0)
+    grad = torch.chunk(grad_master, TESSERACT_DIM, dim=0)[i]
+    grad = torch.chunk(grad, TESSERACT_DIM, dim=-1)[j]
+    out.backward(grad)
+
+    C_master.backward(grad_master)
+    A_grad = A_master.grad
+    A_grad = torch.chunk(A_grad, TESSERACT_DIM, dim=0)[i]
+    A_grad = torch.chunk(A_grad, TESSERACT_DIM, dim=-1)[j]
+    check_equal(A_grad, A.grad)
+
+    W_grad = W_master.grad
+    W_grad = torch.chunk(W_grad, TESSERACT_DIM, dim=0)[i]
+    W_grad = torch.chunk(W_grad, TESSERACT_DIM, dim=-1)[j]
+    check_equal(W_grad, layer.weight.grad)
+
+    B_grad = B_master.grad
+    B_grad = torch.chunk(B_grad, TESSERACT_DIM, dim=0)[j]
+    if i == 0:
+        check_equal(B_grad, layer.bias.grad)
+
+    print_rank_0('linear backward: pass')
+
+
+def check_layernorm():
+    device = get_current_device()
+    dtype = torch.float32
+    INPUT_SIZE = HIDDEN_SIZE
+    EPS = 1e-12
+
+    i = gpc.get_local_rank(ParallelMode.PARALLEL_2P5D_COL)
+    j = gpc.get_local_rank(ParallelMode.PARALLEL_2P5D_ROW)
+    k = gpc.get_local_rank(ParallelMode.PARALLEL_2P5D_DEP)
+
+    layernorm = LayerNorm2p5D(
+        INPUT_SIZE,
+        dtype=dtype)
+
+    A_shape = (BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE)
+    A_master = torch.randn(A_shape, dtype=dtype, device=device)
+    torch.distributed.broadcast(A_master, src=0)
+    A = torch.chunk(A_master, TESSERACT_DIM, dim=0)[i]
+    A = torch.chunk(A, TESSERACT_DIM, dim=-1)[j]
+    A = A.clone()
+    A.requires_grad = True
+
+    out = layernorm(A)
+
+    A_master = A_master.clone()
+    A_master.requires_grad = True
+    E_master = torch.sum(A_master, dim=-1, keepdim=True)
+    E_master /= INPUT_SIZE
+    V_master = torch.sum(A_master * A_master, dim=-1, keepdim=True)
+    V_master /= INPUT_SIZE
+    V_master = V_master - E_master * E_master
+    V_master = 1.0 / torch.sqrt(V_master + EPS)
+    C_master = (A_master - E_master) * V_master
+    C = torch.chunk(C_master, TESSERACT_DIM, dim=0)[i]
+    C = torch.chunk(C, TESSERACT_DIM, dim=-1)[j]
+
+    check_equal(out, C)
+    print_rank_0('layer norm forward: pass')
+
+    grad_shape = C_master.shape
+    grad_master = torch.randn(grad_shape, dtype=dtype, device=get_current_device())
+    torch.distributed.broadcast(grad_master, src=0)
+    grad = torch.chunk(grad_master, TESSERACT_DIM, dim=0)[i]
+    grad = torch.chunk(grad, TESSERACT_DIM, dim=-1)[j]
+    out.backward(grad)
+
+    C_master.backward(grad_master)
+    A_grad = A_master.grad
+    A_grad = torch.chunk(A_grad, TESSERACT_DIM, dim=0)[i]
+    A_grad = torch.chunk(A_grad, TESSERACT_DIM, dim=-1)[j]
+    check_equal(A_grad, A.grad)
+    print_rank_0('layer norm backward: pass')
+
+
+def check_attention():
+    device = get_current_device()
+    dtype = torch.float32
+    INPUT_SIZE = HIDDEN_SIZE
+    NUM_ATTENTION_HEADS = 2
+
+    i = gpc.get_local_rank(ParallelMode.PARALLEL_2P5D_COL)
+    j = gpc.get_local_rank(ParallelMode.PARALLEL_2P5D_ROW)
+    k = gpc.get_local_rank(ParallelMode.PARALLEL_2P5D_DEP)
+
+    layer = TransformerSelfAttention2p5D(
+        HIDDEN_SIZE, NUM_ATTENTION_HEADS,
+        attention_dropout_prob=0.5,
+        hidden_dropout_prob=0.5,
+        dtype=dtype,
+    )
+
+    A_shape = (BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE)
+    A_master = torch.randn(A_shape, dtype=dtype, device=device)
+    torch.distributed.broadcast(A_master, src=0)
+    A = torch.chunk(A_master, TESSERACT_DIM, dim=0)[i]
+    A = torch.chunk(A, TESSERACT_DIM, dim=-1)[j]
+    A = A.clone()
+    A.requires_grad = True
+
+    mask_shape = (BATCH_SIZE // TESSERACT_DIM, NUM_ATTENTION_HEADS // TESSERACT_DIM, SEQ_LENGTH, SEQ_LENGTH)
+    attention_mask = torch.zeros(mask_shape, dtype=dtype, device=device)
+
+    out = layer(A, attention_mask)
+    assert out.shape == (BATCH_SIZE // TESSERACT_DIM, SEQ_LENGTH, INPUT_SIZE // TESSERACT_DIM)
+    print_rank_0('self attention forward: pass')
+
+    grad_shape = out.shape
+    grad = torch.randn(grad_shape, dtype=dtype, device=device)
+
+    out.backward(grad)
+    assert A.grad.shape == A.shape
+    print_rank_0('self attention backward: pass')
+
+
+def check_mlp():
+    device = get_current_device()
+    dtype = torch.float32
+    INPUT_SIZE = HIDDEN_SIZE
+
+    i = gpc.get_local_rank(ParallelMode.PARALLEL_2P5D_COL)
+    j = gpc.get_local_rank(ParallelMode.PARALLEL_2P5D_ROW)
+    k = gpc.get_local_rank(ParallelMode.PARALLEL_2P5D_DEP)
+
+    layer = TransformerMLP2p5D(
+        HIDDEN_SIZE,
+        mlp_ratio=1,
+        dropout_prob=0.5,
+        act_func='gelu',
+        dtype=dtype,
+    )
+
+    A_shape = (BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE)
+    A_master = torch.randn(A_shape, dtype=dtype, device=device)
+    torch.distributed.broadcast(A_master, src=0)
+    A = torch.chunk(A_master, TESSERACT_DIM, dim=0)[i]
+    A = torch.chunk(A, TESSERACT_DIM, dim=-1)[j]
+    A = A.clone()
+    A.requires_grad = True
+
+    out = layer(A)
+    assert out.shape == (BATCH_SIZE // TESSERACT_DIM, SEQ_LENGTH, INPUT_SIZE // TESSERACT_DIM)
+    print_rank_0('mlp forward: pass')
+
+    grad_shape = out.shape
+    grad = torch.randn(grad_shape, dtype=dtype, device=device)
+
+    out.backward(grad)
+    assert A.grad.shape == A.shape
+    print_rank_0('mlp backward: pass')
+
+
+def check_transformerlayer():
+    device = get_current_device()
+    dtype = torch.float32
+    INPUT_SIZE = HIDDEN_SIZE
+    NUM_ATTENTION_HEADS = 2
+
+    i = gpc.get_local_rank(ParallelMode.PARALLEL_2P5D_COL)
+    j = gpc.get_local_rank(ParallelMode.PARALLEL_2P5D_ROW)
+    k = gpc.get_local_rank(ParallelMode.PARALLEL_2P5D_DEP)
+
+    layer = TransformerLayer2p5D(
+        HIDDEN_SIZE,
+        NUM_ATTENTION_HEADS,
+        act_func='gelu',
+        attention_dropout_prob=0.5,
+        hidden_dropout_prob=0.5,
+        dtype=dtype,
+    )
+
+    A_shape = (BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE)
+    A_master = torch.randn(A_shape, dtype=dtype, device=device)
+    torch.distributed.broadcast(A_master, src=0)
+    A = torch.chunk(A_master, TESSERACT_DIM, dim=0)[i]
+    A = torch.chunk(A, TESSERACT_DIM, dim=-1)[j]
+    A = A.clone()
+    A.requires_grad = True
+
+    mask_shape = (BATCH_SIZE // TESSERACT_DIM, NUM_ATTENTION_HEADS // TESSERACT_DIM, SEQ_LENGTH, SEQ_LENGTH)
+    attention_mask = torch.zeros(mask_shape, dtype=dtype, device=device)
+
+    out = layer(A, attention_mask)
+    assert out.shape == (BATCH_SIZE // TESSERACT_DIM, SEQ_LENGTH, INPUT_SIZE // TESSERACT_DIM)
+    print_rank_0('transformerlayer forward: pass')
+
+    grad_shape = out.shape
+    grad = torch.randn(grad_shape, dtype=dtype, device=device)
+
+    out.backward(grad)
+    assert A.grad.shape == A.shape
+    print_rank_0('transformerlayer backward: pass')
--- a/tests/test_layers/test_2p5d/test_operation.py
+++ b/tests/test_layers/test_2p5d/test_operation.py
@@ -0,0 +1,239 @@
+import torch
+
+from colossalai.context import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.nn.layer.parallel_2p5d._operation import Matmul_AB_2p5D, Matmul_ABT_2p5D, \
+    Matmul_ATB_2p5D
+from colossalai.utils import get_current_device
+from colossalai.utils import print_rank_0
+from common import *
+
+
+def check_AB():
+    data_parallel_rank = 0 if not gpc.is_initialized(ParallelMode.DATA) else gpc.get_local_rank(ParallelMode.DATA)
+    pipeline_parallel_rank = 0 if not gpc.is_initialized(ParallelMode.PIPELINE) else gpc.get_local_rank(
+        ParallelMode.PIPELINE)
+    pipeline_parallel_size = 1 if not gpc.is_initialized(ParallelMode.PIPELINE) else gpc.get_world_size(
+        ParallelMode.PIPELINE)
+    tensor_parallel_size = gpc.get_world_size(ParallelMode.TENSOR)
+    
+    dtype = torch.float
+    i = gpc.get_local_rank(ParallelMode.PARALLEL_2P5D_COL)
+    j = gpc.get_local_rank(ParallelMode.PARALLEL_2P5D_ROW)
+    k = gpc.get_local_rank(ParallelMode.PARALLEL_2P5D_DEP)
+
+    A_shape = (BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE)
+    A_master = torch.randn(A_shape, dtype=dtype, device=get_current_device())
+    torch.distributed.broadcast(A_master, src=0)
+    A = torch.chunk(A_master, TESSERACT_DIM, dim=0)[i]
+    A = torch.chunk(A, TESSERACT_DIM, dim=-1)[j]
+    A = A.clone()
+    A.requires_grad = True
+
+    B_shape = (HIDDEN_SIZE, 4 * HIDDEN_SIZE)
+    B_master = torch.randn(B_shape, dtype=dtype, device=get_current_device())
+    torch.distributed.broadcast(B_master, src=0)
+    B = torch.chunk(B_master, TESSERACT_DIM, dim=0)[i]
+    B = torch.chunk(B, TESSERACT_DIM, dim=-1)[j]
+    B = B.clone()
+    B.requires_grad = True
+
+    out_shape = (BATCH_SIZE // TESSERACT_DIM, SEQ_LENGTH, 4 * HIDDEN_SIZE // TESSERACT_DIM)
+    out = Matmul_AB_2p5D.apply(
+        A, B,
+        TESSERACT_DIM, TESSERACT_DEP, out_shape,
+        i, j, k,
+        ParallelMode.PARALLEL_2P5D_ROW,
+        ParallelMode.PARALLEL_2P5D_COL,
+        ParallelMode.PARALLEL_2P5D_DEP,
+        data_parallel_rank,
+        pipeline_parallel_rank,
+        pipeline_parallel_size,
+        tensor_parallel_size)
+
+    C_shape = (BATCH_SIZE, SEQ_LENGTH, 4 * HIDDEN_SIZE)
+    A_master = A_master.clone()
+    A_master.requires_grad = True
+    B_master = B_master.clone()
+    B_master.requires_grad = True
+    C_master = torch.matmul(A_master, B_master)
+    C = torch.chunk(C_master, TESSERACT_DIM, dim=0)[i]
+    C = torch.chunk(C, TESSERACT_DIM, dim=-1)[j]
+    # check forward correctness
+    check_equal(out, C)
+    print_rank_0('AB forward: pass')
+
+    grad_shape = C_master.shape
+    grad_master = torch.randn(grad_shape, dtype=dtype, device=get_current_device())
+    torch.distributed.broadcast(grad_master, src=0)
+    grad = torch.chunk(grad_master, TESSERACT_DIM, dim=0)[i]
+    grad = torch.chunk(grad, TESSERACT_DIM, dim=-1)[j]
+
+    out.backward(grad)
+
+    C_master.backward(grad_master)
+    A_grad = A_master.grad
+    A_grad = torch.chunk(A_grad, TESSERACT_DIM, dim=0)[i]
+    A_grad = torch.chunk(A_grad, TESSERACT_DIM, dim=-1)[j]
+    # check backward correctness
+    check_equal(A_grad, A.grad)
+
+    B_grad = B_master.grad
+    B_grad = torch.chunk(B_grad, TESSERACT_DIM, dim=0)[i]
+    B_grad = torch.chunk(B_grad, TESSERACT_DIM, dim=-1)[j]
+    # check backward correctness
+    check_equal(B_grad, B.grad)
+    print_rank_0('AB backward: pass')
+
+
+def check_ABT():
+    data_parallel_rank = 0 if not gpc.is_initialized(ParallelMode.DATA) else gpc.get_local_rank(ParallelMode.DATA)
+    pipeline_parallel_rank = 0 if not gpc.is_initialized(ParallelMode.PIPELINE) else gpc.get_local_rank(
+        ParallelMode.PIPELINE)
+    pipeline_parallel_size = 1 if not gpc.is_initialized(ParallelMode.PIPELINE) else gpc.get_world_size(
+        ParallelMode.PIPELINE)
+    tensor_parallel_size = gpc.get_world_size(ParallelMode.TENSOR)
+    
+    dtype = torch.float
+    device = get_current_device()
+
+    i = gpc.get_local_rank(ParallelMode.PARALLEL_2P5D_COL)
+    j = gpc.get_local_rank(ParallelMode.PARALLEL_2P5D_ROW)
+    k = gpc.get_local_rank(ParallelMode.PARALLEL_2P5D_DEP)
+
+    C_shape = (BATCH_SIZE, SEQ_LENGTH, 4 * HIDDEN_SIZE)
+    C_master = torch.randn(C_shape, dtype=dtype, device=device)
+    torch.distributed.broadcast(C_master, src=0)
+    C = torch.chunk(C_master, TESSERACT_DIM, dim=0)[i]
+    C = torch.chunk(C, TESSERACT_DIM, dim=-1)[j]
+    C = C.clone()
+    C.requires_grad = True
+
+    B_shape = (HIDDEN_SIZE, 4 * HIDDEN_SIZE)
+    B_master = torch.randn(B_shape, dtype=dtype, device=device)
+    torch.distributed.broadcast(B_master, src=0)
+    B = torch.chunk(B_master, TESSERACT_DIM, dim=0)[i]
+    B = torch.chunk(B, TESSERACT_DIM, dim=-1)[j]
+    B = B.clone()
+    B.requires_grad = True
+
+    out = Matmul_ABT_2p5D.apply(
+        C, B,
+        TESSERACT_DIM, TESSERACT_DEP, (BATCH_SIZE // TESSERACT_DIM, SEQ_LENGTH, HIDDEN_SIZE // TESSERACT_DIM),
+        i, j, k,
+        ParallelMode.PARALLEL_2P5D_ROW,
+        ParallelMode.PARALLEL_2P5D_COL,
+        ParallelMode.PARALLEL_2P5D_DEP,
+        data_parallel_rank,
+        pipeline_parallel_rank,
+        pipeline_parallel_size,
+        tensor_parallel_size)
+
+    A_shape = (BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE)
+    C_master = C_master.clone()
+    C_master.requires_grad = True
+    B_master = B_master.clone()
+    B_master.requires_grad = True
+    A_master = torch.matmul(C_master, B_master.transpose(0, 1))
+    A = torch.chunk(A_master, TESSERACT_DIM, dim=0)[i]
+    A = torch.chunk(A, TESSERACT_DIM, dim=-1)[j]
+    check_equal(out, A)
+    print_rank_0('ABT forward: pass')
+
+    grad_shape = A_master.shape
+    grad_master = torch.randn(grad_shape, dtype=dtype, device=device)
+    torch.distributed.broadcast(grad_master, src=0)
+    grad = torch.chunk(grad_master, TESSERACT_DIM, dim=0)[i]
+    grad = torch.chunk(grad, TESSERACT_DIM, dim=-1)[j]
+
+    # backward
+    out.backward(grad)
+
+    A_master.backward(grad_master)
+    C_grad = C_master.grad
+    C_grad = torch.chunk(C_grad, TESSERACT_DIM, dim=0)[i]
+    C_grad = torch.chunk(C_grad, TESSERACT_DIM, dim=-1)[j]
+    check_equal(C_grad, C.grad)
+
+    B_grad = B_master.grad
+    B_grad = torch.chunk(B_grad, TESSERACT_DIM, dim=0)[i]
+    B_grad = torch.chunk(B_grad, TESSERACT_DIM, dim=-1)[j]
+    check_equal(B_grad, B.grad)
+    print_rank_0('ABT backward: pass')
+
+
+def check_ATB():
+    data_parallel_rank = 0 if not gpc.is_initialized(ParallelMode.DATA) else gpc.get_local_rank(ParallelMode.DATA)
+    pipeline_parallel_rank = 0 if not gpc.is_initialized(ParallelMode.PIPELINE) else gpc.get_local_rank(
+        ParallelMode.PIPELINE)
+    pipeline_parallel_size = 1 if not gpc.is_initialized(ParallelMode.PIPELINE) else gpc.get_world_size(
+        ParallelMode.PIPELINE)
+    tensor_parallel_size = gpc.get_world_size(ParallelMode.TENSOR)
+    
+    device = get_current_device()
+    dtype = torch.float
+
+    i = gpc.get_local_rank(ParallelMode.PARALLEL_2P5D_COL)
+    j = gpc.get_local_rank(ParallelMode.PARALLEL_2P5D_ROW)
+    k = gpc.get_local_rank(ParallelMode.PARALLEL_2P5D_DEP)
+
+    A_shape = (BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE)
+    A_master = torch.randn(A_shape, dtype=dtype, device=device)
+    torch.distributed.broadcast(A_master, src=0)
+    A = torch.chunk(A_master, TESSERACT_DIM, dim=0)[i]
+    A = torch.chunk(A, TESSERACT_DIM, dim=-1)[j]
+    A = A.clone()
+    A.requires_grad = True
+
+    C_shape = (BATCH_SIZE, SEQ_LENGTH, 4 * HIDDEN_SIZE)
+    C_master = torch.randn(C_shape, dtype=dtype, device=device)
+    torch.distributed.broadcast(C_master, src=0)
+    C = torch.chunk(C_master, TESSERACT_DIM, dim=0)[i]
+    C = torch.chunk(C, TESSERACT_DIM, dim=-1)[j]
+    C = C.clone()
+    C.requires_grad = True
+
+    out = Matmul_ATB_2p5D.apply(
+        A, C,
+        TESSERACT_DIM, TESSERACT_DEP, (HIDDEN_SIZE // TESSERACT_DIM, 4 * HIDDEN_SIZE // TESSERACT_DIM),
+        i, j, k,
+        ParallelMode.PARALLEL_2P5D_ROW,
+        ParallelMode.PARALLEL_2P5D_COL,
+        ParallelMode.PARALLEL_2P5D_DEP,
+        data_parallel_rank,
+        pipeline_parallel_rank,
+        pipeline_parallel_size,
+        tensor_parallel_size)
+
+    B_shape = (HIDDEN_SIZE, 4 * HIDDEN_SIZE)
+    A_master = A_master.clone()
+    A_master.requires_grad = True
+    C_master = C_master.clone()
+    C_master.requires_grad = True
+    B_master = torch.matmul(
+        A_master.view(-1, A_master.shape[-1]).transpose(0, 1),
+        C_master.view(-1, C_master.shape[-1]))
+    B = torch.chunk(B_master, TESSERACT_DIM, dim=0)[i]
+    B = torch.chunk(B, TESSERACT_DIM, dim=-1)[j]
+    check_equal(out, B)
+    print_rank_0('ATB forward: pass')
+
+    grad_shape = B_master.shape
+    grad_master = torch.randn(grad_shape, dtype=dtype, device=device)
+    torch.distributed.broadcast(grad_master, src=0)
+    grad = torch.chunk(grad_master, TESSERACT_DIM, dim=0)[i]
+    grad = torch.chunk(grad, TESSERACT_DIM, dim=-1)[j]
+
+    out.backward(grad)
+
+    B_master.backward(grad_master)
+    A_grad = A_master.grad
+    A_grad = torch.chunk(A_grad, TESSERACT_DIM, dim=0)[i]
+    A_grad = torch.chunk(A_grad, TESSERACT_DIM, dim=-1)[j]
+    check_equal(A_grad, A.grad)
+
+    C_grad = C_master.grad
+    C_grad = torch.chunk(C_grad, TESSERACT_DIM, dim=0)[i]
+    C_grad = torch.chunk(C_grad, TESSERACT_DIM, dim=-1)[j]
+    check_equal(C_grad, C.grad)
+    print_rank_0('ATB backward: pass')
--- a/tests/test_layers/test_3d/common.py
+++ b/tests/test_layers/test_3d/common.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+import torch
+
+DEPTH = 2
+BATCH_SIZE = 512
+SEQ_LENGTH = 128
+HIDDEN_SIZE = 512
+NUM_CLASSES = 10
+NUM_BLOCKS = 6
+IMG_SIZE = 32
+
+def check_equal(A, B):
+    return torch.allclose(A, B, rtol=1e-5, atol=1e-2)
--- a/tests/test_layers/test_3d/test.sh
+++ b/tests/test_layers/test_3d/test.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+python -m torch.distributed.launch test_2d.py --nproc_per_node 8 test_3d.py --host $HOST --port 29516 --world_size 8
+
+# expected test output
+#  distributed environment initialized
+#  AB forward: pass
+#  AB backward: pass
+#  ABT forward: pass
+#  ABT backward: pass
+#  ATB forward: pass
+#  ATB backward: pass
+#  linear backward: pass
+#  linear backward: pass
+#  layer norm forward: pass
+#  layer norm backward: pass
+#  self attention forward: pass
+#  self attention backward: pass
+#  mlp forward: pass
+#  mlp backward: pass
+#  transformerlayer forward: pass
+#  transformerlayer backward: pass
--- a/tests/test_layers/test_3d/test_3d.py
+++ b/tests/test_layers/test_3d/test_3d.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+from colossalai.initialize import init_dist
+
+from test_layer import *
+from test_operation import *
+
+CONFIG = dict(parallel=dict(pipeline=1, tensor=dict(mode='3d', size=8)),
+              seed=0)
+
+
+def check_operations():
+    check_AB()
+    check_ABT()
+    check_ATB()
+    check_add()
+    check_mul()
+    check_sum()
+    # check_pooler()
+
+
+def check_layer():
+    logger = get_global_dist_logger()
+    liear_fwd_time, linear_bwd_time = check_linear()
+    norm_fwd_time, norm_bwd_time = check_layernorm()
+    attn_fwd_time, attn_bwd_time = check_attention()
+    mlp_fwd_time, mlp_bwd_time = check_mlp()
+    head_fwd_time, head_bwd_time = check_head()
+    embed_fwd_time, embed_bwd_time = check_embed()
+    loss_fwd_time, loss_bwd_time = check_loss()
+    block_fwd_time = norm_fwd_time + attn_fwd_time + norm_fwd_time + mlp_fwd_time
+    block_bwd_time = norm_bwd_time + attn_bwd_time + norm_bwd_time + mlp_bwd_time
+    fwd_time = embed_fwd_time + NUM_BLOCKS * block_fwd_time + norm_fwd_time + head_fwd_time + loss_fwd_time
+    bwd_time = embed_bwd_time + NUM_BLOCKS * block_bwd_time + norm_bwd_time + head_bwd_time + loss_bwd_time
+    logger.info('ViT forward time: {:.3f} s | backward time: {:.3f} s'.format(
+        fwd_time, bwd_time),
+        ranks=[0])
+
+
+def _test_main():
+    # init dist
+    init_dist(CONFIG)
+    logger = get_global_dist_logger()
+    logger.info('Distributed environment is initialzied.', ranks=[0])
+
+    global_context.set_seed()
+    torch.backends.cudnn.benchmark = True
+
+    # check operation
+    check_operations()
+
+    # check layers
+    check_layer()
+
+
+if __name__ == '__main__':
+    _test_main()
--- a/tests/test_layers/test_3d/test_conn.py
+++ b/tests/test_layers/test_3d/test_conn.py
@@ -0,0 +1,19 @@
+import torch
+import torch.distributed as dist
+
+from colossalai.initialize import parse_args
+from colossalai.utils import get_current_device
+
+ARGS = parse_args()
+size = ARGS.world_size
+rank = ARGS.local_rank
+
+init_method = f'tcp://{ARGS.host}:{ARGS.port}'
+dist.init_process_group(backend='nccl', rank=rank, world_size=size, init_method=init_method)
+print('Rank {} / {}'.format(dist.get_rank(), dist.get_world_size()))
+
+SIZE = 8
+tensor = torch.randn(SIZE)
+tensor = tensor.to(get_current_device())
+dist.all_reduce(tensor)
+print('Rank {0}: {1}'.format(rank, tensor.detach().cpu().numpy().tolist()))
--- a/tests/test_layers/test_3d/test_layer.py
+++ b/tests/test_layers/test_3d/test_layer.py
@@ -0,0 +1,640 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+import math
+import time
+
+import numpy as np
+from colossalai.context.parallel_mode import ParallelMode
+from colossalai.core import global_context
+from colossalai.logging import get_global_dist_logger
+from colossalai.registry import LAYERS, LOSSES
+from colossalai.utils import get_current_device, print_rank_0
+
+from common import *
+
+
+def check_linear():
+    rank = torch.distributed.get_rank()
+    logger = get_global_dist_logger()
+    device = get_current_device()
+    dtype = torch.float32
+    INPUT_SIZE = HIDDEN_SIZE
+    OUTPUT_SIZE = 2 * HIDDEN_SIZE
+
+    j = A_rank = global_context.get_local_rank(ParallelMode.PARALLEL_3D_INPUT)
+    i = B_rank = global_context.get_local_rank(ParallelMode.PARALLEL_3D_WEIGHT)
+    k = C_rank = global_context.get_local_rank(ParallelMode.PARALLEL_3D_OUTPUT)
+
+    layer = LAYERS.get_module('Linear3D')(INPUT_SIZE,
+                                          OUTPUT_SIZE,
+                                          ParallelMode.PARALLEL_3D_INPUT,
+                                          ParallelMode.PARALLEL_3D_WEIGHT,
+                                          dtype=dtype,
+                                          bias=True)
+    torch.nn.init.zeros_(layer.bias)
+    torch.nn.init.ones_(layer.weight)
+    layer = layer.to(device)
+    layer_master = torch.nn.Linear(INPUT_SIZE, OUTPUT_SIZE)
+    torch.nn.init.zeros_(layer_master.bias)
+    torch.nn.init.ones_(layer_master.weight)
+    layer_master = layer_master.to(device)
+
+    A_shape = (BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE)
+    A_master = torch.randn(A_shape, dtype=dtype, device=device)
+    torch.distributed.broadcast(A_master, src=0)
+    A = torch.chunk(A_master, DEPTH, dim=0)[i]
+    A = torch.chunk(A, DEPTH, dim=-1)[k]
+    A = torch.chunk(A, DEPTH, dim=0)[j]
+    A = A.clone()
+    A.requires_grad = True
+
+    fwd_start = time.time()
+    out = layer(A)
+    fwd_end = time.time()
+    print_rank_0(
+        'linear forward: {0} --> {1} | {2:.3f} s'.format(
+            tuple(A.shape), tuple(out.shape), fwd_end - fwd_start), logger)
+    A_master = A_master.clone()
+    A_master.requires_grad = True
+    C_master = layer_master(A_master)
+    C = torch.chunk(C_master, DEPTH, dim=0)[i]
+    C = torch.chunk(C, DEPTH, dim=-1)[j]
+    C = torch.chunk(C, DEPTH, dim=0)[k]
+    logger.info('Rank {} linear forward: {}'.format(rank, check_equal(out, C)))
+
+    grad_shape = C_master.shape
+    grad_master = torch.randn(grad_shape,
+                              dtype=dtype,
+                              device=get_current_device())
+    torch.distributed.broadcast(grad_master, src=0)
+    grad = torch.chunk(grad_master, DEPTH, dim=0)[i]
+    grad = torch.chunk(grad, DEPTH, dim=-1)[j]
+    grad = torch.chunk(grad, DEPTH, dim=0)[k]
+
+    bwd_start = time.time()
+    out.backward(grad)
+    bwd_end = time.time()
+    print_rank_0('linear backward: {:.3f} s'.format(bwd_end - bwd_start),
+                 logger)
+
+    C_master.backward(grad_master)
+    A_grad = A_master.grad
+    A_grad = torch.chunk(A_grad, DEPTH, dim=0)[i]
+    A_grad = torch.chunk(A_grad, DEPTH, dim=-1)[k]
+    A_grad = torch.chunk(A_grad, DEPTH, dim=0)[j]
+    logger.info('Rank {} linear backward (input_grad): {}'.format(
+        rank, check_equal(A_grad, A.grad)))
+
+    B_grad = layer_master.weight.grad.transpose(0, 1)
+    B_grad = torch.chunk(B_grad, DEPTH, dim=0)[k]
+    B_grad = torch.chunk(B_grad, DEPTH, dim=-1)[j]
+    B_grad = torch.chunk(B_grad, DEPTH, dim=-1)[i]
+    logger.info('Rank {} linear backward (weight_grad): {}'.format(
+        rank, check_equal(B_grad, layer.weight.grad)))
+
+    if j == k:
+        bias_grad = layer_master.bias.grad
+        bias_grad = torch.chunk(bias_grad, DEPTH)[j]
+        bias_grad = torch.chunk(bias_grad, DEPTH)[i]
+        logger.info('Rank {} linear backward (bias_grad): {}'.format(
+            rank, check_equal(bias_grad, layer.bias.grad)))
+    else:
+        logger.info('Rank {} linear backward (bias_grad): {}'.format(
+            rank,
+            # np.count_nonzero(layer.bias.grad.detach().cpu().numpy()) == 0))
+            layer.bias.grad is None))
+
+    return fwd_end - fwd_start, bwd_end - bwd_start
+
+
+def check_layernorm():
+    rank = torch.distributed.get_rank()
+    logger = get_global_dist_logger()
+    device = get_current_device()
+    dtype = torch.float32
+    INPUT_SIZE = HIDDEN_SIZE
+
+    j = A_rank = global_context.get_local_rank(ParallelMode.PARALLEL_3D_INPUT)
+    i = B_rank = global_context.get_local_rank(ParallelMode.PARALLEL_3D_WEIGHT)
+    k = C_rank = global_context.get_local_rank(ParallelMode.PARALLEL_3D_OUTPUT)
+
+    norm = LAYERS.get_module('LayerNorm3D')(INPUT_SIZE,
+                                            ParallelMode.PARALLEL_3D_INPUT,
+                                            ParallelMode.PARALLEL_3D_WEIGHT,
+                                            eps=1e-6,
+                                            dtype=dtype)
+    norm = norm.to(device)
+    norm_master = torch.nn.LayerNorm(INPUT_SIZE, eps=1e-6)
+    norm_master = norm_master.to(device)
+
+    A_shape = (BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE)
+    A_master = torch.randn(A_shape, dtype=dtype, device=device)
+    torch.distributed.broadcast(A_master, src=0)
+    A = torch.chunk(A_master, DEPTH, dim=0)[i]
+    A = torch.chunk(A, DEPTH, dim=-1)[k]
+    A = torch.chunk(A, DEPTH, dim=0)[j]
+    A = A.clone()
+    A.requires_grad = True
+
+    fwd_start = time.time()
+    out = norm(A)
+    fwd_end = time.time()
+    print_rank_0(
+        'layer norm forward: pass | {0} --> {1} | {2:.3f} s'.format(
+            tuple(A.shape), tuple(out.shape), fwd_end - fwd_start), logger)
+
+    A_master = A_master.clone()
+    A_master.requires_grad = True
+    C_master = norm_master(A_master)
+    C = torch.chunk(C_master, DEPTH, dim=0)[i]
+    C = torch.chunk(C, DEPTH, dim=-1)[k]
+    C = torch.chunk(C, DEPTH, dim=0)[j]
+    logger.info('Rank {} layernorm forward: {}'.format(rank,
+                                                       check_equal(out, C)))
+    # time.sleep(rank)
+    # logger.info('Rank {0} master:\n{1}\nRank {0} out:\n{2}\nRank {0} true:\n{3}\n'.
+    #       format(rank,
+    #              C_master.detach().cpu().numpy().tolist(),
+    #              out.detach().cpu().numpy().tolist(),
+    #              C.detach().cpu().numpy().tolist()))
+
+    grad_shape = C_master.shape
+    grad_master = torch.randn(grad_shape, dtype=dtype, device=device)
+    torch.distributed.broadcast(grad_master, src=0)
+    grad = torch.chunk(grad_master, DEPTH, dim=0)[i]
+    grad = torch.chunk(grad, DEPTH, dim=-1)[k]
+    grad = torch.chunk(grad, DEPTH, dim=0)[j]
+
+    bwd_start = time.time()
+    out.backward(grad)
+    bwd_end = time.time()
+    print_rank_0(
+        'layer norm backward: pass | {:.3f} s'.format(bwd_end - bwd_start),
+        logger)
+
+    C_master.backward(grad_master)
+    A_grad = A_master.grad
+    A_grad = torch.chunk(A_grad, DEPTH, dim=0)[i]
+    A_grad = torch.chunk(A_grad, DEPTH, dim=-1)[k]
+    A_grad = torch.chunk(A_grad, DEPTH, dim=0)[j]
+    logger.info('Rank {} layernorm backward (input_grad): {}'.format(
+        rank, check_equal(A_grad, A.grad)))
+
+    if j == k:
+        bias_grad = norm_master.weight.grad
+        bias_grad = torch.chunk(bias_grad, DEPTH)[j]
+        bias_grad = torch.chunk(bias_grad, DEPTH)[i]
+        logger.info('Rank {} linear backward (weight_grad): {}'.format(
+            rank, check_equal(bias_grad, norm.weight.grad)))
+    else:
+        logger.info('Rank {} linear backward (weight_grad): {}'.format(
+            rank,
+            # np.count_nonzero(layer.bias.grad.detach().cpu().numpy()) == 0))
+            norm.weight.grad is None))
+
+    if j == k:
+        bias_grad = norm_master.bias.grad
+        bias_grad = torch.chunk(bias_grad, DEPTH)[j]
+        bias_grad = torch.chunk(bias_grad, DEPTH)[i]
+        logger.info('Rank {} linear backward (bias_grad): {}'.format(
+            rank, check_equal(bias_grad, norm.bias.grad)))
+    else:
+        logger.info('Rank {} linear backward (bias_grad): {}'.format(
+            rank,
+            # np.count_nonzero(layer.bias.grad.detach().cpu().numpy()) == 0))
+            norm.bias.grad is None))
+
+    return fwd_end - fwd_start, bwd_end - bwd_start
+
+
+def check_attention():
+    rank = torch.distributed.get_rank()
+    device = get_current_device()
+    logger = get_global_dist_logger()
+    dtype = torch.float32
+    INPUT_SIZE = HIDDEN_SIZE
+    NUM_ATTENTION_HEADS = 2
+
+    j = A_rank = global_context.get_local_rank(ParallelMode.PARALLEL_3D_INPUT)
+    i = B_rank = global_context.get_local_rank(ParallelMode.PARALLEL_3D_WEIGHT)
+    k = C_rank = global_context.get_local_rank(ParallelMode.PARALLEL_3D_OUTPUT)
+
+    layer = LAYERS.get_module('ViTSelfAttention3D')(HIDDEN_SIZE,
+                                                    NUM_ATTENTION_HEADS,
+                                                    0.,
+                                                    0.1,
+                                                    dtype=dtype,
+                                                    bias=True)
+    layer = layer.to(device)
+
+    A_shape = (BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE)
+    A_master = torch.randn(A_shape, dtype=dtype, device=device)
+    torch.distributed.broadcast(A_master, src=0)
+    A = torch.chunk(A_master, DEPTH, dim=0)[i]
+    A = torch.chunk(A, DEPTH, dim=-1)[k]
+    A = torch.chunk(A, DEPTH, dim=0)[j]
+    A = A.clone()
+    A.requires_grad = True
+
+    mask_shape = (BATCH_SIZE // DEPTH, NUM_ATTENTION_HEADS // DEPTH,
+                  SEQ_LENGTH // DEPTH, SEQ_LENGTH // DEPTH)
+    attention_mask = torch.zeros(mask_shape, dtype=dtype, device=device)
+
+    fwd_start = time.time()
+    out = layer(A)
+    fwd_end = time.time()
+    print_rank_0(
+        'self attention forward: pass | {0} --> {1} | {2:.3f} s'.format(
+            tuple(A.shape), tuple(out.shape), fwd_end - fwd_start), logger)
+
+    grad_shape = out.shape
+    grad = torch.randn(grad_shape, dtype=dtype, device=device)
+
+    bwd_start = time.time()
+    out.backward(grad)
+    bwd_end = time.time()
+    print_rank_0(
+        'self attention backward: pass | {:.3f} s'.format(bwd_end - bwd_start),
+        logger)
+
+    return fwd_end - fwd_start, bwd_end - bwd_start
+
+
+def check_mlp():
+    rank = torch.distributed.get_rank()
+    device = get_current_device()
+    logger = get_global_dist_logger()
+    dtype = torch.float32
+    INPUT_SIZE = HIDDEN_SIZE
+
+    j = A_rank = global_context.get_local_rank(ParallelMode.PARALLEL_3D_INPUT)
+    i = B_rank = global_context.get_local_rank(ParallelMode.PARALLEL_3D_WEIGHT)
+    k = C_rank = global_context.get_local_rank(ParallelMode.PARALLEL_3D_OUTPUT)
+
+    layer = LAYERS.get_module('ViTMLP3D')(HIDDEN_SIZE,
+                                          1,
+                                          0.1,
+                                          'gelu',
+                                          dtype=dtype,
+                                          bias=True)
+
+    A_shape = (BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE)
+    A_master = torch.randn(A_shape, dtype=dtype, device=device)
+    torch.distributed.broadcast(A_master, src=0)
+    A = torch.chunk(A_master, DEPTH, dim=0)[i]
+    A = torch.chunk(A, DEPTH, dim=-1)[k]
+    A = torch.chunk(A, DEPTH, dim=0)[j]
+    A = A.clone()
+    A.requires_grad = True
+
+    fwd_start = time.time()
+    out = layer(A)
+    fwd_end = time.time()
+    print_rank_0(
+        'mlp forward: pass | {0} --> {1} | {2:.3f} s'.format(
+            tuple(A.shape), tuple(out.shape), fwd_end - fwd_start), logger)
+
+    grad_shape = out.shape
+    grad = torch.randn(grad_shape, dtype=dtype, device=device)
+
+    bwd_start = time.time()
+    out.backward(grad)
+    bwd_end = time.time()
+    print_rank_0('mlp backward: pass | {:.3f} s'.format(bwd_end - bwd_start),
+                 logger)
+
+    return fwd_end - fwd_start, bwd_end - bwd_start
+
+
+class Testvithead(torch.nn.Module):
+    def __init__(self, in_features, out_features, bias=True):
+        super().__init__()
+        self.linear = torch.nn.Linear(in_features, out_features, bias=bias)
+
+    def forward(self, x):
+        x = x[:, 0]
+        x = self.linear(x)
+        return x
+
+
+def check_head():
+    rank = torch.distributed.get_rank()
+    logger = get_global_dist_logger()
+    device = get_current_device()
+    dtype = torch.float32
+    INPUT_SIZE = HIDDEN_SIZE
+
+    j = A_rank = global_context.get_local_rank(ParallelMode.PARALLEL_3D_INPUT)
+    i = B_rank = global_context.get_local_rank(ParallelMode.PARALLEL_3D_WEIGHT)
+    k = C_rank = global_context.get_local_rank(ParallelMode.PARALLEL_3D_OUTPUT)
+
+    head = LAYERS.get_module('ViTHead3D')(INPUT_SIZE,
+                                          NUM_CLASSES,
+                                          dtype=dtype,
+                                          bias=True)
+    torch.nn.init.zeros_(head.linear.bias)
+    torch.nn.init.ones_(head.linear.weight)
+    head = head.to(device)
+
+    layer = Testvithead(INPUT_SIZE, NUM_CLASSES, bias=True)
+    torch.nn.init.zeros_(layer.linear.bias)
+    torch.nn.init.ones_(layer.linear.weight)
+    layer = layer.to(device)
+
+    A_shape = (BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE)
+    A_master = torch.randn(A_shape, dtype=dtype, device=device)
+    torch.distributed.broadcast(A_master, src=0)
+    A = torch.chunk(A_master, DEPTH, dim=0)[i]
+    A = torch.chunk(A, DEPTH, dim=-1)[k]
+    A = torch.chunk(A, DEPTH, dim=0)[j]
+    A = A.clone()
+    A.requires_grad = True
+
+    fwd_start = time.time()
+    out = head(A)
+    fwd_end = time.time()
+    print_rank_0(
+        'head forward: pass | {0} --> {1} | {2:.3f} s'.format(
+            tuple(A.shape), tuple(out.shape), fwd_end - fwd_start), logger)
+    A_master = A_master.clone()
+    A_master.requires_grad = True
+    C_master = layer(A_master)
+    C = torch.chunk(C_master, DEPTH, dim=0)[i]
+    C = torch.chunk(C, DEPTH, dim=-1)[j]
+    C = torch.chunk(C, DEPTH, dim=0)[k]
+    logger.info('Rank {} head forward: {}'.format(rank, check_equal(out, C)))
+
+    grad_shape = C_master.shape
+    grad_master = torch.randn(grad_shape,
+                              dtype=dtype,
+                              device=get_current_device())
+    torch.distributed.broadcast(grad_master, src=0)
+    grad = torch.chunk(grad_master, DEPTH, dim=0)[i]
+    grad = torch.chunk(grad, DEPTH, dim=-1)[j]
+    grad = torch.chunk(grad, DEPTH, dim=0)[k]
+
+    bwd_start = time.time()
+    out.backward(grad)
+    bwd_end = time.time()
+    print_rank_0('head backward: pass | {:.3f} s'.format(bwd_end - bwd_start),
+                 logger)
+
+    C_master.backward(grad_master)
+    A_grad = A_master.grad
+    A_grad = torch.chunk(A_grad, DEPTH, dim=0)[i]
+    A_grad = torch.chunk(A_grad, DEPTH, dim=-1)[k]
+    A_grad = torch.chunk(A_grad, DEPTH, dim=0)[j]
+    # if j == 0:
+    logger.info('Rank {} head backward (input_grad): {}'.format(
+        rank, check_equal(A_grad, A.grad)))
+    # else:
+    #     logger.info('Rank {} head backward (input_grad): {}'.format(
+    #         # rank, check_equal(A_grad, A.grad)))
+    #         rank,
+    #         A.grad is None))
+
+    B_grad = layer.linear.weight.grad.transpose(0, 1)
+    B_grad = torch.chunk(B_grad, DEPTH, dim=0)[k]
+    B_grad = torch.chunk(B_grad, DEPTH, dim=-1)[j]
+    pad_shape = (B_grad.shape[0], math.ceil(B_grad.shape[-1] / DEPTH) * DEPTH -
+                 B_grad.shape[-1])
+    B_grad = torch.cat(
+        [B_grad, torch.zeros(pad_shape, dtype=dtype, device=device)], dim=-1)
+    B_grad = torch.chunk(B_grad, DEPTH, dim=-1)[i]
+    logger.info('Rank {} head backward (weight_grad): {}'.format(
+        rank, check_equal(B_grad, head.linear.weight.grad)))
+
+    if j == k:
+        bias_grad = layer.linear.bias.grad
+        bias_grad = torch.chunk(bias_grad, DEPTH)[j]
+        pad_shape = (math.ceil(bias_grad.shape[0] / DEPTH) * DEPTH -
+                     bias_grad.shape[0], )
+        bias_grad = torch.cat(
+            [bias_grad,
+             torch.zeros(pad_shape, dtype=dtype, device=device)])
+        bias_grad = torch.chunk(bias_grad, DEPTH)[i]
+        logger.info('Rank {} head backward (bias_grad): {}'.format(
+            rank, check_equal(bias_grad, head.linear.bias.grad)))
+    else:
+        logger.info('Rank {} head backward (bias_grad): {}'.format(
+            rank,
+            # np.count_nonzero(
+            #     head.linear.bias.grad.detach().cpu().numpy()) == 0))
+            head.linear.bias.grad is None))
+
+    return fwd_end - fwd_start, bwd_end - bwd_start
+
+
+class Testvitembed(torch.nn.Module):
+    def __init__(self, img_size: int, patch_size: int, in_chans: int,
+                 embed_size: int, drop_prob: float) -> None:
+        super().__init__()
+        self.proj = torch.nn.Conv2d(in_chans,
+                                    embed_size,
+                                    kernel_size=patch_size,
+                                    stride=patch_size)
+        num_patches = (img_size // patch_size)**2
+        self.cls_token = torch.nn.Parameter(torch.zeros(1, 1, embed_size))
+        self.pos_embed = torch.nn.Parameter(
+            torch.zeros(1, num_patches + 1, embed_size))
+        self.pos_drop = torch.nn.Dropout(drop_prob)
+
+    def forward(self, x):
+        x = self.proj(x)
+        x = x.flatten(2).transpose(1, 2)
+        cls_token = self.cls_token.expand(x.shape[0], -1, -1)
+        x = torch.cat((cls_token, x), dim=1)
+        x = self.pos_drop(x + self.pos_embed)
+        return x
+
+
+def check_embed():
+    rank = torch.distributed.get_rank()
+    device = get_current_device()
+    logger = get_global_dist_logger()
+    dtype = torch.float32
+
+    j = A_rank = global_context.get_local_rank(ParallelMode.PARALLEL_3D_INPUT)
+    i = B_rank = global_context.get_local_rank(ParallelMode.PARALLEL_3D_WEIGHT)
+    k = C_rank = global_context.get_local_rank(ParallelMode.PARALLEL_3D_OUTPUT)
+
+    layer = LAYERS.get_module('ViTPatchEmbedding3D')(IMG_SIZE, 4, 3,
+                                                     HIDDEN_SIZE, 0.)
+    torch.nn.init.zeros_(layer.proj.bias)
+    torch.nn.init.ones_(layer.proj.weight)
+    torch.nn.init.ones_(layer.cls_token)
+    torch.nn.init.ones_(layer.pos_embed)
+    layer = layer.to(device)
+
+    layer_master = Testvitembed(IMG_SIZE, 4, 3, HIDDEN_SIZE, 0.)
+    torch.nn.init.zeros_(layer_master.proj.bias)
+    torch.nn.init.ones_(layer_master.proj.weight)
+    torch.nn.init.ones_(layer_master.cls_token)
+    torch.nn.init.ones_(layer_master.pos_embed)
+    layer_master = layer_master.to(device)
+
+    A_shape = (BATCH_SIZE, 3, IMG_SIZE, IMG_SIZE)
+    A_master = torch.randn(A_shape, dtype=dtype, device=device)
+    torch.distributed.broadcast(A_master, src=0)
+    A = A_master.clone()
+    A.requires_grad = True
+
+    fwd_start = time.time()
+    out = layer(A)
+    fwd_end = time.time()
+    print_rank_0(
+        'embedding forward: pass | {0} --> {1} | {2:.3f} s'.format(
+            tuple(A.shape), tuple(out.shape), fwd_end - fwd_start), logger)
+    # out_cls = out[:, 0]
+    # out_tensor = out[:, 1:]
+
+    A_master = A_master.clone()
+    A_master.requires_grad = True
+    C_master = layer_master(A_master)
+    # if j == 0:
+    #     C_cls = C_master[:, 0]
+    #     C_cls = torch.chunk(C_cls, DEPTH, dim=0)[i]
+    #     C_cls = torch.chunk(C_cls, DEPTH, dim=-1)[k]
+    #     logger.info('Rank {} embed forward (cls): {}'.format(
+    #         rank, check_equal(out_cls, C_cls)))
+    # C = C_master[:, 1:]
+    C = torch.chunk(C_master, DEPTH, dim=0)[i]
+    C = torch.chunk(C, DEPTH, dim=-1)[k]
+    C = torch.chunk(C, DEPTH, dim=0)[j]
+    logger.info('Rank {} embed forward: {}'.format(rank, check_equal(out, C)))
+
+    grad_shape = C_master.shape
+    grad_master = torch.randn(grad_shape,
+                              dtype=dtype,
+                              device=get_current_device())
+    torch.distributed.broadcast(grad_master, src=0)
+    # cls_grad = grad_master[:, 0]
+    # cls_grad = torch.chunk(cls_grad, DEPTH, dim=0)[i]
+    # cls_grad = torch.chunk(cls_grad, DEPTH, dim=-1)[k]
+    # grad = grad_master[:, 1:]
+    grad = torch.chunk(grad_master, DEPTH, dim=0)[i]
+    grad = torch.chunk(grad, DEPTH, dim=-1)[k]
+    grad = torch.chunk(grad, DEPTH, dim=0)[j]
+    # grad = torch.cat((torch.unsqueeze(cls_grad, 1), grad), dim=1)
+    bwd_start = time.time()
+    out.backward(grad)
+    bwd_end = time.time()
+    print_rank_0(
+        'embedding backward: pass | {:.3f} s'.format(bwd_end - bwd_start),
+        logger)
+
+    C_master.backward(grad_master)
+    # A_grad = A_master.grad
+    # logger.info('Rank {} embed backward (input_grad): {}'.format(
+    #     rank, check_equal(A_grad, A.grad)))
+    # time.sleep(0.1 * rank)
+    # logger.info(
+    #     'Rank {0} master:\n{1}\nRank {0} out:\n{2}\nRank {0} true:\n{3}\n'.
+    #     format(rank,
+    #            A_master.grad.detach().cpu().numpy().tolist(),
+    #            A.grad.detach().cpu().numpy().tolist(),
+    #            A_grad.detach().cpu().numpy().tolist()), ranks=[0])
+
+    cls_grad_master = layer_master.cls_token.grad
+    cls_grad = torch.chunk(cls_grad_master, DEPTH, dim=-1)[k]
+    # if j == 0:
+    logger.info('Rank {} embed backward (cls_grad): {}'.format(
+        rank, check_equal(cls_grad, layer.cls_token.grad)))
+    # else:.
+    #     logger.info('Rank {} embed backward (cls_grad): {}'.format(
+    #         rank,
+    #         layer.cls_token.grad is None or np.count_nonzero(
+    #             layer.cls_token.grad.detach().cpu().numpy()) == 0))
+
+    pos_grad_master = layer_master.pos_embed.grad
+    pos_grad = torch.chunk(pos_grad_master, DEPTH, dim=-1)[k]
+    logger.info('Rank {} embed backward (pos_embed_grad): {}'.format(
+        rank, check_equal(pos_grad, layer.pos_embed.grad)))
+    # if i == 0:
+    #     pos_cls_grad = pos_grad[:, 0]
+    #     pos_tensor_grad = pos_grad[:, 1:]
+    #     pos_tensor_grad = torch.chunk(pos_tensor_grad, DEPTH, dim=1)[j]
+    #     if j == 0:
+    #         logger.info('Rank {} embed backward (pos_embed_grad): {}'.format(
+    #             rank,
+    #             check_equal(
+    #                 torch.cat(
+    #                     (torch.unsqueeze(pos_cls_grad, 1), pos_tensor_grad),
+    #                     dim=1), layer.pos_embed.grad)))
+    #     else:
+    #         logger.info('Rank {} embed backward (pos_embed_grad): {}'.format(
+    #             rank, check_equal(pos_tensor_grad, layer.pos_embed.grad[:,
+    #                                                                     1:])))
+    # else:
+    #     logger.info('Rank {} embed backward (pos_embed_grad): {}'.format(
+    #         rank, layer.pos_embed.grad is None))
+
+    B_grad = layer_master.proj.weight.grad
+    B_grad = torch.chunk(B_grad, DEPTH, dim=0)[k]
+    logger.info('Rank {} embed backward (proj_weight_grad): {}'.format(
+        rank, check_equal(B_grad, layer.proj.weight.grad)))
+
+    bias_grad = layer_master.proj.bias.grad
+    bias_grad = torch.chunk(bias_grad, DEPTH)[k]
+    logger.info('Rank {} embed backward (proj_bias_grad): {}'.format(
+        rank, check_equal(bias_grad, layer.proj.bias.grad)))
+
+    return fwd_end - fwd_start, bwd_end - bwd_start
+
+
+def check_loss():
+    rank = torch.distributed.get_rank()
+    logger = get_global_dist_logger()
+    device = get_current_device()
+    dtype = torch.float32
+
+    j = A_rank = global_context.get_local_rank(ParallelMode.PARALLEL_3D_INPUT)
+    i = B_rank = global_context.get_local_rank(ParallelMode.PARALLEL_3D_WEIGHT)
+    k = C_rank = global_context.get_local_rank(ParallelMode.PARALLEL_3D_OUTPUT)
+
+    criterion = LOSSES.get_module('CrossEntropyLoss3D')(
+        ParallelMode.PARALLEL_3D_INPUT, ParallelMode.PARALLEL_3D_WEIGHT)
+    criterion_master = torch.nn.CrossEntropyLoss()
+
+    out_shape = (BATCH_SIZE, NUM_CLASSES)
+    out_master = torch.randn(out_shape, dtype=dtype, device=device)
+    target_master = torch.randint(NUM_CLASSES, (BATCH_SIZE, ),
+                                  dtype=torch.long,
+                                  device=device)
+    torch.distributed.broadcast(out_master, src=0)
+    torch.distributed.broadcast(target_master, src=0)
+    out = torch.chunk(out_master, DEPTH, dim=0)[i]
+    out = torch.chunk(out, DEPTH, dim=-1)[k]
+    out = torch.chunk(out, DEPTH, dim=0)[j]
+    out = out.clone()
+    out.requires_grad = True
+
+    fwd_start = time.time()
+    loss = criterion(out, target_master)
+    fwd_end = time.time()
+    print_rank_0(
+        'loss forward: pass | {0} --> {1} | {2:.3f} s'.format(
+            tuple(out.shape), tuple(loss.shape), fwd_end - fwd_start), logger)
+
+    out_master = out_master.clone()
+    out_master.requires_grad = True
+    loss_master = criterion_master(out_master, target_master)
+    logger.info('Rank {} CrossEntropyLoss forward: {}'.format(
+        rank, check_equal(loss, loss_master)))
+
+    bwd_start = time.time()
+    loss.backward()
+    bwd_end = time.time()
+    print_rank_0('loss backward: pass | {:.3f} s'.format(bwd_end - bwd_start),
+                 logger)
+
+    loss_master.backward()
+    out_grad = out_master.grad
+    out_grad = torch.chunk(out_grad, DEPTH, dim=0)[i]
+    out_grad = torch.chunk(out_grad, DEPTH, dim=-1)[k]
+    out_grad = torch.chunk(out_grad, DEPTH, dim=0)[j]
+    logger.info('Rank {} CrossEntropyLoss backward: {}'.format(
+        rank, check_equal(out_grad, out.grad)))
+
+    return fwd_end - fwd_start, bwd_end - bwd_start
--- a/tests/test_layers/test_3d/test_operation.py
+++ b/tests/test_layers/test_3d/test_operation.py
@@ -0,0 +1,465 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+from colossalai.context import ParallelMode
+from colossalai.core import global_context
+from colossalai.logging import get_global_dist_logger
+from colossalai.nn.layer.parallel_3d._operation import *
+from colossalai.utils import get_current_device
+
+from common import *
+
+
+def check_AB():
+    rank = torch.distributed.get_rank()
+    logger = get_global_dist_logger()
+    dtype = torch.float
+    j = global_context.get_local_rank(ParallelMode.PARALLEL_3D_INPUT)
+    i = global_context.get_local_rank(ParallelMode.PARALLEL_3D_WEIGHT)
+    k = global_context.get_local_rank(ParallelMode.PARALLEL_3D_OUTPUT)
+
+    A_shape = (BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE)
+    A_master = torch.randn(A_shape, dtype=dtype, device=get_current_device())
+    torch.distributed.broadcast(A_master, src=0)
+    A = torch.chunk(A_master, DEPTH, dim=0)[i]
+    A = torch.chunk(A, DEPTH, dim=-1)[k]
+    A = torch.chunk(A, DEPTH, dim=0)[j]
+    A = A.clone()
+    A.requires_grad = True
+
+    B_shape = (HIDDEN_SIZE, 4 * HIDDEN_SIZE)
+    B_master = torch.randn(B_shape, dtype=dtype, device=get_current_device())
+    torch.distributed.broadcast(B_master, src=0)
+    B = torch.chunk(B_master, DEPTH, dim=0)[k]
+    B = torch.chunk(B, DEPTH, dim=-1)[j]
+    B = torch.chunk(B, DEPTH, dim=-1)[i]
+    B = B.clone()
+    B.requires_grad = True
+
+    out = Matmul_AB_3D.apply(A, B, DEPTH, ParallelMode.PARALLEL_3D_INPUT,
+                             ParallelMode.PARALLEL_3D_WEIGHT,
+                             ParallelMode.PARALLEL_3D_OUTPUT)
+
+    C_shape = (BATCH_SIZE, SEQ_LENGTH, 4 * HIDDEN_SIZE)
+    A_master = A_master.clone()
+    A_master.requires_grad = True
+    B_master = B_master.clone()
+    B_master.requires_grad = True
+    C_master = torch.matmul(A_master, B_master)
+    C = torch.chunk(C_master, DEPTH, dim=0)[i]
+    C = torch.chunk(C, DEPTH, dim=-1)[j]
+    C = torch.chunk(C, DEPTH, dim=0)[k]
+    # check forward correctness
+    logger.info('Rank {} AB forward: {}'.format(rank, check_equal(out, C)))
+
+    grad_shape = C_master.shape
+    grad_master = torch.randn(grad_shape,
+                              dtype=dtype,
+                              device=get_current_device())
+    torch.distributed.broadcast(grad_master, src=0)
+    grad = torch.chunk(grad_master, DEPTH, dim=0)[i]
+    grad = torch.chunk(grad, DEPTH, dim=-1)[j]
+    grad = torch.chunk(grad, DEPTH, dim=0)[k]
+
+    out.backward(grad)
+
+    C_master.backward(grad_master)
+    A_grad = A_master.grad
+    A_grad = torch.chunk(A_grad, DEPTH, dim=0)[i]
+    A_grad = torch.chunk(A_grad, DEPTH, dim=-1)[k]
+    A_grad = torch.chunk(A_grad, DEPTH, dim=0)[j]
+    # check backward correctness
+    logger.info('Rank {} AB backward (A_grad): {}'.format(
+        rank, check_equal(A_grad, A.grad)))
+
+    B_grad = B_master.grad
+    B_grad = torch.chunk(B_grad, DEPTH, dim=0)[k]
+    B_grad = torch.chunk(B_grad, DEPTH, dim=-1)[j]
+    B_grad = torch.chunk(B_grad, DEPTH, dim=-1)[i]
+    # check backward correctness
+    logger.info('Rank {} AB backward (B_grad): {}'.format(
+        rank, check_equal(B_grad, B.grad)))
+
+
+def check_ABT():
+    rank = torch.distributed.get_rank()
+    logger = get_global_dist_logger()
+    dtype = torch.float
+
+    j = A_rank = global_context.get_local_rank(ParallelMode.PARALLEL_3D_INPUT)
+    i = B_rank = global_context.get_local_rank(ParallelMode.PARALLEL_3D_WEIGHT)
+    k = C_rank = global_context.get_local_rank(ParallelMode.PARALLEL_3D_OUTPUT)
+    device = get_current_device()
+
+    C_shape = (BATCH_SIZE, SEQ_LENGTH, 4 * HIDDEN_SIZE)
+    C_master = torch.randn(C_shape, dtype=dtype, device=device)
+    torch.distributed.broadcast(C_master, src=0)
+    C = torch.chunk(C_master, DEPTH, dim=0)[i]
+    C = torch.chunk(C, DEPTH, dim=-1)[j]
+    C = torch.chunk(C, DEPTH, dim=0)[k]
+    C = C.clone()
+    C.requires_grad = True
+
+    B_shape = (HIDDEN_SIZE, 4 * HIDDEN_SIZE)
+    B_master = torch.randn(B_shape, dtype=dtype, device=device)
+    torch.distributed.broadcast(B_master, src=0)
+    B = torch.chunk(B_master, DEPTH, dim=0)[k]
+    B = torch.chunk(B, DEPTH, dim=-1)[j]
+    B = torch.chunk(B, DEPTH, dim=-1)[i]
+    B = B.clone()
+    B.requires_grad = True
+
+    out = Matmul_ABT_3D.apply(C, B, DEPTH, ParallelMode.PARALLEL_3D_OUTPUT,
+                              ParallelMode.PARALLEL_3D_WEIGHT,
+                              ParallelMode.PARALLEL_3D_INPUT)
+
+    A_shape = (BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE)
+    C_master = C_master.clone()
+    C_master.requires_grad = True
+    B_master = B_master.clone()
+    B_master.requires_grad = True
+    A_master = torch.matmul(C_master, B_master.transpose(0, 1))
+    A = torch.chunk(A_master, DEPTH, dim=0)[i]
+    A = torch.chunk(A, DEPTH, dim=-1)[k]
+    A = torch.chunk(A, DEPTH, dim=0)[j]
+    logger.info('Rank {} ABT forward: {}'.format(rank, check_equal(out, A)))
+
+    grad_shape = A_master.shape
+    grad_master = torch.randn(grad_shape, dtype=dtype, device=device)
+    torch.distributed.broadcast(grad_master, src=0)
+    grad = torch.chunk(grad_master, DEPTH, dim=0)[i]
+    grad = torch.chunk(grad, DEPTH, dim=-1)[k]
+    grad = torch.chunk(grad, DEPTH, dim=0)[j]
+
+    # backward
+    out.backward(grad)
+
+    A_master.backward(grad_master)
+    C_grad = C_master.grad
+    C_grad = torch.chunk(C_grad, DEPTH, dim=0)[i]
+    C_grad = torch.chunk(C_grad, DEPTH, dim=-1)[j]
+    C_grad = torch.chunk(C_grad, DEPTH, dim=0)[k]
+    logger.info('Rank {} ABT backward (A_grad): {}'.format(
+        rank, check_equal(C_grad, C.grad)))
+
+    B_grad = B_master.grad
+    B_grad = torch.chunk(B_grad, DEPTH, dim=0)[k]
+    B_grad = torch.chunk(B_grad, DEPTH, dim=-1)[j]
+    B_grad = torch.chunk(B_grad, DEPTH, dim=-1)[i]
+    logger.info('Rank {} ABT backward (B_grad): {}'.format(
+        rank, check_equal(B_grad, B.grad)))
+
+
+def check_ATB():
+    rank = torch.distributed.get_rank()
+    logger = get_global_dist_logger()
+    device = get_current_device()
+    dtype = torch.float
+
+    j = A_rank = global_context.get_local_rank(ParallelMode.PARALLEL_3D_INPUT)
+    i = B_rank = global_context.get_local_rank(ParallelMode.PARALLEL_3D_WEIGHT)
+    k = C_rank = global_context.get_local_rank(ParallelMode.PARALLEL_3D_OUTPUT)
+
+    A_shape = (BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE)
+    A_master = torch.randn(A_shape, dtype=dtype, device=device)
+    torch.distributed.broadcast(A_master, src=0)
+    A = torch.chunk(A_master, DEPTH, dim=0)[i]
+    A = torch.chunk(A, DEPTH, dim=-1)[k]
+    A = torch.chunk(A, DEPTH, dim=0)[j]
+    A = A.clone()
+    A.requires_grad = True
+
+    C_shape = (BATCH_SIZE, SEQ_LENGTH, 4 * HIDDEN_SIZE)
+    C_master = torch.randn(C_shape, dtype=dtype, device=device)
+    torch.distributed.broadcast(C_master, src=0)
+    C = torch.chunk(C_master, DEPTH, dim=0)[i]
+    C = torch.chunk(C, DEPTH, dim=-1)[j]
+    C = torch.chunk(C, DEPTH, dim=0)[k]
+    C = C.clone()
+    C.requires_grad = True
+
+    out = Matmul_ATB_3D.apply(A, C, DEPTH, ParallelMode.PARALLEL_3D_INPUT,
+                              ParallelMode.PARALLEL_3D_OUTPUT,
+                              ParallelMode.PARALLEL_3D_WEIGHT)
+
+    B_shape = (HIDDEN_SIZE, 4 * HIDDEN_SIZE)
+    A_master = A_master.clone()
+    A_master.requires_grad = True
+    C_master = C_master.clone()
+    C_master.requires_grad = True
+    B_master = torch.matmul(
+        A_master.view(-1, A_master.shape[-1]).transpose(0, 1),
+        C_master.view(-1, C_master.shape[-1]))
+    B = torch.chunk(B_master, DEPTH, dim=0)[k]
+    B = torch.chunk(B, DEPTH, dim=-1)[j]
+    B = torch.chunk(B, DEPTH, dim=-1)[i]
+    logger.info('Rank {} ATB forward: {}'.format(rank, check_equal(out, B)))
+
+    grad_shape = B_master.shape
+    grad_master = torch.randn(grad_shape, dtype=dtype, device=device)
+    torch.distributed.broadcast(grad_master, src=0)
+    grad = torch.chunk(grad_master, DEPTH, dim=0)[k]
+    grad = torch.chunk(grad, DEPTH, dim=-1)[j]
+    grad = torch.chunk(grad, DEPTH, dim=-1)[i]
+
+    out.backward(grad)
+
+    B_master.backward(grad_master)
+    A_grad = A_master.grad
+    A_grad = torch.chunk(A_grad, DEPTH, dim=0)[i]
+    A_grad = torch.chunk(A_grad, DEPTH, dim=-1)[k]
+    A_grad = torch.chunk(A_grad, DEPTH, dim=0)[j]
+    logger.info('Rank {} ATB backward (A_grad): {}'.format(
+        rank, check_equal(A_grad, A.grad)))
+
+    C_grad = C_master.grad
+    C_grad = torch.chunk(C_grad, DEPTH, dim=0)[i]
+    C_grad = torch.chunk(C_grad, DEPTH, dim=-1)[j]
+    C_grad = torch.chunk(C_grad, DEPTH, dim=0)[k]
+    logger.info('Rank {} ATB backward (B_grad): {}'.format(
+        rank, check_equal(C_grad, C.grad)))
+
+
+def check_add():
+    rank = torch.distributed.get_rank()
+    logger = get_global_dist_logger()
+    dtype = torch.float
+
+    j = A_rank = global_context.get_local_rank(ParallelMode.PARALLEL_3D_INPUT)
+    i = B_rank = global_context.get_local_rank(ParallelMode.PARALLEL_3D_WEIGHT)
+    k = C_rank = global_context.get_local_rank(ParallelMode.PARALLEL_3D_OUTPUT)
+    device = get_current_device()
+
+    A_shape = (BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE)
+    A_master = torch.randn(A_shape, dtype=dtype, device=get_current_device())
+    torch.distributed.broadcast(A_master, src=0)
+    A = torch.chunk(A_master, DEPTH, dim=0)[i]
+    A = torch.chunk(A, DEPTH, dim=-1)[k]
+    A = torch.chunk(A, DEPTH, dim=0)[j]
+    A = A.clone()
+    A.requires_grad = True
+
+    bias_shape = (HIDDEN_SIZE, )
+    bias_master = torch.randn(bias_shape,
+                              dtype=dtype,
+                              device=get_current_device())
+    torch.distributed.broadcast(bias_master, src=0)
+    bias = torch.chunk(bias_master, DEPTH)[j]
+    bias = torch.chunk(bias, DEPTH)[i]
+    bias = bias.clone()
+    bias.requires_grad = True
+
+    out = Add_3D.apply(A, bias, DEPTH, ParallelMode.PARALLEL_3D_INPUT,
+                       ParallelMode.PARALLEL_3D_WEIGHT,
+                       ParallelMode.PARALLEL_3D_OUTPUT)
+
+    A_master = A_master.clone()
+    A_master.requires_grad = True
+    bias_master = bias_master.clone()
+    bias_master.requires_grad = True
+    C_master = A_master + bias_master
+    C = torch.chunk(C_master, DEPTH, dim=0)[i]
+    C = torch.chunk(C, DEPTH, dim=-1)[k]
+    C = torch.chunk(C, DEPTH, dim=0)[j]
+
+    logger.info('Rank {} Add forward: {}'.format(rank, check_equal(out, C)))
+
+    grad_shape = C_master.shape
+    grad_master = torch.randn(grad_shape, dtype=dtype, device=device)
+    torch.distributed.broadcast(grad_master, src=0)
+    grad = torch.chunk(grad_master, DEPTH, dim=0)[i]
+    grad = torch.chunk(grad, DEPTH, dim=-1)[k]
+    grad = torch.chunk(grad, DEPTH, dim=0)[j]
+
+    out.backward(grad)
+
+    C_master.backward(grad_master)
+    A_grad = A_master.grad
+    A_grad = torch.chunk(A_grad, DEPTH, dim=0)[i]
+    A_grad = torch.chunk(A_grad, DEPTH, dim=-1)[k]
+    A_grad = torch.chunk(A_grad, DEPTH, dim=0)[j]
+    logger.info('Rank {} Add backward (A_grad): {}'.format(
+        rank, check_equal(A_grad, A.grad)))
+
+    if j == k:
+        bias_grad = bias_master.grad
+        bias_grad = torch.chunk(bias_grad, DEPTH)[j]
+        bias_grad = torch.chunk(bias_grad, DEPTH)[i]
+        logger.info('Rank {} Add backward (b_grad): {}'.format(
+            rank, check_equal(bias_grad, bias.grad)))
+    else:
+        logger.info('Rank {} Add backward (b_grad): {}'.format(
+            rank,
+            # np.count_nonzero(bias.grad.detach().cpu().numpy()) == 0))
+            bias.grad is None))
+
+
+def check_mul():
+    rank = torch.distributed.get_rank()
+    logger = get_global_dist_logger()
+    dtype = torch.float
+
+    j = A_rank = global_context.get_local_rank(ParallelMode.PARALLEL_3D_INPUT)
+    i = B_rank = global_context.get_local_rank(ParallelMode.PARALLEL_3D_WEIGHT)
+    k = C_rank = global_context.get_local_rank(ParallelMode.PARALLEL_3D_OUTPUT)
+    device = get_current_device()
+
+    A_shape = (BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE)
+    A_master = torch.randn(A_shape, dtype=dtype, device=get_current_device())
+    torch.distributed.broadcast(A_master, src=0)
+    A = torch.chunk(A_master, DEPTH, dim=0)[i]
+    A = torch.chunk(A, DEPTH, dim=-1)[k]
+    A = torch.chunk(A, DEPTH, dim=0)[j]
+    A = A.clone()
+    A.requires_grad = True
+
+    bias_shape = (HIDDEN_SIZE, )
+    bias_master = torch.randn(bias_shape,
+                              dtype=dtype,
+                              device=get_current_device())
+    torch.distributed.broadcast(bias_master, src=0)
+    bias = torch.chunk(bias_master, DEPTH)[j]
+    bias = torch.chunk(bias, DEPTH)[i]
+    bias = bias.clone()
+    bias.requires_grad = True
+
+    out = Mul_3D.apply(A, bias, DEPTH, ParallelMode.PARALLEL_3D_INPUT,
+                       ParallelMode.PARALLEL_3D_WEIGHT,
+                       ParallelMode.PARALLEL_3D_OUTPUT)
+
+    A_master = A_master.clone()
+    A_master.requires_grad = True
+    bias_master = bias_master.clone()
+    bias_master.requires_grad = True
+    C_master = torch.mul(A_master, bias_master)
+    C = torch.chunk(C_master, DEPTH, dim=0)[i]
+    C = torch.chunk(C, DEPTH, dim=-1)[k]
+    C = torch.chunk(C, DEPTH, dim=0)[j]
+
+    logger.info('Rank {} Mul forward: {}'.format(rank, check_equal(out, C)))
+
+    grad_shape = C_master.shape
+    grad_master = torch.randn(grad_shape, dtype=dtype, device=device)
+    torch.distributed.broadcast(grad_master, src=0)
+    grad = torch.chunk(grad_master, DEPTH, dim=0)[i]
+    grad = torch.chunk(grad, DEPTH, dim=-1)[k]
+    grad = torch.chunk(grad, DEPTH, dim=0)[j]
+
+    out.backward(grad)
+
+    C_master.backward(grad_master)
+    A_grad = A_master.grad
+    A_grad = torch.chunk(A_grad, DEPTH, dim=0)[i]
+    A_grad = torch.chunk(A_grad, DEPTH, dim=-1)[k]
+    A_grad = torch.chunk(A_grad, DEPTH, dim=0)[j]
+    logger.info('Rank {} Mul backward (A_grad): {}'.format(
+        rank, check_equal(A_grad, A.grad)))
+
+    if j == k:
+        bias_grad = bias_master.grad
+        bias_grad = torch.chunk(bias_grad, DEPTH)[j]
+        bias_grad = torch.chunk(bias_grad, DEPTH)[i]
+        logger.info('Rank {} Mul backward (b_grad): {}'.format(
+            rank, check_equal(bias_grad, bias.grad)))
+    else:
+        logger.info('Rank {} Mul backward (b_grad): {}'.format(
+            rank,
+            # np.count_nonzero(bias.grad.detach().cpu().numpy()) == 0))
+            bias.grad is None))
+
+
+def check_sum():
+    rank = torch.distributed.get_rank()
+    logger = get_global_dist_logger()
+    dtype = torch.float
+
+    j = A_rank = global_context.get_local_rank(ParallelMode.PARALLEL_3D_INPUT)
+    i = B_rank = global_context.get_local_rank(ParallelMode.PARALLEL_3D_WEIGHT)
+    k = C_rank = global_context.get_local_rank(ParallelMode.PARALLEL_3D_OUTPUT)
+    device = get_current_device()
+
+    # tensor
+    A_shape = (BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE)
+    A_master = torch.randn(A_shape, dtype=dtype, device=get_current_device())
+    torch.distributed.broadcast(A_master, src=0)
+    A = torch.chunk(A_master, DEPTH, dim=0)[i]
+    A = torch.chunk(A, DEPTH, dim=-1)[k]
+    A = torch.chunk(A, DEPTH, dim=0)[j]
+    A = A.clone()
+    A.requires_grad = True
+
+    out_tensor = Sum_3D.apply(A, -1, DEPTH, ParallelMode.PARALLEL_3D_OUTPUT)
+
+    A_master = A_master.clone()
+    A_master.requires_grad = True
+    C_master = torch.sum(A_master, dim=-1)
+    C = torch.chunk(C_master, DEPTH, dim=0)[i]
+    C = torch.chunk(C, DEPTH, dim=0)[j]
+    logger.info('Rank {} Sum forward: {}'.format(rank,
+                                                 check_equal(out_tensor, C)))
+
+    grad_shape = C_master.shape
+    grad_master = torch.randn(grad_shape, dtype=dtype, device=device)
+    torch.distributed.broadcast(grad_master, src=0)
+    grad = torch.chunk(grad_master, DEPTH, dim=0)[i]
+    grad = torch.chunk(grad, DEPTH, dim=0)[j]
+
+    out_tensor.backward(grad / DEPTH)
+
+    C_master.backward(grad_master)
+    A_grad = A_master.grad
+    A_grad = torch.chunk(A_grad, DEPTH, dim=0)[i]
+    A_grad = torch.chunk(A_grad, DEPTH, dim=-1)[k]
+    A_grad = torch.chunk(A_grad, DEPTH, dim=0)[j]
+    logger.info('Rank {} Sum backward: {}'.format(rank,
+                                                  check_equal(A_grad, A.grad)))
+
+
+def check_reduce():
+    rank = torch.distributed.get_rank()
+    logger = get_global_dist_logger()
+    dtype = torch.float
+
+    j = A_rank = global_context.get_local_rank(ParallelMode.PARALLEL_3D_INPUT)
+    i = B_rank = global_context.get_local_rank(ParallelMode.PARALLEL_3D_WEIGHT)
+    k = C_rank = global_context.get_local_rank(ParallelMode.PARALLEL_3D_OUTPUT)
+    device = get_current_device()
+
+    # scaler
+    B_shape = (DEPTH * DEPTH, DEPTH)
+    B_master = torch.randn(B_shape, dtype=dtype, device=get_current_device())
+    torch.distributed.broadcast(B_master, src=0)
+    B = torch.chunk(B_master, DEPTH, dim=0)[i]
+    B = torch.chunk(B, DEPTH, dim=-1)[k]
+    B = torch.chunk(B, DEPTH, dim=0)[j]
+    B = torch.squeeze(B)
+    B = B.clone()
+    B.requires_grad = True
+
+    out_scaler = Reduce_3D.apply(B, 0, DEPTH, ParallelMode.PARALLEL_3D_OUTPUT)
+    out_scaler = Reduce_3D.apply(out_scaler, 0, DEPTH,
+                                 ParallelMode.PARALLEL_3D_INPUT)
+    out_scaler = Reduce_3D.apply(out_scaler, 0, DEPTH,
+                                 ParallelMode.PARALLEL_3D_WEIGHT)
+
+    B_master = B_master.clone()
+    B_master.requires_grad = True
+    D = torch.sum(B_master)
+    logger.info('Rank {} Reduce forward: {}'.format(rank,
+                                                    check_equal(out_scaler,
+                                                                D)))
+
+    grad_shape = D.shape
+    grad_master = torch.randn(grad_shape, dtype=dtype, device=device)
+    torch.distributed.broadcast(grad_master, src=0)
+
+    out_scaler.backward(grad_master)
+
+    D.backward(grad_master)
+    B_grad = B_master.grad
+    B_grad = torch.chunk(B_grad, DEPTH, dim=0)[i]
+    B_grad = torch.chunk(B_grad, DEPTH, dim=-1)[k]
+    B_grad = torch.chunk(B_grad, DEPTH, dim=0)[j]
+    B_grad = torch.squeeze(B_grad)
+    logger.info('Rank {} Reduce backward: {}'.format(
+        rank, check_equal(B_grad, B.grad)))
--- a/tests/test_layers/test_sequence/test_layer.py
+++ b/tests/test_layers/test_sequence/test_layer.py
@@ -0,0 +1,26 @@
+import torch
+
+from colossalai.context import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.nn import TransformerSelfAttentionRing
+from colossalai.utils import get_current_device
+
+
+def check_selfattention():
+    WORLD_SIZE = gpc.get_world_size(ParallelMode.SEQUENCE)
+    SUB_SEQ_LENGTH = 8
+    BATCH = 4
+    HIDDEN_SIZE = 16
+
+    layer = TransformerSelfAttentionRing(
+        16,
+        8,
+        8,
+        0.1
+    )
+    layer = layer.to(get_current_device())
+
+    hidden_states = torch.rand(SUB_SEQ_LENGTH, BATCH, HIDDEN_SIZE).to(get_current_device())
+    attention_mask = torch.randint(low=0, high=2, size=(BATCH, 1, 1, 1, SUB_SEQ_LENGTH * WORLD_SIZE)).to(
+        get_current_device())
+    out = layer(hidden_states, attention_mask)
--- a/tests/test_layers/test_sequence/test_sequence.py
+++ b/tests/test_layers/test_sequence/test_sequence.py
@@ -0,0 +1,34 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+from colossalai.initialize import init_dist
+from colossalai.logging import get_global_dist_logger
+from test_layer import *
+
+CONFIG = dict(
+    parallel=dict(
+        pipeline=1,
+        tensor=dict(mode='sequence', size=4)
+    )
+)
+
+
+def check_layer():
+    check_selfattention()
+
+
+def _test_main():
+    # init dist
+    init_dist(CONFIG)
+    logger = get_global_dist_logger()
+    logger.info('Distributed environment is initialzied.', ranks=[0])
+
+    gpc.set_seed()
+    torch.backends.cudnn.benchmark = True
+
+    # check layers
+    check_layer()
+
+
+if __name__ == '__main__':
+    _test_main()