mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-15 14:12:02 +00:00
[misc] update pre-commit and run all files (#4752)
* [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format
This commit is contained in:
@@ -48,7 +48,7 @@ def check_linear():
|
||||
W = W.clone()
|
||||
W.requires_grad = True
|
||||
|
||||
B_shape = (OUTPUT_SIZE)
|
||||
B_shape = OUTPUT_SIZE
|
||||
B_master = torch.randn(B_shape, dtype=dtype, device=device)
|
||||
torch.distributed.broadcast(B_master, src=0)
|
||||
B = torch.chunk(B_master, DEPTH, dim=-1)[j]
|
||||
@@ -71,7 +71,7 @@ def check_linear():
|
||||
C = torch.chunk(C, DEPTH, dim=-1)[j]
|
||||
|
||||
check_equal(out, C)
|
||||
print_rank_0('linear forward: pass')
|
||||
print_rank_0("linear forward: pass")
|
||||
|
||||
grad_shape = C_master.shape
|
||||
grad_master = torch.randn(grad_shape, dtype=dtype, device=get_current_device())
|
||||
@@ -99,7 +99,7 @@ def check_linear():
|
||||
# if i == 0:
|
||||
check_equal(B_grad, layer.bias.grad)
|
||||
|
||||
print_rank_0('linear backward: pass')
|
||||
print_rank_0("linear backward: pass")
|
||||
|
||||
|
||||
def check_layernorm():
|
||||
@@ -136,7 +136,7 @@ def check_layernorm():
|
||||
C = torch.chunk(C, DEPTH, dim=-1)[j]
|
||||
|
||||
check_equal(out, C)
|
||||
print_rank_0('layer norm forward: pass')
|
||||
print_rank_0("layer norm forward: pass")
|
||||
|
||||
grad_shape = C_master.shape
|
||||
grad_master = torch.randn(grad_shape, dtype=dtype, device=get_current_device())
|
||||
@@ -150,7 +150,7 @@ def check_layernorm():
|
||||
A_grad = torch.chunk(A_grad, DEPTH, dim=0)[i]
|
||||
A_grad = torch.chunk(A_grad, DEPTH, dim=-1)[j]
|
||||
check_equal(A_grad, A.grad)
|
||||
print_rank_0('layer norm backward: pass')
|
||||
print_rank_0("layer norm backward: pass")
|
||||
|
||||
|
||||
def check_embed():
|
||||
@@ -181,7 +181,7 @@ def check_embed():
|
||||
C = torch.chunk(C_master, DEPTH, dim=0)[i]
|
||||
C = torch.chunk(C, DEPTH, dim=-1)[j]
|
||||
check_equal(out, C)
|
||||
print_rank_0('embed forward: pass')
|
||||
print_rank_0("embed forward: pass")
|
||||
|
||||
grad_shape = C_master.shape
|
||||
grad_master = torch.randn(grad_shape, dtype=dtype, device=device)
|
||||
@@ -197,7 +197,7 @@ def check_embed():
|
||||
B_grad = torch.chunk(B_grad, DEPTH, dim=-1)[j]
|
||||
B_grad = torch.chunk(B_grad, DEPTH, dim=-1)[i]
|
||||
check_equal(B_grad, embed.weight.grad)
|
||||
print_rank_0('embed backward: pass')
|
||||
print_rank_0("embed backward: pass")
|
||||
|
||||
|
||||
def check_patch_embed():
|
||||
@@ -238,7 +238,7 @@ def check_patch_embed():
|
||||
C = torch.chunk(C_master, DEPTH, dim=0)[i]
|
||||
C = torch.chunk(C, DEPTH, dim=-1)[j]
|
||||
check_equal(out, C)
|
||||
print_rank_0('patch embed forward: pass')
|
||||
print_rank_0("patch embed forward: pass")
|
||||
|
||||
grad_shape = C_master.shape
|
||||
grad_master = torch.randn(grad_shape, dtype=dtype, device=device)
|
||||
@@ -270,7 +270,7 @@ def check_patch_embed():
|
||||
bias_grad = torch.chunk(bias_grad, DEPTH)[j]
|
||||
bias_grad = torch.chunk(bias_grad, DEPTH)[i]
|
||||
check_equal(bias_grad, layer.bias.grad)
|
||||
print_rank_0('patch embed backward: pass')
|
||||
print_rank_0("patch embed backward: pass")
|
||||
|
||||
|
||||
def check_vocab_parallel_embed():
|
||||
@@ -301,7 +301,7 @@ def check_vocab_parallel_embed():
|
||||
C = torch.chunk(C_master, DEPTH, dim=0)[i]
|
||||
C = torch.chunk(C, DEPTH, dim=-1)[j]
|
||||
check_equal(out, C)
|
||||
print_rank_0('vocab parallel embed forward: pass')
|
||||
print_rank_0("vocab parallel embed forward: pass")
|
||||
|
||||
grad_shape = C_master.shape
|
||||
grad_master = torch.randn(grad_shape, dtype=dtype, device=device)
|
||||
@@ -317,7 +317,7 @@ def check_vocab_parallel_embed():
|
||||
B_grad = torch.chunk(B_grad, DEPTH, dim=-1)[j]
|
||||
B_grad = torch.chunk(B_grad, DEPTH, dim=0)[i]
|
||||
check_equal(B_grad, embed.weight.grad)
|
||||
print_rank_0('vocab parallel embed backward: pass')
|
||||
print_rank_0("vocab parallel embed backward: pass")
|
||||
|
||||
|
||||
def check_classifier_no_given_weight():
|
||||
@@ -368,7 +368,7 @@ def check_classifier_no_given_weight():
|
||||
# C = torch.chunk(C, DEPTH, dim=-1)[j]
|
||||
|
||||
check_equal(out, C)
|
||||
print_rank_0('classifier (no given weight) forward: pass')
|
||||
print_rank_0("classifier (no given weight) forward: pass")
|
||||
|
||||
grad_shape = C_master.shape
|
||||
grad_master = torch.randn(grad_shape, dtype=dtype, device=get_current_device())
|
||||
@@ -395,7 +395,7 @@ def check_classifier_no_given_weight():
|
||||
# if i == 0:
|
||||
check_equal(B_grad, layer.bias.grad)
|
||||
|
||||
print_rank_0('classifier (no given weight) backward: pass')
|
||||
print_rank_0("classifier (no given weight) backward: pass")
|
||||
|
||||
|
||||
def check_vocab_parallel_classifier_no_given_weight():
|
||||
@@ -437,7 +437,7 @@ def check_vocab_parallel_classifier_no_given_weight():
|
||||
C = torch.chunk(C_master, DEPTH, dim=0)[i]
|
||||
C = torch.chunk(C, DEPTH, dim=-1)[j]
|
||||
check_equal(out, C)
|
||||
print_rank_0('vocab parallel classifier (no given weight) forward: pass')
|
||||
print_rank_0("vocab parallel classifier (no given weight) forward: pass")
|
||||
|
||||
grad_shape = C_master.shape
|
||||
grad_master = torch.randn(grad_shape, dtype=dtype, device=device)
|
||||
@@ -463,7 +463,7 @@ def check_vocab_parallel_classifier_no_given_weight():
|
||||
B_grad = torch.chunk(B_grad, DEPTH)[j]
|
||||
B_grad = torch.chunk(B_grad, DEPTH)[i]
|
||||
check_equal(B_grad, layer.bias.grad)
|
||||
print_rank_0('vocab parallel classifier (no given weight) backward: pass')
|
||||
print_rank_0("vocab parallel classifier (no given weight) backward: pass")
|
||||
|
||||
|
||||
def check_classifier_given_embed_weight():
|
||||
@@ -499,7 +499,7 @@ def check_classifier_given_embed_weight():
|
||||
C_master = layer_master(embed_master(A_master))
|
||||
C = torch.chunk(C_master, DEPTH, dim=0)[i]
|
||||
check_equal(out, C)
|
||||
print_rank_0('classifier (given embed weight) forward: pass')
|
||||
print_rank_0("classifier (given embed weight) forward: pass")
|
||||
|
||||
grad_shape = C_master.shape
|
||||
grad_master = torch.randn(grad_shape, dtype=dtype, device=device)
|
||||
@@ -515,7 +515,7 @@ def check_classifier_given_embed_weight():
|
||||
W_grad = torch.chunk(W_grad, DEPTH, dim=-1)[j]
|
||||
W_grad = torch.chunk(W_grad, DEPTH, dim=-1)[i]
|
||||
check_equal(W_grad, embed.weight.grad)
|
||||
print_rank_0('classifier (given embed weight) backward: pass')
|
||||
print_rank_0("classifier (given embed weight) backward: pass")
|
||||
|
||||
|
||||
def check_vocab_parallel_classifier_given_embed_weight():
|
||||
@@ -552,7 +552,7 @@ def check_vocab_parallel_classifier_given_embed_weight():
|
||||
C = torch.chunk(C_master, DEPTH, dim=0)[i]
|
||||
C = torch.chunk(C, DEPTH, dim=-1)[j]
|
||||
check_equal(out, C)
|
||||
print_rank_0('vocab parallel classifier (given embed weight) forward: pass')
|
||||
print_rank_0("vocab parallel classifier (given embed weight) forward: pass")
|
||||
|
||||
grad_shape = C_master.shape
|
||||
grad_master = torch.randn(grad_shape, dtype=dtype, device=device)
|
||||
@@ -569,14 +569,14 @@ def check_vocab_parallel_classifier_given_embed_weight():
|
||||
W_grad = torch.chunk(W_grad, DEPTH, dim=-1)[j]
|
||||
W_grad = torch.chunk(W_grad, DEPTH, dim=0)[i]
|
||||
check_equal(W_grad, embed.weight.grad)
|
||||
print_rank_0('vocab parallel classifier (given embed weight) backward: pass')
|
||||
print_rank_0("vocab parallel classifier (given embed weight) backward: pass")
|
||||
|
||||
|
||||
def check_loss():
|
||||
device = get_current_device()
|
||||
dtype = torch.float32
|
||||
|
||||
j = gpc.get_local_rank(ParallelMode.PARALLEL_2D_ROW)
|
||||
gpc.get_local_rank(ParallelMode.PARALLEL_2D_ROW)
|
||||
i = gpc.get_local_rank(ParallelMode.PARALLEL_2D_COL)
|
||||
|
||||
criterion = CrossEntropyLoss2D()
|
||||
@@ -596,7 +596,7 @@ def check_loss():
|
||||
out_master.requires_grad = True
|
||||
loss_master = criterion_master(out_master, target_master)
|
||||
check_equal(loss, loss_master)
|
||||
print_rank_0('cross entropy loss forward: pass')
|
||||
print_rank_0("cross entropy loss forward: pass")
|
||||
|
||||
loss.backward()
|
||||
loss_master.backward()
|
||||
@@ -604,7 +604,7 @@ def check_loss():
|
||||
out_grad = out_master.grad
|
||||
out_grad = torch.chunk(out_grad, DEPTH, dim=0)[i]
|
||||
check_equal(out_grad, out.grad)
|
||||
print_rank_0('cross entropy loss backward: pass')
|
||||
print_rank_0("cross entropy loss backward: pass")
|
||||
|
||||
|
||||
def check_vocab_parallel_loss():
|
||||
@@ -632,7 +632,7 @@ def check_vocab_parallel_loss():
|
||||
out_master.requires_grad = True
|
||||
loss_master = criterion_master(out_master, target_master)
|
||||
check_equal(loss, loss_master)
|
||||
print_rank_0('vocab parallel cross entropy loss forward: pass')
|
||||
print_rank_0("vocab parallel cross entropy loss forward: pass")
|
||||
|
||||
loss.backward()
|
||||
loss_master.backward()
|
||||
@@ -641,7 +641,7 @@ def check_vocab_parallel_loss():
|
||||
out_grad = torch.chunk(out_grad, DEPTH, dim=0)[i]
|
||||
out_grad = torch.chunk(out_grad, DEPTH, dim=-1)[j]
|
||||
check_equal(out_grad, out.grad)
|
||||
print_rank_0('vocab parallel cross entropy loss backward: pass')
|
||||
print_rank_0("vocab parallel cross entropy loss backward: pass")
|
||||
|
||||
|
||||
# def check_attention():
|
||||
|
@@ -14,10 +14,12 @@ from .common import BATCH_SIZE, DEPTH, HIDDEN_SIZE, SEQ_LENGTH, check_equal
|
||||
|
||||
def check_AB():
|
||||
data_parallel_rank = 0 if not gpc.is_initialized(ParallelMode.DATA) else gpc.get_local_rank(ParallelMode.DATA)
|
||||
pipeline_parallel_rank = 0 if not gpc.is_initialized(ParallelMode.PIPELINE) else gpc.get_local_rank(
|
||||
ParallelMode.PIPELINE)
|
||||
pipeline_parallel_size = 1 if not gpc.is_initialized(ParallelMode.PIPELINE) else gpc.get_world_size(
|
||||
ParallelMode.PIPELINE)
|
||||
pipeline_parallel_rank = (
|
||||
0 if not gpc.is_initialized(ParallelMode.PIPELINE) else gpc.get_local_rank(ParallelMode.PIPELINE)
|
||||
)
|
||||
pipeline_parallel_size = (
|
||||
1 if not gpc.is_initialized(ParallelMode.PIPELINE) else gpc.get_world_size(ParallelMode.PIPELINE)
|
||||
)
|
||||
tensor_parallel_size = gpc.get_world_size(ParallelMode.TENSOR)
|
||||
|
||||
dtype = torch.float
|
||||
@@ -42,10 +44,22 @@ def check_AB():
|
||||
|
||||
out_shape = (BATCH_SIZE // DEPTH, SEQ_LENGTH, 4 * HIDDEN_SIZE // DEPTH)
|
||||
|
||||
out = Matmul_AB_2D.apply(A, B, DEPTH, out_shape, i, j, ParallelMode.PARALLEL_2D_ROW, ParallelMode.PARALLEL_2D_COL,
|
||||
data_parallel_rank, pipeline_parallel_rank, pipeline_parallel_size, tensor_parallel_size)
|
||||
out = Matmul_AB_2D.apply(
|
||||
A,
|
||||
B,
|
||||
DEPTH,
|
||||
out_shape,
|
||||
i,
|
||||
j,
|
||||
ParallelMode.PARALLEL_2D_ROW,
|
||||
ParallelMode.PARALLEL_2D_COL,
|
||||
data_parallel_rank,
|
||||
pipeline_parallel_rank,
|
||||
pipeline_parallel_size,
|
||||
tensor_parallel_size,
|
||||
)
|
||||
|
||||
C_shape = (BATCH_SIZE, SEQ_LENGTH, 4 * HIDDEN_SIZE)
|
||||
(BATCH_SIZE, SEQ_LENGTH, 4 * HIDDEN_SIZE)
|
||||
A_master = A_master.clone()
|
||||
A_master.requires_grad = True
|
||||
B_master = B_master.clone()
|
||||
@@ -55,7 +69,7 @@ def check_AB():
|
||||
C = torch.chunk(C, DEPTH, dim=-1)[j]
|
||||
# check forward correctness
|
||||
check_equal(out, C)
|
||||
print_rank_0('AB forward: pass')
|
||||
print_rank_0("AB forward: pass")
|
||||
|
||||
grad_shape = C_master.shape
|
||||
grad_master = torch.randn(grad_shape, dtype=dtype, device=get_current_device())
|
||||
@@ -77,15 +91,17 @@ def check_AB():
|
||||
B_grad = torch.chunk(B_grad, DEPTH, dim=-1)[j]
|
||||
# check backward correctness
|
||||
check_equal(B_grad, B.grad)
|
||||
print_rank_0('AB backward: pass')
|
||||
print_rank_0("AB backward: pass")
|
||||
|
||||
|
||||
def check_ABT():
|
||||
data_parallel_rank = 0 if not gpc.is_initialized(ParallelMode.DATA) else gpc.get_local_rank(ParallelMode.DATA)
|
||||
pipeline_parallel_rank = 0 if not gpc.is_initialized(ParallelMode.PIPELINE) else gpc.get_local_rank(
|
||||
ParallelMode.PIPELINE)
|
||||
pipeline_parallel_size = 1 if not gpc.is_initialized(ParallelMode.PIPELINE) else gpc.get_world_size(
|
||||
ParallelMode.PIPELINE)
|
||||
pipeline_parallel_rank = (
|
||||
0 if not gpc.is_initialized(ParallelMode.PIPELINE) else gpc.get_local_rank(ParallelMode.PIPELINE)
|
||||
)
|
||||
pipeline_parallel_size = (
|
||||
1 if not gpc.is_initialized(ParallelMode.PIPELINE) else gpc.get_world_size(ParallelMode.PIPELINE)
|
||||
)
|
||||
tensor_parallel_size = gpc.get_world_size(ParallelMode.TENSOR)
|
||||
|
||||
dtype = torch.float
|
||||
@@ -110,11 +126,22 @@ def check_ABT():
|
||||
B = B.clone()
|
||||
B.requires_grad = True
|
||||
|
||||
out = Matmul_ABT_2D.apply(C, B, DEPTH, (BATCH_SIZE // DEPTH, SEQ_LENGTH, HIDDEN_SIZE // DEPTH), i, j,
|
||||
ParallelMode.PARALLEL_2D_ROW, ParallelMode.PARALLEL_2D_COL, data_parallel_rank,
|
||||
pipeline_parallel_rank, pipeline_parallel_size, tensor_parallel_size)
|
||||
out = Matmul_ABT_2D.apply(
|
||||
C,
|
||||
B,
|
||||
DEPTH,
|
||||
(BATCH_SIZE // DEPTH, SEQ_LENGTH, HIDDEN_SIZE // DEPTH),
|
||||
i,
|
||||
j,
|
||||
ParallelMode.PARALLEL_2D_ROW,
|
||||
ParallelMode.PARALLEL_2D_COL,
|
||||
data_parallel_rank,
|
||||
pipeline_parallel_rank,
|
||||
pipeline_parallel_size,
|
||||
tensor_parallel_size,
|
||||
)
|
||||
|
||||
A_shape = (BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE)
|
||||
(BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE)
|
||||
C_master = C_master.clone()
|
||||
C_master.requires_grad = True
|
||||
B_master = B_master.clone()
|
||||
@@ -123,7 +150,7 @@ def check_ABT():
|
||||
A = torch.chunk(A_master, DEPTH, dim=0)[i]
|
||||
A = torch.chunk(A, DEPTH, dim=-1)[j]
|
||||
check_equal(out, A)
|
||||
print_rank_0('ABT forward: pass')
|
||||
print_rank_0("ABT forward: pass")
|
||||
|
||||
grad_shape = A_master.shape
|
||||
grad_master = torch.randn(grad_shape, dtype=dtype, device=device)
|
||||
@@ -144,15 +171,17 @@ def check_ABT():
|
||||
B_grad = torch.chunk(B_grad, DEPTH, dim=0)[i]
|
||||
B_grad = torch.chunk(B_grad, DEPTH, dim=-1)[j]
|
||||
check_equal(B_grad, B.grad)
|
||||
print_rank_0('ABT backward: pass')
|
||||
print_rank_0("ABT backward: pass")
|
||||
|
||||
|
||||
def check_ATB():
|
||||
data_parallel_rank = 0 if not gpc.is_initialized(ParallelMode.DATA) else gpc.get_local_rank(ParallelMode.DATA)
|
||||
pipeline_parallel_rank = 0 if not gpc.is_initialized(ParallelMode.PIPELINE) else gpc.get_local_rank(
|
||||
ParallelMode.PIPELINE)
|
||||
pipeline_parallel_size = 1 if not gpc.is_initialized(ParallelMode.PIPELINE) else gpc.get_world_size(
|
||||
ParallelMode.PIPELINE)
|
||||
pipeline_parallel_rank = (
|
||||
0 if not gpc.is_initialized(ParallelMode.PIPELINE) else gpc.get_local_rank(ParallelMode.PIPELINE)
|
||||
)
|
||||
pipeline_parallel_size = (
|
||||
1 if not gpc.is_initialized(ParallelMode.PIPELINE) else gpc.get_world_size(ParallelMode.PIPELINE)
|
||||
)
|
||||
tensor_parallel_size = gpc.get_world_size(ParallelMode.TENSOR)
|
||||
|
||||
device = get_current_device()
|
||||
@@ -177,21 +206,33 @@ def check_ATB():
|
||||
C = C.clone()
|
||||
C.requires_grad = True
|
||||
|
||||
out = Matmul_ATB_2D.apply(A, C, DEPTH, (HIDDEN_SIZE // DEPTH, 4 * HIDDEN_SIZE // DEPTH), i, j,
|
||||
ParallelMode.PARALLEL_2D_ROW, ParallelMode.PARALLEL_2D_COL, data_parallel_rank,
|
||||
pipeline_parallel_rank, pipeline_parallel_size, tensor_parallel_size)
|
||||
out = Matmul_ATB_2D.apply(
|
||||
A,
|
||||
C,
|
||||
DEPTH,
|
||||
(HIDDEN_SIZE // DEPTH, 4 * HIDDEN_SIZE // DEPTH),
|
||||
i,
|
||||
j,
|
||||
ParallelMode.PARALLEL_2D_ROW,
|
||||
ParallelMode.PARALLEL_2D_COL,
|
||||
data_parallel_rank,
|
||||
pipeline_parallel_rank,
|
||||
pipeline_parallel_size,
|
||||
tensor_parallel_size,
|
||||
)
|
||||
|
||||
B_shape = (HIDDEN_SIZE, 4 * HIDDEN_SIZE)
|
||||
(HIDDEN_SIZE, 4 * HIDDEN_SIZE)
|
||||
A_master = A_master.clone()
|
||||
A_master.requires_grad = True
|
||||
C_master = C_master.clone()
|
||||
C_master.requires_grad = True
|
||||
B_master = torch.matmul(
|
||||
A_master.view(-1, A_master.shape[-1]).transpose(0, 1), C_master.view(-1, C_master.shape[-1]))
|
||||
A_master.view(-1, A_master.shape[-1]).transpose(0, 1), C_master.view(-1, C_master.shape[-1])
|
||||
)
|
||||
B = torch.chunk(B_master, DEPTH, dim=0)[i]
|
||||
B = torch.chunk(B, DEPTH, dim=-1)[j]
|
||||
check_equal(out, B)
|
||||
print_rank_0('ATB forward: pass')
|
||||
print_rank_0("ATB forward: pass")
|
||||
|
||||
grad_shape = B_master.shape
|
||||
grad_master = torch.randn(grad_shape, dtype=dtype, device=device)
|
||||
@@ -211,4 +252,4 @@ def check_ATB():
|
||||
C_grad = torch.chunk(C_grad, DEPTH, dim=0)[i]
|
||||
C_grad = torch.chunk(C_grad, DEPTH, dim=-1)[j]
|
||||
check_equal(C_grad, C.grad)
|
||||
print_rank_0('ATB backward: pass')
|
||||
print_rank_0("ATB backward: pass")
|
||||
|
@@ -23,7 +23,9 @@ from colossalai.legacy.initialize import launch
|
||||
from colossalai.logging import disable_existing_loggers
|
||||
from colossalai.testing import rerun_if_address_is_in_use, spawn
|
||||
|
||||
CONFIG = dict(parallel=dict(pipeline=dict(size=1), tensor=dict(size=4, mode='2d')),)
|
||||
CONFIG = dict(
|
||||
parallel=dict(pipeline=dict(size=1), tensor=dict(size=4, mode="2d")),
|
||||
)
|
||||
|
||||
|
||||
def check_operations():
|
||||
@@ -48,7 +50,7 @@ def check_layer():
|
||||
|
||||
def check_layer_and_operation(rank, world_size, port):
|
||||
disable_existing_loggers()
|
||||
launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
|
||||
launch(config=CONFIG, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
|
||||
torch.backends.cuda.matmul.allow_tf32 = False
|
||||
torch.backends.cudnn.allow_tf32 = False
|
||||
@@ -65,5 +67,5 @@ def test_2d():
|
||||
spawn(check_layer_and_operation, 4)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
test_2d()
|
||||
|
Reference in New Issue
Block a user