mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-01 17:17:05 +00:00
[misc] update pre-commit and run all files (#4752)
* [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format
This commit is contained in:
@@ -7,12 +7,14 @@ from colossalai.logging import disable_existing_loggers
|
||||
from colossalai.shardformer.layer import cross_entropy_1d
|
||||
from colossalai.testing import rerun_if_address_is_in_use, spawn
|
||||
|
||||
CONFIG = dict(parallel=dict(data=1, pipeline=1, tensor=dict(size=2, mode='1d')),)
|
||||
CONFIG = dict(
|
||||
parallel=dict(data=1, pipeline=1, tensor=dict(size=2, mode="1d")),
|
||||
)
|
||||
|
||||
|
||||
def check_dist_crossentropy(rank, world_size, port, ignore_index):
|
||||
disable_existing_loggers()
|
||||
colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, port=port, host='localhost', backend='nccl')
|
||||
colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, port=port, host="localhost", backend="nccl")
|
||||
|
||||
# prepare data
|
||||
pred = torch.randn(2, 4, 8, requires_grad=True)
|
||||
@@ -25,10 +27,11 @@ def check_dist_crossentropy(rank, world_size, port, ignore_index):
|
||||
org_loss = F.cross_entropy(org_pred, org_labels)
|
||||
|
||||
dist_pred = pred.chunk(world_size, -1)[rank]
|
||||
dist_loss = cross_entropy_1d(dist_pred.to('cuda'), labels.to('cuda'), ignore_index=ignore_index)
|
||||
dist_loss = cross_entropy_1d(dist_pred.to("cuda"), labels.to("cuda"), ignore_index=ignore_index)
|
||||
|
||||
assert torch.allclose(org_loss, dist_loss,
|
||||
atol=1e-5), f"dist cross entropy loss is not equal to orgin loss\n{org_loss}\n{dist_loss}"
|
||||
assert torch.allclose(
|
||||
org_loss, dist_loss, atol=1e-5
|
||||
), f"dist cross entropy loss is not equal to orgin loss\n{org_loss}\n{dist_loss}"
|
||||
|
||||
|
||||
@pytest.mark.dist
|
||||
@@ -38,5 +41,5 @@ def test_dist_crossentropy():
|
||||
spawn(check_dist_crossentropy, 2, ignore_index=ignore_index)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
test_dist_crossentropy()
|
||||
|
@@ -56,7 +56,7 @@ def check_dropout_replicated_input():
|
||||
|
||||
|
||||
def run_dist(rank, world_size, port):
|
||||
colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
|
||||
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
check_dropout_parallel_input()
|
||||
check_dropout_replicated_input()
|
||||
|
||||
@@ -66,5 +66,5 @@ def test_dropout():
|
||||
spawn(run_dist, nprocs=2)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
test_dropout()
|
||||
|
@@ -11,7 +11,7 @@ from colossalai.shardformer.layer import Embedding1D
|
||||
from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
|
||||
|
||||
|
||||
@parameterize('lazy_init', [False, True])
|
||||
@parameterize("lazy_init", [False, True])
|
||||
def check_embedding_1d(lazy_init: bool):
|
||||
ctx = LazyInitContext() if lazy_init else nullcontext()
|
||||
|
||||
@@ -43,7 +43,7 @@ def check_embedding_1d(lazy_init: bool):
|
||||
|
||||
|
||||
def run_dist(rank, world_size, port):
|
||||
colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
|
||||
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
check_embedding_1d()
|
||||
|
||||
|
||||
@@ -52,5 +52,5 @@ def test_embedding_1d():
|
||||
spawn(run_dist, nprocs=2)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
test_embedding_1d()
|
||||
|
@@ -58,12 +58,9 @@ def check_linear_conv_1d_col(lazy_init: bool, seq_parallel: bool, overlap: bool)
|
||||
linear = Conv1D(192, 48).cuda()
|
||||
with ctx:
|
||||
linear_copy = Conv1D(192, 48).cuda()
|
||||
linear_conv_col = GPT2FusedLinearConv1D_Col.from_native_module(linear_copy,
|
||||
process_group=None,
|
||||
gather_output=True,
|
||||
seq_parallel=seq_parallel,
|
||||
n_fused=3,
|
||||
overlap=overlap)
|
||||
linear_conv_col = GPT2FusedLinearConv1D_Col.from_native_module(
|
||||
linear_copy, process_group=None, gather_output=True, seq_parallel=seq_parallel, n_fused=3, overlap=overlap
|
||||
)
|
||||
|
||||
assert linear.weight.shape == torch.Size([48, 192])
|
||||
assert linear.bias.shape == torch.Size([192])
|
||||
@@ -97,10 +94,9 @@ def check_linear_conv_1d_row(lazy_init: bool, seq_parallel: bool):
|
||||
linear = Conv1D(192, 48).cuda()
|
||||
with ctx:
|
||||
linear_copy = Conv1D(192, 48).cuda()
|
||||
linear_row = GPT2FusedLinearConv1D_Row.from_native_module(linear_copy,
|
||||
process_group=None,
|
||||
parallel_input=False,
|
||||
seq_parallel=seq_parallel)
|
||||
linear_row = GPT2FusedLinearConv1D_Row.from_native_module(
|
||||
linear_copy, process_group=None, parallel_input=False, seq_parallel=seq_parallel
|
||||
)
|
||||
|
||||
assert linear.weight.shape == torch.Size([48, 192])
|
||||
assert linear_row.weight.shape == torch.Size([24, 192])
|
||||
@@ -128,16 +124,16 @@ def check_linear_conv_1d_row(lazy_init: bool, seq_parallel: bool):
|
||||
assert_close(target_grad, linear_row.weight.grad)
|
||||
|
||||
|
||||
@parameterize('lazy_init', [False, True])
|
||||
@parameterize('seq_parallel', [False, True])
|
||||
@parameterize('overlap', [True])
|
||||
@parameterize("lazy_init", [False, True])
|
||||
@parameterize("seq_parallel", [False, True])
|
||||
@parameterize("overlap", [True])
|
||||
def check_gpt2_qkv_fused_linear_1d(lazy_init: bool, seq_parallel: bool, overlap: bool):
|
||||
check_linear_conv_1d_col(lazy_init, seq_parallel, overlap)
|
||||
check_linear_conv_1d_row(lazy_init, seq_parallel)
|
||||
|
||||
|
||||
def run_dist(rank, world_size, port):
|
||||
colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
|
||||
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
|
||||
# test for linear conv
|
||||
check_gpt2_qkv_fused_linear_1d()
|
||||
@@ -148,5 +144,5 @@ def test_linearconv():
|
||||
spawn(run_dist, nprocs=2)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
test_linearconv()
|
||||
|
@@ -10,7 +10,7 @@ from colossalai.shardformer.layer import FusedLayerNorm
|
||||
from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
|
||||
|
||||
|
||||
@parameterize('lazy_init', [False, True])
|
||||
@parameterize("lazy_init", [False, True])
|
||||
def check_layernorm(lazy_init: bool):
|
||||
ctx = LazyInitContext() if lazy_init else nullcontext()
|
||||
|
||||
@@ -41,7 +41,7 @@ def check_layernorm(lazy_init: bool):
|
||||
|
||||
|
||||
def run_dist(rank, world_size, port):
|
||||
colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
|
||||
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
check_layernorm()
|
||||
|
||||
|
||||
@@ -50,5 +50,5 @@ def test_layernorm():
|
||||
spawn(run_dist, nprocs=2)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
test_layernorm()
|
||||
|
@@ -17,11 +17,9 @@ def check_linear_1d_col(lazy_init: bool, seq_parallel: bool, overlap: bool):
|
||||
linear = nn.Linear(32, 128).cuda()
|
||||
with ctx:
|
||||
linear_copy = nn.Linear(32, 128).cuda()
|
||||
linear_col = Linear1D_Col.from_native_module(linear_copy,
|
||||
process_group=None,
|
||||
gather_output=True,
|
||||
seq_parallel=seq_parallel,
|
||||
overlap=overlap)
|
||||
linear_col = Linear1D_Col.from_native_module(
|
||||
linear_copy, process_group=None, gather_output=True, seq_parallel=seq_parallel, overlap=overlap
|
||||
)
|
||||
|
||||
# ensure that the parameters are distributed
|
||||
assert is_distributed_tensor(linear_col.weight)
|
||||
@@ -60,8 +58,11 @@ def check_linear_1d_col(lazy_init: bool, seq_parallel: bool, overlap: bool):
|
||||
# check the input gradients
|
||||
assert x_for_shard.grad is not None
|
||||
assert x_for_unshard.grad is not None
|
||||
target_unshard_gard = x_for_unshard.grad if seq_parallel is False else torch.chunk(
|
||||
x_for_unshard.grad.clone(), 2, dim=1)[dist.get_rank()]
|
||||
target_unshard_gard = (
|
||||
x_for_unshard.grad
|
||||
if seq_parallel is False
|
||||
else torch.chunk(x_for_unshard.grad.clone(), 2, dim=1)[dist.get_rank()]
|
||||
)
|
||||
assert_close(target_unshard_gard, x_for_shard.grad)
|
||||
|
||||
|
||||
@@ -71,10 +72,9 @@ def check_linear_1d_row(lazy_init: bool, seq_parallel: bool):
|
||||
linear = nn.Linear(32, 128).cuda()
|
||||
with ctx:
|
||||
linear_copy = nn.Linear(32, 128).cuda()
|
||||
linear_row = Linear1D_Row.from_native_module(linear_copy,
|
||||
process_group=None,
|
||||
parallel_input=False,
|
||||
seq_parallel=seq_parallel)
|
||||
linear_row = Linear1D_Row.from_native_module(
|
||||
linear_copy, process_group=None, parallel_input=False, seq_parallel=seq_parallel
|
||||
)
|
||||
|
||||
assert linear_row.weight.shape == torch.Size([128, 16])
|
||||
assert linear_row.bias.shape == torch.Size([128])
|
||||
@@ -121,15 +121,12 @@ def check_linear_col_plus_row(lazy_init: bool, seq_parallel: bool, overlap: bool
|
||||
with ctx:
|
||||
linear_1_copy = nn.Linear(32, 128).cuda()
|
||||
linear_2_copy = nn.Linear(128, 32).cuda()
|
||||
linear_col = Linear1D_Col.from_native_module(linear_1_copy,
|
||||
process_group=None,
|
||||
gather_output=False,
|
||||
seq_parallel=seq_parallel,
|
||||
overlap=overlap)
|
||||
linear_row = Linear1D_Row.from_native_module(linear_2_copy,
|
||||
process_group=None,
|
||||
parallel_input=True,
|
||||
seq_parallel=seq_parallel)
|
||||
linear_col = Linear1D_Col.from_native_module(
|
||||
linear_1_copy, process_group=None, gather_output=False, seq_parallel=seq_parallel, overlap=overlap
|
||||
)
|
||||
linear_row = Linear1D_Row.from_native_module(
|
||||
linear_2_copy, process_group=None, parallel_input=True, seq_parallel=seq_parallel
|
||||
)
|
||||
|
||||
linear_1.load_state_dict(linear_col.state_dict())
|
||||
linear_col.load_state_dict(linear_1.state_dict())
|
||||
@@ -161,14 +158,17 @@ def check_linear_col_plus_row(lazy_init: bool, seq_parallel: bool, overlap: bool
|
||||
# check the input gradients
|
||||
assert x_for_shard.grad is not None
|
||||
assert x_for_unshard.grad is not None
|
||||
target_unshard_gard = x_for_unshard.grad if seq_parallel is False else torch.chunk(
|
||||
x_for_unshard.grad.clone(), 2, dim=1)[dist.get_rank()]
|
||||
target_unshard_gard = (
|
||||
x_for_unshard.grad
|
||||
if seq_parallel is False
|
||||
else torch.chunk(x_for_unshard.grad.clone(), 2, dim=1)[dist.get_rank()]
|
||||
)
|
||||
assert_close(target_unshard_gard, x_for_shard.grad)
|
||||
|
||||
|
||||
@parameterize('lazy_init', [False, True])
|
||||
@parameterize('seq_parallel', [False, True])
|
||||
@parameterize('overlap', [True])
|
||||
@parameterize("lazy_init", [False, True])
|
||||
@parameterize("seq_parallel", [False, True])
|
||||
@parameterize("overlap", [True])
|
||||
def run_dist_linear_test(lazy_init, seq_parallel, overlap):
|
||||
check_linear_1d_col(lazy_init, seq_parallel, overlap)
|
||||
check_linear_1d_row(lazy_init, seq_parallel)
|
||||
@@ -176,7 +176,7 @@ def run_dist_linear_test(lazy_init, seq_parallel, overlap):
|
||||
|
||||
|
||||
def check_dist_linear(rank, world_size, port):
|
||||
colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
|
||||
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
run_dist_linear_test()
|
||||
|
||||
|
||||
@@ -185,5 +185,5 @@ def test_linear():
|
||||
spawn(check_dist_linear, nprocs=2)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
test_linear()
|
||||
|
@@ -53,16 +53,15 @@ def rearrange(tensor: torch.Tensor, dim: int):
|
||||
return rearanged_tensor
|
||||
|
||||
|
||||
@parameterize('lazy_init', [False, True])
|
||||
@parameterize("lazy_init", [False, True])
|
||||
def check_linear_conv_1d_col(lazy_init: bool):
|
||||
ctx = LazyInitContext() if lazy_init else nullcontext()
|
||||
linear = Conv1D(192, 48).cuda()
|
||||
with ctx:
|
||||
linear_copy = Conv1D(192, 48).cuda()
|
||||
linear_conv_col = GPT2FusedLinearConv1D_Col.from_native_module(linear_copy,
|
||||
process_group=None,
|
||||
gather_output=True,
|
||||
n_fused=3)
|
||||
linear_conv_col = GPT2FusedLinearConv1D_Col.from_native_module(
|
||||
linear_copy, process_group=None, gather_output=True, n_fused=3
|
||||
)
|
||||
|
||||
assert linear.weight.shape == torch.Size([48, 192])
|
||||
assert linear.bias.shape == torch.Size([192])
|
||||
@@ -89,7 +88,7 @@ def check_linear_conv_1d_col(lazy_init: bool):
|
||||
assert_close(target_grad, linear_conv_col.weight.grad)
|
||||
|
||||
|
||||
@parameterize('lazy_init', [False, True])
|
||||
@parameterize("lazy_init", [False, True])
|
||||
def check_linear_conv_1d_row(lazy_init: bool):
|
||||
ctx = LazyInitContext() if lazy_init else nullcontext()
|
||||
|
||||
@@ -124,7 +123,7 @@ def check_linear_conv_1d_row(lazy_init: bool):
|
||||
|
||||
|
||||
def run_dist(rank, world_size, port):
|
||||
colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
|
||||
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
|
||||
# test for linear conv
|
||||
check_linear_conv_1d_col()
|
||||
@@ -136,5 +135,5 @@ def test_linearconv():
|
||||
spawn(run_dist, nprocs=2)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
test_linearconv()
|
||||
|
@@ -11,13 +11,13 @@ from colossalai.shardformer.layer import VocabParallelEmbedding1D
|
||||
from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
|
||||
|
||||
|
||||
@parameterize('lazy_init', [False, True])
|
||||
@parameterize("lazy_init", [False, True])
|
||||
def check_vocab_embedding_1d(lazy_init: bool):
|
||||
ctx = LazyInitContext() if lazy_init else nullcontext()
|
||||
|
||||
embedding = nn.Embedding(128, 32).to('cuda')
|
||||
embedding = nn.Embedding(128, 32).to("cuda")
|
||||
with ctx:
|
||||
embedding_copy = nn.Embedding(128, 32).to('cuda')
|
||||
embedding_copy = nn.Embedding(128, 32).to("cuda")
|
||||
dist_embedding_1d = VocabParallelEmbedding1D.from_native_module(embedding_copy, process_group=None)
|
||||
|
||||
assert dist_embedding_1d.weight.shape == torch.Size([64, 32])
|
||||
@@ -30,7 +30,7 @@ def check_vocab_embedding_1d(lazy_init: bool):
|
||||
dist_embedding_1d.load_state_dict(embedding.state_dict())
|
||||
|
||||
# check embedding correctness
|
||||
x = torch.randint(0, 128, (4, 32)).to('cuda')
|
||||
x = torch.randint(0, 128, (4, 32)).to("cuda")
|
||||
org_out = embedding(x)
|
||||
dist_out = dist_embedding_1d(x)
|
||||
assert_close(org_out, dist_out)
|
||||
@@ -45,7 +45,7 @@ def check_vocab_embedding_1d(lazy_init: bool):
|
||||
|
||||
|
||||
def run_dist(rank, world_size, port):
|
||||
colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
|
||||
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
check_vocab_embedding_1d()
|
||||
|
||||
|
||||
@@ -54,5 +54,5 @@ def test_vocab_embedding():
|
||||
spawn(run_dist, nprocs=2)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
test_vocab_embedding()
|
||||
|
Reference in New Issue
Block a user