mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-05 11:02:05 +00:00
[misc] update pre-commit and run all files (#4752)
* [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format
This commit is contained in:
@@ -16,11 +16,15 @@ torch.manual_seed(123)
|
||||
|
||||
def check_layer(rank, world_size, port):
|
||||
disable_existing_loggers()
|
||||
launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl', verbose=False)
|
||||
launch(config=CONFIG, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl", verbose=False)
|
||||
rank = gpc.get_local_rank(ParallelMode.PIPELINE)
|
||||
|
||||
if rank == 0:
|
||||
obj = [torch.randn(3,)]
|
||||
obj = [
|
||||
torch.randn(
|
||||
3,
|
||||
)
|
||||
]
|
||||
_send_object(obj, 1)
|
||||
|
||||
if rank == 1:
|
||||
@@ -30,7 +34,11 @@ def check_layer(rank, world_size, port):
|
||||
_recv_object(3)
|
||||
|
||||
if rank == 3:
|
||||
obj = [torch.randn(3,)]
|
||||
obj = [
|
||||
torch.randn(
|
||||
3,
|
||||
)
|
||||
]
|
||||
_send_object(obj, 2)
|
||||
|
||||
gpc.destroy()
|
||||
@@ -43,5 +51,5 @@ def test_object_list_p2p():
|
||||
spawn(check_layer, world_size)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
test_object_list_p2p()
|
||||
|
@@ -17,41 +17,41 @@ SIZE = 8
|
||||
def check_all_gather():
|
||||
tensor = torch.tensor([dist.get_rank() * SIZE + j for j in range(SIZE)])
|
||||
tensor = tensor.to(get_current_device())
|
||||
print('Before: Rank {0} - {1}'.format(dist.get_rank(), tensor))
|
||||
print("Before: Rank {0} - {1}".format(dist.get_rank(), tensor))
|
||||
tensor, op = all_gather(tensor, 0, ParallelMode.GLOBAL, async_op=True)
|
||||
print('After: Rank {0} - {1}'.format(dist.get_rank(), tensor))
|
||||
print("After: Rank {0} - {1}".format(dist.get_rank(), tensor))
|
||||
op.wait()
|
||||
print('Complete: Rank {0} - {1}'.format(dist.get_rank(), tensor))
|
||||
print("Complete: Rank {0} - {1}".format(dist.get_rank(), tensor))
|
||||
torch.cuda.synchronize()
|
||||
|
||||
|
||||
def check_reduce_scatter():
|
||||
tensor = torch.tensor([dist.get_rank() * SIZE + j for j in range(SIZE)])
|
||||
tensor = tensor.to(get_current_device())
|
||||
print('Before: Rank {0} - {1}'.format(dist.get_rank(), tensor))
|
||||
print("Before: Rank {0} - {1}".format(dist.get_rank(), tensor))
|
||||
tensor, op = reduce_scatter(tensor, 0, ParallelMode.GLOBAL, async_op=True)
|
||||
print('After: Rank {0} - {1}'.format(dist.get_rank(), tensor))
|
||||
print("After: Rank {0} - {1}".format(dist.get_rank(), tensor))
|
||||
op.wait()
|
||||
print('Complete: Rank {0} - {1}'.format(dist.get_rank(), tensor))
|
||||
print("Complete: Rank {0} - {1}".format(dist.get_rank(), tensor))
|
||||
torch.cuda.synchronize()
|
||||
|
||||
|
||||
def check_all_reduce():
|
||||
tensor = torch.tensor([dist.get_rank() * SIZE + j for j in range(SIZE)])
|
||||
tensor = tensor.to(get_current_device())
|
||||
print('Before: Rank {0} - {1}'.format(dist.get_rank(), tensor))
|
||||
print("Before: Rank {0} - {1}".format(dist.get_rank(), tensor))
|
||||
tensor, op = all_reduce(tensor, ParallelMode.GLOBAL, async_op=True)
|
||||
print('After: Rank {0} - {1}'.format(dist.get_rank(), tensor))
|
||||
print("After: Rank {0} - {1}".format(dist.get_rank(), tensor))
|
||||
op.wait()
|
||||
print('Complete: Rank {0} - {1}'.format(dist.get_rank(), tensor))
|
||||
print("Complete: Rank {0} - {1}".format(dist.get_rank(), tensor))
|
||||
torch.cuda.synchronize()
|
||||
|
||||
|
||||
def check_layer(rank, world_size, port):
|
||||
launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
|
||||
launch(config=CONFIG, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
|
||||
assert dist.get_rank() == gpc.get_global_rank()
|
||||
print('Rank {} / {}'.format(dist.get_rank(), dist.get_world_size()))
|
||||
print("Rank {} / {}".format(dist.get_rank(), dist.get_world_size()))
|
||||
|
||||
check_all_gather()
|
||||
check_reduce_scatter()
|
||||
@@ -67,5 +67,5 @@ def test_comm():
|
||||
spawn(check_layer, 4)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
test_comm()
|
||||
|
@@ -27,7 +27,7 @@ grad_list = [torch.rand(3, 3) for i in range(LIST_LENGTH)]
|
||||
|
||||
def check_send_recv_forward():
|
||||
if gpc.get_local_rank(ParallelMode.PIPELINE) == 0:
|
||||
device = torch.device('cuda:0')
|
||||
device = torch.device("cuda:0")
|
||||
data_to_send = data.to(device)
|
||||
data_list_to_send = []
|
||||
for data_in_list in data_list:
|
||||
@@ -35,7 +35,7 @@ def check_send_recv_forward():
|
||||
send_forward(data_to_send)
|
||||
send_forward(data_list_to_send)
|
||||
else:
|
||||
device = torch.device('cuda:1')
|
||||
device = torch.device("cuda:1")
|
||||
data_recv = recv_forward(TENSOR_SIZE)
|
||||
data_list_recv = recv_forward(TENSOR_SIZE_LIST)
|
||||
data_to_check = data.to(device)
|
||||
@@ -47,7 +47,7 @@ def check_send_recv_forward():
|
||||
|
||||
def check_send_recv_backward():
|
||||
if gpc.get_local_rank(ParallelMode.PIPELINE) == 0:
|
||||
device = torch.device('cuda:0')
|
||||
device = torch.device("cuda:0")
|
||||
grad_recv = recv_backward(TENSOR_SIZE)
|
||||
grad_list_recv = recv_backward(TENSOR_SIZE_LIST)
|
||||
grad_to_check = grad.to(device)
|
||||
@@ -56,7 +56,7 @@ def check_send_recv_backward():
|
||||
grad_to_check = grad_send.to(device)
|
||||
assert grad_recv.equal(grad_to_check)
|
||||
else:
|
||||
device = torch.device('cuda:1')
|
||||
device = torch.device("cuda:1")
|
||||
grad_to_send = grad.to(device)
|
||||
grad_list_to_send = []
|
||||
for grad_in_list in grad_list:
|
||||
@@ -67,7 +67,7 @@ def check_send_recv_backward():
|
||||
|
||||
def check_send_recv_forward_backward():
|
||||
if gpc.get_local_rank(ParallelMode.PIPELINE) == 0:
|
||||
device = torch.device('cuda:0')
|
||||
device = torch.device("cuda:0")
|
||||
data_list_to_send = []
|
||||
for data_in_list in data_list:
|
||||
data_list_to_send.append(data_in_list.to(device))
|
||||
@@ -77,7 +77,7 @@ def check_send_recv_forward_backward():
|
||||
grad_to_check = grad_send.to(device)
|
||||
assert grad_recv.equal(grad_to_check)
|
||||
else:
|
||||
device = torch.device('cuda:1')
|
||||
device = torch.device("cuda:1")
|
||||
grad_list_to_send = []
|
||||
for grad_in_list in grad_list:
|
||||
grad_list_to_send.append(grad_in_list.to(device))
|
||||
@@ -88,7 +88,7 @@ def check_send_recv_forward_backward():
|
||||
|
||||
|
||||
def check_layer(rank, world_size, port):
|
||||
launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
|
||||
launch(config=CONFIG, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
check_send_recv_forward()
|
||||
check_send_recv_backward()
|
||||
check_send_recv_forward_backward()
|
||||
@@ -102,5 +102,5 @@ def test_object_list_p2p():
|
||||
spawn(check_layer, 2)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
test_object_list_p2p()
|
||||
|
@@ -32,7 +32,7 @@ def check_send_recv_forward():
|
||||
local_rank = gpc.get_local_rank(ParallelMode.PIPELINE)
|
||||
|
||||
if local_rank == 0:
|
||||
device = torch.device('cuda:0')
|
||||
device = torch.device("cuda:0")
|
||||
data_to_send = data.to(device)
|
||||
data_list_to_send = []
|
||||
for data_in_list in data_list:
|
||||
@@ -42,7 +42,7 @@ def check_send_recv_forward():
|
||||
send_forward(data_list_to_send, scatter_gather_tensors=use_scatter_gather_tensors)
|
||||
|
||||
elif local_rank == 1:
|
||||
device = torch.device('cuda:1')
|
||||
device = torch.device("cuda:1")
|
||||
|
||||
data_recv = recv_forward(TENSOR_SIZE, scatter_gather_tensors=use_scatter_gather_tensors)
|
||||
data_list_recv = recv_forward(TENSOR_SIZE_LIST, scatter_gather_tensors=use_scatter_gather_tensors)
|
||||
@@ -60,7 +60,7 @@ def check_send_recv_forward():
|
||||
def check_send_recv_backward():
|
||||
disable_existing_loggers()
|
||||
if gpc.get_local_rank(ParallelMode.PIPELINE) == 0:
|
||||
device = torch.device('cuda:0')
|
||||
device = torch.device("cuda:0")
|
||||
grad_recv = recv_backward(TENSOR_SIZE)
|
||||
grad_list_recv = recv_backward(TENSOR_SIZE_LIST)
|
||||
|
||||
@@ -73,7 +73,7 @@ def check_send_recv_backward():
|
||||
grad_to_check = grad_send.to(device)
|
||||
assert grad_recv.equal(grad_to_check)
|
||||
else:
|
||||
device = torch.device('cuda:1')
|
||||
device = torch.device("cuda:1")
|
||||
grad_to_send = grad.to(device)
|
||||
grad_list_to_send = []
|
||||
for grad_in_list in grad_list:
|
||||
@@ -104,7 +104,7 @@ def check_small_pipeline():
|
||||
|
||||
def check_layer(rank, world_size, port):
|
||||
disable_existing_loggers()
|
||||
launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
|
||||
launch(config=CONFIG, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
|
||||
disable_existing_loggers()
|
||||
# check_send_recv_forward()
|
||||
@@ -120,6 +120,6 @@ def test_object_list_p2p():
|
||||
spawn(check_layer, world_size)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
disable_existing_loggers()
|
||||
test_object_list_p2p()
|
||||
|
Reference in New Issue
Block a user