mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-01 09:07:51 +00:00
[misc] update pre-commit and run all files (#4752)
* [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format
This commit is contained in:
@@ -22,30 +22,30 @@ def check_p2p_communication():
|
||||
if rank == 0:
|
||||
p2p.send_forward(tensor)
|
||||
p2p.send_forward([tensor])
|
||||
p2p.send_forward({'tensor': tensor})
|
||||
p2p.send_forward({"tensor": tensor})
|
||||
else:
|
||||
obj = p2p.recv_forward()
|
||||
assert torch.equal(obj, tensor)
|
||||
obj = p2p.recv_forward()
|
||||
assert type(obj) == list and len(obj) == 1 and torch.equal(obj[0], tensor)
|
||||
obj = p2p.recv_forward()
|
||||
assert type(obj) == dict and 'tensor' in obj and torch.equal(obj['tensor'], tensor)
|
||||
assert type(obj) == dict and "tensor" in obj and torch.equal(obj["tensor"], tensor)
|
||||
|
||||
if rank == 1:
|
||||
p2p.send_backward(tensor)
|
||||
p2p.send_backward([tensor])
|
||||
p2p.send_backward({'tensor': tensor})
|
||||
p2p.send_backward({"tensor": tensor})
|
||||
else:
|
||||
obj = p2p.recv_backward()
|
||||
assert torch.equal(obj, tensor)
|
||||
obj = p2p.recv_backward()
|
||||
assert type(obj) == list and len(obj) == 1 and torch.equal(obj[0], tensor)
|
||||
obj = p2p.recv_backward()
|
||||
assert type(obj) == dict and 'tensor' in obj and torch.equal(obj['tensor'], tensor)
|
||||
assert type(obj) == dict and "tensor" in obj and torch.equal(obj["tensor"], tensor)
|
||||
|
||||
|
||||
def run_dist(rank, world_size, port):
|
||||
colossalai.launch(config={}, rank=rank, world_size=world_size, port=port, host='localhost')
|
||||
colossalai.launch(config={}, rank=rank, world_size=world_size, port=port, host="localhost")
|
||||
check_p2p_communication()
|
||||
|
||||
|
||||
@@ -55,5 +55,5 @@ def test_pipeline_p2p():
|
||||
spawn(run_dist, 2)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
test_pipeline_p2p()
|
||||
|
@@ -4,36 +4,42 @@ from colossalai.shardformer.policies.t5 import T5BasePolicy
|
||||
def test_t5_pipeline_distribution():
|
||||
num_test_cases = 8
|
||||
test_dict = {
|
||||
'num_encoder_layers': [2, 1, 3, 2, 3, 2, 10, 5],
|
||||
'num_decoder_layers': [2, 8, 0, 2, 1, 5, 6, 22],
|
||||
'num_stages': [2, 2, 2, 4, 4, 4, 8, 8],
|
||||
'decoder_starting_stage': [1, 1, 2, 2, 3, 1, 5, 2]
|
||||
"num_encoder_layers": [2, 1, 3, 2, 3, 2, 10, 5],
|
||||
"num_decoder_layers": [2, 8, 0, 2, 1, 5, 6, 22],
|
||||
"num_stages": [2, 2, 2, 4, 4, 4, 8, 8],
|
||||
"decoder_starting_stage": [1, 1, 2, 2, 3, 1, 5, 2],
|
||||
}
|
||||
|
||||
for i in range(num_test_cases):
|
||||
_, decoder_starting_stage = T5BasePolicy.distribute_t5_layers(test_dict['num_encoder_layers'][i],
|
||||
test_dict['num_decoder_layers'][i],
|
||||
test_dict['num_stages'][i])
|
||||
assert test_dict['decoder_starting_stage'][i] == decoder_starting_stage
|
||||
_, decoder_starting_stage = T5BasePolicy.distribute_t5_layers(
|
||||
test_dict["num_encoder_layers"][i], test_dict["num_decoder_layers"][i], test_dict["num_stages"][i]
|
||||
)
|
||||
assert test_dict["decoder_starting_stage"][i] == decoder_starting_stage
|
||||
|
||||
|
||||
def test_t5_pipeline_layers():
|
||||
num_test_cases = 4
|
||||
test_dict = {
|
||||
'num_encoder_layers': [2, 3, 2, 4],
|
||||
'num_decoder_layers': [2, 0, 2, 8],
|
||||
'num_stages': [2, 2, 4, 4],
|
||||
'layers_per_stage': [[[0, 2], [0, 2]], [[0, 1], [1, 3]], [[0, 1], [1, 2], [0, 1], [1, 2]],
|
||||
[[0, 4], [0, 3], [3, 6], [6, 8]]]
|
||||
"num_encoder_layers": [2, 3, 2, 4],
|
||||
"num_decoder_layers": [2, 0, 2, 8],
|
||||
"num_stages": [2, 2, 4, 4],
|
||||
"layers_per_stage": [
|
||||
[[0, 2], [0, 2]],
|
||||
[[0, 1], [1, 3]],
|
||||
[[0, 1], [1, 2], [0, 1], [1, 2]],
|
||||
[[0, 4], [0, 3], [3, 6], [6, 8]],
|
||||
],
|
||||
}
|
||||
|
||||
for i in range(num_test_cases):
|
||||
layers_per_stage, decoder_starting_stage = T5BasePolicy.distribute_t5_layers(
|
||||
test_dict['num_encoder_layers'][i], test_dict['num_decoder_layers'][i], test_dict['num_stages'][i])
|
||||
test_dict["num_encoder_layers"][i], test_dict["num_decoder_layers"][i], test_dict["num_stages"][i]
|
||||
)
|
||||
|
||||
for stage in range(test_dict['num_stages'][i]):
|
||||
start_idx, end_idx = test_dict['layers_per_stage'][i][stage]
|
||||
predicted_start, predicted_end = T5BasePolicy.get_t5_stage_index(layers_per_stage, stage,
|
||||
decoder_starting_stage)
|
||||
for stage in range(test_dict["num_stages"][i]):
|
||||
start_idx, end_idx = test_dict["layers_per_stage"][i][stage]
|
||||
predicted_start, predicted_end = T5BasePolicy.get_t5_stage_index(
|
||||
layers_per_stage, stage, decoder_starting_stage
|
||||
)
|
||||
assert start_idx == predicted_start
|
||||
assert end_idx == predicted_end
|
||||
|
@@ -4,41 +4,47 @@ from colossalai.shardformer.policies.whisper import WhisperPolicy
|
||||
def test_whisper_pipeline_distribution():
|
||||
num_test_cases = 8
|
||||
test_dict = {
|
||||
'num_encoder_layers': [2, 1, 3, 2, 3, 2, 10, 5],
|
||||
'num_decoder_layers': [2, 8, 0, 2, 1, 5, 6, 22],
|
||||
'num_stages': [2, 2, 2, 4, 4, 4, 8, 8],
|
||||
'decoder_starting_stage': [1, 1, 2, 2, 3, 1, 5, 2]
|
||||
"num_encoder_layers": [2, 1, 3, 2, 3, 2, 10, 5],
|
||||
"num_decoder_layers": [2, 8, 0, 2, 1, 5, 6, 22],
|
||||
"num_stages": [2, 2, 2, 4, 4, 4, 8, 8],
|
||||
"decoder_starting_stage": [1, 1, 2, 2, 3, 1, 5, 2],
|
||||
}
|
||||
|
||||
for i in range(num_test_cases):
|
||||
_, decoder_starting_stage = WhisperPolicy.distribute_whisper_layers(test_dict['num_encoder_layers'][i],
|
||||
test_dict['num_decoder_layers'][i],
|
||||
test_dict['num_stages'][i])
|
||||
assert test_dict['decoder_starting_stage'][i] == decoder_starting_stage
|
||||
_, decoder_starting_stage = WhisperPolicy.distribute_whisper_layers(
|
||||
test_dict["num_encoder_layers"][i], test_dict["num_decoder_layers"][i], test_dict["num_stages"][i]
|
||||
)
|
||||
assert test_dict["decoder_starting_stage"][i] == decoder_starting_stage
|
||||
|
||||
|
||||
def test_whisper_pipeline_layers():
|
||||
num_test_cases = 4
|
||||
test_dict = {
|
||||
'num_encoder_layers': [2, 3, 2, 4],
|
||||
'num_decoder_layers': [2, 0, 2, 8],
|
||||
'num_stages': [2, 2, 4, 4],
|
||||
'layers_per_stage': [[[0, 2], [0, 2]], [[0, 1], [1, 3]], [[0, 1], [1, 2], [0, 1], [1, 2]],
|
||||
[[0, 4], [0, 3], [3, 6], [6, 8]]]
|
||||
"num_encoder_layers": [2, 3, 2, 4],
|
||||
"num_decoder_layers": [2, 0, 2, 8],
|
||||
"num_stages": [2, 2, 4, 4],
|
||||
"layers_per_stage": [
|
||||
[[0, 2], [0, 2]],
|
||||
[[0, 1], [1, 3]],
|
||||
[[0, 1], [1, 2], [0, 1], [1, 2]],
|
||||
[[0, 4], [0, 3], [3, 6], [6, 8]],
|
||||
],
|
||||
}
|
||||
|
||||
for i in range(num_test_cases):
|
||||
layers_per_stage, decoder_starting_stage = WhisperPolicy.distribute_whisper_layers(
|
||||
test_dict['num_encoder_layers'][i], test_dict['num_decoder_layers'][i], test_dict['num_stages'][i])
|
||||
test_dict["num_encoder_layers"][i], test_dict["num_decoder_layers"][i], test_dict["num_stages"][i]
|
||||
)
|
||||
|
||||
for stage in range(test_dict['num_stages'][i]):
|
||||
start_idx, end_idx = test_dict['layers_per_stage'][i][stage]
|
||||
predicted_start, predicted_end = WhisperPolicy.get_whisper_stage_index(layers_per_stage, stage,
|
||||
decoder_starting_stage)
|
||||
for stage in range(test_dict["num_stages"][i]):
|
||||
start_idx, end_idx = test_dict["layers_per_stage"][i][stage]
|
||||
predicted_start, predicted_end = WhisperPolicy.get_whisper_stage_index(
|
||||
layers_per_stage, stage, decoder_starting_stage
|
||||
)
|
||||
assert start_idx == predicted_start
|
||||
assert end_idx == predicted_end
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
test_whisper_pipeline_distribution()
|
||||
test_whisper_pipeline_layers()
|
||||
|
@@ -16,7 +16,6 @@ from colossalai.testing.random import seed_all
|
||||
|
||||
|
||||
class MlpModel(nn.Module):
|
||||
|
||||
def __init__(self):
|
||||
super(MlpModel, self).__init__()
|
||||
self.linear1 = nn.Linear(4, 8)
|
||||
@@ -40,19 +39,20 @@ class MlpModel(nn.Module):
|
||||
return x
|
||||
|
||||
|
||||
def pp_linear_fwd(forward,
|
||||
data: torch.Tensor = None,
|
||||
input_obj: torch.Tensor = None,
|
||||
stage_mgr: PipelineStageManager = None,
|
||||
num_chunks: int = None,
|
||||
model_chunk_id: int = None):
|
||||
|
||||
def pp_linear_fwd(
|
||||
forward,
|
||||
data: torch.Tensor = None,
|
||||
input_obj: torch.Tensor = None,
|
||||
stage_mgr: PipelineStageManager = None,
|
||||
num_chunks: int = None,
|
||||
model_chunk_id: int = None,
|
||||
):
|
||||
if stage_mgr.is_first_stage() and model_chunk_id == 0:
|
||||
return {'input_obj': forward(data)}
|
||||
return {"input_obj": forward(data)}
|
||||
elif stage_mgr.is_last_stage() and model_chunk_id == num_chunks - 1:
|
||||
return forward(input_obj)
|
||||
else:
|
||||
return {'input_obj': forward(input_obj)}
|
||||
return {"input_obj": forward(input_obj)}
|
||||
|
||||
|
||||
@parameterize("num_micro_batches", [4, 8, 12])
|
||||
@@ -84,10 +84,11 @@ def examine_pp(num_micro_batches):
|
||||
if idx % (world_size) == local_rank:
|
||||
sub_model._forward = sub_model.forward
|
||||
sub_model.forward = MethodType(
|
||||
partial(pp_linear_fwd,
|
||||
stage_mgr=stage_manager,
|
||||
num_chunks=NUM_CHUNKS,
|
||||
model_chunk_id=len(sharded_model)), sub_model._forward)
|
||||
partial(
|
||||
pp_linear_fwd, stage_mgr=stage_manager, num_chunks=NUM_CHUNKS, model_chunk_id=len(sharded_model)
|
||||
),
|
||||
sub_model._forward,
|
||||
)
|
||||
sharded_model.append(sub_model.cuda())
|
||||
|
||||
# create optimizer
|
||||
@@ -109,16 +110,13 @@ def examine_pp(num_micro_batches):
|
||||
torch_loss = criterion(torch_output, _)
|
||||
torch_loss.backward()
|
||||
|
||||
pp_ret = schedule.forward_backward_step(sharded_model,
|
||||
iter(input_list),
|
||||
criterion,
|
||||
pp_optimizer,
|
||||
return_loss=True,
|
||||
return_outputs=True)
|
||||
pp_ret = schedule.forward_backward_step(
|
||||
sharded_model, iter(input_list), criterion, pp_optimizer, return_loss=True, return_outputs=True
|
||||
)
|
||||
|
||||
# check loss
|
||||
if stage_manager.is_last_stage():
|
||||
assert torch.allclose(torch_loss, pp_ret['loss'])
|
||||
assert torch.allclose(torch_loss, pp_ret["loss"])
|
||||
|
||||
# check gradients
|
||||
torch_grad = []
|
||||
@@ -147,7 +145,7 @@ def examine_pp(num_micro_batches):
|
||||
|
||||
|
||||
def run_dist(rank, world_size, port):
|
||||
colossalai.launch(config=dict(), rank=rank, world_size=world_size, port=port, host='localhost')
|
||||
colossalai.launch(config=dict(), rank=rank, world_size=world_size, port=port, host="localhost")
|
||||
examine_pp()
|
||||
|
||||
|
||||
@@ -157,5 +155,5 @@ def test_pp():
|
||||
spawn(run_dist, 4)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
test_pp()
|
||||
|
@@ -16,7 +16,6 @@ from colossalai.testing.random import seed_all
|
||||
|
||||
|
||||
class MlpModel(nn.Module):
|
||||
|
||||
def __init__(self):
|
||||
super(MlpModel, self).__init__()
|
||||
self.linear1 = nn.Linear(4, 8)
|
||||
@@ -28,17 +27,15 @@ class MlpModel(nn.Module):
|
||||
return x
|
||||
|
||||
|
||||
def pp_linear_fwd(forward,
|
||||
data: torch.Tensor = None,
|
||||
input_obj: torch.Tensor = None,
|
||||
stage_mgr: PipelineStageManager = None):
|
||||
|
||||
def pp_linear_fwd(
|
||||
forward, data: torch.Tensor = None, input_obj: torch.Tensor = None, stage_mgr: PipelineStageManager = None
|
||||
):
|
||||
if stage_mgr.is_first_stage():
|
||||
return {'input_obj': forward(data)}
|
||||
return {"input_obj": forward(data)}
|
||||
elif stage_mgr.is_last_stage():
|
||||
return forward(input_obj)
|
||||
else:
|
||||
return {'input_obj': forward(input_obj)}
|
||||
return {"input_obj": forward(input_obj)}
|
||||
|
||||
|
||||
def examine_pp():
|
||||
@@ -89,16 +86,13 @@ def examine_pp():
|
||||
torch_loss = criterion(torch_output, _)
|
||||
torch_loss.backward()
|
||||
|
||||
pp_ret = schedule.forward_backward_step(sharded_model,
|
||||
iter(input_list),
|
||||
criterion,
|
||||
pp_optimizer,
|
||||
return_loss=True,
|
||||
return_outputs=True)
|
||||
pp_ret = schedule.forward_backward_step(
|
||||
sharded_model, iter(input_list), criterion, pp_optimizer, return_loss=True, return_outputs=True
|
||||
)
|
||||
|
||||
# check loss
|
||||
if stage_manager.is_last_stage():
|
||||
assert torch.allclose(torch_loss, pp_ret['loss'])
|
||||
assert torch.allclose(torch_loss, pp_ret["loss"])
|
||||
|
||||
# check gradients
|
||||
torch_grad = []
|
||||
@@ -120,7 +114,7 @@ def examine_pp():
|
||||
|
||||
|
||||
def run_dist(rank, world_size, port):
|
||||
colossalai.launch(config=dict(), rank=rank, world_size=world_size, port=port, host='localhost')
|
||||
colossalai.launch(config=dict(), rank=rank, world_size=world_size, port=port, host="localhost")
|
||||
examine_pp()
|
||||
|
||||
|
||||
@@ -130,5 +124,5 @@ def test_pp():
|
||||
spawn(run_dist, 2)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
test_pp()
|
||||
|
@@ -8,9 +8,9 @@ def test_get_batch_size():
|
||||
assert get_batch_size(tensor) == 2
|
||||
assert get_batch_size([tensor]) == 2
|
||||
assert get_batch_size((1, tensor)) == 2
|
||||
assert get_batch_size({'tensor': tensor}) == 2
|
||||
assert get_batch_size({'dummy': [1], 'tensor': tensor}) == 2
|
||||
assert get_batch_size({'tensor': [tensor]}) == 2
|
||||
assert get_batch_size({"tensor": tensor}) == 2
|
||||
assert get_batch_size({"dummy": [1], "tensor": tensor}) == 2
|
||||
assert get_batch_size({"tensor": [tensor]}) == 2
|
||||
|
||||
|
||||
def test_get_micro_batch():
|
||||
@@ -26,12 +26,12 @@ def test_get_micro_batch():
|
||||
micro_batch = get_micro_batch([x, y], 1, 1)
|
||||
assert torch.equal(micro_batch[0], x[1:2])
|
||||
assert torch.equal(micro_batch[1], y[1:2])
|
||||
micro_batch = get_micro_batch({'x': x, 'y': y}, 0, 1)
|
||||
assert torch.equal(micro_batch['x'], x[0:1])
|
||||
assert torch.equal(micro_batch['y'], y[0:1])
|
||||
micro_batch = get_micro_batch({'x': x, 'y': y}, 1, 1)
|
||||
assert torch.equal(micro_batch['x'], x[1:2])
|
||||
assert torch.equal(micro_batch['y'], y[1:2])
|
||||
micro_batch = get_micro_batch({"x": x, "y": y}, 0, 1)
|
||||
assert torch.equal(micro_batch["x"], x[0:1])
|
||||
assert torch.equal(micro_batch["y"], y[0:1])
|
||||
micro_batch = get_micro_batch({"x": x, "y": y}, 1, 1)
|
||||
assert torch.equal(micro_batch["x"], x[1:2])
|
||||
assert torch.equal(micro_batch["y"], y[1:2])
|
||||
|
||||
|
||||
def test_merge_batch():
|
||||
@@ -42,6 +42,6 @@ def test_merge_batch():
|
||||
merged = merge_batch([[x[0:1], y[0:1]], [x[1:2], y[1:2]]])
|
||||
assert torch.equal(merged[0], x)
|
||||
assert torch.equal(merged[1], y)
|
||||
merged = merge_batch([{'x': x[0:1], 'y': y[0:1]}, {'x': x[1:2], 'y': y[1:2]}])
|
||||
assert torch.equal(merged['x'], x)
|
||||
assert torch.equal(merged['y'], y)
|
||||
merged = merge_batch([{"x": x[0:1], "y": y[0:1]}, {"x": x[1:2], "y": y[1:2]}])
|
||||
assert torch.equal(merged["x"], x)
|
||||
assert torch.equal(merged["y"], y)
|
||||
|
@@ -64,7 +64,7 @@ def check_stage_manager():
|
||||
|
||||
|
||||
def run_dist(rank, world_size, port):
|
||||
colossalai.launch(config={}, rank=rank, world_size=world_size, port=port, host='localhost')
|
||||
colossalai.launch(config={}, rank=rank, world_size=world_size, port=port, host="localhost")
|
||||
check_stage_manager()
|
||||
|
||||
|
||||
@@ -74,5 +74,5 @@ def test_pipeline_stage_manager():
|
||||
spawn(run_dist, 4)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
test_pipeline_stage_manager()
|
||||
|
Reference in New Issue
Block a user