mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-16 14:41:53 +00:00
[misc] refactor launch API and tensor constructor (#5666)
* [misc] remove config arg from initialize * [misc] remove old tensor contrusctor * [plugin] add npu support for ddp * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [devops] fix doc test ci * [test] fix test launch * [doc] update launch doc --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
This commit is contained in:
@@ -77,7 +77,7 @@ def run_naive_amp():
|
||||
|
||||
|
||||
def run_dist(rank, world_size, port):
|
||||
colossalai.legacy.launch(config=dict(), rank=rank, world_size=world_size, port=port, host="localhost")
|
||||
colossalai.legacy.launch(rank=rank, world_size=world_size, port=port, host="localhost")
|
||||
run_naive_amp()
|
||||
|
||||
|
||||
|
@@ -76,7 +76,7 @@ def run_torch_amp():
|
||||
|
||||
|
||||
def run_dist(rank, world_size, port):
|
||||
colossalai.legacy.launch(config=dict(), rank=rank, world_size=world_size, port=port, host="localhost")
|
||||
colossalai.legacy.launch(rank=rank, world_size=world_size, port=port, host="localhost")
|
||||
run_torch_amp()
|
||||
|
||||
|
||||
|
@@ -16,7 +16,7 @@ torch.manual_seed(123)
|
||||
|
||||
def check_layer(rank, world_size, port):
|
||||
disable_existing_loggers()
|
||||
launch(config=CONFIG, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl", verbose=False)
|
||||
launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl", verbose=False)
|
||||
rank = gpc.get_local_rank(ParallelMode.PIPELINE)
|
||||
|
||||
if rank == 0:
|
||||
|
@@ -48,7 +48,7 @@ def check_all_reduce():
|
||||
|
||||
|
||||
def check_layer(rank, world_size, port):
|
||||
launch(config=CONFIG, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
|
||||
assert dist.get_rank() == gpc.get_global_rank()
|
||||
print("Rank {} / {}".format(dist.get_rank(), dist.get_world_size()))
|
||||
|
@@ -88,7 +88,7 @@ def check_send_recv_forward_backward():
|
||||
|
||||
|
||||
def check_layer(rank, world_size, port):
|
||||
launch(config=CONFIG, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
check_send_recv_forward()
|
||||
check_send_recv_backward()
|
||||
check_send_recv_forward_backward()
|
||||
|
@@ -104,7 +104,7 @@ def check_small_pipeline():
|
||||
|
||||
def check_layer(rank, world_size, port):
|
||||
disable_existing_loggers()
|
||||
launch(config=CONFIG, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
|
||||
disable_existing_loggers()
|
||||
# check_send_recv_forward()
|
||||
|
@@ -17,7 +17,7 @@ CONFIG = dict(
|
||||
|
||||
def check_layer(rank, world_size, port):
|
||||
disable_existing_loggers()
|
||||
launch(config=CONFIG, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
|
||||
check_linear_col()
|
||||
check_linear_row()
|
||||
|
@@ -50,7 +50,7 @@ def check_layer():
|
||||
|
||||
def check_layer_and_operation(rank, world_size, port):
|
||||
disable_existing_loggers()
|
||||
launch(config=CONFIG, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
|
||||
torch.backends.cuda.matmul.allow_tf32 = False
|
||||
torch.backends.cudnn.allow_tf32 = False
|
||||
|
@@ -38,7 +38,7 @@ def check_layer():
|
||||
|
||||
def check_layer_and_operation(rank, world_size, port):
|
||||
disable_existing_loggers()
|
||||
launch(config=CONFIG, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
|
||||
torch.backends.cuda.matmul.allow_tf32 = False
|
||||
torch.backends.cudnn.allow_tf32 = False
|
||||
|
@@ -44,7 +44,7 @@ def check_layer():
|
||||
|
||||
def check_layer_and_operation(rank, world_size, port):
|
||||
disable_existing_loggers()
|
||||
launch(config=CONFIG, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
torch.backends.cuda.matmul.allow_tf32 = False
|
||||
torch.backends.cudnn.allow_tf32 = False
|
||||
torch.backends.cudnn.deterministic = True
|
||||
|
@@ -378,7 +378,7 @@ def run_parallel_freq_aware_embed_columnwise(rank, world_size):
|
||||
|
||||
|
||||
def run_dist(rank, world_size, port):
|
||||
colossalai.legacy.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
colossalai.legacy.launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
# run_parallel_freq_aware_embed_columnwise(rank, world_size)
|
||||
run_parallel_freq_aware_embed_tablewise(rank, world_size)
|
||||
|
||||
|
@@ -48,7 +48,7 @@ def check_mem():
|
||||
|
||||
|
||||
def run_dist(rank, world_size, port):
|
||||
colossalai.legacy.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
colossalai.legacy.launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
check_mem()
|
||||
run()
|
||||
|
||||
|
@@ -9,7 +9,7 @@ from colossalai.testing import free_port
|
||||
|
||||
@pytest.mark.skip
|
||||
def test_multiinheritance():
|
||||
colossalai.legacy.launch(config={}, rank=0, world_size=1, host="localhost", port=free_port(), backend="nccl")
|
||||
colossalai.legacy.launch(rank=0, world_size=1, host="localhost", port=free_port(), backend="nccl")
|
||||
colo_param = ColoParameter(None, requires_grad=True)
|
||||
assert colo_param.dist_spec.placement.value == "r"
|
||||
assert isinstance(colo_param, ColoTensor)
|
||||
|
@@ -86,7 +86,7 @@ def check_comm(size, rank, prev_rank, next_rank, logger):
|
||||
|
||||
|
||||
def run_check(rank, world_size, port):
|
||||
launch(config=CONFIG, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
logger = get_dist_logger()
|
||||
rank = gpc.get_global_rank()
|
||||
prev_rank = gpc.get_prev_global_rank(ParallelMode.PIPELINE)
|
||||
|
@@ -23,7 +23,7 @@ CONFIG = dict(NUM_MICRO_BATCHES=2, parallel=dict(pipeline=dict(size=2), tensor=d
|
||||
|
||||
|
||||
def run_schedule(rank, world_size, port):
|
||||
launch(config=CONFIG, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
|
||||
# build model
|
||||
model = resnet18(num_classes=10)
|
||||
|
@@ -43,7 +43,7 @@ def check_checkpoint_1d(rank, world_size, port):
|
||||
)
|
||||
|
||||
disable_existing_loggers()
|
||||
launch(config=config, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
|
||||
m1 = nn.Sequential(nn.Linear(4, 8), nn.Linear(8, 4))
|
||||
sd1 = m1.state_dict()
|
||||
|
@@ -43,7 +43,7 @@ def check_checkpoint_2d(rank, world_size, port):
|
||||
)
|
||||
|
||||
disable_existing_loggers()
|
||||
launch(config=config, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
|
||||
m1 = nn.Sequential(nn.Linear(4, 8), nn.Linear(8, 4))
|
||||
sd1 = m1.state_dict()
|
||||
|
@@ -43,7 +43,7 @@ def check_checkpoint_2p5d(rank, world_size, port):
|
||||
)
|
||||
|
||||
disable_existing_loggers()
|
||||
launch(config=config, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
|
||||
m1 = nn.Sequential(nn.Linear(4, 8), nn.Linear(8, 4))
|
||||
sd1 = m1.state_dict()
|
||||
|
@@ -43,7 +43,7 @@ def check_checkpoint_3d(rank, world_size, port):
|
||||
)
|
||||
|
||||
disable_existing_loggers()
|
||||
launch(config=config, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
|
||||
m1 = nn.Sequential(nn.Linear(4, 8), nn.Linear(8, 4))
|
||||
sd1 = m1.state_dict()
|
||||
|
@@ -14,7 +14,7 @@ def _run_colo_set_process_memory_fraction_and_colo_device_memory_capacity():
|
||||
|
||||
|
||||
def run_dist(rank, world_size, port):
|
||||
colossalai.legacy.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
colossalai.legacy.launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
_run_colo_set_process_memory_fraction_and_colo_device_memory_capacity()
|
||||
|
||||
|
||||
|
@@ -62,7 +62,7 @@ def run_grad_clip_norm(world_size: int, dtype: torch.dtype, device: str, norm_ty
|
||||
|
||||
def run_dist(rank, world_size, port):
|
||||
disable_existing_loggers()
|
||||
colossalai.legacy.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
colossalai.legacy.launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
run_grad_clip_norm(world_size=world_size)
|
||||
|
||||
|
||||
|
@@ -7,7 +7,7 @@ from colossalai.testing import rerun_if_address_is_in_use, spawn
|
||||
|
||||
|
||||
def run_tensor_move(rank, world_size, port):
|
||||
colossalai.legacy.launch(config={}, rank=0, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
colossalai.legacy.launch(rank=0, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
|
||||
src_t = torch.ones(2, 3).cuda()
|
||||
tgt_t = torch.zeros(2, 3)
|
||||
|
Reference in New Issue
Block a user