[legacy] clean up legacy code (#4743)

* [legacy] remove outdated codes of pipeline (#4692) * [legacy] remove cli of benchmark and update optim (#4690) * [legacy] remove cli of benchmark and update optim * [doc] fix cli doc test * [legacy] fix engine clip grad norm * [legacy] remove outdated colo tensor (#4694) * [legacy] remove outdated colo tensor * [test] fix test import * [legacy] move outdated zero to legacy (#4696) * [legacy] clean up utils (#4700) * [legacy] clean up utils * [example] update examples * [legacy] clean up amp * [legacy] fix amp module * [legacy] clean up gpc (#4742) * [legacy] clean up context * [legacy] clean core, constants and global vars * [legacy] refactor initialize * [example] fix examples ci * [example] fix examples ci * [legacy] fix tests * [example] fix gpt example * [example] fix examples ci * [devops] fix ci installation * [example] fix examples ci
2025-09-01 09:07:51 +00:00 · 2023-09-18 16:31:06 +08:00
parent 32e7f99416
commit b5f9e37c70
342 changed files with 2919 additions and 4182 deletions
--- a/tests/test_pipeline/rpc_test_utils.py
+++ b/tests/test_pipeline/rpc_test_utils.py
@@ -1,150 +0,0 @@
-import argparse
-import os
-import warnings
-
-import torch
-import torch.distributed as dist
-import torch.distributed.rpc as rpc
-import torch.multiprocessing as mp
-from torch import nn
-from torch._C._distributed_rpc import _is_current_rpc_agent_set
-from torch.optim import SGD, Adam, Optimizer, RMSprop
-
-from colossalai import launch
-from colossalai.logging import disable_existing_loggers
-from colossalai.pipeline.pipeline_process_group import ppg
-
-rpc_is_initialized = _is_current_rpc_agent_set
-
-
-def color_debug(text, prefix=' ', color='blue'):
-    color = color.upper()
-    print(getattr(Back, color), prefix, Style.RESET_ALL, text)
-
-
-class MLP(nn.Module):
-
-    def __init__(self, dim: int, layers: int):
-        super().__init__()
-        self.layers = torch.nn.ModuleList()
-
-        for _ in range(layers):
-            self.layers.append(nn.Linear(dim, dim, bias=False))
-
-    def forward(self, x):
-        for layer in self.layers:
-            x = layer(x)
-        return x.sum()
-
-
-class DAG_MLP(nn.Module):
-
-    def __init__(self, dim: int, layers: int):
-        super().__init__()
-        self.layers = torch.nn.ModuleList()
-        self.dag_layer = nn.Linear(dim, dim, bias=False)
-
-        for _ in range(layers):
-            self.layers.append(nn.Linear(dim, dim, bias=False))
-
-    def forward(self, x, y):
-        for layer in self.layers:
-            x = layer(x)
-            y = self.dag_layer(y)
-        return x.sum(), y.sum()
-
-
-class RpcTestModel(nn.Module):
-
-    def __init__(self, stage_id, actual_stage_num, feat_num, h) -> None:
-        super().__init__()
-        self.rank = stage_id
-        self.is_last_rank = stage_id == actual_stage_num - 1
-        self.linear_name = f'linear_{stage_id}'
-
-        if stage_id == 0:
-            linear = nn.Linear(feat_num, h)
-        elif stage_id == actual_stage_num - 1:
-            linear = nn.Linear(h, 1)
-        else:
-            linear = nn.Linear(h, h)
-
-        setattr(self, self.linear_name, linear)
-
-    def forward(self, x) -> torch.Tensor:
-        linear: nn.Module = getattr(self, self.linear_name)
-        out: torch.Tensor = linear(x)
-
-        if self.is_last_rank:
-            out = out.sum()
-        return out
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--epoch', type=int, default=1)
-    parser.add_argument('--world_size', type=int, default=2)
-    parser.add_argument('--batch_size', type=int, default=16)
-    parser.add_argument('--dp_degree', type=int, default=1)
-    parser.add_argument('--tp_degree', type=int, default=1)
-    parser.add_argument('--num_microbatches', type=int, default=2)
-    parser.add_argument('--chunk', type=int, default=1)
-    parser.add_argument('--use_checkpoint', action='store_true')
-    parser.add_argument('--optimizer', type=str, choices=['SGD', 'Adam', 'RMSprop'], default='SGD')
-    parser.add_argument('--device', type=str, choices=['cpu', 'cuda'], default='cuda')
-    parser.add_argument('--master_addr', type=str, default='localhost')
-    parser.add_argument('--master_port', type=str, default='29020')
-    parser.add_argument('--num_worker_threads', type=str, default=128)
-    return parser.parse_args()
-
-
-def pg_parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--world_size', type=int, default=4)
-    parser.add_argument('--dp_degree', type=int, default=2)
-    parser.add_argument('--tp_degree', type=int, default=1)
-    parser.add_argument('--chunk', type=int, default=1)
-    parser.add_argument('--num_worker_threads', type=str, default=128)
-    parser.add_argument('--device', type=str, choices=['cpu', 'cuda'], default='cuda')
-    parser.add_argument('--master_addr', type=str, default='localhost')
-    parser.add_argument('--master_port', type=str, default='29020')
-    return parser.parse_args()
-
-
-def run_worker(rank, args, master_func):
-    os.environ['MASTER_ADDR'] = args.master_addr
-    os.environ['MASTER_PORT'] = args.master_port
-
-    device = args.device
-    world_size = args.world_size
-    dp_degree = args.dp_degree
-    tp_degree = args.tp_degree
-    num_worker_threads = args.num_worker_threads
-    host = args.master_addr
-    port = args.master_port
-    backend = 'nccl' if device == 'cuda' else 'gloo'
-
-    disable_existing_loggers()
-
-    launch(dict(), rank, world_size, host, int(port), backend, verbose=False)
-    ppg.set_global_info(rank=rank,
-                        world_size=world_size,
-                        dp_degree=dp_degree,
-                        tp_degree=tp_degree,
-                        num_worker_threads=num_worker_threads,
-                        device=device)
-
-    # in rpc mode, only rank 0 is needed to be coded
-    if rank == 0:
-        master_func(args)
-    # barrier here
-    if rpc_is_initialized():
-        rpc.shutdown()
-    else:
-        warnings.warn("RPC has not been initialized")
-
-
-def rpc_run(args, master_func):
-    world_size = args.world_size
-    assert args.num_microbatches >= args.world_size, "num_microbatches cannot be fewer than world_size!"
-    mp.spawn(run_worker, args=(args, master_func), nprocs=world_size)
--- a/tests/test_pipeline/test_cuda_rpc_chimera.py
+++ b/tests/test_pipeline/test_cuda_rpc_chimera.py
@@ -1,80 +0,0 @@
-import torch
-from torch import nn
-import torch.autograd as autograd
-
-from colossalai.pipeline.rpc import ChimeraPipelineEngine
-from colossalai.testing import assert_close
-from rpc_test_utils import rpc_run, parse_args, RpcTestModel
-
-# global variable for model created
-feat_num = 100
-h = 100
-
-
-def partition(pp_rank: int, chunk: int, stage_num: int):
-    torch.manual_seed(1024)
-    partition = RpcTestModel(pp_rank, stage_num, feat_num, h)
-    return partition
-
-
-def run_master(args):
-    torch.manual_seed(100)
-
-    epoch = args.epoch
-    device = args.device
-    stage_num = args.world_size
-    chunk = 1
-    num_microbatches = args.num_microbatches
-    use_checkpoint = False
-
-    sample_num = 1024
-    batch_size = 1024
-
-    assert sample_num % batch_size == 0
-
-    engine = ChimeraPipelineEngine(partition_fn=partition,
-                                   stage_num=stage_num,
-                                   num_microbatches=num_microbatches,
-                                   device=device,
-                                   checkpoint=use_checkpoint)
-    engine.initialize_optimizer(torch.optim.Adam, lr=1e-3)
-
-    input_sample = torch.randn((sample_num, feat_num), device=device)
-
-    forward_result = engine.forward_backward(input_sample)
-
-    cuda_rpc_result = []
-    single_result = []
-    actual_stage_num = engine._get_actual_stage_num()
-
-    # compute forward result and backward grad of parameters in cuda rpc
-    cuda_rpc_result.append(sum(forward_result[0]))
-    grad = engine.remote_grad()
-    for stage_id in range(actual_stage_num):
-        for p in grad[stage_id]:
-            cuda_rpc_result.append(p)
-
-    # compute forward result and backward grad of parameters just in rank_0
-    test_model = nn.Sequential(
-        *[partition(pp_rank, chunk, actual_stage_num) for pp_rank in range(actual_stage_num)]).to(device)
-    # input_sample = input_sample[len(input_sample) // 2:]
-    input_sample = input_sample.requires_grad_()
-    out_val = test_model(input_sample).sum()
-    autograd.backward(out_val)
-    single_result.append(out_val)
-    for p in test_model.parameters():
-        single_result.append(p.grad)
-
-    # print("my")
-    # print(cuda_rpc_result[1])
-    # print("answer:")
-    # print(single_result[1])
-
-    # assert len(cuda_rpc_result) == len(single_result)
-    # for r_c, r_s in zip(cuda_rpc_result, single_result):
-    #     assert_close(r_c, r_s, 0.001, 0.001)
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    rpc_run(args, run_master)
--- a/tests/test_pipeline/test_cuda_rpc_optimizer.py
+++ b/tests/test_pipeline/test_cuda_rpc_optimizer.py
@@ -1,81 +0,0 @@
-import torch
-from torch import nn
-from torch import autograd
-from torch.optim import SGD, Adam, RMSprop, Optimizer
-
-from colossalai.pipeline.rpc._pipeline_schedule import FillDrainPipelineEngine, OneFOneBPipelineEngine
-from colossalai.testing import assert_close
-from rpc_test_utils import rpc_run, parse_args, RpcTestModel
-
-# global variable for model created
-feat_num = 100
-h = 100
-
-
-def partition(pp_rank: int, chunk: int, stage_num: int):
-    torch.manual_seed(1024)
-    partition = RpcTestModel(pp_rank, stage_num, feat_num, h)
-    return partition
-
-
-def run_master(args):
-    torch.manual_seed(100)
-
-    device = args.device
-    stage_num = args.world_size
-    chunk = args.chunk
-    actual_stage_num = stage_num * chunk
-    use_checkpoint = args.use_checkpoint
-    num_microbatches = args.num_microbatches
-    optimizer_class = globals()[args.optimizer]
-
-    lr = 1e-3
-    sample_num = 1024
-    batch_size = 1024
-
-    assert sample_num % batch_size == 0
-
-    input_sample = torch.randn((sample_num, feat_num), device=device)
-
-    engine = OneFOneBPipelineEngine(partition_fn=partition,
-                                    stage_num=stage_num,
-                                    num_microbatches=num_microbatches,
-                                    device=device,
-                                    chunk=chunk,
-                                    checkpoint=use_checkpoint)
-
-    engine.initialize_optimizer(optimizer_class, lr=lr)
-
-    _ = engine.forward_backward(input_sample)
-
-    cuda_rpc_result = []
-    single_result = []
-    actual_stage_num = engine._get_actual_stage_num()
-
-    # compute parameters after updating in cuda rpc
-    parameters = engine.remote_parameters()
-    for stage_id in range(actual_stage_num):
-        for p in parameters[stage_id]:
-            cuda_rpc_result.append(p)
-
-    # compute forward result and backward grad of parameters just in rank_0
-    test_model = nn.Sequential(
-        *[partition(pp_rank, chunk, actual_stage_num) for pp_rank in range(actual_stage_num)]).to(device)
-    optimizer: Optimizer = optimizer_class(test_model.parameters(), lr=lr)
-    input_sample = input_sample.requires_grad_()
-    out_val = test_model(input_sample).sum()
-    autograd.backward(out_val)
-    optimizer.step()
-    optimizer.zero_grad()
-
-    for p in test_model.parameters():
-        single_result.append(p)
-
-    assert len(cuda_rpc_result) == len(single_result)
-    for r_c, r_s in zip(cuda_rpc_result, single_result):
-        assert_close(r_c, r_s, 0.001, 0.001)
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    rpc_run(args, run_master)
--- a/tests/test_pipeline/test_cuda_rpc_pipeline.py
+++ b/tests/test_pipeline/test_cuda_rpc_pipeline.py
@@ -1,48 +0,0 @@
-import torch
-from torch import nn
-
-from colossalai.pipeline.rpc._pipeline_schedule import FillDrainPipelineEngine, OneFOneBPipelineEngine
-from rpc_test_utils import rpc_run, parse_args, RpcTestModel
-
-# global variable for model created
-feat_num = 100
-h = 100
-
-
-def partition(pp_rank: int, chunk: int, stage_num: int):
-    torch.manual_seed(1024)
-    partition = RpcTestModel(pp_rank, stage_num, feat_num, h)
-    return partition
-
-
-def run_master(args):
-    torch.manual_seed(100)
-
-    epoch = args.epoch
-    device = args.device
-    stage_num = args.world_size
-    chunk = args.chunk
-    num_microbatches = args.num_microbatches
-    use_checkpoint = args.use_checkpoint
-
-    sample_num = 1024
-    batch_size = 1024
-
-    assert sample_num % batch_size == 0
-
-    input_sample = torch.randn((sample_num, feat_num), device=device)
-
-    engine = OneFOneBPipelineEngine(partition_fn=partition,
-                                    stage_num=stage_num,
-                                    num_microbatches=num_microbatches,
-                                    device=device,
-                                    chunk=chunk,
-                                    checkpoint=use_checkpoint)
-
-    for _ in range(epoch):
-        _ = engine.forward_backward(input_sample, forward_only=False)
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    rpc_run(args, run_master)
--- a/tests/test_pipeline/test_cuda_rpc_value_correctness.py
+++ b/tests/test_pipeline/test_cuda_rpc_value_correctness.py
@@ -1,73 +0,0 @@
-import torch
-from torch import nn
-from torch import autograd
-
-from colossalai.pipeline.rpc._pipeline_schedule import FillDrainPipelineEngine, OneFOneBPipelineEngine
-from colossalai.testing import assert_close
-from rpc_test_utils import rpc_run, parse_args, RpcTestModel
-
-feat_num = 100
-h = 100
-
-
-def partition(pp_rank: int, chunk: int, stage_num: int):
-    torch.manual_seed(1024)
-    partition = RpcTestModel(pp_rank, stage_num, feat_num, h)
-    return partition
-
-
-def run_master(args):
-    torch.manual_seed(100)
-
-    device = args.device
-    stage_num = args.world_size
-    chunk = args.chunk
-    actual_stage_num = stage_num * chunk
-    use_checkpoint = args.use_checkpoint
-    num_microbatches = args.num_microbatches
-
-    sample_num = 1024
-    batch_size = 1024
-
-    assert sample_num % batch_size == 0
-
-    input_sample = torch.randn((sample_num, feat_num), device=device)
-
-    engine = OneFOneBPipelineEngine(partition_fn=partition,
-                                    stage_num=stage_num,
-                                    num_microbatches=num_microbatches,
-                                    device=device,
-                                    chunk=chunk,
-                                    checkpoint=use_checkpoint)
-
-    forward_result = engine.forward_backward(input_sample)
-
-    cuda_rpc_result = []
-    single_result = []
-    actual_stage_num = engine._get_actual_stage_num()
-
-    # compute forward result and backward grad of parameters in cuda rpc
-    cuda_rpc_result.append(sum(forward_result[0]))
-    grad = engine.remote_grad()
-    for stage_id in range(actual_stage_num):
-        for p in grad[stage_id]:
-            cuda_rpc_result.append(p)
-
-    # compute forward result and backward grad of parameters just in rank_0
-    test_model = nn.Sequential(
-        *[partition(pp_rank, chunk, actual_stage_num) for pp_rank in range(actual_stage_num)]).to(device)
-    input_sample = input_sample.requires_grad_()
-    out_val = test_model(input_sample).sum()
-    autograd.backward(out_val)
-    single_result.append(out_val)
-    for p in test_model.parameters():
-        single_result.append(p.grad)
-
-    assert len(cuda_rpc_result) == len(single_result)
-    for r_c, r_s in zip(cuda_rpc_result, single_result):
-        assert_close(r_c, r_s, 0.001, 0.001)
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    rpc_run(args, run_master)
--- a/tests/test_pipeline/test_middleware_1f1b.py
+++ b/tests/test_pipeline/test_middleware_1f1b.py
@@ -1,145 +0,0 @@
-import os
-from functools import partial
-
-import pytest
-import torch
-import torch.distributed.rpc as rpc
-from rpc_test_utils import DAG_MLP, MLP
-from torch._C._distributed_rpc import _is_current_rpc_agent_set
-
-from colossalai import launch
-from colossalai.fx import ColoTracer
-from colossalai.fx.passes.adding_split_node_pass import balanced_split_pass, split_with_split_nodes_pass
-from colossalai.logging import disable_existing_loggers
-from colossalai.pipeline.middleware.adaptor import get_fx_topology
-from colossalai.pipeline.pipeline_process_group import ppg
-from colossalai.pipeline.rpc._pipeline_schedule import OneFOneBPipelineEngine
-from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
-
-# global variable for model created
-batch_size = 16
-dim = 10
-rpc_is_initialized = _is_current_rpc_agent_set
-
-
-def create_partition_module(pp_rank: int, stage_num: int, model, data_kwargs):
-    model.eval()
-    tracer = ColoTracer()
-    meta_args = {k: v.to('meta') for k, v in data_kwargs.items()}
-    graph = tracer.trace(root=model, meta_args=meta_args)
-    gm = torch.fx.GraphModule(model, graph, model.__class__.__name__)
-    annotated_model = balanced_split_pass(gm, stage_num)
-    top_module, split_submodules = split_with_split_nodes_pass(annotated_model, merge_output=True)
-    topo = get_fx_topology(top_module)
-    for submodule in split_submodules:
-        if isinstance(submodule, torch.fx.GraphModule):
-            setattr(submodule, '_topo', topo)
-    return split_submodules[pp_rank + 1]
-
-
-def partition(model, data_kwargs: dict, pp_rank: int, chunk: int, stage_num: int):
-    torch.manual_seed(1024)
-    partition = create_partition_module(pp_rank, stage_num, model, data_kwargs)
-    return partition
-
-
-def run_master(model_cls, world_size, forward_only):
-    torch.manual_seed(100)
-
-    epoch = 3
-    device = 'cuda'
-    stage_num = world_size
-    chunk = 1
-    num_microbatches = 8
-    use_checkpoint = 'store_true'
-
-    if model_cls == MLP:
-
-        def data_gen():
-            x = torch.zeros((batch_size, dim))
-            kwargs = dict(x=x)
-            return kwargs
-
-        model = model_cls(dim, stage_num * 3)
-        if forward_only:
-            labels = None
-        else:
-            labels = 1
-    elif model_cls == DAG_MLP:
-
-        def data_gen():
-            x = torch.zeros((batch_size, dim))
-            y = torch.zeros((batch_size, dim))
-            kwargs = dict(x=x, y=y)
-            return kwargs
-
-        model = model_cls(dim, stage_num * 3)
-        if forward_only:
-            labels = None
-        else:
-            labels = 1
-    else:
-        pass
-
-    data_kwargs = data_gen()
-
-    engine = OneFOneBPipelineEngine(
-        partition_fn=partial(partition, model, data_kwargs),
-        stage_num=stage_num,
-        num_microbatches=num_microbatches,
-        device=device,
-        chunk=chunk,
-        checkpoint=use_checkpoint,
-    )
-    if not forward_only:
-        engine.initialize_optimizer(getattr(torch.optim, 'SGD'), lr=1e-3)
-
-    for _ in range(epoch):
-        input_x = torch.randn((batch_size, dim), device=device)
-        input_y = torch.randn((batch_size, dim), device=device)
-        logits = engine.forward_backward({'x': input_x, 'y': input_y}, labels=labels, forward_only=forward_only)
-
-
-def run_worker(rank, world_size, port, model_cls, forward_only, master_func):
-    master_addr = 'localhost'
-    master_port = 29020
-    os.environ['MASTER_ADDR'] = master_addr
-    os.environ['MASTER_PORT'] = str(master_port)
-
-    disable_existing_loggers()
-
-    launch(dict(), rank, world_size, master_addr, master_port, 'nccl', verbose=False)
-    ppg.set_global_info(rank=rank,
-                        world_size=world_size,
-                        dp_degree=1,
-                        tp_degree=1,
-                        num_worker_threads=128,
-                        device='cuda')
-
-    # in rpc mode, only rank 0 is needed to be coded
-    if rank == 0:
-        master_func(model_cls, world_size, forward_only)
-    # barrier here
-    if rpc_is_initialized():
-        rpc.shutdown()
-
-
-@pytest.mark.skip("skip due to CI torch version 1.11")
-@parameterize('model_cls', [MLP, DAG_MLP])
-@parameterize('forward_only', [True, False])
-@pytest.mark.dist
-@rerun_if_address_is_in_use()
-def test_pp_middleware_fwd(model_cls, forward_only):
-    world_size = 4
-    master_func = run_master
-    spawn(
-        run_worker,
-        world_size,
-        model_cls=model_cls,
-        forward_only=forward_only,
-        master_func=master_func,
-    )
-
-
-if __name__ == "__main__":
-    test_pp_middleware_fwd()
--- a/tests/test_pipeline/test_pipelinable.py
+++ b/tests/test_pipeline/test_pipelinable.py
@@ -1,59 +0,0 @@
-import pytest
-import torch
-
-from colossalai.pipeline.pipelinable import PipelinableContext
-from colossalai.testing import rerun_if_address_is_in_use, rerun_on_exception, spawn
-
-NUM_CHUNKS = 1
-PIPELINE_SIZE = 2
-
-
-class MLP(torch.nn.Module):
-
-    def __init__(self, dim: int = 256):
-        super().__init__()
-        intermediate_dim = dim * 4
-        self.dense_1 = torch.nn.Linear(dim, intermediate_dim)
-        self.activation = torch.nn.GELU()
-        self.dense_2 = torch.nn.Linear(intermediate_dim, dim)
-        self.dropout = torch.nn.Dropout(0.1)
-
-    def forward(self, x):
-        x = self.dense_1(x)
-        x = self.activation(x)
-        x = self.dense_2(x)
-        x = self.dropout(x)
-        return x
-
-
-def run_pipelinable(rank, world_size, port):
-    pipelinable = PipelinableContext()
-    with pipelinable:
-        model = MLP()
-
-    assert pipelinable.policy == "balanced"
-    pipelinable.policy = "uniform"
-    assert pipelinable.policy == "uniform"
-    pipelinable.to_layer_list()
-
-    assert pipelinable.layers_count == len(list(model.children()))
-
-    pipeline_model_part_0 = pipelinable.partition(NUM_CHUNKS, PIPELINE_SIZE, 0)
-    assert isinstance(pipeline_model_part_0, torch.nn.Module)
-    pipeline_model_part_1 = pipelinable.partition(NUM_CHUNKS, PIPELINE_SIZE, 1)
-    assert isinstance(pipeline_model_part_1, torch.nn.Module)
-
-    layers_count_in_part_0 = len(list(pipeline_model_part_0._module_list))
-    layers_count_in_part_1 = len(list(pipeline_model_part_1._module_list))
-
-    assert layers_count_in_part_0 + layers_count_in_part_1 == pipelinable.layers_count
-
-
-@pytest.mark.skip(reason="this is useless")
-@rerun_if_address_is_in_use()
-def test_pipelinable():
-    spawn(run_pipelinable, 1)
-
-
-if __name__ == '__main__':
-    test_pipelinable()
--- a/tests/test_pipeline/test_pipeline_process_group.py
+++ b/tests/test_pipeline/test_pipeline_process_group.py
@@ -1,42 +0,0 @@
-import os
-
-import torch.distributed.rpc as rpc
-from rpc_test_utils import pg_parse_args, rpc_is_initialized
-
-from colossalai.initialize import launch
-from colossalai.logging import disable_existing_loggers
-from colossalai.pipeline.pipeline_process_group import ppg
-from colossalai.testing import spawn
-
-
-def run_worker(rank, args):
-    os.environ['MASTER_ADDR'] = args.master_addr
-    os.environ['MASTER_PORT'] = args.master_port
-
-    device = args.device
-    world_size = args.world_size
-    dp_degree = args.dp_degree
-    tp_degree = args.tp_degree
-    num_worker_threads = args.num_worker_threads
-    host = args.master_addr
-    port = args.master_port
-    backend = 'nccl' if device == 'cuda' else 'gloo'
-
-    disable_existing_loggers()
-    launch(dict(), rank, world_size, host, int(port), backend, verbose=False)
-
-    ppg.set_global_info(rank=rank,
-                        world_size=world_size,
-                        dp_degree=dp_degree,
-                        tp_degree=tp_degree,
-                        num_worker_threads=num_worker_threads,
-                        device=device)
-
-    if rpc_is_initialized():
-        rpc.shutdown()
-
-
-if __name__ == "__main__":
-    args = pg_parse_args()
-    world_size = args.world_size
-    spawn(run_worker, world_size, args=args)