ColossalAI/tests/test_pipeline/test_pipeline_process_group.py
Kirigaya Kazuto 170fa81095
[pipeline/chimera] test chimera | fix bug of initializing (#1615)
* [pipeline/tuning] improve dispatch performance both time and space cost

* [pipeline/converge] add interface for testing convergence

* [NFC] polish colossalai/utils/multi_tensor_apply/multi_tensor_apply.py code style

* Update PipelineBase.py

* [pipeline/chimera] reconstruct PipelineBase and Worker to support more feasible custom schedule | finish Chimera

* [pipeline/chimera] test chimera | fix bug of initializing
2022-09-20 18:00:39 +08:00

43 lines
1.3 KiB
Python

import os
import torch.distributed.rpc as rpc
import torch.multiprocessing as mp
import pytest
from colossalai.pipeline.pipeline_process_group import ppg
from colossalai.initialize import launch
from colossalai.logging import disable_existing_loggers
from rpc_test_utils import pg_parse_args, rpc_is_initialized
def run_worker(rank, args):
os.environ['MASTER_ADDR'] = args.master_addr
os.environ['MASTER_PORT'] = args.master_port
device = args.device
world_size = args.world_size
dp_degree = args.dp_degree
tp_degree = args.tp_degree
num_worker_threads = args.num_worker_threads
host = args.master_addr
port = args.master_port
backend = 'nccl' if device == 'cuda' else 'gloo'
disable_existing_loggers()
launch(dict(), rank, world_size, host, int(port), backend, verbose=False)
ppg.set_global_info(rank=rank,
world_size=world_size,
dp_degree=dp_degree,
tp_degree=tp_degree,
num_worker_threads=num_worker_threads,
device=device)
if rpc_is_initialized():
rpc.shutdown()
if __name__ == "__main__":
args = pg_parse_args()
world_size = args.world_size
mp.spawn(run_worker, args=(args,), nprocs=world_size)