mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-02 17:46:42 +00:00
[pipeline/pipleline_process_group] finish PipelineProcessGroup to manage local abd global rank in TP,DP and PP (#1508)
* support p2p communication with any type of object | pass test * reconstruct pipeline schedule with p2p_v2.py(support communication with List[Any]) | pass test * [engin/schedule] use p2p_v2 to recontruct pipeline_schedule * [pipeline/rpc] implement a demo for PP with cuda rpc framework * [pipeline/rpc] support interleaving | fix checkpoint bug | change logic when dispatch data in work_list to ensure steady 1F1B * [pipeline/rpc] implement distributed optimizer | test with assert_close * [pipeline/rpc] implement distributed optimizer | test with assert_close * [pipeline/rpc] update outstanding mechanism | optimize dispatching strategy * [pipeline/rpc] update outstanding mechanism | optimize dispatching strategy * [pipeline/rpc] update outstanding mechanism | optimize dispatching strategy * [pipeline/pipleline_process_group] finish PipelineProcessGroup to manage local abd global rank in TP,DP and PP * [pipeline/pipleline_process_group] remove comment * [pipeline/pipleline_process_group] remove comment * [pipeline/pipleline_process_group] skip process group test * [pipeline/pipleline_process_group] remove test named function
This commit is contained in:
@@ -1,13 +1,17 @@
|
||||
import os
|
||||
import argparse
|
||||
import warnings
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
import torch.multiprocessing as mp
|
||||
import torch.distributed.rpc as rpc
|
||||
from torch.optim import SGD, Adam, RMSprop, Optimizer
|
||||
from torch._C._distributed_rpc import _is_current_rpc_agent_set
|
||||
from colorama import Back, Style
|
||||
|
||||
rpc_is_initialized = _is_current_rpc_agent_set
|
||||
|
||||
|
||||
def color_debug(text, prefix=' ', color='blue'):
|
||||
color = color.upper()
|
||||
@@ -52,6 +56,19 @@ def parse_args():
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def pg_parse_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--world_size', type=int, default=4)
|
||||
parser.add_argument('--dp_degree', type=int, default=2)
|
||||
parser.add_argument('--tp_degree', type=int, default=1)
|
||||
parser.add_argument('--chunk', type=int, default=1)
|
||||
parser.add_argument('--num_worker_threads', type=str, default=128)
|
||||
parser.add_argument('--device', type=str, choices=['cpu', 'cuda'], default='cuda')
|
||||
parser.add_argument('--master_addr', type=str, default='localhost')
|
||||
parser.add_argument('--master_port', type=str, default='29020')
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def run_worker(rank, args, master_func):
|
||||
os.environ['MASTER_ADDR'] = args.master_addr
|
||||
os.environ['MASTER_PORT'] = args.master_port
|
||||
@@ -71,7 +88,10 @@ def run_worker(rank, args, master_func):
|
||||
if rank == 0:
|
||||
master_func(args)
|
||||
# barrier here
|
||||
rpc.shutdown()
|
||||
if rpc_is_initialized():
|
||||
rpc.shutdown()
|
||||
else:
|
||||
warnings.warn("RPC has not been initialized")
|
||||
|
||||
|
||||
def rpc_run(args, master_func):
|
||||
|
43
tests/test_pipeline/test_pipeline_process_group.py
Normal file
43
tests/test_pipeline/test_pipeline_process_group.py
Normal file
@@ -0,0 +1,43 @@
|
||||
import os
|
||||
|
||||
import torch.distributed.rpc as rpc
|
||||
import torch.multiprocessing as mp
|
||||
import pytest
|
||||
|
||||
from colossalai.pipeline.pipeline_process_group import PipelineProcessGroup
|
||||
from colossalai.initialize import launch
|
||||
from colossalai.logging import disable_existing_loggers
|
||||
from rpc_test_utils import pg_parse_args, rpc_is_initialized
|
||||
|
||||
|
||||
def run_worker(rank, args):
|
||||
os.environ['MASTER_ADDR'] = args.master_addr
|
||||
os.environ['MASTER_PORT'] = args.master_port
|
||||
|
||||
device = args.device
|
||||
world_size = args.world_size
|
||||
dp_degree = args.dp_degree
|
||||
tp_degree = args.tp_degree
|
||||
num_worker_threads = args.num_worker_threads
|
||||
host = args.master_addr
|
||||
port = args.master_port
|
||||
backend = 'nccl' if device == 'cuda' else 'gloo'
|
||||
|
||||
disable_existing_loggers()
|
||||
launch(dict(), rank, world_size, host, int(port), backend, verbose=False)
|
||||
|
||||
pg = PipelineProcessGroup(rank=rank,
|
||||
world_size=world_size,
|
||||
dp_degree=dp_degree,
|
||||
tp_degree=tp_degree,
|
||||
num_worker_threads=num_worker_threads,
|
||||
device=device)
|
||||
|
||||
if rpc_is_initialized():
|
||||
rpc.shutdown()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = pg_parse_args()
|
||||
world_size = args.world_size
|
||||
mp.spawn(run_worker, args=(args,), nprocs=world_size)
|
Reference in New Issue
Block a user