mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-12-02 19:56:09 +00:00
[pipeline/rpc] update outstanding mechanism | optimize dispatching strategy (#1497)
* support p2p communication with any type of object | pass test * reconstruct pipeline schedule with p2p_v2.py(support communication with List[Any]) | pass test * [engin/schedule] use p2p_v2 to recontruct pipeline_schedule * [pipeline/rpc] implement a demo for PP with cuda rpc framework * [pipeline/rpc] support interleaving | fix checkpoint bug | change logic when dispatch data in work_list to ensure steady 1F1B * [pipeline/rpc] implement distributed optimizer | test with assert_close * [pipeline/rpc] implement distributed optimizer | test with assert_close * [pipeline/rpc] update outstanding mechanism | optimize dispatching strategy * [pipeline/rpc] update outstanding mechanism | optimize dispatching strategy * [pipeline/rpc] update outstanding mechanism | optimize dispatching strategy
This commit is contained in:
@@ -5,13 +5,9 @@ import torch
|
||||
from torch import nn
|
||||
import torch.multiprocessing as mp
|
||||
import torch.distributed.rpc as rpc
|
||||
from torch import autograd
|
||||
from torch.optim import SGD, Adam, RMSprop, Optimizer
|
||||
from colorama import Back, Style
|
||||
|
||||
from colossalai.pipeline.rpc.PipelineBase import FillDrainPipelineEngine, OneFOneBPipelineEngine
|
||||
from colossalai.testing import assert_close
|
||||
|
||||
|
||||
def color_debug(text, prefix=' ', color='blue'):
|
||||
color = color.upper()
|
||||
@@ -43,13 +39,13 @@ class RpcTestModel(nn.Module):
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--epoch', type=int, default=1)
|
||||
parser.add_argument('--world_size', type=int, default=2)
|
||||
parser.add_argument('--num_microbatches', type=int, default=2)
|
||||
parser.add_argument('--chunk', type=int, default=1)
|
||||
parser.add_argument('--use_checkpoint', action='store_true')
|
||||
parser.add_argument('--use_interleave', action='store_true')
|
||||
parser.add_argument('--optimizer', type=str, choices=['SGD', 'Adam', 'RMSprop'], default='SGD')
|
||||
parser.add_argument('--device', type=str, default='cuda')
|
||||
parser.add_argument('--device', type=str, choices=['cpu', 'cuda'], default='cuda')
|
||||
parser.add_argument('--master_addr', type=str, default='localhost')
|
||||
parser.add_argument('--master_port', type=str, default='29020')
|
||||
parser.add_argument('--num_worker_threads', type=str, default=128)
|
||||
|
||||
Reference in New Issue
Block a user