mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-01 09:07:51 +00:00
[legacy] clean up legacy code (#4743)
* [legacy] remove outdated codes of pipeline (#4692) * [legacy] remove cli of benchmark and update optim (#4690) * [legacy] remove cli of benchmark and update optim * [doc] fix cli doc test * [legacy] fix engine clip grad norm * [legacy] remove outdated colo tensor (#4694) * [legacy] remove outdated colo tensor * [test] fix test import * [legacy] move outdated zero to legacy (#4696) * [legacy] clean up utils (#4700) * [legacy] clean up utils * [example] update examples * [legacy] clean up amp * [legacy] fix amp module * [legacy] clean up gpc (#4742) * [legacy] clean up context * [legacy] clean core, constants and global vars * [legacy] refactor initialize * [example] fix examples ci * [example] fix examples ci * [legacy] fix tests * [example] fix gpt example * [example] fix examples ci * [devops] fix ci installation * [example] fix examples ci
This commit is contained in:
@@ -1,150 +0,0 @@
|
||||
import argparse
|
||||
import os
|
||||
import warnings
|
||||
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
import torch.distributed.rpc as rpc
|
||||
import torch.multiprocessing as mp
|
||||
from torch import nn
|
||||
from torch._C._distributed_rpc import _is_current_rpc_agent_set
|
||||
from torch.optim import SGD, Adam, Optimizer, RMSprop
|
||||
|
||||
from colossalai import launch
|
||||
from colossalai.logging import disable_existing_loggers
|
||||
from colossalai.pipeline.pipeline_process_group import ppg
|
||||
|
||||
rpc_is_initialized = _is_current_rpc_agent_set
|
||||
|
||||
|
||||
def color_debug(text, prefix=' ', color='blue'):
|
||||
color = color.upper()
|
||||
print(getattr(Back, color), prefix, Style.RESET_ALL, text)
|
||||
|
||||
|
||||
class MLP(nn.Module):
|
||||
|
||||
def __init__(self, dim: int, layers: int):
|
||||
super().__init__()
|
||||
self.layers = torch.nn.ModuleList()
|
||||
|
||||
for _ in range(layers):
|
||||
self.layers.append(nn.Linear(dim, dim, bias=False))
|
||||
|
||||
def forward(self, x):
|
||||
for layer in self.layers:
|
||||
x = layer(x)
|
||||
return x.sum()
|
||||
|
||||
|
||||
class DAG_MLP(nn.Module):
|
||||
|
||||
def __init__(self, dim: int, layers: int):
|
||||
super().__init__()
|
||||
self.layers = torch.nn.ModuleList()
|
||||
self.dag_layer = nn.Linear(dim, dim, bias=False)
|
||||
|
||||
for _ in range(layers):
|
||||
self.layers.append(nn.Linear(dim, dim, bias=False))
|
||||
|
||||
def forward(self, x, y):
|
||||
for layer in self.layers:
|
||||
x = layer(x)
|
||||
y = self.dag_layer(y)
|
||||
return x.sum(), y.sum()
|
||||
|
||||
|
||||
class RpcTestModel(nn.Module):
|
||||
|
||||
def __init__(self, stage_id, actual_stage_num, feat_num, h) -> None:
|
||||
super().__init__()
|
||||
self.rank = stage_id
|
||||
self.is_last_rank = stage_id == actual_stage_num - 1
|
||||
self.linear_name = f'linear_{stage_id}'
|
||||
|
||||
if stage_id == 0:
|
||||
linear = nn.Linear(feat_num, h)
|
||||
elif stage_id == actual_stage_num - 1:
|
||||
linear = nn.Linear(h, 1)
|
||||
else:
|
||||
linear = nn.Linear(h, h)
|
||||
|
||||
setattr(self, self.linear_name, linear)
|
||||
|
||||
def forward(self, x) -> torch.Tensor:
|
||||
linear: nn.Module = getattr(self, self.linear_name)
|
||||
out: torch.Tensor = linear(x)
|
||||
|
||||
if self.is_last_rank:
|
||||
out = out.sum()
|
||||
return out
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--epoch', type=int, default=1)
|
||||
parser.add_argument('--world_size', type=int, default=2)
|
||||
parser.add_argument('--batch_size', type=int, default=16)
|
||||
parser.add_argument('--dp_degree', type=int, default=1)
|
||||
parser.add_argument('--tp_degree', type=int, default=1)
|
||||
parser.add_argument('--num_microbatches', type=int, default=2)
|
||||
parser.add_argument('--chunk', type=int, default=1)
|
||||
parser.add_argument('--use_checkpoint', action='store_true')
|
||||
parser.add_argument('--optimizer', type=str, choices=['SGD', 'Adam', 'RMSprop'], default='SGD')
|
||||
parser.add_argument('--device', type=str, choices=['cpu', 'cuda'], default='cuda')
|
||||
parser.add_argument('--master_addr', type=str, default='localhost')
|
||||
parser.add_argument('--master_port', type=str, default='29020')
|
||||
parser.add_argument('--num_worker_threads', type=str, default=128)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def pg_parse_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--world_size', type=int, default=4)
|
||||
parser.add_argument('--dp_degree', type=int, default=2)
|
||||
parser.add_argument('--tp_degree', type=int, default=1)
|
||||
parser.add_argument('--chunk', type=int, default=1)
|
||||
parser.add_argument('--num_worker_threads', type=str, default=128)
|
||||
parser.add_argument('--device', type=str, choices=['cpu', 'cuda'], default='cuda')
|
||||
parser.add_argument('--master_addr', type=str, default='localhost')
|
||||
parser.add_argument('--master_port', type=str, default='29020')
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def run_worker(rank, args, master_func):
|
||||
os.environ['MASTER_ADDR'] = args.master_addr
|
||||
os.environ['MASTER_PORT'] = args.master_port
|
||||
|
||||
device = args.device
|
||||
world_size = args.world_size
|
||||
dp_degree = args.dp_degree
|
||||
tp_degree = args.tp_degree
|
||||
num_worker_threads = args.num_worker_threads
|
||||
host = args.master_addr
|
||||
port = args.master_port
|
||||
backend = 'nccl' if device == 'cuda' else 'gloo'
|
||||
|
||||
disable_existing_loggers()
|
||||
|
||||
launch(dict(), rank, world_size, host, int(port), backend, verbose=False)
|
||||
ppg.set_global_info(rank=rank,
|
||||
world_size=world_size,
|
||||
dp_degree=dp_degree,
|
||||
tp_degree=tp_degree,
|
||||
num_worker_threads=num_worker_threads,
|
||||
device=device)
|
||||
|
||||
# in rpc mode, only rank 0 is needed to be coded
|
||||
if rank == 0:
|
||||
master_func(args)
|
||||
# barrier here
|
||||
if rpc_is_initialized():
|
||||
rpc.shutdown()
|
||||
else:
|
||||
warnings.warn("RPC has not been initialized")
|
||||
|
||||
|
||||
def rpc_run(args, master_func):
|
||||
world_size = args.world_size
|
||||
assert args.num_microbatches >= args.world_size, "num_microbatches cannot be fewer than world_size!"
|
||||
mp.spawn(run_worker, args=(args, master_func), nprocs=world_size)
|
@@ -1,80 +0,0 @@
|
||||
import torch
|
||||
from torch import nn
|
||||
import torch.autograd as autograd
|
||||
|
||||
from colossalai.pipeline.rpc import ChimeraPipelineEngine
|
||||
from colossalai.testing import assert_close
|
||||
from rpc_test_utils import rpc_run, parse_args, RpcTestModel
|
||||
|
||||
# global variable for model created
|
||||
feat_num = 100
|
||||
h = 100
|
||||
|
||||
|
||||
def partition(pp_rank: int, chunk: int, stage_num: int):
|
||||
torch.manual_seed(1024)
|
||||
partition = RpcTestModel(pp_rank, stage_num, feat_num, h)
|
||||
return partition
|
||||
|
||||
|
||||
def run_master(args):
|
||||
torch.manual_seed(100)
|
||||
|
||||
epoch = args.epoch
|
||||
device = args.device
|
||||
stage_num = args.world_size
|
||||
chunk = 1
|
||||
num_microbatches = args.num_microbatches
|
||||
use_checkpoint = False
|
||||
|
||||
sample_num = 1024
|
||||
batch_size = 1024
|
||||
|
||||
assert sample_num % batch_size == 0
|
||||
|
||||
engine = ChimeraPipelineEngine(partition_fn=partition,
|
||||
stage_num=stage_num,
|
||||
num_microbatches=num_microbatches,
|
||||
device=device,
|
||||
checkpoint=use_checkpoint)
|
||||
engine.initialize_optimizer(torch.optim.Adam, lr=1e-3)
|
||||
|
||||
input_sample = torch.randn((sample_num, feat_num), device=device)
|
||||
|
||||
forward_result = engine.forward_backward(input_sample)
|
||||
|
||||
cuda_rpc_result = []
|
||||
single_result = []
|
||||
actual_stage_num = engine._get_actual_stage_num()
|
||||
|
||||
# compute forward result and backward grad of parameters in cuda rpc
|
||||
cuda_rpc_result.append(sum(forward_result[0]))
|
||||
grad = engine.remote_grad()
|
||||
for stage_id in range(actual_stage_num):
|
||||
for p in grad[stage_id]:
|
||||
cuda_rpc_result.append(p)
|
||||
|
||||
# compute forward result and backward grad of parameters just in rank_0
|
||||
test_model = nn.Sequential(
|
||||
*[partition(pp_rank, chunk, actual_stage_num) for pp_rank in range(actual_stage_num)]).to(device)
|
||||
# input_sample = input_sample[len(input_sample) // 2:]
|
||||
input_sample = input_sample.requires_grad_()
|
||||
out_val = test_model(input_sample).sum()
|
||||
autograd.backward(out_val)
|
||||
single_result.append(out_val)
|
||||
for p in test_model.parameters():
|
||||
single_result.append(p.grad)
|
||||
|
||||
# print("my")
|
||||
# print(cuda_rpc_result[1])
|
||||
# print("answer:")
|
||||
# print(single_result[1])
|
||||
|
||||
# assert len(cuda_rpc_result) == len(single_result)
|
||||
# for r_c, r_s in zip(cuda_rpc_result, single_result):
|
||||
# assert_close(r_c, r_s, 0.001, 0.001)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
rpc_run(args, run_master)
|
@@ -1,81 +0,0 @@
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch import autograd
|
||||
from torch.optim import SGD, Adam, RMSprop, Optimizer
|
||||
|
||||
from colossalai.pipeline.rpc._pipeline_schedule import FillDrainPipelineEngine, OneFOneBPipelineEngine
|
||||
from colossalai.testing import assert_close
|
||||
from rpc_test_utils import rpc_run, parse_args, RpcTestModel
|
||||
|
||||
# global variable for model created
|
||||
feat_num = 100
|
||||
h = 100
|
||||
|
||||
|
||||
def partition(pp_rank: int, chunk: int, stage_num: int):
|
||||
torch.manual_seed(1024)
|
||||
partition = RpcTestModel(pp_rank, stage_num, feat_num, h)
|
||||
return partition
|
||||
|
||||
|
||||
def run_master(args):
|
||||
torch.manual_seed(100)
|
||||
|
||||
device = args.device
|
||||
stage_num = args.world_size
|
||||
chunk = args.chunk
|
||||
actual_stage_num = stage_num * chunk
|
||||
use_checkpoint = args.use_checkpoint
|
||||
num_microbatches = args.num_microbatches
|
||||
optimizer_class = globals()[args.optimizer]
|
||||
|
||||
lr = 1e-3
|
||||
sample_num = 1024
|
||||
batch_size = 1024
|
||||
|
||||
assert sample_num % batch_size == 0
|
||||
|
||||
input_sample = torch.randn((sample_num, feat_num), device=device)
|
||||
|
||||
engine = OneFOneBPipelineEngine(partition_fn=partition,
|
||||
stage_num=stage_num,
|
||||
num_microbatches=num_microbatches,
|
||||
device=device,
|
||||
chunk=chunk,
|
||||
checkpoint=use_checkpoint)
|
||||
|
||||
engine.initialize_optimizer(optimizer_class, lr=lr)
|
||||
|
||||
_ = engine.forward_backward(input_sample)
|
||||
|
||||
cuda_rpc_result = []
|
||||
single_result = []
|
||||
actual_stage_num = engine._get_actual_stage_num()
|
||||
|
||||
# compute parameters after updating in cuda rpc
|
||||
parameters = engine.remote_parameters()
|
||||
for stage_id in range(actual_stage_num):
|
||||
for p in parameters[stage_id]:
|
||||
cuda_rpc_result.append(p)
|
||||
|
||||
# compute forward result and backward grad of parameters just in rank_0
|
||||
test_model = nn.Sequential(
|
||||
*[partition(pp_rank, chunk, actual_stage_num) for pp_rank in range(actual_stage_num)]).to(device)
|
||||
optimizer: Optimizer = optimizer_class(test_model.parameters(), lr=lr)
|
||||
input_sample = input_sample.requires_grad_()
|
||||
out_val = test_model(input_sample).sum()
|
||||
autograd.backward(out_val)
|
||||
optimizer.step()
|
||||
optimizer.zero_grad()
|
||||
|
||||
for p in test_model.parameters():
|
||||
single_result.append(p)
|
||||
|
||||
assert len(cuda_rpc_result) == len(single_result)
|
||||
for r_c, r_s in zip(cuda_rpc_result, single_result):
|
||||
assert_close(r_c, r_s, 0.001, 0.001)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
rpc_run(args, run_master)
|
@@ -1,48 +0,0 @@
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
from colossalai.pipeline.rpc._pipeline_schedule import FillDrainPipelineEngine, OneFOneBPipelineEngine
|
||||
from rpc_test_utils import rpc_run, parse_args, RpcTestModel
|
||||
|
||||
# global variable for model created
|
||||
feat_num = 100
|
||||
h = 100
|
||||
|
||||
|
||||
def partition(pp_rank: int, chunk: int, stage_num: int):
|
||||
torch.manual_seed(1024)
|
||||
partition = RpcTestModel(pp_rank, stage_num, feat_num, h)
|
||||
return partition
|
||||
|
||||
|
||||
def run_master(args):
|
||||
torch.manual_seed(100)
|
||||
|
||||
epoch = args.epoch
|
||||
device = args.device
|
||||
stage_num = args.world_size
|
||||
chunk = args.chunk
|
||||
num_microbatches = args.num_microbatches
|
||||
use_checkpoint = args.use_checkpoint
|
||||
|
||||
sample_num = 1024
|
||||
batch_size = 1024
|
||||
|
||||
assert sample_num % batch_size == 0
|
||||
|
||||
input_sample = torch.randn((sample_num, feat_num), device=device)
|
||||
|
||||
engine = OneFOneBPipelineEngine(partition_fn=partition,
|
||||
stage_num=stage_num,
|
||||
num_microbatches=num_microbatches,
|
||||
device=device,
|
||||
chunk=chunk,
|
||||
checkpoint=use_checkpoint)
|
||||
|
||||
for _ in range(epoch):
|
||||
_ = engine.forward_backward(input_sample, forward_only=False)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
rpc_run(args, run_master)
|
@@ -1,73 +0,0 @@
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch import autograd
|
||||
|
||||
from colossalai.pipeline.rpc._pipeline_schedule import FillDrainPipelineEngine, OneFOneBPipelineEngine
|
||||
from colossalai.testing import assert_close
|
||||
from rpc_test_utils import rpc_run, parse_args, RpcTestModel
|
||||
|
||||
feat_num = 100
|
||||
h = 100
|
||||
|
||||
|
||||
def partition(pp_rank: int, chunk: int, stage_num: int):
|
||||
torch.manual_seed(1024)
|
||||
partition = RpcTestModel(pp_rank, stage_num, feat_num, h)
|
||||
return partition
|
||||
|
||||
|
||||
def run_master(args):
|
||||
torch.manual_seed(100)
|
||||
|
||||
device = args.device
|
||||
stage_num = args.world_size
|
||||
chunk = args.chunk
|
||||
actual_stage_num = stage_num * chunk
|
||||
use_checkpoint = args.use_checkpoint
|
||||
num_microbatches = args.num_microbatches
|
||||
|
||||
sample_num = 1024
|
||||
batch_size = 1024
|
||||
|
||||
assert sample_num % batch_size == 0
|
||||
|
||||
input_sample = torch.randn((sample_num, feat_num), device=device)
|
||||
|
||||
engine = OneFOneBPipelineEngine(partition_fn=partition,
|
||||
stage_num=stage_num,
|
||||
num_microbatches=num_microbatches,
|
||||
device=device,
|
||||
chunk=chunk,
|
||||
checkpoint=use_checkpoint)
|
||||
|
||||
forward_result = engine.forward_backward(input_sample)
|
||||
|
||||
cuda_rpc_result = []
|
||||
single_result = []
|
||||
actual_stage_num = engine._get_actual_stage_num()
|
||||
|
||||
# compute forward result and backward grad of parameters in cuda rpc
|
||||
cuda_rpc_result.append(sum(forward_result[0]))
|
||||
grad = engine.remote_grad()
|
||||
for stage_id in range(actual_stage_num):
|
||||
for p in grad[stage_id]:
|
||||
cuda_rpc_result.append(p)
|
||||
|
||||
# compute forward result and backward grad of parameters just in rank_0
|
||||
test_model = nn.Sequential(
|
||||
*[partition(pp_rank, chunk, actual_stage_num) for pp_rank in range(actual_stage_num)]).to(device)
|
||||
input_sample = input_sample.requires_grad_()
|
||||
out_val = test_model(input_sample).sum()
|
||||
autograd.backward(out_val)
|
||||
single_result.append(out_val)
|
||||
for p in test_model.parameters():
|
||||
single_result.append(p.grad)
|
||||
|
||||
assert len(cuda_rpc_result) == len(single_result)
|
||||
for r_c, r_s in zip(cuda_rpc_result, single_result):
|
||||
assert_close(r_c, r_s, 0.001, 0.001)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
rpc_run(args, run_master)
|
@@ -1,145 +0,0 @@
|
||||
import os
|
||||
from functools import partial
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import torch.distributed.rpc as rpc
|
||||
from rpc_test_utils import DAG_MLP, MLP
|
||||
from torch._C._distributed_rpc import _is_current_rpc_agent_set
|
||||
|
||||
from colossalai import launch
|
||||
from colossalai.fx import ColoTracer
|
||||
from colossalai.fx.passes.adding_split_node_pass import balanced_split_pass, split_with_split_nodes_pass
|
||||
from colossalai.logging import disable_existing_loggers
|
||||
from colossalai.pipeline.middleware.adaptor import get_fx_topology
|
||||
from colossalai.pipeline.pipeline_process_group import ppg
|
||||
from colossalai.pipeline.rpc._pipeline_schedule import OneFOneBPipelineEngine
|
||||
from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
|
||||
|
||||
# global variable for model created
|
||||
batch_size = 16
|
||||
dim = 10
|
||||
rpc_is_initialized = _is_current_rpc_agent_set
|
||||
|
||||
|
||||
def create_partition_module(pp_rank: int, stage_num: int, model, data_kwargs):
|
||||
model.eval()
|
||||
tracer = ColoTracer()
|
||||
meta_args = {k: v.to('meta') for k, v in data_kwargs.items()}
|
||||
graph = tracer.trace(root=model, meta_args=meta_args)
|
||||
gm = torch.fx.GraphModule(model, graph, model.__class__.__name__)
|
||||
annotated_model = balanced_split_pass(gm, stage_num)
|
||||
top_module, split_submodules = split_with_split_nodes_pass(annotated_model, merge_output=True)
|
||||
topo = get_fx_topology(top_module)
|
||||
for submodule in split_submodules:
|
||||
if isinstance(submodule, torch.fx.GraphModule):
|
||||
setattr(submodule, '_topo', topo)
|
||||
return split_submodules[pp_rank + 1]
|
||||
|
||||
|
||||
def partition(model, data_kwargs: dict, pp_rank: int, chunk: int, stage_num: int):
|
||||
torch.manual_seed(1024)
|
||||
partition = create_partition_module(pp_rank, stage_num, model, data_kwargs)
|
||||
return partition
|
||||
|
||||
|
||||
def run_master(model_cls, world_size, forward_only):
|
||||
torch.manual_seed(100)
|
||||
|
||||
epoch = 3
|
||||
device = 'cuda'
|
||||
stage_num = world_size
|
||||
chunk = 1
|
||||
num_microbatches = 8
|
||||
use_checkpoint = 'store_true'
|
||||
|
||||
if model_cls == MLP:
|
||||
|
||||
def data_gen():
|
||||
x = torch.zeros((batch_size, dim))
|
||||
kwargs = dict(x=x)
|
||||
return kwargs
|
||||
|
||||
model = model_cls(dim, stage_num * 3)
|
||||
if forward_only:
|
||||
labels = None
|
||||
else:
|
||||
labels = 1
|
||||
elif model_cls == DAG_MLP:
|
||||
|
||||
def data_gen():
|
||||
x = torch.zeros((batch_size, dim))
|
||||
y = torch.zeros((batch_size, dim))
|
||||
kwargs = dict(x=x, y=y)
|
||||
return kwargs
|
||||
|
||||
model = model_cls(dim, stage_num * 3)
|
||||
if forward_only:
|
||||
labels = None
|
||||
else:
|
||||
labels = 1
|
||||
else:
|
||||
pass
|
||||
|
||||
data_kwargs = data_gen()
|
||||
|
||||
engine = OneFOneBPipelineEngine(
|
||||
partition_fn=partial(partition, model, data_kwargs),
|
||||
stage_num=stage_num,
|
||||
num_microbatches=num_microbatches,
|
||||
device=device,
|
||||
chunk=chunk,
|
||||
checkpoint=use_checkpoint,
|
||||
)
|
||||
if not forward_only:
|
||||
engine.initialize_optimizer(getattr(torch.optim, 'SGD'), lr=1e-3)
|
||||
|
||||
for _ in range(epoch):
|
||||
input_x = torch.randn((batch_size, dim), device=device)
|
||||
input_y = torch.randn((batch_size, dim), device=device)
|
||||
logits = engine.forward_backward({'x': input_x, 'y': input_y}, labels=labels, forward_only=forward_only)
|
||||
|
||||
|
||||
def run_worker(rank, world_size, port, model_cls, forward_only, master_func):
|
||||
master_addr = 'localhost'
|
||||
master_port = 29020
|
||||
os.environ['MASTER_ADDR'] = master_addr
|
||||
os.environ['MASTER_PORT'] = str(master_port)
|
||||
|
||||
disable_existing_loggers()
|
||||
|
||||
launch(dict(), rank, world_size, master_addr, master_port, 'nccl', verbose=False)
|
||||
ppg.set_global_info(rank=rank,
|
||||
world_size=world_size,
|
||||
dp_degree=1,
|
||||
tp_degree=1,
|
||||
num_worker_threads=128,
|
||||
device='cuda')
|
||||
|
||||
# in rpc mode, only rank 0 is needed to be coded
|
||||
if rank == 0:
|
||||
master_func(model_cls, world_size, forward_only)
|
||||
# barrier here
|
||||
if rpc_is_initialized():
|
||||
rpc.shutdown()
|
||||
|
||||
|
||||
@pytest.mark.skip("skip due to CI torch version 1.11")
|
||||
@parameterize('model_cls', [MLP, DAG_MLP])
|
||||
@parameterize('forward_only', [True, False])
|
||||
@pytest.mark.dist
|
||||
@rerun_if_address_is_in_use()
|
||||
def test_pp_middleware_fwd(model_cls, forward_only):
|
||||
world_size = 4
|
||||
master_func = run_master
|
||||
spawn(
|
||||
run_worker,
|
||||
world_size,
|
||||
model_cls=model_cls,
|
||||
forward_only=forward_only,
|
||||
master_func=master_func,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_pp_middleware_fwd()
|
@@ -1,59 +0,0 @@
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from colossalai.pipeline.pipelinable import PipelinableContext
|
||||
from colossalai.testing import rerun_if_address_is_in_use, rerun_on_exception, spawn
|
||||
|
||||
NUM_CHUNKS = 1
|
||||
PIPELINE_SIZE = 2
|
||||
|
||||
|
||||
class MLP(torch.nn.Module):
|
||||
|
||||
def __init__(self, dim: int = 256):
|
||||
super().__init__()
|
||||
intermediate_dim = dim * 4
|
||||
self.dense_1 = torch.nn.Linear(dim, intermediate_dim)
|
||||
self.activation = torch.nn.GELU()
|
||||
self.dense_2 = torch.nn.Linear(intermediate_dim, dim)
|
||||
self.dropout = torch.nn.Dropout(0.1)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.dense_1(x)
|
||||
x = self.activation(x)
|
||||
x = self.dense_2(x)
|
||||
x = self.dropout(x)
|
||||
return x
|
||||
|
||||
|
||||
def run_pipelinable(rank, world_size, port):
|
||||
pipelinable = PipelinableContext()
|
||||
with pipelinable:
|
||||
model = MLP()
|
||||
|
||||
assert pipelinable.policy == "balanced"
|
||||
pipelinable.policy = "uniform"
|
||||
assert pipelinable.policy == "uniform"
|
||||
pipelinable.to_layer_list()
|
||||
|
||||
assert pipelinable.layers_count == len(list(model.children()))
|
||||
|
||||
pipeline_model_part_0 = pipelinable.partition(NUM_CHUNKS, PIPELINE_SIZE, 0)
|
||||
assert isinstance(pipeline_model_part_0, torch.nn.Module)
|
||||
pipeline_model_part_1 = pipelinable.partition(NUM_CHUNKS, PIPELINE_SIZE, 1)
|
||||
assert isinstance(pipeline_model_part_1, torch.nn.Module)
|
||||
|
||||
layers_count_in_part_0 = len(list(pipeline_model_part_0._module_list))
|
||||
layers_count_in_part_1 = len(list(pipeline_model_part_1._module_list))
|
||||
|
||||
assert layers_count_in_part_0 + layers_count_in_part_1 == pipelinable.layers_count
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="this is useless")
|
||||
@rerun_if_address_is_in_use()
|
||||
def test_pipelinable():
|
||||
spawn(run_pipelinable, 1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_pipelinable()
|
@@ -1,42 +0,0 @@
|
||||
import os
|
||||
|
||||
import torch.distributed.rpc as rpc
|
||||
from rpc_test_utils import pg_parse_args, rpc_is_initialized
|
||||
|
||||
from colossalai.initialize import launch
|
||||
from colossalai.logging import disable_existing_loggers
|
||||
from colossalai.pipeline.pipeline_process_group import ppg
|
||||
from colossalai.testing import spawn
|
||||
|
||||
|
||||
def run_worker(rank, args):
|
||||
os.environ['MASTER_ADDR'] = args.master_addr
|
||||
os.environ['MASTER_PORT'] = args.master_port
|
||||
|
||||
device = args.device
|
||||
world_size = args.world_size
|
||||
dp_degree = args.dp_degree
|
||||
tp_degree = args.tp_degree
|
||||
num_worker_threads = args.num_worker_threads
|
||||
host = args.master_addr
|
||||
port = args.master_port
|
||||
backend = 'nccl' if device == 'cuda' else 'gloo'
|
||||
|
||||
disable_existing_loggers()
|
||||
launch(dict(), rank, world_size, host, int(port), backend, verbose=False)
|
||||
|
||||
ppg.set_global_info(rank=rank,
|
||||
world_size=world_size,
|
||||
dp_degree=dp_degree,
|
||||
tp_degree=tp_degree,
|
||||
num_worker_threads=num_worker_threads,
|
||||
device=device)
|
||||
|
||||
if rpc_is_initialized():
|
||||
rpc.shutdown()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = pg_parse_args()
|
||||
world_size = args.world_size
|
||||
spawn(run_worker, world_size, args=args)
|
Reference in New Issue
Block a user