[legacy] clean up legacy code (#4743)

* [legacy] remove outdated codes of pipeline (#4692)

* [legacy] remove cli of benchmark and update optim (#4690)

* [legacy] remove cli of benchmark and update optim

* [doc] fix cli doc test

* [legacy] fix engine clip grad norm

* [legacy] remove outdated colo tensor (#4694)

* [legacy] remove outdated colo tensor

* [test] fix test import

* [legacy] move outdated zero to legacy (#4696)

* [legacy] clean up utils (#4700)

* [legacy] clean up utils

* [example] update examples

* [legacy] clean up amp

* [legacy] fix amp module

* [legacy] clean up gpc (#4742)

* [legacy] clean up context

* [legacy] clean core, constants and global vars

* [legacy] refactor initialize

* [example] fix examples ci

* [example] fix examples ci

* [legacy] fix tests

* [example] fix gpt example

* [example] fix examples ci

* [devops] fix ci installation

* [example] fix examples ci
This commit is contained in:
Hongxin Liu
2023-09-18 16:31:06 +08:00
committed by GitHub
parent 32e7f99416
commit b5f9e37c70
342 changed files with 2919 additions and 4182 deletions

View File

@@ -1,150 +0,0 @@
import argparse
import os
import warnings
import torch
import torch.distributed as dist
import torch.distributed.rpc as rpc
import torch.multiprocessing as mp
from torch import nn
from torch._C._distributed_rpc import _is_current_rpc_agent_set
from torch.optim import SGD, Adam, Optimizer, RMSprop
from colossalai import launch
from colossalai.logging import disable_existing_loggers
from colossalai.pipeline.pipeline_process_group import ppg
rpc_is_initialized = _is_current_rpc_agent_set
def color_debug(text, prefix=' ', color='blue'):
color = color.upper()
print(getattr(Back, color), prefix, Style.RESET_ALL, text)
class MLP(nn.Module):
def __init__(self, dim: int, layers: int):
super().__init__()
self.layers = torch.nn.ModuleList()
for _ in range(layers):
self.layers.append(nn.Linear(dim, dim, bias=False))
def forward(self, x):
for layer in self.layers:
x = layer(x)
return x.sum()
class DAG_MLP(nn.Module):
def __init__(self, dim: int, layers: int):
super().__init__()
self.layers = torch.nn.ModuleList()
self.dag_layer = nn.Linear(dim, dim, bias=False)
for _ in range(layers):
self.layers.append(nn.Linear(dim, dim, bias=False))
def forward(self, x, y):
for layer in self.layers:
x = layer(x)
y = self.dag_layer(y)
return x.sum(), y.sum()
class RpcTestModel(nn.Module):
def __init__(self, stage_id, actual_stage_num, feat_num, h) -> None:
super().__init__()
self.rank = stage_id
self.is_last_rank = stage_id == actual_stage_num - 1
self.linear_name = f'linear_{stage_id}'
if stage_id == 0:
linear = nn.Linear(feat_num, h)
elif stage_id == actual_stage_num - 1:
linear = nn.Linear(h, 1)
else:
linear = nn.Linear(h, h)
setattr(self, self.linear_name, linear)
def forward(self, x) -> torch.Tensor:
linear: nn.Module = getattr(self, self.linear_name)
out: torch.Tensor = linear(x)
if self.is_last_rank:
out = out.sum()
return out
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('--epoch', type=int, default=1)
parser.add_argument('--world_size', type=int, default=2)
parser.add_argument('--batch_size', type=int, default=16)
parser.add_argument('--dp_degree', type=int, default=1)
parser.add_argument('--tp_degree', type=int, default=1)
parser.add_argument('--num_microbatches', type=int, default=2)
parser.add_argument('--chunk', type=int, default=1)
parser.add_argument('--use_checkpoint', action='store_true')
parser.add_argument('--optimizer', type=str, choices=['SGD', 'Adam', 'RMSprop'], default='SGD')
parser.add_argument('--device', type=str, choices=['cpu', 'cuda'], default='cuda')
parser.add_argument('--master_addr', type=str, default='localhost')
parser.add_argument('--master_port', type=str, default='29020')
parser.add_argument('--num_worker_threads', type=str, default=128)
return parser.parse_args()
def pg_parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('--world_size', type=int, default=4)
parser.add_argument('--dp_degree', type=int, default=2)
parser.add_argument('--tp_degree', type=int, default=1)
parser.add_argument('--chunk', type=int, default=1)
parser.add_argument('--num_worker_threads', type=str, default=128)
parser.add_argument('--device', type=str, choices=['cpu', 'cuda'], default='cuda')
parser.add_argument('--master_addr', type=str, default='localhost')
parser.add_argument('--master_port', type=str, default='29020')
return parser.parse_args()
def run_worker(rank, args, master_func):
os.environ['MASTER_ADDR'] = args.master_addr
os.environ['MASTER_PORT'] = args.master_port
device = args.device
world_size = args.world_size
dp_degree = args.dp_degree
tp_degree = args.tp_degree
num_worker_threads = args.num_worker_threads
host = args.master_addr
port = args.master_port
backend = 'nccl' if device == 'cuda' else 'gloo'
disable_existing_loggers()
launch(dict(), rank, world_size, host, int(port), backend, verbose=False)
ppg.set_global_info(rank=rank,
world_size=world_size,
dp_degree=dp_degree,
tp_degree=tp_degree,
num_worker_threads=num_worker_threads,
device=device)
# in rpc mode, only rank 0 is needed to be coded
if rank == 0:
master_func(args)
# barrier here
if rpc_is_initialized():
rpc.shutdown()
else:
warnings.warn("RPC has not been initialized")
def rpc_run(args, master_func):
world_size = args.world_size
assert args.num_microbatches >= args.world_size, "num_microbatches cannot be fewer than world_size!"
mp.spawn(run_worker, args=(args, master_func), nprocs=world_size)

View File

@@ -1,80 +0,0 @@
import torch
from torch import nn
import torch.autograd as autograd
from colossalai.pipeline.rpc import ChimeraPipelineEngine
from colossalai.testing import assert_close
from rpc_test_utils import rpc_run, parse_args, RpcTestModel
# global variable for model created
feat_num = 100
h = 100
def partition(pp_rank: int, chunk: int, stage_num: int):
torch.manual_seed(1024)
partition = RpcTestModel(pp_rank, stage_num, feat_num, h)
return partition
def run_master(args):
torch.manual_seed(100)
epoch = args.epoch
device = args.device
stage_num = args.world_size
chunk = 1
num_microbatches = args.num_microbatches
use_checkpoint = False
sample_num = 1024
batch_size = 1024
assert sample_num % batch_size == 0
engine = ChimeraPipelineEngine(partition_fn=partition,
stage_num=stage_num,
num_microbatches=num_microbatches,
device=device,
checkpoint=use_checkpoint)
engine.initialize_optimizer(torch.optim.Adam, lr=1e-3)
input_sample = torch.randn((sample_num, feat_num), device=device)
forward_result = engine.forward_backward(input_sample)
cuda_rpc_result = []
single_result = []
actual_stage_num = engine._get_actual_stage_num()
# compute forward result and backward grad of parameters in cuda rpc
cuda_rpc_result.append(sum(forward_result[0]))
grad = engine.remote_grad()
for stage_id in range(actual_stage_num):
for p in grad[stage_id]:
cuda_rpc_result.append(p)
# compute forward result and backward grad of parameters just in rank_0
test_model = nn.Sequential(
*[partition(pp_rank, chunk, actual_stage_num) for pp_rank in range(actual_stage_num)]).to(device)
# input_sample = input_sample[len(input_sample) // 2:]
input_sample = input_sample.requires_grad_()
out_val = test_model(input_sample).sum()
autograd.backward(out_val)
single_result.append(out_val)
for p in test_model.parameters():
single_result.append(p.grad)
# print("my")
# print(cuda_rpc_result[1])
# print("answer:")
# print(single_result[1])
# assert len(cuda_rpc_result) == len(single_result)
# for r_c, r_s in zip(cuda_rpc_result, single_result):
# assert_close(r_c, r_s, 0.001, 0.001)
if __name__ == "__main__":
args = parse_args()
rpc_run(args, run_master)

View File

@@ -1,81 +0,0 @@
import torch
from torch import nn
from torch import autograd
from torch.optim import SGD, Adam, RMSprop, Optimizer
from colossalai.pipeline.rpc._pipeline_schedule import FillDrainPipelineEngine, OneFOneBPipelineEngine
from colossalai.testing import assert_close
from rpc_test_utils import rpc_run, parse_args, RpcTestModel
# global variable for model created
feat_num = 100
h = 100
def partition(pp_rank: int, chunk: int, stage_num: int):
torch.manual_seed(1024)
partition = RpcTestModel(pp_rank, stage_num, feat_num, h)
return partition
def run_master(args):
torch.manual_seed(100)
device = args.device
stage_num = args.world_size
chunk = args.chunk
actual_stage_num = stage_num * chunk
use_checkpoint = args.use_checkpoint
num_microbatches = args.num_microbatches
optimizer_class = globals()[args.optimizer]
lr = 1e-3
sample_num = 1024
batch_size = 1024
assert sample_num % batch_size == 0
input_sample = torch.randn((sample_num, feat_num), device=device)
engine = OneFOneBPipelineEngine(partition_fn=partition,
stage_num=stage_num,
num_microbatches=num_microbatches,
device=device,
chunk=chunk,
checkpoint=use_checkpoint)
engine.initialize_optimizer(optimizer_class, lr=lr)
_ = engine.forward_backward(input_sample)
cuda_rpc_result = []
single_result = []
actual_stage_num = engine._get_actual_stage_num()
# compute parameters after updating in cuda rpc
parameters = engine.remote_parameters()
for stage_id in range(actual_stage_num):
for p in parameters[stage_id]:
cuda_rpc_result.append(p)
# compute forward result and backward grad of parameters just in rank_0
test_model = nn.Sequential(
*[partition(pp_rank, chunk, actual_stage_num) for pp_rank in range(actual_stage_num)]).to(device)
optimizer: Optimizer = optimizer_class(test_model.parameters(), lr=lr)
input_sample = input_sample.requires_grad_()
out_val = test_model(input_sample).sum()
autograd.backward(out_val)
optimizer.step()
optimizer.zero_grad()
for p in test_model.parameters():
single_result.append(p)
assert len(cuda_rpc_result) == len(single_result)
for r_c, r_s in zip(cuda_rpc_result, single_result):
assert_close(r_c, r_s, 0.001, 0.001)
if __name__ == "__main__":
args = parse_args()
rpc_run(args, run_master)

View File

@@ -1,48 +0,0 @@
import torch
from torch import nn
from colossalai.pipeline.rpc._pipeline_schedule import FillDrainPipelineEngine, OneFOneBPipelineEngine
from rpc_test_utils import rpc_run, parse_args, RpcTestModel
# global variable for model created
feat_num = 100
h = 100
def partition(pp_rank: int, chunk: int, stage_num: int):
torch.manual_seed(1024)
partition = RpcTestModel(pp_rank, stage_num, feat_num, h)
return partition
def run_master(args):
torch.manual_seed(100)
epoch = args.epoch
device = args.device
stage_num = args.world_size
chunk = args.chunk
num_microbatches = args.num_microbatches
use_checkpoint = args.use_checkpoint
sample_num = 1024
batch_size = 1024
assert sample_num % batch_size == 0
input_sample = torch.randn((sample_num, feat_num), device=device)
engine = OneFOneBPipelineEngine(partition_fn=partition,
stage_num=stage_num,
num_microbatches=num_microbatches,
device=device,
chunk=chunk,
checkpoint=use_checkpoint)
for _ in range(epoch):
_ = engine.forward_backward(input_sample, forward_only=False)
if __name__ == "__main__":
args = parse_args()
rpc_run(args, run_master)

View File

@@ -1,73 +0,0 @@
import torch
from torch import nn
from torch import autograd
from colossalai.pipeline.rpc._pipeline_schedule import FillDrainPipelineEngine, OneFOneBPipelineEngine
from colossalai.testing import assert_close
from rpc_test_utils import rpc_run, parse_args, RpcTestModel
feat_num = 100
h = 100
def partition(pp_rank: int, chunk: int, stage_num: int):
torch.manual_seed(1024)
partition = RpcTestModel(pp_rank, stage_num, feat_num, h)
return partition
def run_master(args):
torch.manual_seed(100)
device = args.device
stage_num = args.world_size
chunk = args.chunk
actual_stage_num = stage_num * chunk
use_checkpoint = args.use_checkpoint
num_microbatches = args.num_microbatches
sample_num = 1024
batch_size = 1024
assert sample_num % batch_size == 0
input_sample = torch.randn((sample_num, feat_num), device=device)
engine = OneFOneBPipelineEngine(partition_fn=partition,
stage_num=stage_num,
num_microbatches=num_microbatches,
device=device,
chunk=chunk,
checkpoint=use_checkpoint)
forward_result = engine.forward_backward(input_sample)
cuda_rpc_result = []
single_result = []
actual_stage_num = engine._get_actual_stage_num()
# compute forward result and backward grad of parameters in cuda rpc
cuda_rpc_result.append(sum(forward_result[0]))
grad = engine.remote_grad()
for stage_id in range(actual_stage_num):
for p in grad[stage_id]:
cuda_rpc_result.append(p)
# compute forward result and backward grad of parameters just in rank_0
test_model = nn.Sequential(
*[partition(pp_rank, chunk, actual_stage_num) for pp_rank in range(actual_stage_num)]).to(device)
input_sample = input_sample.requires_grad_()
out_val = test_model(input_sample).sum()
autograd.backward(out_val)
single_result.append(out_val)
for p in test_model.parameters():
single_result.append(p.grad)
assert len(cuda_rpc_result) == len(single_result)
for r_c, r_s in zip(cuda_rpc_result, single_result):
assert_close(r_c, r_s, 0.001, 0.001)
if __name__ == "__main__":
args = parse_args()
rpc_run(args, run_master)

View File

@@ -1,145 +0,0 @@
import os
from functools import partial
import pytest
import torch
import torch.distributed.rpc as rpc
from rpc_test_utils import DAG_MLP, MLP
from torch._C._distributed_rpc import _is_current_rpc_agent_set
from colossalai import launch
from colossalai.fx import ColoTracer
from colossalai.fx.passes.adding_split_node_pass import balanced_split_pass, split_with_split_nodes_pass
from colossalai.logging import disable_existing_loggers
from colossalai.pipeline.middleware.adaptor import get_fx_topology
from colossalai.pipeline.pipeline_process_group import ppg
from colossalai.pipeline.rpc._pipeline_schedule import OneFOneBPipelineEngine
from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
# global variable for model created
batch_size = 16
dim = 10
rpc_is_initialized = _is_current_rpc_agent_set
def create_partition_module(pp_rank: int, stage_num: int, model, data_kwargs):
model.eval()
tracer = ColoTracer()
meta_args = {k: v.to('meta') for k, v in data_kwargs.items()}
graph = tracer.trace(root=model, meta_args=meta_args)
gm = torch.fx.GraphModule(model, graph, model.__class__.__name__)
annotated_model = balanced_split_pass(gm, stage_num)
top_module, split_submodules = split_with_split_nodes_pass(annotated_model, merge_output=True)
topo = get_fx_topology(top_module)
for submodule in split_submodules:
if isinstance(submodule, torch.fx.GraphModule):
setattr(submodule, '_topo', topo)
return split_submodules[pp_rank + 1]
def partition(model, data_kwargs: dict, pp_rank: int, chunk: int, stage_num: int):
torch.manual_seed(1024)
partition = create_partition_module(pp_rank, stage_num, model, data_kwargs)
return partition
def run_master(model_cls, world_size, forward_only):
torch.manual_seed(100)
epoch = 3
device = 'cuda'
stage_num = world_size
chunk = 1
num_microbatches = 8
use_checkpoint = 'store_true'
if model_cls == MLP:
def data_gen():
x = torch.zeros((batch_size, dim))
kwargs = dict(x=x)
return kwargs
model = model_cls(dim, stage_num * 3)
if forward_only:
labels = None
else:
labels = 1
elif model_cls == DAG_MLP:
def data_gen():
x = torch.zeros((batch_size, dim))
y = torch.zeros((batch_size, dim))
kwargs = dict(x=x, y=y)
return kwargs
model = model_cls(dim, stage_num * 3)
if forward_only:
labels = None
else:
labels = 1
else:
pass
data_kwargs = data_gen()
engine = OneFOneBPipelineEngine(
partition_fn=partial(partition, model, data_kwargs),
stage_num=stage_num,
num_microbatches=num_microbatches,
device=device,
chunk=chunk,
checkpoint=use_checkpoint,
)
if not forward_only:
engine.initialize_optimizer(getattr(torch.optim, 'SGD'), lr=1e-3)
for _ in range(epoch):
input_x = torch.randn((batch_size, dim), device=device)
input_y = torch.randn((batch_size, dim), device=device)
logits = engine.forward_backward({'x': input_x, 'y': input_y}, labels=labels, forward_only=forward_only)
def run_worker(rank, world_size, port, model_cls, forward_only, master_func):
master_addr = 'localhost'
master_port = 29020
os.environ['MASTER_ADDR'] = master_addr
os.environ['MASTER_PORT'] = str(master_port)
disable_existing_loggers()
launch(dict(), rank, world_size, master_addr, master_port, 'nccl', verbose=False)
ppg.set_global_info(rank=rank,
world_size=world_size,
dp_degree=1,
tp_degree=1,
num_worker_threads=128,
device='cuda')
# in rpc mode, only rank 0 is needed to be coded
if rank == 0:
master_func(model_cls, world_size, forward_only)
# barrier here
if rpc_is_initialized():
rpc.shutdown()
@pytest.mark.skip("skip due to CI torch version 1.11")
@parameterize('model_cls', [MLP, DAG_MLP])
@parameterize('forward_only', [True, False])
@pytest.mark.dist
@rerun_if_address_is_in_use()
def test_pp_middleware_fwd(model_cls, forward_only):
world_size = 4
master_func = run_master
spawn(
run_worker,
world_size,
model_cls=model_cls,
forward_only=forward_only,
master_func=master_func,
)
if __name__ == "__main__":
test_pp_middleware_fwd()

View File

@@ -1,59 +0,0 @@
import pytest
import torch
from colossalai.pipeline.pipelinable import PipelinableContext
from colossalai.testing import rerun_if_address_is_in_use, rerun_on_exception, spawn
NUM_CHUNKS = 1
PIPELINE_SIZE = 2
class MLP(torch.nn.Module):
def __init__(self, dim: int = 256):
super().__init__()
intermediate_dim = dim * 4
self.dense_1 = torch.nn.Linear(dim, intermediate_dim)
self.activation = torch.nn.GELU()
self.dense_2 = torch.nn.Linear(intermediate_dim, dim)
self.dropout = torch.nn.Dropout(0.1)
def forward(self, x):
x = self.dense_1(x)
x = self.activation(x)
x = self.dense_2(x)
x = self.dropout(x)
return x
def run_pipelinable(rank, world_size, port):
pipelinable = PipelinableContext()
with pipelinable:
model = MLP()
assert pipelinable.policy == "balanced"
pipelinable.policy = "uniform"
assert pipelinable.policy == "uniform"
pipelinable.to_layer_list()
assert pipelinable.layers_count == len(list(model.children()))
pipeline_model_part_0 = pipelinable.partition(NUM_CHUNKS, PIPELINE_SIZE, 0)
assert isinstance(pipeline_model_part_0, torch.nn.Module)
pipeline_model_part_1 = pipelinable.partition(NUM_CHUNKS, PIPELINE_SIZE, 1)
assert isinstance(pipeline_model_part_1, torch.nn.Module)
layers_count_in_part_0 = len(list(pipeline_model_part_0._module_list))
layers_count_in_part_1 = len(list(pipeline_model_part_1._module_list))
assert layers_count_in_part_0 + layers_count_in_part_1 == pipelinable.layers_count
@pytest.mark.skip(reason="this is useless")
@rerun_if_address_is_in_use()
def test_pipelinable():
spawn(run_pipelinable, 1)
if __name__ == '__main__':
test_pipelinable()

View File

@@ -1,42 +0,0 @@
import os
import torch.distributed.rpc as rpc
from rpc_test_utils import pg_parse_args, rpc_is_initialized
from colossalai.initialize import launch
from colossalai.logging import disable_existing_loggers
from colossalai.pipeline.pipeline_process_group import ppg
from colossalai.testing import spawn
def run_worker(rank, args):
os.environ['MASTER_ADDR'] = args.master_addr
os.environ['MASTER_PORT'] = args.master_port
device = args.device
world_size = args.world_size
dp_degree = args.dp_degree
tp_degree = args.tp_degree
num_worker_threads = args.num_worker_threads
host = args.master_addr
port = args.master_port
backend = 'nccl' if device == 'cuda' else 'gloo'
disable_existing_loggers()
launch(dict(), rank, world_size, host, int(port), backend, verbose=False)
ppg.set_global_info(rank=rank,
world_size=world_size,
dp_degree=dp_degree,
tp_degree=tp_degree,
num_worker_threads=num_worker_threads,
device=device)
if rpc_is_initialized():
rpc.shutdown()
if __name__ == "__main__":
args = pg_parse_args()
world_size = args.world_size
spawn(run_worker, world_size, args=args)