added CI for unit testing (#69)

This commit is contained in:
Frank Lee
2021-12-16 10:32:08 +08:00
committed by GitHub
parent 45355a62f7
commit cd9c28e055
68 changed files with 1089 additions and 766 deletions

View File

@@ -1,232 +0,0 @@
# referenced from Megatron and used to testify communication
import os.path as osp
import pytest
import torch
from torch.utils.data import DataLoader
from colossalai.builder import ModelInitializer, build_dataset, build_optimizer, build_loss
from colossalai.communication import p2p as p2p_communication
from colossalai.communication.utils import send_tensor_meta, recv_tensor_meta
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.initialize import initialize
from colossalai.utils import print_rank_0, get_current_device
NUM_BATCH = 128
NUM_MICRO = 6
def get_num_microbatches():
return NUM_MICRO
def to_cuda(data):
if isinstance(data, (tuple, list)):
data = data[0].to(get_current_device())
else:
data = data.to(get_current_device())
return data
def step_func(loss):
def _step_func(input_tensor, model):
output = model(input_tensor)
if isinstance(output, (tuple, list)):
if len(output) > 1:
raise NotImplementedError("Multiple output!!!")
else:
output = output[0]
return output, loss
return _step_func
def forward_step(forward_step_func, data_iterator, model, input_tensor, losses_reduced):
"""Forward step for passed-in model.
If first stage, input tensor is obtained from data_iterator, otherwise
passed-in input_tensor is used.
Returns output tensor."""
if input_tensor is None:
data, label = data_iterator.next()
input_tensor = to_cuda(data)
output_tensor, loss_func = forward_step_func(input_tensor, model)
if gpc.is_last_rank(ParallelMode.PIPELINE):
data, label = data_iterator.next()
label = to_cuda(label)
output_tensor = loss_func(output_tensor, label) / get_num_microbatches()
losses_reduced.append(output_tensor)
return output_tensor
def backward_step(optimizer, input_tensor, output_tensor, output_tensor_grad):
"""Backward step through passed-in output tensor.
If last stage, output_tensor_grad is None, otherwise gradient of loss
with respect to stage's output tensor.
Returns gradient of loss with respect to input tensor (None if first
stage)."""
# Retain the grad on the input_tensor.
if input_tensor is not None:
input_tensor.retain_grad()
# Backward pass.
torch.autograd.backward(output_tensor, grad_tensors=output_tensor_grad)
# Collect the grad of the input_tensor.
input_tensor_grad = None
if input_tensor is not None:
input_tensor_grad = input_tensor.grad
return input_tensor_grad
def forward_backward_pipelining_without_interleaving(forward_step_func, data_iterator,
model, optimizer, forward_only):
"""Run non-interleaved 1F1B schedule, with communication between pipeline
stages.
Returns dictionary with losses if the last stage, empty dict otherwise."""
# Compute number of warmup microbatches.
num_microbatches = get_num_microbatches()
num_warmup_microbatches = \
(gpc.get_world_size(ParallelMode.PIPELINE) -
gpc.get_local_rank(ParallelMode.PIPELINE) - 1)
num_warmup_microbatches = min(
num_warmup_microbatches,
num_microbatches)
num_microbatches_remaining = \
num_microbatches - num_warmup_microbatches
# Input, output tensors only need to be saved when doing backward passes
input_tensors = None
output_tensors = None
if not forward_only:
input_tensors = []
output_tensors = []
losses_reduced = []
# Used for tensor meta information communication
ft_shape = None
bt_shape = None
fs_checker = True
# Run warmup forward passes.
for i in range(num_warmup_microbatches):
if not gpc.is_first_rank(ParallelMode.PIPELINE):
ft_shape = recv_tensor_meta(ft_shape)
input_tensor = p2p_communication.recv_forward(ft_shape)
output_tensor = forward_step(forward_step_func, data_iterator, model,
input_tensor, losses_reduced)
if not gpc.is_last_rank(ParallelMode.PIPELINE):
bt_shape = output_tensor.shape
fs_checker = send_tensor_meta(output_tensor, fs_checker)
p2p_communication.send_forward(output_tensor)
if not forward_only:
input_tensors.append(input_tensor)
output_tensors.append(output_tensor)
# Before running 1F1B, need to receive first forward tensor.
# If all microbatches are run in warmup / cooldown phase, then no need to
# receive this tensor here.
if num_microbatches_remaining > 0:
if not gpc.is_first_rank(ParallelMode.PIPELINE):
ft_shape = recv_tensor_meta(ft_shape)
input_tensor = p2p_communication.recv_forward(ft_shape)
# Run 1F1B in steady state.
for i in range(num_microbatches_remaining):
last_iteration = (i == (num_microbatches_remaining - 1))
output_tensor = forward_step(forward_step_func, data_iterator, model,
input_tensor, losses_reduced)
if forward_only:
p2p_communication.send_forward(output_tensor)
if not last_iteration:
input_tensor = p2p_communication.recv_forward(ft_shape)
else:
output_tensor_grad = \
p2p_communication.send_forward_recv_backward(output_tensor, bt_shape)
# Add input_tensor and output_tensor to end of list.
input_tensors.append(input_tensor)
output_tensors.append(output_tensor)
# Pop input_tensor and output_tensor from the start of the list for
# the backward pass.
input_tensor = input_tensors.pop(0)
output_tensor = output_tensors.pop(0)
input_tensor_grad = \
backward_step(optimizer, input_tensor, output_tensor,
output_tensor_grad)
if last_iteration:
input_tensor = None
p2p_communication.send_backward(input_tensor_grad)
else:
input_tensor = \
p2p_communication.send_backward_recv_forward(input_tensor_grad, ft_shape)
# Run cooldown backward passes.
if not forward_only:
for i in range(num_warmup_microbatches):
input_tensor = input_tensors.pop(0)
output_tensor = output_tensors.pop(0)
output_tensor_grad = p2p_communication.recv_backward(bt_shape)
input_tensor_grad = \
backward_step(optimizer, input_tensor, output_tensor,
output_tensor_grad)
p2p_communication.send_backward(input_tensor_grad)
return losses_reduced
DIR_PATH = osp.dirname(osp.realpath(__file__))
CONFIG_PATH = osp.join(DIR_PATH, '../configs/pipeline_vanilla_vit.py')
@pytest.mark.skip(reason="This is only for debugging purpose, please ignore this test")
@pytest.mark.dist
def test_schedule():
initialize(CONFIG_PATH)
# build model
model = ModelInitializer(gpc.config.model, 1).model_initialize()
print_rank_0('model is created')
# keep the same sampler for all process
torch.manual_seed(1331)
dataset = build_dataset(gpc.config.data.dataset)
dataloader = DataLoader(dataset=dataset, **gpc.config.data.dataloader)
print_rank_0('train data is created')
# build optimizer and loss
optim = build_optimizer(gpc.config.optimizer, model)
loss = build_loss(gpc.config.loss)
print_rank_0('optim and loss is created')
forward_backward_pipelining_without_interleaving(
step_func(loss),
iter(dataloader),
model,
optim,
False
)
gpc.destroy()
print_rank_0('training finished')
if __name__ == '__main__':
test_schedule()

View File

@@ -0,0 +1,2 @@
from .layers import *
from .resnet import VanillaResNet

View File

@@ -0,0 +1,3 @@
from .basic_block import ResNetBasicBlock
from .bottleneck import ResNetBottleneck
from .reslayer import ResLayer

View File

@@ -0,0 +1,64 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
from typing import Optional, Callable
import torch.nn as nn
from torch import Tensor
from colossalai.registry import LAYERS
from .conv import conv3x3
@LAYERS.register_module
class ResNetBasicBlock(nn.Module):
"""Basic ResNet block
"""
expansion: int = 1
def __init__(
self,
inplanes: int,
planes: int,
stride: int = 1,
downsample: Optional[nn.Module] = None,
groups: int = 1,
base_width: int = 64,
dilation: int = 1,
norm_layer: Optional[Callable[..., nn.Module]] = None
) -> None:
super().__init__()
if norm_layer is None:
norm_layer = nn.BatchNorm2d
if groups != 1 or base_width != 64:
raise ValueError(
'BasicBlock only supports groups=1 and base_width=64')
if dilation > 1:
raise NotImplementedError(
"Dilation > 1 not supported in BasicBlock")
# Both self.conv1 and self.downsample layers downsample the input when stride != 1
self.conv1 = conv3x3(inplanes, planes, stride)
self.bn1 = norm_layer(planes)
self.relu = nn.ReLU(inplace=True)
self.conv2 = conv3x3(planes, planes)
self.bn2 = norm_layer(planes)
self.downsample = downsample
self.stride = stride
def forward(self, x: Tensor) -> Tensor:
identity = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
if self.downsample is not None:
identity = self.downsample(x)
out += identity
out = self.relu(out)
return out

View File

@@ -0,0 +1,69 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
from typing import Optional, Callable
import torch.nn as nn
from torch import Tensor
from colossalai.registry import LAYERS
from .conv import conv3x3, conv1x1
@LAYERS.register_module
class ResNetBottleneck(nn.Module):
# Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
# while original implementation places the stride at the first 1x1 convolution(self.conv1)
# according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
# This variant is also known as ResNet V1.5 and improves accuracy according to
# https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.
expansion: int = 4
def __init__(
self,
inplanes: int,
planes: int,
stride: int = 1,
downsample: Optional[nn.Module] = None,
groups: int = 1,
base_width: int = 64,
dilation: int = 1,
norm_layer: Optional[Callable[..., nn.Module]] = None
) -> None:
super().__init__()
if norm_layer is None:
norm_layer = nn.BatchNorm2d
width = int(planes * (base_width / 64.)) * groups
# Both self.conv2 and self.downsample layers downsample the input when stride != 1
self.conv1 = conv1x1(inplanes, width)
self.bn1 = norm_layer(width)
self.conv2 = conv3x3(width, width, stride, groups, dilation)
self.bn2 = norm_layer(width)
self.conv3 = conv1x1(width, planes * self.expansion)
self.bn3 = norm_layer(planes * self.expansion)
self.relu = nn.ReLU(inplace=True)
self.downsample = downsample
self.stride = stride
def forward(self, x: Tensor) -> Tensor:
identity = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)
out = self.conv3(out)
out = self.bn3(out)
if self.downsample is not None:
identity = self.downsample(x)
out += identity
out = self.relu(out)
return out

View File

@@ -0,0 +1,15 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import torch.nn as nn
def conv3x3(in_planes: int, out_planes: int, stride: int = 1, groups: int = 1, dilation: int = 1) -> nn.Conv2d:
"""3x3 convolution with padding"""
return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
padding=dilation, groups=groups, bias=False, dilation=dilation)
def conv1x1(in_planes: int, out_planes: int, stride: int = 1) -> nn.Conv2d:
"""1x1 convolution"""
return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)

View File

@@ -0,0 +1,63 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import torch.nn as nn
from colossalai.registry import LAYERS
from .conv import conv1x1
@LAYERS.register_module
class ResLayer(nn.Module):
def __init__(self,
block_type: str,
norm_layer_type: str,
inplanes: int,
planes: int,
blocks: int,
groups: int,
base_width: int,
stride: int = 1,
dilation: int = 1,
dilate: bool = False,
):
super().__init__()
self.block = LAYERS.get_module(block_type)
self.norm_layer = LAYERS.get_module(norm_layer_type)
self.inplanes = inplanes
self.planes = planes
self.blocks = blocks
self.groups = groups
self.dilation = dilation
self.base_width = base_width
self.dilate = dilate
self.stride = stride
self.layer = self._make_layer()
def _make_layer(self):
norm_layer = self.norm_layer
downsample = None
previous_dilation = self.dilation
if self.dilate:
self.dilation *= self.stride
self.stride = 1
if self.stride != 1 or self.inplanes != self.planes * self.block.expansion:
downsample = nn.Sequential(
conv1x1(self.inplanes, self.planes * self.block.expansion, self.stride),
norm_layer(self.planes * self.block.expansion),
)
layers = []
layers.append(self.block(self.inplanes, self.planes, self.stride, downsample, self.groups,
self.base_width, previous_dilation, norm_layer))
self.inplanes = self.planes * self.block.expansion
for _ in range(1, self.blocks):
layers.append(self.block(self.inplanes, self.planes, groups=self.groups,
base_width=self.base_width, dilation=self.dilation,
norm_layer=norm_layer))
return nn.Sequential(*layers)
def forward(self, x):
return self.layer(x)

View File

@@ -0,0 +1,163 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
from typing import List, Optional
import torch
import torch.nn as nn
from torch import Tensor
from colossalai.registry import LAYERS
from colossalai.registry import MODELS
from colossalai.nn.model import ModelFromConfig
@MODELS.register_module
class VanillaResNet(ModelFromConfig):
"""ResNet from
`"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
"""
def __init__(
self,
num_cls: int,
block_type: str,
layers: List[int],
norm_layer_type: str = 'BatchNorm2d',
in_channels: int = 3,
groups: int = 1,
width_per_group: int = 64,
zero_init_residual: bool = False,
replace_stride_with_dilation: Optional[List[bool]] = None,
dilations=(1, 1, 1, 1)
) -> None:
super().__init__()
self.inplanes = 64
self.zero_init_residual = zero_init_residual
self.blocks = layers
self.block_expansion = LAYERS.get_module(block_type).expansion
self.dilations = dilations
self.reslayer_common_cfg = dict(
type='ResLayer',
block_type=block_type,
norm_layer_type=norm_layer_type,
groups=groups,
base_width=width_per_group
)
if replace_stride_with_dilation is None:
# each element in the tuple indicates if we should replace
# the 2x2 stride with a dilated convolution instead
replace_stride_with_dilation = [False, False, False]
if len(replace_stride_with_dilation) != 3:
raise ValueError("replace_stride_with_dilation should be None "
"or a 3-element tuple, got {}".format(replace_stride_with_dilation))
self.layers_cfg = [
# conv1
dict(type='Conv2d',
in_channels=in_channels,
out_channels=self.inplanes,
kernel_size=7,
stride=2,
padding=3,
bias=False),
# bn1
dict(
type=norm_layer_type,
num_features=self.inplanes
),
# relu
dict(
type='ReLU',
inplace=True
),
# maxpool
dict(
type='MaxPool2d',
kernel_size=3,
stride=2,
padding=1
),
# layer 1
dict(
inplanes=self.inplanes,
planes=64,
blocks=self.blocks[0],
dilation=self.dilations[0],
**self.reslayer_common_cfg
),
# layer 2
dict(
inplanes=64 * self.block_expansion,
planes=128,
blocks=self.blocks[1],
stride=2,
dilate=replace_stride_with_dilation[0],
dilation=self.dilations[1],
**self.reslayer_common_cfg
),
# layer 3
dict(
inplanes=128 * self.block_expansion,
planes=256,
blocks=layers[2],
stride=2,
dilate=replace_stride_with_dilation[1],
dilation=self.dilations[2],
**self.reslayer_common_cfg
),
# layer 4
dict(
inplanes=256 * self.block_expansion,
planes=512,
blocks=layers[3], stride=2,
dilate=replace_stride_with_dilation[2],
dilation=self.dilations[3],
**self.reslayer_common_cfg
),
# avg pool
dict(
type='AdaptiveAvgPool2d',
output_size=(1, 1)
),
# flatten
dict(
type='LambdaWrapper',
func=lambda mod, x: torch.flatten(x, 1)
),
# linear
dict(
type='Linear',
in_features=512 * self.block_expansion,
out_features=num_cls
)
]
def forward(self, x: Tensor):
for layer in self.layers:
x = layer(x)
return x,
def init_weights(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(
m.weight, mode='fan_out', nonlinearity='relu')
elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
# Zero-initialize the last BN in each residual branch,
# so that the residual branch starts with zeros, and each residual block behaves like an identity.
# This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
if self.zero_init_residual:
for m in self.modules():
if isinstance(m, LAYERS.get_module('ResNetBottleneck')):
# type: ignore[arg-type]
nn.init.constant_(m.bn3.weight, 0)
elif isinstance(m, LAYERS.get_module('ResNetBasicBlock')):
# type: ignore[arg-type]
nn.init.constant_(m.bn2.weight, 0)

View File

@@ -0,0 +1,19 @@
import os
from pathlib import Path
BATCH_SIZE = 128
IMG_SIZE = 224
DIM = 768
NUM_CLASSES = 10
NUM_ATTN_HEADS = 12
# resnet 18
model = dict(type='VanillaResNet',
block_type='ResNetBasicBlock',
layers=[2, 2, 2, 2],
num_cls=10)
parallel = dict(
pipeline=dict(size=4),
tensor=dict(size=1, mode=None)
)

View File

@@ -4,6 +4,7 @@
import pytest
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
from colossalai.communication import (recv_backward, recv_forward,
recv_tensor_meta, send_backward,
@@ -12,13 +13,14 @@ from colossalai.communication import (recv_backward, recv_forward,
send_tensor_meta)
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.initialize import init_dist, parse_args
from colossalai.initialize import launch
from colossalai.logging import get_dist_logger
from colossalai.utils import get_current_device
from functools import partial
BATCH_SIZE = 32
SEQ_LENGTH = 128
HIDDEN_SIZE = 512
BATCH_SIZE = 16
SEQ_LENGTH = 64
HIDDEN_SIZE = 128
CONFIG = dict(
parallel=dict(
@@ -106,7 +108,7 @@ def check_op(size, rank, prev_rank, next_rank, up_group, down_group, logger):
rank, check_equal(tensor, out)))
def test_comm(size, rank, prev_rank, next_rank, up_group, down_group, logger):
def check_comm(size, rank, prev_rank, next_rank, up_group, down_group, logger):
dtype = torch.float32
device = get_current_device()
tensor_shape = (BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE)
@@ -121,13 +123,15 @@ def test_comm(size, rank, prev_rank, next_rank, up_group, down_group, logger):
check_forward_backward(tensor, grad, rank, logger)
@pytest.mark.skip("This test should be invoked using the test.sh provided")
@pytest.mark.dist
def test_main():
args = parse_args()
world_size = args.world_size
init_dist(CONFIG)
def run_check(rank, world_size):
launch(
config=CONFIG,
rank=rank,
world_size=world_size,
host='localhost',
port=29932,
backend='nccl'
)
logger = get_dist_logger()
rank = gpc.get_global_rank()
prev_rank = gpc.get_prev_global_rank(ParallelMode.PIPELINE)
@@ -141,9 +145,18 @@ def test_main():
rank, prev_rank, up_ranks, next_rank, down_ranks))
logger.info('Distributed environment is initialzied.')
test_comm(world_size, rank, prev_rank, next_rank, up_group, down_group,
logger)
check_comm(world_size, rank, prev_rank, next_rank, up_group, down_group,
logger)
gpc.destroy()
torch.cuda.empty_cache()
@pytest.mark.dist
def test_p2p():
world_size = 4
run_func = partial(run_check, world_size=world_size)
mp.spawn(run_func, nprocs=world_size)
if __name__ == '__main__':
test_main()
test_p2p()

View File

@@ -2,35 +2,46 @@ import os.path as osp
import pytest
import torch
import torch.multiprocessing as mp
from torch.utils.data import DataLoader
from colossalai.builder import build_dataset, ModelInitializer
from colossalai.builder.pipeline import PipelineModelInitializer
from colossalai.core import global_context
from colossalai.initialize import init_dist
from colossalai.initialize import launch
from colossalai.logging import get_dist_logger
from functools import partial
import model
DIR_PATH = osp.dirname(osp.realpath(__file__))
CONFIG_PATH = osp.join(DIR_PATH, '../configs/pipeline_vanilla_resnet.py')
CONFIG_PATH = osp.join(DIR_PATH, 'resnet_config.py')
@pytest.mark.skip("This test should be invoked using the test.sh provided")
@pytest.mark.dist
def test_partition():
init_dist(CONFIG_PATH)
def run_partition(rank, world_size):
launch(config=CONFIG_PATH,
rank=rank,
world_size=world_size,
host='localhost',
port=29933,
backend='nccl'
)
logger = get_dist_logger()
logger.info('finished initialization')
# build model
model = ModelInitializer(global_context.config.model, 1, verbose=True).model_initialize()
model = PipelineModelInitializer(global_context.config.model, 1, verbose=True).initialize()
assert isinstance(model, torch.nn.Module)
logger.info('model is created')
dataset = build_dataset(global_context.config.train_data.dataset)
dataloader = DataLoader(dataset=dataset, **global_context.config.train_data.dataloader)
logger.info('train data is created')
global_context.destroy()
torch.cuda.synchronize()
logger.info('training finished')
torch.cuda.empty_cache()
@pytest.mark.dist
def test_partition():
world_size = 4
run_func = partial(run_partition, world_size=world_size)
mp.spawn(run_func, nprocs=world_size)
if __name__ == '__main__':

View File

@@ -0,0 +1,94 @@
# referenced from Megatron and used to testify communication
import colossalai
import os
import os.path as osp
import pytest
import torch
import torch.multiprocessing as mp
import model
from colossalai.builder import PipelineModelInitializer
from colossalai.communication import p2p as p2p_communication
from colossalai.communication.utils import send_tensor_meta, recv_tensor_meta
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.initialize import launch
from colossalai.utils import print_rank_0, get_current_device, get_dataloader
from colossalai.engine.schedule import PipelineSchedule
from torchvision.datasets import CIFAR10
from torchvision import transforms
from pathlib import Path
from functools import partial
BATCH_SIZE = 32
NUM_MICRO = 8
DIR_PATH = osp.dirname(osp.realpath(__file__))
CONFIG_PATH = osp.join(DIR_PATH, './resnet_config.py')
def run_schedule(rank, world_size):
launch(config=CONFIG_PATH,
rank=rank,
world_size=world_size,
host='localhost',
port=29934,
backend='nccl')
# build model
model = PipelineModelInitializer(gpc.config.model, 1).initialize()
print_rank_0('model is created')
train_dataset = CIFAR10(
root=Path(os.environ['DATA']),
download=True,
transform=transforms.Compose(
[
transforms.RandomCrop(size=32, padding=4),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[
0.2023, 0.1994, 0.2010]),
]
)
)
train_dataloader = get_dataloader(dataset=train_dataset,
shuffle=True,
add_sampler=True,
batch_size=BATCH_SIZE,
pin_memory=True,
)
# build criterion
criterion = torch.nn.CrossEntropyLoss()
# optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=0)
# initialize
engine, train_dataloader, _, _ = colossalai.initialize(model, optimizer, criterion, train_dataloader)
# build pipeline schedule
schedule = PipelineSchedule(num_microbatches=NUM_MICRO)
# run schedule
data_iter = iter(train_dataloader)
schedule.forward_backward_step(engine, data_iter)
gpc.destroy()
torch.cuda.empty_cache()
@pytest.mark.dist
def test_pipeline_schedule():
world_size = 4
run_func = partial(run_schedule, world_size=world_size)
mp.spawn(run_func, nprocs=world_size)
if __name__ == '__main__':
test_pipeline_schedule()

View File

@@ -1,51 +0,0 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import os.path as osp
import pytest
from colossalai.context import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.initialize import initialize
from colossalai.logging import get_dist_logger
NUM_BATCH = 128
BATCH_SIZE = 32
SEQ_LENGTH = 128
HIDDEN_SIZE = 512
DIR_PATH = osp.dirname(osp.realpath(__file__))
CONFIG_PATH = osp.join(DIR_PATH, '../configs/pipeline_vanilla_resnet.py')
@pytest.mark.skip("This test should be invoked using the test.sh provided")
@pytest.mark.dist
def test_schedule():
engine, train_dataloader, test_dataloader = initialize(CONFIG_PATH)
logger = get_dist_logger()
model = engine.model
optimizer = engine.optimizer
criterion = engine.criterion
schedule = engine._schedule
output, label, loss = schedule.forward_backward_step(
data_iter=iter(train_dataloader),
model=model,
optimizer=optimizer,
criterion=criterion,
forward_only=False
)
schedule.optimizer_step(model, optimizer)
if gpc.is_last_rank(ParallelMode.PIPELINE):
logger.info('losses: {}'.format(loss))
gpc.destroy()
logger.info('training finished')
if __name__ == '__main__':
test_schedule()