mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-01 17:17:05 +00:00
Hotfix/Colossalai layers (#92)
* optimized 1d layer apis; reorganized nn.layer modules; fixed tests * fixed 2.5d runtime issue * reworked split batch, now called in trainer.schedule.load_batch Co-authored-by: BoxiangW <45734921+BoxiangW@users.noreply.github.com>
This commit is contained in:
@@ -1,4 +1,3 @@
|
||||
import time
|
||||
from functools import partial
|
||||
|
||||
import pytest
|
||||
@@ -9,7 +8,7 @@ from colossalai.communication import all_gather, all_reduce, reduce_scatter
|
||||
from colossalai.context import ParallelMode
|
||||
from colossalai.core import global_context as gpc
|
||||
from colossalai.initialize import launch
|
||||
from colossalai.utils import get_current_device
|
||||
from colossalai.utils import free_port, get_current_device
|
||||
|
||||
CONFIG = dict(parallel=dict(data=8, pipeline=1, tensor=dict(mode=None, size=1)))
|
||||
|
||||
@@ -49,8 +48,8 @@ def check_all_reduce():
|
||||
torch.cuda.synchronize()
|
||||
|
||||
|
||||
def check_layer(rank, world_size):
|
||||
launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=30010, backend='nccl')
|
||||
def check_layer(rank, world_size, port):
|
||||
launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
|
||||
|
||||
assert dist.get_rank() == gpc.get_global_rank()
|
||||
print('Rank {} / {}'.format(dist.get_rank(), dist.get_world_size()))
|
||||
@@ -66,7 +65,7 @@ def check_layer(rank, world_size):
|
||||
@pytest.mark.dist
|
||||
def test_comm():
|
||||
world_size = 4
|
||||
run_func = partial(check_layer, world_size=world_size)
|
||||
run_func = partial(check_layer, world_size=world_size, port=free_port())
|
||||
mp.spawn(run_func, nprocs=world_size)
|
||||
|
||||
|
||||
|
@@ -1,15 +1,16 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- encoding: utf-8 -*-
|
||||
|
||||
from functools import partial
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import torch.multiprocessing as mp
|
||||
|
||||
from colossalai import launch
|
||||
from colossalai.context.parallel_mode import ParallelMode
|
||||
from colossalai.core import global_context as gpc
|
||||
from functools import partial
|
||||
from pathlib import Path
|
||||
from colossalai.utils import free_port
|
||||
|
||||
CONFIG_PATH = Path(__file__).parent.joinpath('configs/parallel_2d_init.py').absolute()
|
||||
|
||||
@@ -87,7 +88,7 @@ def test_2d_init():
|
||||
test_fn = partial(init_2d,
|
||||
world_size=world_size,
|
||||
backend='gloo',
|
||||
port='29900',
|
||||
port=free_port(),
|
||||
host='localhost'
|
||||
)
|
||||
mp.spawn(test_fn, nprocs=world_size)
|
||||
|
@@ -7,10 +7,10 @@ from pathlib import Path
|
||||
import pytest
|
||||
import torch
|
||||
import torch.multiprocessing as mp
|
||||
|
||||
from colossalai.context.parallel_mode import ParallelMode
|
||||
from colossalai.core import global_context as gpc
|
||||
from colossalai.initialize import launch
|
||||
from colossalai.utils import free_port
|
||||
|
||||
CONFIG_PATH = Path(__file__).parent.joinpath('configs/parallel_2p5d_init.py').absolute()
|
||||
|
||||
@@ -111,7 +111,7 @@ def test_2halfd_init():
|
||||
test_fn = partial(init_2halfd,
|
||||
world_size=world_size,
|
||||
backend='gloo',
|
||||
port='29901',
|
||||
port=free_port(),
|
||||
host='localhost'
|
||||
)
|
||||
mp.spawn(test_fn, nprocs=world_size)
|
||||
|
@@ -7,11 +7,10 @@ from pathlib import Path
|
||||
import pytest
|
||||
import torch
|
||||
import torch.multiprocessing as mp
|
||||
|
||||
|
||||
from colossalai.context.parallel_mode import ParallelMode
|
||||
from colossalai.core import global_context as gpc
|
||||
from colossalai.initialize import launch
|
||||
from colossalai.utils import free_port
|
||||
|
||||
CONFIG_PATH = Path(__file__).parent.joinpath('configs/parallel_3d_init.py').absolute()
|
||||
|
||||
@@ -104,7 +103,7 @@ def test_3d_init():
|
||||
test_fn = partial(init_3d,
|
||||
world_size=world_size,
|
||||
backend='gloo',
|
||||
port='29902',
|
||||
port=free_port(),
|
||||
host='localhost'
|
||||
)
|
||||
mp.spawn(test_fn, nprocs=world_size)
|
||||
|
@@ -13,7 +13,7 @@ from colossalai.logging import get_dist_logger
|
||||
from colossalai.nn import Accuracy, LinearWarmupLR
|
||||
from colossalai.nn.loss import CrossEntropyLoss
|
||||
from colossalai.trainer import Trainer, hooks
|
||||
from colossalai.utils import MultiTimer, get_dataloader
|
||||
from colossalai.utils import MultiTimer, free_port, get_dataloader
|
||||
from colossalai.utils.gradient_accumulation import GradAccumLrSchedulerByStep
|
||||
from model_zoo.vit import vit_tiny_patch4_32
|
||||
from torchvision import transforms
|
||||
@@ -27,12 +27,12 @@ CONFIG = dict(parallel=dict(pipeline=2, tensor=dict(size=2, mode='1d')),
|
||||
gradient_accumulation=2)
|
||||
|
||||
|
||||
def run_trainer(rank, world_size):
|
||||
colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=30000, backend='nccl')
|
||||
def run_trainer(rank, world_size, port):
|
||||
colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
|
||||
|
||||
logger = get_dist_logger()
|
||||
|
||||
model = vit_tiny_patch4_32(tensor_parallel='1d')
|
||||
model = vit_tiny_patch4_32()
|
||||
pipe_model = build_pipeline_model(model.layers, num_chunks=1)
|
||||
|
||||
# build dataloaders
|
||||
@@ -54,7 +54,7 @@ def run_trainer(rank, world_size):
|
||||
test_dataloader = get_dataloader(dataset=test_dataset, batch_size=BATCH_SIZE, pin_memory=True)
|
||||
|
||||
# build criterion
|
||||
criterion = CrossEntropyLoss(tensor_parallel='1d')
|
||||
criterion = CrossEntropyLoss()
|
||||
|
||||
# optimizer
|
||||
optimizer = torch.optim.Adam(pipe_model.parameters(), lr=0.001, weight_decay=0)
|
||||
@@ -78,7 +78,6 @@ def run_trainer(rank, world_size):
|
||||
hook_list = [
|
||||
hooks.LossHook(),
|
||||
hooks.LRSchedulerHook(lr_scheduler=lr_scheduler, by_epoch=False),
|
||||
hooks.AccuracyHook(accuracy_func=Accuracy(tensor_parallel='1d')),
|
||||
hooks.LogMetricByEpochHook(logger),
|
||||
]
|
||||
|
||||
@@ -95,7 +94,7 @@ def run_trainer(rank, world_size):
|
||||
# @pytest.mark.skip("This test requires more than 8 GPUs, you should invoke this test script using test.sh provided manually")
|
||||
def test_hybrid_parallel():
|
||||
world_size = 8
|
||||
run_func = partial(run_trainer, world_size=world_size)
|
||||
run_func = partial(run_trainer, world_size=world_size, port=free_port())
|
||||
mp.spawn(run_func, nprocs=world_size)
|
||||
|
||||
|
||||
|
@@ -1,25 +1,23 @@
|
||||
# !/usr/bin/env python
|
||||
# -*- encoding: utf-8 -*-
|
||||
|
||||
import colossalai
|
||||
import os
|
||||
from functools import partial
|
||||
from pathlib import Path
|
||||
|
||||
import colossalai
|
||||
import pytest
|
||||
import torch
|
||||
import os.path as osp
|
||||
from pathlib import Path
|
||||
import torch.nn as nn
|
||||
import torch.multiprocessing as mp
|
||||
|
||||
from torchvision import transforms
|
||||
from torch.optim import Adam
|
||||
from colossalai.core import global_context as gpc
|
||||
import torch.nn as nn
|
||||
from colossalai.amp import AMP_TYPE
|
||||
from colossalai.core import global_context as gpc
|
||||
from colossalai.logging import get_dist_logger
|
||||
from colossalai.utils import report_memory_usage, get_dataloader
|
||||
from torchvision.models import resnet18
|
||||
from colossalai.utils import free_port, get_dataloader, report_memory_usage
|
||||
from torch.optim import Adam
|
||||
from torchvision import transforms
|
||||
from torchvision.datasets import CIFAR10
|
||||
from functools import partial
|
||||
|
||||
from torchvision.models import resnet18
|
||||
|
||||
# Config
|
||||
BATCH_SIZE = 128
|
||||
@@ -38,14 +36,14 @@ CONFIG = dict(
|
||||
)
|
||||
|
||||
|
||||
def run_engine(rank, world_size):
|
||||
def run_engine(rank, world_size, port):
|
||||
# init dist env
|
||||
colossalai.launch(
|
||||
config=CONFIG,
|
||||
rank=rank,
|
||||
world_size=world_size,
|
||||
host='localhost',
|
||||
port=29910,
|
||||
port=port,
|
||||
backend='nccl'
|
||||
)
|
||||
|
||||
@@ -104,7 +102,7 @@ def run_engine(rank, world_size):
|
||||
@pytest.mark.dist
|
||||
def test_engine():
|
||||
world_size = 4
|
||||
run_func = partial(run_engine, world_size=world_size)
|
||||
run_func = partial(run_engine, world_size=world_size, port=free_port())
|
||||
mp.spawn(run_func, nprocs=world_size)
|
||||
|
||||
|
||||
|
@@ -1,23 +1,20 @@
|
||||
import colossalai
|
||||
import os
|
||||
from functools import partial
|
||||
from pathlib import Path
|
||||
|
||||
import colossalai
|
||||
import pytest
|
||||
import torch
|
||||
import os.path as osp
|
||||
from pathlib import Path
|
||||
import torch.nn as nn
|
||||
import torch.multiprocessing as mp
|
||||
|
||||
from torchvision import transforms
|
||||
from torch.optim import Adam
|
||||
from colossalai.core import global_context as gpc
|
||||
import torch.nn as nn
|
||||
from colossalai.amp import AMP_TYPE
|
||||
from colossalai.core import global_context as gpc
|
||||
from colossalai.logging import get_dist_logger
|
||||
from colossalai.utils import report_memory_usage, get_dataloader
|
||||
from colossalai.initialize import get_default_parser
|
||||
from torchvision.models import resnet18
|
||||
from colossalai.utils import free_port, get_dataloader, report_memory_usage
|
||||
from torch.optim import Adam
|
||||
from torchvision import transforms
|
||||
from torchvision.datasets import CIFAR10
|
||||
from functools import partial
|
||||
|
||||
from torchvision.models import resnet18
|
||||
|
||||
# Config
|
||||
BATCH_SIZE = 128
|
||||
@@ -38,14 +35,14 @@ CONFIG = dict(
|
||||
)
|
||||
|
||||
|
||||
def run_engine(rank, world_size):
|
||||
def run_engine(rank, world_size, port):
|
||||
# init dist env
|
||||
colossalai.launch(
|
||||
config=CONFIG,
|
||||
rank=rank,
|
||||
world_size=world_size,
|
||||
host='localhost',
|
||||
port=29911,
|
||||
port=port,
|
||||
backend='nccl'
|
||||
)
|
||||
|
||||
@@ -104,7 +101,7 @@ def run_engine(rank, world_size):
|
||||
@pytest.mark.dist
|
||||
def test_engine():
|
||||
world_size = 4
|
||||
run_func = partial(run_engine, world_size=world_size)
|
||||
run_func = partial(run_engine, world_size=world_size, port=free_port())
|
||||
mp.spawn(run_func, nprocs=world_size)
|
||||
|
||||
|
||||
|
@@ -1,23 +1,19 @@
|
||||
import colossalai
|
||||
import os
|
||||
from functools import partial
|
||||
from pathlib import Path
|
||||
|
||||
import colossalai
|
||||
import pytest
|
||||
import torch
|
||||
import os.path as osp
|
||||
from pathlib import Path
|
||||
import torch.nn as nn
|
||||
import torch.multiprocessing as mp
|
||||
|
||||
from torchvision import transforms
|
||||
from torch.optim import Adam
|
||||
import torch.nn as nn
|
||||
from colossalai.core import global_context as gpc
|
||||
from colossalai.amp import AMP_TYPE
|
||||
from colossalai.logging import get_dist_logger
|
||||
from colossalai.utils import report_memory_usage, get_dataloader
|
||||
from colossalai.initialize import get_default_parser
|
||||
from torchvision.models import resnet18
|
||||
from colossalai.utils import free_port, get_dataloader, report_memory_usage
|
||||
from torch.optim import Adam
|
||||
from torchvision import transforms
|
||||
from torchvision.datasets import CIFAR10
|
||||
from functools import partial
|
||||
|
||||
from torchvision.models import resnet18
|
||||
|
||||
# Config
|
||||
BATCH_SIZE = 128
|
||||
@@ -35,14 +31,14 @@ CONFIG = dict(
|
||||
)
|
||||
|
||||
|
||||
def run_engine(rank, world_size):
|
||||
def run_engine(rank, world_size, port):
|
||||
# init dist env
|
||||
colossalai.launch(
|
||||
config=CONFIG,
|
||||
rank=rank,
|
||||
world_size=world_size,
|
||||
host='localhost',
|
||||
port=29912,
|
||||
port=port,
|
||||
backend='nccl'
|
||||
)
|
||||
|
||||
@@ -101,7 +97,7 @@ def run_engine(rank, world_size):
|
||||
@pytest.mark.dist
|
||||
def test_engine():
|
||||
world_size = 4
|
||||
run_func = partial(run_engine, world_size=world_size)
|
||||
run_func = partial(run_engine, world_size=world_size, port=free_port())
|
||||
mp.spawn(run_func, nprocs=world_size)
|
||||
|
||||
|
||||
|
@@ -1,23 +1,20 @@
|
||||
import colossalai
|
||||
import os
|
||||
from functools import partial
|
||||
from pathlib import Path
|
||||
|
||||
import colossalai
|
||||
import pytest
|
||||
import torch
|
||||
import os.path as osp
|
||||
from pathlib import Path
|
||||
import torch.nn as nn
|
||||
import torch.multiprocessing as mp
|
||||
|
||||
from torchvision import transforms
|
||||
from torch.optim import Adam
|
||||
from colossalai.core import global_context as gpc
|
||||
import torch.nn as nn
|
||||
from colossalai.amp import AMP_TYPE
|
||||
from colossalai.core import global_context as gpc
|
||||
from colossalai.logging import get_dist_logger
|
||||
from colossalai.utils import report_memory_usage, get_dataloader
|
||||
from colossalai.initialize import get_default_parser
|
||||
from torchvision.models import resnet18
|
||||
from colossalai.utils import free_port, get_dataloader, report_memory_usage
|
||||
from torch.optim import Adam
|
||||
from torchvision import transforms
|
||||
from torchvision.datasets import CIFAR10
|
||||
from functools import partial
|
||||
|
||||
from torchvision.models import resnet18
|
||||
|
||||
# Config
|
||||
BATCH_SIZE = 128
|
||||
@@ -36,14 +33,14 @@ CONFIG = dict(
|
||||
)
|
||||
|
||||
|
||||
def run_engine(rank, world_size):
|
||||
def run_engine(rank, world_size, port):
|
||||
# init dist env
|
||||
colossalai.launch(
|
||||
config=CONFIG,
|
||||
rank=rank,
|
||||
world_size=world_size,
|
||||
host='localhost',
|
||||
port=29913,
|
||||
port=port,
|
||||
backend='nccl'
|
||||
)
|
||||
|
||||
@@ -102,7 +99,7 @@ def run_engine(rank, world_size):
|
||||
@pytest.mark.dist
|
||||
def test_engine():
|
||||
world_size = 4
|
||||
run_func = partial(run_engine, world_size=world_size)
|
||||
run_func = partial(run_engine, world_size=world_size, port=free_port())
|
||||
mp.spawn(run_func, nprocs=world_size)
|
||||
|
||||
|
||||
|
@@ -1,13 +1,15 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- encoding: utf-8 -*-
|
||||
|
||||
from functools import partial
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import torch.multiprocessing as mp
|
||||
|
||||
from colossalai.core import global_context as gpc
|
||||
from colossalai.initialize import launch
|
||||
from functools import partial
|
||||
from colossalai.utils import free_port
|
||||
|
||||
from checks_1d.check_layer_1d import *
|
||||
|
||||
CONFIG = dict(
|
||||
@@ -21,12 +23,12 @@ CONFIG = dict(
|
||||
)
|
||||
|
||||
|
||||
def check_layer(rank, world_size):
|
||||
def check_layer(rank, world_size, port):
|
||||
launch(config=CONFIG,
|
||||
rank=rank,
|
||||
world_size=world_size,
|
||||
host='localhost',
|
||||
port=29920,
|
||||
port=port,
|
||||
backend='nccl')
|
||||
|
||||
check_linear_col()
|
||||
@@ -39,7 +41,7 @@ def check_layer(rank, world_size):
|
||||
@pytest.mark.dist
|
||||
def test_1d():
|
||||
world_size = 4
|
||||
run_func = partial(check_layer, world_size=world_size)
|
||||
run_func = partial(check_layer, world_size=world_size, port=free_port())
|
||||
mp.spawn(run_func, nprocs=world_size)
|
||||
|
||||
|
||||
|
@@ -1,16 +1,17 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- encoding: utf-8 -*-
|
||||
|
||||
from functools import partial
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import torch.multiprocessing as mp
|
||||
|
||||
from colossalai.core import global_context as gpc
|
||||
from colossalai.initialize import launch
|
||||
from colossalai.utils import free_port
|
||||
|
||||
from checks_2d.check_layer_2d import *
|
||||
from checks_2d.check_operation_2d import *
|
||||
from functools import partial
|
||||
|
||||
|
||||
CONFIG = dict(
|
||||
parallel=dict(
|
||||
@@ -34,12 +35,12 @@ def check_layer():
|
||||
check_layernorm()
|
||||
check_classifier()
|
||||
|
||||
def check_layer_and_operation(rank, world_size):
|
||||
def check_layer_and_operation(rank, world_size, port):
|
||||
launch(config=CONFIG,
|
||||
rank=rank,
|
||||
world_size=world_size,
|
||||
host='localhost',
|
||||
port=29921,
|
||||
port=port,
|
||||
backend='nccl')
|
||||
|
||||
# check_operations()
|
||||
@@ -51,7 +52,7 @@ def check_layer_and_operation(rank, world_size):
|
||||
@pytest.mark.dist
|
||||
def test_2d():
|
||||
world_size = 4
|
||||
run_func = partial(check_layer_and_operation, world_size=world_size)
|
||||
run_func = partial(check_layer_and_operation, world_size=world_size, port=free_port())
|
||||
mp.spawn(run_func, nprocs=world_size)
|
||||
|
||||
|
||||
|
@@ -1,13 +1,15 @@
|
||||
from functools import partial
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import torch.multiprocessing as mp
|
||||
|
||||
from colossalai.core import global_context as gpc
|
||||
from colossalai.initialize import launch
|
||||
from checks_2p5d.check_layer_2p5d import check_linear, check_layernorm, check_classifier
|
||||
from checks_2p5d.check_operation_2p5d import check_AB, check_ABT, check_ATB
|
||||
from functools import partial
|
||||
from colossalai.utils import free_port
|
||||
|
||||
from checks_2p5d.check_layer_2p5d import (check_classifier, check_layernorm,
|
||||
check_linear)
|
||||
from checks_2p5d.check_operation_2p5d import check_AB, check_ABT, check_ATB
|
||||
|
||||
CONFIG = dict(
|
||||
parallel=dict(
|
||||
@@ -29,12 +31,12 @@ def check_layer():
|
||||
check_classifier()
|
||||
|
||||
|
||||
def check_layer_and_operation(rank, world_size):
|
||||
def check_layer_and_operation(rank, world_size, port):
|
||||
launch(config=CONFIG,
|
||||
rank=rank,
|
||||
world_size=world_size,
|
||||
host='localhost',
|
||||
port=29922,
|
||||
port=port,
|
||||
backend='nccl')
|
||||
|
||||
check_operations()
|
||||
@@ -46,7 +48,7 @@ def check_layer_and_operation(rank, world_size):
|
||||
@pytest.mark.dist
|
||||
def test_2p5d():
|
||||
world_size = 4
|
||||
run_func = partial(check_layer_and_operation, world_size=world_size)
|
||||
run_func = partial(check_layer_and_operation, world_size=world_size, port=free_port())
|
||||
mp.spawn(run_func, nprocs=world_size)
|
||||
|
||||
|
||||
|
@@ -7,6 +7,7 @@ import torch
|
||||
import torch.multiprocessing as mp
|
||||
from colossalai.core import global_context as gpc
|
||||
from colossalai.initialize import launch
|
||||
from colossalai.utils import free_port
|
||||
|
||||
from checks_3d.check_layer_3d import *
|
||||
|
||||
@@ -27,8 +28,8 @@ def check_layer():
|
||||
# check_loss()
|
||||
|
||||
|
||||
def check_layer_and_operation(rank, world_size):
|
||||
launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=29923, backend='nccl')
|
||||
def check_layer_and_operation(rank, world_size, port):
|
||||
launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
|
||||
check_layer()
|
||||
gpc.destroy()
|
||||
torch.cuda.empty_cache()
|
||||
@@ -37,7 +38,7 @@ def check_layer_and_operation(rank, world_size):
|
||||
@pytest.mark.dist
|
||||
def test_3d():
|
||||
world_size = 8
|
||||
run_func = partial(check_layer_and_operation, world_size=world_size)
|
||||
run_func = partial(check_layer_and_operation, world_size=world_size, port=free_port())
|
||||
mp.spawn(run_func, nprocs=world_size)
|
||||
|
||||
|
||||
|
@@ -4,10 +4,11 @@
|
||||
import pytest
|
||||
import torch
|
||||
import torch.multiprocessing as mp
|
||||
from colossalai.initialize import launch, get_default_parser
|
||||
from colossalai.initialize import launch
|
||||
from colossalai.logging import get_dist_logger
|
||||
from checks_seq.check_layer_seq import *
|
||||
from functools import partial
|
||||
from colossalai.utils import free_port
|
||||
|
||||
|
||||
CONFIG = dict(
|
||||
@@ -22,13 +23,13 @@ def check_layer():
|
||||
check_selfattention()
|
||||
|
||||
|
||||
def run_check_sequence(rank, world_size):
|
||||
def run_check_sequence(rank, world_size, port):
|
||||
# init dist
|
||||
launch(config=CONFIG,
|
||||
rank=rank,
|
||||
world_size=world_size,
|
||||
host='localhost',
|
||||
port=29924,
|
||||
port=port,
|
||||
backend='nccl')
|
||||
logger = get_dist_logger()
|
||||
logger.info('Distributed environment is initialzied.', ranks=[0])
|
||||
@@ -41,7 +42,7 @@ def run_check_sequence(rank, world_size):
|
||||
@pytest.mark.dist
|
||||
def test_sequence():
|
||||
world_size = 4
|
||||
run_func = partial(run_check_sequence, world_size=world_size)
|
||||
run_func = partial(run_check_sequence, world_size=world_size, port=free_port())
|
||||
mp.spawn(run_func, nprocs=world_size)
|
||||
|
||||
|
||||
|
@@ -1,4 +1,5 @@
|
||||
import os
|
||||
import model
|
||||
from pathlib import Path
|
||||
|
||||
BATCH_SIZE = 128
|
||||
|
@@ -1,11 +1,12 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- encoding: utf-8 -*-
|
||||
|
||||
from functools import partial
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
import torch.multiprocessing as mp
|
||||
|
||||
from colossalai.communication import (recv_backward, recv_forward,
|
||||
recv_tensor_meta, send_backward,
|
||||
send_backward_recv_forward, send_forward,
|
||||
@@ -15,8 +16,7 @@ from colossalai.context.parallel_mode import ParallelMode
|
||||
from colossalai.core import global_context as gpc
|
||||
from colossalai.initialize import launch
|
||||
from colossalai.logging import get_dist_logger
|
||||
from colossalai.utils import get_current_device
|
||||
from functools import partial
|
||||
from colossalai.utils import free_port, get_current_device
|
||||
|
||||
BATCH_SIZE = 16
|
||||
SEQ_LENGTH = 64
|
||||
@@ -123,13 +123,13 @@ def check_comm(size, rank, prev_rank, next_rank, up_group, down_group, logger):
|
||||
check_forward_backward(tensor, grad, rank, logger)
|
||||
|
||||
|
||||
def run_check(rank, world_size):
|
||||
def run_check(rank, world_size, port):
|
||||
launch(
|
||||
config=CONFIG,
|
||||
rank=rank,
|
||||
world_size=world_size,
|
||||
host='localhost',
|
||||
port=29932,
|
||||
port=port,
|
||||
backend='nccl'
|
||||
)
|
||||
logger = get_dist_logger()
|
||||
@@ -154,7 +154,7 @@ def run_check(rank, world_size):
|
||||
@pytest.mark.dist
|
||||
def test_p2p():
|
||||
world_size = 4
|
||||
run_func = partial(run_check, world_size=world_size)
|
||||
run_func = partial(run_check, world_size=world_size, port=free_port())
|
||||
mp.spawn(run_func, nprocs=world_size)
|
||||
|
||||
|
||||
|
@@ -3,25 +3,24 @@ import os.path as osp
|
||||
import pytest
|
||||
import torch
|
||||
import torch.multiprocessing as mp
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
from colossalai.builder.pipeline import build_pipeline_model_from_cfg
|
||||
from colossalai.core import global_context
|
||||
from colossalai.initialize import launch
|
||||
from colossalai.logging import get_dist_logger
|
||||
from functools import partial
|
||||
import model
|
||||
from colossalai.utils import free_port
|
||||
|
||||
DIR_PATH = osp.dirname(osp.realpath(__file__))
|
||||
CONFIG_PATH = osp.join(DIR_PATH, 'resnet_config.py')
|
||||
|
||||
|
||||
def run_partition(rank, world_size):
|
||||
def run_partition(rank, world_size, port):
|
||||
launch(config=CONFIG_PATH,
|
||||
rank=rank,
|
||||
world_size=world_size,
|
||||
host='localhost',
|
||||
port=29933,
|
||||
port=port,
|
||||
backend='nccl'
|
||||
)
|
||||
logger = get_dist_logger()
|
||||
@@ -40,7 +39,7 @@ def run_partition(rank, world_size):
|
||||
@pytest.mark.dist
|
||||
def test_partition():
|
||||
world_size = 4
|
||||
run_func = partial(run_partition, world_size=world_size)
|
||||
run_func = partial(run_partition, world_size=world_size, port=free_port())
|
||||
mp.spawn(run_func, nprocs=world_size)
|
||||
|
||||
|
||||
|
@@ -1,26 +1,23 @@
|
||||
# referenced from Megatron and used to testify communication
|
||||
|
||||
import colossalai
|
||||
import os
|
||||
import os.path as osp
|
||||
from functools import partial
|
||||
from pathlib import Path
|
||||
|
||||
import colossalai
|
||||
import pytest
|
||||
import torch
|
||||
import torch.multiprocessing as mp
|
||||
import model
|
||||
|
||||
from colossalai.builder import build_pipeline_model_from_cfg
|
||||
from colossalai.communication import p2p as p2p_communication
|
||||
from colossalai.communication.utils import send_tensor_meta, recv_tensor_meta
|
||||
from colossalai.context.parallel_mode import ParallelMode
|
||||
from colossalai.core import global_context as gpc
|
||||
from colossalai.initialize import launch
|
||||
from colossalai.utils import print_rank_0, get_current_device, get_dataloader
|
||||
from colossalai.engine.schedule import PipelineSchedule
|
||||
from torchvision.datasets import CIFAR10
|
||||
from colossalai.initialize import launch
|
||||
from colossalai.utils import free_port, get_dataloader, print_rank_0
|
||||
from torchvision import transforms
|
||||
from pathlib import Path
|
||||
from functools import partial
|
||||
from torchvision.datasets import CIFAR10
|
||||
|
||||
import model
|
||||
|
||||
BATCH_SIZE = 32
|
||||
NUM_MICRO = 8
|
||||
@@ -30,12 +27,12 @@ DIR_PATH = osp.dirname(osp.realpath(__file__))
|
||||
CONFIG_PATH = osp.join(DIR_PATH, './resnet_config.py')
|
||||
|
||||
|
||||
def run_schedule(rank, world_size):
|
||||
def run_schedule(rank, world_size, port):
|
||||
launch(config=CONFIG_PATH,
|
||||
rank=rank,
|
||||
world_size=world_size,
|
||||
host='localhost',
|
||||
port=29934,
|
||||
port=port,
|
||||
backend='nccl')
|
||||
|
||||
# build model
|
||||
@@ -86,7 +83,7 @@ def run_schedule(rank, world_size):
|
||||
@pytest.mark.dist
|
||||
def test_pipeline_schedule():
|
||||
world_size = 4
|
||||
run_func = partial(run_schedule, world_size=world_size)
|
||||
run_func = partial(run_schedule, world_size=world_size, port=free_port())
|
||||
mp.spawn(run_func, nprocs=world_size)
|
||||
|
||||
|
||||
|
@@ -11,7 +11,7 @@ from colossalai.amp.amp_type import AMP_TYPE
|
||||
from colossalai.core import global_context as gpc
|
||||
from colossalai.logging import get_dist_logger
|
||||
from colossalai.trainer import Trainer
|
||||
from colossalai.utils import MultiTimer, get_dataloader
|
||||
from colossalai.utils import MultiTimer, free_port, get_dataloader
|
||||
from torch.optim import Adam
|
||||
from torchvision import transforms
|
||||
from torchvision.datasets import CIFAR10
|
||||
@@ -26,8 +26,8 @@ CONFIG = dict(
|
||||
fp16=dict(mode=AMP_TYPE.TORCH))
|
||||
|
||||
|
||||
def run_trainer_no_pipeline(rank, world_size):
|
||||
colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=29930, backend='nccl')
|
||||
def run_trainer_no_pipeline(rank, world_size, port):
|
||||
colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
|
||||
|
||||
# build model
|
||||
model = resnet18(num_classes=10)
|
||||
@@ -88,7 +88,7 @@ def run_trainer_no_pipeline(rank, world_size):
|
||||
@pytest.mark.dist
|
||||
def test_trainer_no_pipeline():
|
||||
world_size = 4
|
||||
run_func = partial(run_trainer_no_pipeline, world_size=world_size)
|
||||
run_func = partial(run_trainer_no_pipeline, world_size=world_size, port=free_port())
|
||||
mp.spawn(run_func, nprocs=world_size)
|
||||
|
||||
|
||||
|
@@ -12,7 +12,7 @@ from colossalai.core import global_context as gpc
|
||||
from colossalai.engine.schedule import PipelineSchedule
|
||||
from colossalai.logging import get_dist_logger
|
||||
from colossalai.trainer import Trainer
|
||||
from colossalai.utils import MultiTimer, get_dataloader
|
||||
from colossalai.utils import MultiTimer, free_port, get_dataloader
|
||||
from torch.optim import Adam
|
||||
from torchvision import transforms
|
||||
from torchvision.datasets import CIFAR10
|
||||
@@ -25,8 +25,8 @@ NUM_EPOCHS = 200
|
||||
CONFIG = dict(parallel=dict(pipeline=2, ), )
|
||||
|
||||
|
||||
def run_trainer_with_pipeline(rank, world_size):
|
||||
colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=29931, backend='nccl')
|
||||
def run_trainer_with_pipeline(rank, world_size, port):
|
||||
colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
|
||||
|
||||
# build model
|
||||
model = resnet18(num_classes=10)
|
||||
@@ -99,7 +99,7 @@ def run_trainer_with_pipeline(rank, world_size):
|
||||
@pytest.mark.dist
|
||||
def test_trainer_with_pipeline():
|
||||
world_size = 4
|
||||
run_func = partial(run_trainer_with_pipeline, world_size=world_size)
|
||||
run_func = partial(run_trainer_with_pipeline, world_size=world_size, port=free_port())
|
||||
mp.spawn(run_func, nprocs=world_size)
|
||||
|
||||
|
||||
|
@@ -1,21 +1,19 @@
|
||||
import colossalai
|
||||
import os
|
||||
from functools import partial
|
||||
from pathlib import Path
|
||||
|
||||
import colossalai
|
||||
import pytest
|
||||
import torch
|
||||
import torch.multiprocessing as mp
|
||||
import torch.nn as nn
|
||||
|
||||
from functools import partial
|
||||
from pathlib import Path
|
||||
from torchvision import transforms
|
||||
from torch.optim import Adam
|
||||
from colossalai.core import global_context as gpc
|
||||
from colossalai.logging import get_dist_logger
|
||||
from colossalai.utils import report_memory_usage, get_dataloader
|
||||
from colossalai.initialize import get_default_parser
|
||||
from torchvision.models import resnet18
|
||||
from colossalai.utils import free_port, get_dataloader
|
||||
from torch.optim import Adam
|
||||
from torchvision import transforms
|
||||
from torchvision.datasets import CIFAR10
|
||||
|
||||
from torchvision.models import resnet18
|
||||
|
||||
# Config
|
||||
BATCH_SIZE = 16
|
||||
@@ -32,7 +30,7 @@ CONFIG = dict(
|
||||
)
|
||||
|
||||
|
||||
def run_no_pipeline(rank, world_size):
|
||||
def run_no_pipeline(rank, world_size, port):
|
||||
|
||||
# init dist env
|
||||
colossalai.launch(
|
||||
@@ -40,7 +38,7 @@ def run_no_pipeline(rank, world_size):
|
||||
rank=rank,
|
||||
world_size=world_size,
|
||||
host='localhost',
|
||||
port=29500,
|
||||
port=port,
|
||||
backend='nccl'
|
||||
)
|
||||
|
||||
@@ -110,7 +108,7 @@ def run_no_pipeline(rank, world_size):
|
||||
@pytest.mark.dist
|
||||
def test_engine():
|
||||
world_size = 4
|
||||
func = partial(run_no_pipeline, world_size=world_size)
|
||||
func = partial(run_no_pipeline, world_size=world_size, port=free_port())
|
||||
mp.spawn(func, nprocs=world_size)
|
||||
|
||||
|
||||
|
@@ -2,18 +2,18 @@
|
||||
# -*- encoding: utf-8 -*-
|
||||
|
||||
import os
|
||||
import pytest
|
||||
import torch
|
||||
import torch.multiprocessing as mp
|
||||
from functools import partial
|
||||
from pathlib import Path
|
||||
|
||||
import colossalai
|
||||
import pytest
|
||||
import torch
|
||||
import torch.multiprocessing as mp
|
||||
from colossalai.core import global_context as gpc
|
||||
from colossalai.utils import get_dataloader
|
||||
from colossalai.utils import free_port, get_dataloader
|
||||
from torchvision import transforms
|
||||
from torchvision.models import resnet18
|
||||
from torchvision.datasets import CIFAR10
|
||||
from functools import partial
|
||||
from torchvision.models import resnet18
|
||||
|
||||
BATCH_SIZE = 16
|
||||
IMG_SIZE = 224
|
||||
@@ -34,12 +34,12 @@ CONFIG = dict(
|
||||
)
|
||||
|
||||
|
||||
def run_dist(rank, world_size):
|
||||
def run_dist(rank, world_size, port):
|
||||
colossalai.launch(config=CONFIG,
|
||||
rank=rank,
|
||||
world_size=world_size,
|
||||
host='localhost',
|
||||
port=29940,
|
||||
port=port,
|
||||
backend='nccl')
|
||||
|
||||
# build model
|
||||
@@ -94,7 +94,7 @@ def run_dist(rank, world_size):
|
||||
@pytest.mark.dist
|
||||
def test_zero_level_2():
|
||||
world_size = 4
|
||||
run_func = partial(run_dist, world_size=world_size)
|
||||
run_func = partial(run_dist, world_size=world_size, port=free_port())
|
||||
mp.spawn(run_func, nprocs=world_size)
|
||||
|
||||
|
||||
|
@@ -2,18 +2,18 @@
|
||||
# -*- encoding: utf-8 -*-
|
||||
|
||||
import os
|
||||
import pytest
|
||||
import torch
|
||||
import torch.multiprocessing as mp
|
||||
from functools import partial
|
||||
from pathlib import Path
|
||||
|
||||
import colossalai
|
||||
import pytest
|
||||
import torch
|
||||
import torch.multiprocessing as mp
|
||||
from colossalai.core import global_context as gpc
|
||||
from colossalai.utils import get_dataloader
|
||||
from colossalai.utils import free_port, get_dataloader
|
||||
from torchvision import transforms
|
||||
from torchvision.models import resnet18
|
||||
from torchvision.datasets import CIFAR10
|
||||
from functools import partial
|
||||
from torchvision.models import resnet18
|
||||
|
||||
BATCH_SIZE = 16
|
||||
IMG_SIZE = 224
|
||||
@@ -46,12 +46,12 @@ CONFIG = dict(
|
||||
)
|
||||
|
||||
|
||||
def run_dist(rank, world_size):
|
||||
def run_dist(rank, world_size, port):
|
||||
colossalai.launch(config=CONFIG,
|
||||
rank=rank,
|
||||
world_size=world_size,
|
||||
host='localhost',
|
||||
port=29941,
|
||||
port=port,
|
||||
backend='nccl')
|
||||
|
||||
# build model
|
||||
@@ -106,7 +106,7 @@ def run_dist(rank, world_size):
|
||||
@pytest.mark.dist
|
||||
def test_zero_level_3():
|
||||
world_size = 4
|
||||
run_func = partial(run_dist, world_size=world_size)
|
||||
run_func = partial(run_dist, world_size=world_size, port=free_port())
|
||||
mp.spawn(run_func, nprocs=world_size)
|
||||
|
||||
|
||||
|
@@ -13,7 +13,7 @@ import torch.multiprocessing as mp
|
||||
from colossalai.core import global_context as gpc
|
||||
from colossalai.logging import get_dist_logger
|
||||
from colossalai.nn import CrossEntropyLoss
|
||||
from colossalai.utils import get_dataloader
|
||||
from colossalai.utils import free_port, get_dataloader
|
||||
from model_zoo.vit import vit_lite_depth7_patch4_32
|
||||
from torchvision import transforms
|
||||
from torchvision.datasets import CIFAR10
|
||||
@@ -40,11 +40,11 @@ def train_epoch(engine, train_dataloader):
|
||||
return avg_loss
|
||||
|
||||
|
||||
def run_2d_parallel_vision_transformer_level_2(rank, world_size):
|
||||
colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=29950, backend='nccl')
|
||||
def run_2d_parallel_vision_transformer_level_2(rank, world_size, port):
|
||||
colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
|
||||
|
||||
# build model
|
||||
model = vit_lite_depth7_patch4_32(tensor_parallel='2d')
|
||||
model = vit_lite_depth7_patch4_32()
|
||||
|
||||
# build dataloader# build dataloaders
|
||||
train_dataset = CIFAR10(root=Path(os.environ['DATA']),
|
||||
@@ -62,7 +62,7 @@ def run_2d_parallel_vision_transformer_level_2(rank, world_size):
|
||||
|
||||
# build optimizer and loss
|
||||
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
|
||||
criterion = CrossEntropyLoss(tensor_parallel='2d')
|
||||
criterion = CrossEntropyLoss()
|
||||
|
||||
engine, train_dataloader, *args = colossalai.initialize(model=model,
|
||||
optimizer=optimizer,
|
||||
@@ -90,7 +90,7 @@ def run_2d_parallel_vision_transformer_level_2(rank, world_size):
|
||||
@pytest.mark.dist
|
||||
def test_2d_vit_zero_level_2():
|
||||
world_size = 8
|
||||
run_func = partial(run_2d_parallel_vision_transformer_level_2, world_size=world_size)
|
||||
run_func = partial(run_2d_parallel_vision_transformer_level_2, world_size=world_size, port=free_port())
|
||||
mp.spawn(run_func, nprocs=world_size)
|
||||
|
||||
|
||||
|
@@ -13,7 +13,7 @@ import torch.multiprocessing as mp
|
||||
from colossalai.core import global_context as gpc
|
||||
from colossalai.logging import get_dist_logger
|
||||
from colossalai.nn import CrossEntropyLoss
|
||||
from colossalai.utils import get_dataloader
|
||||
from colossalai.utils import free_port, get_dataloader
|
||||
from model_zoo.vit import vit_lite_depth7_patch4_32
|
||||
from torchvision import transforms
|
||||
from torchvision.datasets import CIFAR10
|
||||
@@ -40,11 +40,11 @@ def train_epoch(engine, train_dataloader):
|
||||
return avg_loss
|
||||
|
||||
|
||||
def run_2d_parallel_vision_transformer_level_3(rank, world_size):
|
||||
colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=29951, backend='nccl')
|
||||
def run_2d_parallel_vision_transformer_level_3(rank, world_size, port):
|
||||
colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
|
||||
|
||||
# build model
|
||||
model = vit_lite_depth7_patch4_32(tensor_parallel='2d')
|
||||
model = vit_lite_depth7_patch4_32()
|
||||
|
||||
# build dataloader# build dataloaders
|
||||
train_dataset = CIFAR10(root=Path(os.environ['DATA']),
|
||||
@@ -62,7 +62,7 @@ def run_2d_parallel_vision_transformer_level_3(rank, world_size):
|
||||
|
||||
# build optimizer and loss
|
||||
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
|
||||
criterion = CrossEntropyLoss(tensor_parallel='2d')
|
||||
criterion = CrossEntropyLoss()
|
||||
|
||||
engine, train_dataloader, *args = colossalai.initialize(model=model,
|
||||
optimizer=optimizer,
|
||||
@@ -91,7 +91,7 @@ def run_2d_parallel_vision_transformer_level_3(rank, world_size):
|
||||
@pytest.mark.skip("Level 3 has unknown bug so skip this test for now")
|
||||
def test_3d_vit_zero_level_3():
|
||||
world_size = 8
|
||||
run_func = partial(run_2d_parallel_vision_transformer_level_3, world_size=world_size)
|
||||
run_func = partial(run_2d_parallel_vision_transformer_level_3, world_size=world_size, port=free_port())
|
||||
mp.spawn(run_func, nprocs=world_size)
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user