[legacy] move communication and nn to legacy and refactor logger (#4671)

* [legacy] move communication to legacy (#4640)

* [legacy] refactor logger and clean up legacy codes (#4654)

* [legacy] make logger independent to gpc

* [legacy] make optim independent to registry

* [legacy] move test engine to legacy

* [legacy] move nn to legacy (#4656)

* [legacy] move nn to legacy

* [checkpointio] fix save hf config

* [test] remove useledd rpc pp test

* [legacy] fix nn init

* [example] skip tutorial hybriad parallel example

* [devops] test doc check

* [devops] test doc check
This commit is contained in:
Hongxin Liu
2023-09-11 16:24:28 +08:00
committed by GitHub
parent 536397cc95
commit 554aa9592e
170 changed files with 781 additions and 758 deletions

View File

@@ -2,7 +2,7 @@ import torch
import torch.nn as nn
import torch.nn.functional as F
from colossalai.nn import CheckpointModule
from colossalai.legacy.nn import CheckpointModule
from .registry import non_distributed_component_funcs
from .utils.dummy_data_generator import DummyDataGenerator

View File

@@ -2,7 +2,7 @@ import torch
import torch.nn as nn
import torch.nn.functional as F
from colossalai.nn import CheckpointModule
from colossalai.legacy.nn import CheckpointModule
from .registry import non_distributed_component_funcs
from .utils.dummy_data_generator import DummyDataGenerator

View File

@@ -2,7 +2,7 @@ import torch
import torch.nn as nn
import torch.nn.functional as F
from colossalai.nn import CheckpointModule
from colossalai.legacy.nn import CheckpointModule
from .registry import non_distributed_component_funcs
from .utils import DummyDataGenerator

View File

@@ -3,7 +3,7 @@
import torch
import torch.nn as nn
from colossalai.nn import CheckpointModule
from colossalai.legacy.nn import CheckpointModule
from .registry import non_distributed_component_funcs
from .utils.dummy_data_generator import DummyDataGenerator

View File

@@ -1,7 +1,7 @@
import torch
import torch.nn as nn
from colossalai.nn import CheckpointModule
from colossalai.legacy.nn import CheckpointModule
from colossalai.utils.cuda import get_current_device
from .registry import non_distributed_component_funcs

View File

@@ -1,10 +1,10 @@
import pytest
import torch
from colossalai.communication.p2p_v2 import _recv_object, _send_object
from colossalai.context import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.initialize import launch
from colossalai.legacy.communication.p2p_v2 import _recv_object, _send_object
from colossalai.logging import disable_existing_loggers
from colossalai.testing import rerun_if_address_is_in_use, spawn

View File

@@ -2,10 +2,10 @@ import pytest
import torch
import torch.distributed as dist
from colossalai.communication import all_gather, all_reduce, reduce_scatter
from colossalai.context import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.initialize import launch
from colossalai.legacy.communication import all_gather, all_reduce, reduce_scatter
from colossalai.testing import rerun_if_address_is_in_use, spawn
from colossalai.utils import get_current_device

View File

@@ -1,7 +1,10 @@
import pytest
import torch
from colossalai.communication.p2p import (
from colossalai.context import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.initialize import launch
from colossalai.legacy.communication.p2p import (
recv_backward,
recv_forward,
send_backward,
@@ -9,9 +12,6 @@ from colossalai.communication.p2p import (
send_forward,
send_forward_recv_backward,
)
from colossalai.context import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.initialize import launch
from colossalai.testing import rerun_if_address_is_in_use, spawn
CONFIG = dict(parallel=dict(pipeline=2))

View File

@@ -1,10 +1,10 @@
import pytest
import torch
from colossalai.communication.p2p_v2 import recv_backward, recv_forward, send_backward, send_forward
from colossalai.context import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.initialize import launch
from colossalai.legacy.communication.p2p_v2 import recv_backward, recv_forward, send_backward, send_forward
from colossalai.logging import disable_existing_loggers
from colossalai.testing import rerun_if_address_is_in_use, spawn

View File

@@ -5,7 +5,7 @@ from torch.nn import Parameter
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.global_variables import tensor_parallel_env as env
from colossalai.nn import (
from colossalai.legacy.nn import (
Classifier1D,
Embedding1D,
Linear1D_Col,

View File

@@ -1,15 +1,16 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import torch
DEPTH = 4
BATCH_SIZE = 8
SEQ_LENGTH = 8
IMG_SIZE = 16
HIDDEN_SIZE = 8
NUM_CLASSES = 8
VOCAB_SIZE = 16
def check_equal(A, B):
assert torch.allclose(A, B, rtol=1e-3, atol=1e-1) == True
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import torch
DEPTH = 4
BATCH_SIZE = 8
SEQ_LENGTH = 8
IMG_SIZE = 16
HIDDEN_SIZE = 8
NUM_CLASSES = 8
VOCAB_SIZE = 16
def check_equal(A, B):
assert torch.allclose(A, B, rtol=1e-3, atol=1e-1) == True

View File

@@ -1,12 +1,23 @@
import torch
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.nn import (Classifier2D, CrossEntropyLoss2D, Embedding2D, LayerNorm2D, Linear2D, PatchEmbedding2D,
VanillaClassifier, VanillaPatchEmbedding, VocabParallelClassifier2D,
VocabParallelCrossEntropyLoss2D, VocabParallelEmbedding2D)
from colossalai.legacy.nn import (
Classifier2D,
CrossEntropyLoss2D,
Embedding2D,
LayerNorm2D,
Linear2D,
PatchEmbedding2D,
VanillaClassifier,
VanillaPatchEmbedding,
VocabParallelClassifier2D,
VocabParallelCrossEntropyLoss2D,
VocabParallelEmbedding2D,
)
from colossalai.utils import get_current_device, print_rank_0
from .common import (BATCH_SIZE, DEPTH, HIDDEN_SIZE, IMG_SIZE, NUM_CLASSES, SEQ_LENGTH, VOCAB_SIZE, check_equal)
from .common import BATCH_SIZE, DEPTH, HIDDEN_SIZE, IMG_SIZE, NUM_CLASSES, SEQ_LENGTH, VOCAB_SIZE, check_equal
def check_linear():
@@ -336,7 +347,7 @@ def check_classifier_no_given_weight():
layer.weight.data.copy_(W)
# W.requires_grad = True
B_shape = (OUTPUT_SIZE, )
B_shape = (OUTPUT_SIZE,)
B_master = torch.randint(5, B_shape, dtype=dtype, device=device)
torch.distributed.broadcast(B_master, src=0)
# B = torch.chunk(B_master, DEPTH, dim=0)[j]
@@ -572,7 +583,7 @@ def check_loss():
out_shape = (BATCH_SIZE, NUM_CLASSES)
out_master = torch.randn(out_shape, dtype=dtype, device=device)
target_master = torch.randint(NUM_CLASSES, (BATCH_SIZE, ), dtype=torch.long, device=device)
target_master = torch.randint(NUM_CLASSES, (BATCH_SIZE,), dtype=torch.long, device=device)
torch.distributed.broadcast(out_master, src=0)
torch.distributed.broadcast(target_master, src=0)
out = torch.chunk(out_master, DEPTH, dim=0)[i]
@@ -607,7 +618,7 @@ def check_vocab_parallel_loss():
out_shape = (BATCH_SIZE, NUM_CLASSES)
out_master = torch.randn(out_shape, dtype=dtype, device=device)
target_master = torch.randint(NUM_CLASSES, (BATCH_SIZE, ), dtype=torch.long, device=device)
target_master = torch.randint(NUM_CLASSES, (BATCH_SIZE,), dtype=torch.long, device=device)
torch.distributed.broadcast(out_master, src=0)
torch.distributed.broadcast(target_master, src=0)
out = torch.chunk(out_master, DEPTH, dim=0)[i]

View File

@@ -5,10 +5,10 @@ import torch
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.nn.layer.parallel_2d._operation import Matmul_AB_2D, Matmul_ABT_2D, Matmul_ATB_2D
from colossalai.utils import get_current_device
from colossalai.utils import print_rank_0
from .common import check_equal, BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE, DEPTH
from colossalai.legacy.nn.layer.parallel_2d._operation import Matmul_AB_2D, Matmul_ABT_2D, Matmul_ATB_2D
from colossalai.utils import get_current_device, print_rank_0
from .common import BATCH_SIZE, DEPTH, HIDDEN_SIZE, SEQ_LENGTH, check_equal
def check_AB():

View File

@@ -1,11 +1,22 @@
import torch
from torch.nn import Parameter
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.nn import (Classifier2p5D, CrossEntropyLoss2p5D, Embedding2p5D, LayerNorm2p5D, Linear2p5D,
PatchEmbedding2p5D, VanillaClassifier, VanillaPatchEmbedding, VocabParallelClassifier2p5D,
VocabParallelCrossEntropyLoss2p5D, VocabParallelEmbedding2p5D)
from colossalai.legacy.nn import (
Classifier2p5D,
CrossEntropyLoss2p5D,
Embedding2p5D,
LayerNorm2p5D,
Linear2p5D,
PatchEmbedding2p5D,
VanillaClassifier,
VanillaPatchEmbedding,
VocabParallelClassifier2p5D,
VocabParallelCrossEntropyLoss2p5D,
VocabParallelEmbedding2p5D,
)
from colossalai.utils import get_current_device, print_rank_0
from torch.nn import Parameter
from .common import *
@@ -342,7 +353,7 @@ def check_classifier_no_given_weight():
layer.weight.data.copy_(W)
# W.requires_grad = True
B_shape = (OUTPUT_SIZE, )
B_shape = (OUTPUT_SIZE,)
B_master = torch.randint(5, B_shape, dtype=dtype, device=device)
torch.distributed.broadcast(B_master, src=0)
# B = torch.chunk(B_master, TESSERACT_DIM, dim=0)[j]
@@ -577,7 +588,7 @@ def check_loss():
out_shape = (BATCH_SIZE, NUM_CLASSES)
out_master = torch.randn(out_shape, dtype=dtype, device=device)
target_master = torch.randint(NUM_CLASSES, (BATCH_SIZE, ), dtype=torch.long, device=device)
target_master = torch.randint(NUM_CLASSES, (BATCH_SIZE,), dtype=torch.long, device=device)
torch.distributed.broadcast(out_master, src=0)
torch.distributed.broadcast(target_master, src=0)
out = torch.chunk(out_master, TESSERACT_DIM, dim=0)[i]
@@ -612,7 +623,7 @@ def check_vocab_parallel_loss():
out_shape = (BATCH_SIZE, NUM_CLASSES)
out_master = torch.randn(out_shape, dtype=dtype, device=device)
target_master = torch.randint(NUM_CLASSES, (BATCH_SIZE, ), dtype=torch.long, device=device)
target_master = torch.randint(NUM_CLASSES, (BATCH_SIZE,), dtype=torch.long, device=device)
torch.distributed.broadcast(out_master, src=0)
torch.distributed.broadcast(target_master, src=0)
out = torch.chunk(out_master, TESSERACT_DIM, dim=0)[i]

View File

@@ -2,10 +2,9 @@ import torch
from colossalai.context import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.nn.layer.parallel_2p5d._operation import Matmul_AB_2p5D, Matmul_ABT_2p5D, \
Matmul_ATB_2p5D
from colossalai.utils import get_current_device
from colossalai.utils import print_rank_0
from colossalai.legacy.nn.layer.parallel_2p5d._operation import Matmul_AB_2p5D, Matmul_ABT_2p5D, Matmul_ATB_2p5D
from colossalai.utils import get_current_device, print_rank_0
from .common import *

View File

@@ -11,4 +11,4 @@ IMG_SIZE = 16
def check_equal(A, B):
assert torch.allclose(A, B, rtol=1e-5, atol=1e-2)
assert torch.allclose(A, B, rtol=1e-5, atol=1e-2)

View File

@@ -7,8 +7,7 @@ import torch
from colossalai.constants import INPUT_GROUP_3D, OUTPUT_GROUP_3D, WEIGHT_GROUP_3D
from colossalai.core import global_context
from colossalai.logging import get_dist_logger
from colossalai.nn import (
from colossalai.legacy.nn import (
Classifier3D,
CrossEntropyLoss3D,
Embedding3D,
@@ -21,7 +20,8 @@ from colossalai.nn import (
VocabParallelCrossEntropyLoss3D,
VocabParallelEmbedding3D,
)
from colossalai.nn.layer.parallel_3d._utils import get_parallel_mode_from_env
from colossalai.legacy.nn.layer.parallel_3d._utils import get_parallel_mode_from_env
from colossalai.logging import get_dist_logger
from colossalai.utils import get_current_device, print_rank_0
from .common import BATCH_SIZE, DEPTH, HIDDEN_SIZE, IMG_SIZE, NUM_CLASSES, SEQ_LENGTH, VOCAB_SIZE, check_equal

View File

@@ -16,4 +16,4 @@ VOCAB_SIZE = 16
def check_equal(A, B):
eq = torch.allclose(A, B, rtol=1e-3, atol=1e-2)
assert eq, f"\nA = {A}\nB = {B}"
return eq
return eq

View File

@@ -6,7 +6,7 @@ import pytest
import torch
import colossalai
from colossalai.nn.parallel.layers import (
from colossalai.legacy.nn.parallel.layers import (
CachedEmbeddingBag,
CachedParamMgr,
EvictionStrategy,

View File

@@ -2,7 +2,7 @@ import torch
from colossalai.context import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.nn import TransformerSelfAttentionRing
from colossalai.legacy.nn import TransformerSelfAttentionRing
from colossalai.utils import get_current_device

View File

@@ -5,6 +5,7 @@ import torch.distributed as dist
import colossalai
from colossalai.context import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.legacy.nn.layer.parallel_sequence import RingAV, RingQK
from colossalai.testing import rerun_if_address_is_in_use, spawn
CONFIG = dict(parallel=dict(tensor=dict(size=4, mode='sequence')))
@@ -42,7 +43,7 @@ def check_ring_qk(rank, world_size):
a = torch.matmul(q, k.transpose(2, 1))
# compute distributed attention scores
ring_qk = colossalai.nn.layer.parallel_sequence.RingQK.apply
ring_qk = RingQK.apply
sub_a = ring_qk(sub_q, sub_k, batch_size, num_heads, sub_seq_length)
# check master and distributed attention scores
@@ -95,7 +96,7 @@ def check_ring_av(rank, world_size):
out = torch.matmul(a, v)
# compute distributed attention scores
ring_av = colossalai.nn.layer.parallel_sequence.RingAV.apply
ring_av = RingAV.apply
sub_out = ring_av(sub_a, sub_v, batch_size, num_heads, attention_head_size, sub_seq_length)
# print(f'master output shape: {out.shape}, partial output shape: {sub_out.shape}')

View File

@@ -5,7 +5,10 @@ import pytest
import torch
import torch.distributed as dist
from colossalai.communication import (
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.initialize import launch
from colossalai.legacy.communication import (
recv_backward,
recv_forward,
recv_obj_meta,
@@ -15,9 +18,6 @@ from colossalai.communication import (
send_forward_recv_backward,
send_obj_meta,
)
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.initialize import launch
from colossalai.logging import get_dist_logger
from colossalai.testing import rerun_if_address_is_in_use, spawn
from colossalai.utils import get_current_device

View File

@@ -1,81 +0,0 @@
import os
import time
import pytest
import torch
import torch.nn as nn
from rpc_test_utils import parse_args, rpc_run
from titans.dataloader.cifar10 import build_cifar
from torchvision.models import resnet50
from tqdm import tqdm
from colossalai.pipeline.pipelinable import PipelinableContext
from colossalai.pipeline.rpc import OneFOneBPipelineEngine
def flatten(x):
return torch.flatten(x, 1)
def partition(pp_rank: int, chunk: int, stage_num: int):
pipelinable = PipelinableContext()
# build model partitions
with pipelinable:
# input : [B, 3, 32, 32]
_ = resnet50()
pipelinable.policy = "customized"
exec_seq = [
'conv1', 'bn1', 'relu', 'maxpool', 'layer1', 'layer2', 'layer3', 'layer4', 'avgpool', (flatten, "behind"), 'fc'
]
pipelinable.to_layer_list(exec_seq)
partition = pipelinable.partition(chunk, stage_num, pp_rank)
return partition
def run_master(args):
batch_size = args.batch_size
chunk = args.chunk
device = args.device
world_size = args.world_size
stage_num = world_size
num_microbatches = args.num_microbatches
# build dataloader
root = os.environ.get('DATA', './data')
train_dataloader, test_dataloader = build_cifar(batch_size, root, padding=4, crop=32, resize=32)
criterion = nn.CrossEntropyLoss()
pp_engine = OneFOneBPipelineEngine(partition_fn=partition,
stage_num=stage_num,
num_microbatches=num_microbatches,
device=device,
chunk=chunk,
criterion=criterion,
checkpoint=False)
pp_engine.initialize_optimizer(torch.optim.Adam, lr=1e-3)
s = time.time()
for bx, by in tqdm(train_dataloader):
pp_engine.forward_backward(bx, labels=by, forward_only=False)
cost_time = time.time() - s
print("total cost time :", cost_time)
print("cost time per batch:", cost_time / len(train_dataloader))
@pytest.mark.skip("Test for performance, no need for CI")
def main():
args = parse_args()
# this is due to limitation of partition function
args.world_size = 2
args.chunk = 1
rpc_run(args, run_master)
if __name__ == '__main__':
main()

View File

@@ -7,7 +7,7 @@ import pytest
import torch
import torch.nn as nn
import colossalai.nn as col_nn
import colossalai.legacy.nn as col_nn
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.initialize import launch

View File

@@ -7,7 +7,7 @@ import pytest
import torch
import torch.nn as nn
import colossalai.nn as col_nn
import colossalai.legacy.nn as col_nn
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.initialize import launch

View File

@@ -7,7 +7,7 @@ import pytest
import torch
import torch.nn as nn
import colossalai.nn as col_nn
import colossalai.legacy.nn as col_nn
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.initialize import launch

View File

@@ -7,7 +7,7 @@ import pytest
import torch
import torch.nn as nn
import colossalai.nn as col_nn
import colossalai.legacy.nn as col_nn
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.initialize import launch