[legacy] move communication and nn to legacy and refactor logger (#4671)

* [legacy] move communication to legacy (#4640)

* [legacy] refactor logger and clean up legacy codes (#4654)

* [legacy] make logger independent to gpc

* [legacy] make optim independent to registry

* [legacy] move test engine to legacy

* [legacy] move nn to legacy (#4656)

* [legacy] move nn to legacy

* [checkpointio] fix save hf config

* [test] remove useledd rpc pp test

* [legacy] fix nn init

* [example] skip tutorial hybriad parallel example

* [devops] test doc check

* [devops] test doc check
This commit is contained in:
Hongxin Liu
2023-09-11 16:24:28 +08:00
committed by GitHub
parent 536397cc95
commit 554aa9592e
170 changed files with 781 additions and 758 deletions

View File

@@ -8,11 +8,11 @@ from torch.nn.parameter import Parameter
from colossalai.context import ParallelMode, seed
from colossalai.core import global_context as gpc
from colossalai.legacy.nn.layer.base_layer import ParallelLayer
from colossalai.legacy.nn.layer.parallel_1d._utils import gather_forward_split_backward, reduce_grad, reduce_input
from colossalai.legacy.nn.layer.parallel_1d.layers import Linear1D_Row
from colossalai.legacy.nn.layer.utils import divide
from colossalai.legacy.registry import LAYERS, LOSSES, MODELS
from colossalai.nn.layer.base_layer import ParallelLayer
from colossalai.nn.layer.parallel_1d._utils import gather_forward_split_backward, reduce_grad, reduce_input
from colossalai.nn.layer.parallel_1d.layers import Linear1D_Row
from colossalai.nn.layer.utils import divide
from colossalai.utils import get_current_device

View File

@@ -11,9 +11,9 @@ from colossalai import kernel
from colossalai import nn as col_nn
from colossalai.core import global_context as gpc
from colossalai.kernel.cuda_native.scaled_softmax import AttnMaskType
from colossalai.nn.layer import Linear1D_Col, Linear1D_Row
from colossalai.nn.layer.base_layer import ParallelLayer
from colossalai.nn.layer.utils import ACT2FN, divide
from colossalai.legacy.nn.layer import Linear1D_Col, Linear1D_Row
from colossalai.legacy.nn.layer.base_layer import ParallelLayer
from colossalai.legacy.nn.layer.utils import ACT2FN, divide
from colossalai.utils import checkpoint
from colossalai.utils.activation_checkpoint import checkpoint

View File

@@ -9,8 +9,8 @@ from colossalai import kernel
from colossalai import nn as col_nn
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.legacy.nn.layer.wrapper import PipelineSharedModuleWrapper
from colossalai.logging import get_dist_logger
from colossalai.nn.layer.wrapper import PipelineSharedModuleWrapper
from colossalai.pipeline.utils import partition_uniform
from .embed import HiddenParallelEmbedding, HiddenParallelGPTLMHead1D, VocabParallelEmbedding, VocabParallelGPTLMHead1D

View File

@@ -1,5 +1,7 @@
#!/bin/bash
set -euxo pipefail
pip install -r requirements.txt
colossalai run --nproc_per_node 4 train.py --config config.py
echo "legacy example"
# pip install -r requirements.txt
# colossalai run --nproc_per_node 4 train.py --config config.py

View File

@@ -7,8 +7,8 @@ from tqdm import tqdm
import colossalai
from colossalai.context import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.legacy.nn import CrossEntropyLoss
from colossalai.logging import get_dist_logger
from colossalai.nn import CrossEntropyLoss
from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
from colossalai.pipeline.pipelinable import PipelinableContext
from colossalai.utils import is_using_pp

View File

@@ -1,33 +1,37 @@
from colossalai.context.parallel_mode import ParallelMode
import inspect
import torch
import torch.nn as nn
import inspect
from .layers import Embedding, BertLayer, BertDualHead, PreProcessor, VocabEmbedding
from .layers.init_method import init_normal, output_init_normal
from colossalai.core import global_context as gpc
from colossalai.context import ParallelMode
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.kernel import LayerNorm
from colossalai.nn.layer.wrapper import PipelineSharedModuleWrapper
from colossalai.legacy.nn.layer.wrapper import PipelineSharedModuleWrapper
from colossalai.logging import get_dist_logger
from colossalai.pipeline.utils import partition_uniform
from .layers import BertDualHead, BertLayer, Embedding, PreProcessor, VocabEmbedding
from .layers.init_method import init_normal, output_init_normal
class BertForPretrain(nn.Module):
def __init__(self,
vocab_size,
hidden_size,
max_sequence_length,
num_attention_heads,
num_layers,
add_binary_head,
is_naive_fp16,
num_tokentypes=2,
dropout_prob=0.1,
mlp_ratio=4,
init_std=0.02,
convert_fp16_to_fp32_in_softmax=False,
):
def __init__(
self,
vocab_size,
hidden_size,
max_sequence_length,
num_attention_heads,
num_layers,
add_binary_head,
is_naive_fp16,
num_tokentypes=2,
dropout_prob=0.1,
mlp_ratio=4,
init_std=0.02,
convert_fp16_to_fp32_in_softmax=False,
):
super().__init__()
self.seq_parallel_size = gpc.get_world_size(ParallelMode.SEQUENCE)
assert max_sequence_length % self.seq_parallel_size == 0, 'sequence length is not divisible by the sequence parallel size'
@@ -47,19 +51,19 @@ class BertForPretrain(nn.Module):
self.bert_layers = nn.ModuleList()
for i in range(num_layers):
bert_layer = BertLayer(layer_number=i+1,
bert_layer = BertLayer(layer_number=i + 1,
hidden_size=hidden_size,
num_attention_heads=num_attention_heads,
attention_dropout=dropout_prob,
mlp_ratio=mlp_ratio,
hidden_dropout=dropout_prob,
convert_fp16_to_fp32_in_softmax=convert_fp16_to_fp32_in_softmax,
is_naive_fp16=is_naive_fp16
)
is_naive_fp16=is_naive_fp16)
self.bert_layers.append(bert_layer)
self.layer_norm = LayerNorm(hidden_size)
self.head = BertDualHead(hidden_size, self.embedding.word_embedding_weight.size(0),
self.head = BertDualHead(hidden_size,
self.embedding.word_embedding_weight.size(0),
add_binary_head=add_binary_head)
self.reset_parameters()
@@ -166,22 +170,20 @@ class PipelineBertForPretrain(nn.Module):
end_idx = num_layers
for i in range(start_idx, end_idx):
bert_layer = BertLayer(layer_number=i+1,
bert_layer = BertLayer(layer_number=i + 1,
hidden_size=hidden_size,
num_attention_heads=num_attention_heads,
attention_dropout=dropout_prob,
mlp_ratio=mlp_ratio,
hidden_dropout=dropout_prob,
convert_fp16_to_fp32_in_softmax=convert_fp16_to_fp32_in_softmax,
is_naive_fp16=is_naive_fp16
)
is_naive_fp16=is_naive_fp16)
self.bert_layers.append(bert_layer)
if self.last_stage:
self.word_embeddings = VocabEmbedding(vocab_size, hidden_size)
self.layer_norm = LayerNorm(hidden_size)
self.head = BertDualHead(hidden_size, vocab_size,
add_binary_head=add_binary_head)
self.head = BertDualHead(hidden_size, vocab_size, add_binary_head=add_binary_head)
self.reset_parameters()
def _init_normal(self, tensor):

View File

@@ -1,10 +1,12 @@
import torch
import torch.nn as nn
from colossalai.nn.layer.parallel_sequence import TransformerSelfAttentionRing
from colossalai.kernel.jit import bias_dropout_add_fused_train, bias_dropout_add_fused_inference
from colossalai.kernel.cuda_native import LayerNorm
from .mlp import TransformerMLP
from colossalai.kernel.jit import bias_dropout_add_fused_inference, bias_dropout_add_fused_train
from colossalai.legacy.nn.layer.parallel_sequence import TransformerSelfAttentionRing
from .dropout import get_bias_dropout_add
from .mlp import TransformerMLP
def attention_mask_func(attention_scores, attention_mask):
@@ -48,8 +50,7 @@ class BertLayer(nn.Module):
layer_number=layer_number,
apply_query_key_layer_scaling=True,
convert_fp16_to_fp32_in_softmax=convert_fp16_to_fp32_in_softmax,
fp16=is_naive_fp16
)
fp16=is_naive_fp16)
self.hidden_dropout = hidden_dropout
self.bias_dropout_fusion = bias_dropout_fusion
@@ -89,11 +90,8 @@ class BertLayer(nn.Module):
# re-enable torch grad to enable fused optimization.
with torch.enable_grad():
layernorm_input = bias_dropout_add_func(
attention_output,
attention_bias.expand_as(residual),
residual,
self.hidden_dropout)
layernorm_input = bias_dropout_add_func(attention_output, attention_bias.expand_as(residual), residual,
self.hidden_dropout)
# Layer norm post the self attention.
layernorm_output = self.post_attention_layernorm(layernorm_input)
@@ -109,10 +107,6 @@ class BertLayer(nn.Module):
# re-enable torch grad to enable fused optimization.
with torch.enable_grad():
output = bias_dropout_add_func(
mlp_output,
mlp_bias.expand_as(residual),
residual,
self.hidden_dropout)
output = bias_dropout_add_func(mlp_output, mlp_bias.expand_as(residual), residual, self.hidden_dropout)
return output