mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-02 09:38:05 +00:00
[legacy] move communication and nn to legacy and refactor logger (#4671)
* [legacy] move communication to legacy (#4640) * [legacy] refactor logger and clean up legacy codes (#4654) * [legacy] make logger independent to gpc * [legacy] make optim independent to registry * [legacy] move test engine to legacy * [legacy] move nn to legacy (#4656) * [legacy] move nn to legacy * [checkpointio] fix save hf config * [test] remove useledd rpc pp test * [legacy] fix nn init * [example] skip tutorial hybriad parallel example * [devops] test doc check * [devops] test doc check
This commit is contained in:
@@ -8,11 +8,11 @@ from torch.nn.parameter import Parameter
|
||||
|
||||
from colossalai.context import ParallelMode, seed
|
||||
from colossalai.core import global_context as gpc
|
||||
from colossalai.legacy.nn.layer.base_layer import ParallelLayer
|
||||
from colossalai.legacy.nn.layer.parallel_1d._utils import gather_forward_split_backward, reduce_grad, reduce_input
|
||||
from colossalai.legacy.nn.layer.parallel_1d.layers import Linear1D_Row
|
||||
from colossalai.legacy.nn.layer.utils import divide
|
||||
from colossalai.legacy.registry import LAYERS, LOSSES, MODELS
|
||||
from colossalai.nn.layer.base_layer import ParallelLayer
|
||||
from colossalai.nn.layer.parallel_1d._utils import gather_forward_split_backward, reduce_grad, reduce_input
|
||||
from colossalai.nn.layer.parallel_1d.layers import Linear1D_Row
|
||||
from colossalai.nn.layer.utils import divide
|
||||
from colossalai.utils import get_current_device
|
||||
|
||||
|
||||
|
@@ -11,9 +11,9 @@ from colossalai import kernel
|
||||
from colossalai import nn as col_nn
|
||||
from colossalai.core import global_context as gpc
|
||||
from colossalai.kernel.cuda_native.scaled_softmax import AttnMaskType
|
||||
from colossalai.nn.layer import Linear1D_Col, Linear1D_Row
|
||||
from colossalai.nn.layer.base_layer import ParallelLayer
|
||||
from colossalai.nn.layer.utils import ACT2FN, divide
|
||||
from colossalai.legacy.nn.layer import Linear1D_Col, Linear1D_Row
|
||||
from colossalai.legacy.nn.layer.base_layer import ParallelLayer
|
||||
from colossalai.legacy.nn.layer.utils import ACT2FN, divide
|
||||
from colossalai.utils import checkpoint
|
||||
from colossalai.utils.activation_checkpoint import checkpoint
|
||||
|
||||
|
@@ -9,8 +9,8 @@ from colossalai import kernel
|
||||
from colossalai import nn as col_nn
|
||||
from colossalai.context.parallel_mode import ParallelMode
|
||||
from colossalai.core import global_context as gpc
|
||||
from colossalai.legacy.nn.layer.wrapper import PipelineSharedModuleWrapper
|
||||
from colossalai.logging import get_dist_logger
|
||||
from colossalai.nn.layer.wrapper import PipelineSharedModuleWrapper
|
||||
from colossalai.pipeline.utils import partition_uniform
|
||||
|
||||
from .embed import HiddenParallelEmbedding, HiddenParallelGPTLMHead1D, VocabParallelEmbedding, VocabParallelGPTLMHead1D
|
||||
|
@@ -1,5 +1,7 @@
|
||||
#!/bin/bash
|
||||
set -euxo pipefail
|
||||
|
||||
pip install -r requirements.txt
|
||||
colossalai run --nproc_per_node 4 train.py --config config.py
|
||||
echo "legacy example"
|
||||
|
||||
# pip install -r requirements.txt
|
||||
# colossalai run --nproc_per_node 4 train.py --config config.py
|
||||
|
@@ -7,8 +7,8 @@ from tqdm import tqdm
|
||||
import colossalai
|
||||
from colossalai.context import ParallelMode
|
||||
from colossalai.core import global_context as gpc
|
||||
from colossalai.legacy.nn import CrossEntropyLoss
|
||||
from colossalai.logging import get_dist_logger
|
||||
from colossalai.nn import CrossEntropyLoss
|
||||
from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
|
||||
from colossalai.pipeline.pipelinable import PipelinableContext
|
||||
from colossalai.utils import is_using_pp
|
||||
|
@@ -1,33 +1,37 @@
|
||||
from colossalai.context.parallel_mode import ParallelMode
|
||||
import inspect
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import inspect
|
||||
from .layers import Embedding, BertLayer, BertDualHead, PreProcessor, VocabEmbedding
|
||||
from .layers.init_method import init_normal, output_init_normal
|
||||
from colossalai.core import global_context as gpc
|
||||
|
||||
from colossalai.context import ParallelMode
|
||||
from colossalai.context.parallel_mode import ParallelMode
|
||||
from colossalai.core import global_context as gpc
|
||||
from colossalai.kernel import LayerNorm
|
||||
from colossalai.nn.layer.wrapper import PipelineSharedModuleWrapper
|
||||
from colossalai.legacy.nn.layer.wrapper import PipelineSharedModuleWrapper
|
||||
from colossalai.logging import get_dist_logger
|
||||
from colossalai.pipeline.utils import partition_uniform
|
||||
|
||||
from .layers import BertDualHead, BertLayer, Embedding, PreProcessor, VocabEmbedding
|
||||
from .layers.init_method import init_normal, output_init_normal
|
||||
|
||||
|
||||
class BertForPretrain(nn.Module):
|
||||
|
||||
def __init__(self,
|
||||
vocab_size,
|
||||
hidden_size,
|
||||
max_sequence_length,
|
||||
num_attention_heads,
|
||||
num_layers,
|
||||
add_binary_head,
|
||||
is_naive_fp16,
|
||||
num_tokentypes=2,
|
||||
dropout_prob=0.1,
|
||||
mlp_ratio=4,
|
||||
init_std=0.02,
|
||||
convert_fp16_to_fp32_in_softmax=False,
|
||||
):
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size,
|
||||
hidden_size,
|
||||
max_sequence_length,
|
||||
num_attention_heads,
|
||||
num_layers,
|
||||
add_binary_head,
|
||||
is_naive_fp16,
|
||||
num_tokentypes=2,
|
||||
dropout_prob=0.1,
|
||||
mlp_ratio=4,
|
||||
init_std=0.02,
|
||||
convert_fp16_to_fp32_in_softmax=False,
|
||||
):
|
||||
super().__init__()
|
||||
self.seq_parallel_size = gpc.get_world_size(ParallelMode.SEQUENCE)
|
||||
assert max_sequence_length % self.seq_parallel_size == 0, 'sequence length is not divisible by the sequence parallel size'
|
||||
@@ -47,19 +51,19 @@ class BertForPretrain(nn.Module):
|
||||
self.bert_layers = nn.ModuleList()
|
||||
|
||||
for i in range(num_layers):
|
||||
bert_layer = BertLayer(layer_number=i+1,
|
||||
bert_layer = BertLayer(layer_number=i + 1,
|
||||
hidden_size=hidden_size,
|
||||
num_attention_heads=num_attention_heads,
|
||||
attention_dropout=dropout_prob,
|
||||
mlp_ratio=mlp_ratio,
|
||||
hidden_dropout=dropout_prob,
|
||||
convert_fp16_to_fp32_in_softmax=convert_fp16_to_fp32_in_softmax,
|
||||
is_naive_fp16=is_naive_fp16
|
||||
)
|
||||
is_naive_fp16=is_naive_fp16)
|
||||
self.bert_layers.append(bert_layer)
|
||||
|
||||
self.layer_norm = LayerNorm(hidden_size)
|
||||
self.head = BertDualHead(hidden_size, self.embedding.word_embedding_weight.size(0),
|
||||
self.head = BertDualHead(hidden_size,
|
||||
self.embedding.word_embedding_weight.size(0),
|
||||
add_binary_head=add_binary_head)
|
||||
self.reset_parameters()
|
||||
|
||||
@@ -166,22 +170,20 @@ class PipelineBertForPretrain(nn.Module):
|
||||
end_idx = num_layers
|
||||
|
||||
for i in range(start_idx, end_idx):
|
||||
bert_layer = BertLayer(layer_number=i+1,
|
||||
bert_layer = BertLayer(layer_number=i + 1,
|
||||
hidden_size=hidden_size,
|
||||
num_attention_heads=num_attention_heads,
|
||||
attention_dropout=dropout_prob,
|
||||
mlp_ratio=mlp_ratio,
|
||||
hidden_dropout=dropout_prob,
|
||||
convert_fp16_to_fp32_in_softmax=convert_fp16_to_fp32_in_softmax,
|
||||
is_naive_fp16=is_naive_fp16
|
||||
)
|
||||
is_naive_fp16=is_naive_fp16)
|
||||
self.bert_layers.append(bert_layer)
|
||||
|
||||
if self.last_stage:
|
||||
self.word_embeddings = VocabEmbedding(vocab_size, hidden_size)
|
||||
self.layer_norm = LayerNorm(hidden_size)
|
||||
self.head = BertDualHead(hidden_size, vocab_size,
|
||||
add_binary_head=add_binary_head)
|
||||
self.head = BertDualHead(hidden_size, vocab_size, add_binary_head=add_binary_head)
|
||||
self.reset_parameters()
|
||||
|
||||
def _init_normal(self, tensor):
|
||||
|
@@ -1,10 +1,12 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from colossalai.nn.layer.parallel_sequence import TransformerSelfAttentionRing
|
||||
from colossalai.kernel.jit import bias_dropout_add_fused_train, bias_dropout_add_fused_inference
|
||||
|
||||
from colossalai.kernel.cuda_native import LayerNorm
|
||||
from .mlp import TransformerMLP
|
||||
from colossalai.kernel.jit import bias_dropout_add_fused_inference, bias_dropout_add_fused_train
|
||||
from colossalai.legacy.nn.layer.parallel_sequence import TransformerSelfAttentionRing
|
||||
|
||||
from .dropout import get_bias_dropout_add
|
||||
from .mlp import TransformerMLP
|
||||
|
||||
|
||||
def attention_mask_func(attention_scores, attention_mask):
|
||||
@@ -48,8 +50,7 @@ class BertLayer(nn.Module):
|
||||
layer_number=layer_number,
|
||||
apply_query_key_layer_scaling=True,
|
||||
convert_fp16_to_fp32_in_softmax=convert_fp16_to_fp32_in_softmax,
|
||||
fp16=is_naive_fp16
|
||||
)
|
||||
fp16=is_naive_fp16)
|
||||
|
||||
self.hidden_dropout = hidden_dropout
|
||||
self.bias_dropout_fusion = bias_dropout_fusion
|
||||
@@ -89,11 +90,8 @@ class BertLayer(nn.Module):
|
||||
|
||||
# re-enable torch grad to enable fused optimization.
|
||||
with torch.enable_grad():
|
||||
layernorm_input = bias_dropout_add_func(
|
||||
attention_output,
|
||||
attention_bias.expand_as(residual),
|
||||
residual,
|
||||
self.hidden_dropout)
|
||||
layernorm_input = bias_dropout_add_func(attention_output, attention_bias.expand_as(residual), residual,
|
||||
self.hidden_dropout)
|
||||
|
||||
# Layer norm post the self attention.
|
||||
layernorm_output = self.post_attention_layernorm(layernorm_input)
|
||||
@@ -109,10 +107,6 @@ class BertLayer(nn.Module):
|
||||
|
||||
# re-enable torch grad to enable fused optimization.
|
||||
with torch.enable_grad():
|
||||
output = bias_dropout_add_func(
|
||||
mlp_output,
|
||||
mlp_bias.expand_as(residual),
|
||||
residual,
|
||||
self.hidden_dropout)
|
||||
output = bias_dropout_add_func(mlp_output, mlp_bias.expand_as(residual), residual, self.hidden_dropout)
|
||||
|
||||
return output
|
||||
|
Reference in New Issue
Block a user