[legacy] move communication and nn to legacy and refactor logger (#4671)

* [legacy] move communication to legacy (#4640) * [legacy] refactor logger and clean up legacy codes (#4654) * [legacy] make logger independent to gpc * [legacy] make optim independent to registry * [legacy] move test engine to legacy * [legacy] move nn to legacy (#4656) * [legacy] move nn to legacy * [checkpointio] fix save hf config * [test] remove useledd rpc pp test * [legacy] fix nn init * [example] skip tutorial hybriad parallel example * [devops] test doc check * [devops] test doc check
2025-09-02 09:38:05 +00:00 · 2023-09-11 16:24:28 +08:00
parent 536397cc95
commit 554aa9592e
170 changed files with 781 additions and 758 deletions
--- a/examples/language/gpt/titans/model/embed.py
+++ b/examples/language/gpt/titans/model/embed.py
@@ -8,11 +8,11 @@ from torch.nn.parameter import Parameter

 from colossalai.context import ParallelMode, seed
 from colossalai.core import global_context as gpc
+from colossalai.legacy.nn.layer.base_layer import ParallelLayer
+from colossalai.legacy.nn.layer.parallel_1d._utils import gather_forward_split_backward, reduce_grad, reduce_input
+from colossalai.legacy.nn.layer.parallel_1d.layers import Linear1D_Row
+from colossalai.legacy.nn.layer.utils import divide
 from colossalai.legacy.registry import LAYERS, LOSSES, MODELS
-from colossalai.nn.layer.base_layer import ParallelLayer
-from colossalai.nn.layer.parallel_1d._utils import gather_forward_split_backward, reduce_grad, reduce_input
-from colossalai.nn.layer.parallel_1d.layers import Linear1D_Row
-from colossalai.nn.layer.utils import divide
 from colossalai.utils import get_current_device


--- a/examples/language/gpt/titans/model/gpt1d.py
+++ b/examples/language/gpt/titans/model/gpt1d.py
@@ -11,9 +11,9 @@ from colossalai import kernel
 from colossalai import nn as col_nn
 from colossalai.core import global_context as gpc
 from colossalai.kernel.cuda_native.scaled_softmax import AttnMaskType
-from colossalai.nn.layer import Linear1D_Col, Linear1D_Row
-from colossalai.nn.layer.base_layer import ParallelLayer
-from colossalai.nn.layer.utils import ACT2FN, divide
+from colossalai.legacy.nn.layer import Linear1D_Col, Linear1D_Row
+from colossalai.legacy.nn.layer.base_layer import ParallelLayer
+from colossalai.legacy.nn.layer.utils import ACT2FN, divide
 from colossalai.utils import checkpoint
 from colossalai.utils.activation_checkpoint import checkpoint

--- a/examples/language/gpt/titans/model/pipeline_gpt1d.py
+++ b/examples/language/gpt/titans/model/pipeline_gpt1d.py
@@ -9,8 +9,8 @@ from colossalai import kernel
 from colossalai import nn as col_nn
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
+from colossalai.legacy.nn.layer.wrapper import PipelineSharedModuleWrapper
 from colossalai.logging import get_dist_logger
-from colossalai.nn.layer.wrapper import PipelineSharedModuleWrapper
 from colossalai.pipeline.utils import partition_uniform

 from .embed import HiddenParallelEmbedding, HiddenParallelGPTLMHead1D, VocabParallelEmbedding, VocabParallelGPTLMHead1D
--- a/examples/tutorial/hybrid_parallel/test_ci.sh
+++ b/examples/tutorial/hybrid_parallel/test_ci.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 set -euxo pipefail

-pip install -r requirements.txt
-colossalai run --nproc_per_node 4 train.py --config config.py
+echo "legacy example"
+
+# pip install -r requirements.txt
+# colossalai run --nproc_per_node 4 train.py --config config.py
--- a/examples/tutorial/hybrid_parallel/train.py
+++ b/examples/tutorial/hybrid_parallel/train.py
@@ -7,8 +7,8 @@ from tqdm import tqdm
 import colossalai
 from colossalai.context import ParallelMode
 from colossalai.core import global_context as gpc
+from colossalai.legacy.nn import CrossEntropyLoss
 from colossalai.logging import get_dist_logger
-from colossalai.nn import CrossEntropyLoss
 from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
 from colossalai.pipeline.pipelinable import PipelinableContext
 from colossalai.utils import is_using_pp
--- a/examples/tutorial/sequence_parallel/model/bert.py
+++ b/examples/tutorial/sequence_parallel/model/bert.py
@@ -1,33 +1,37 @@
-from colossalai.context.parallel_mode import ParallelMode
+import inspect
+
 import torch
 import torch.nn as nn
-import inspect
-from .layers import Embedding, BertLayer, BertDualHead, PreProcessor, VocabEmbedding
-from .layers.init_method import init_normal, output_init_normal
-from colossalai.core import global_context as gpc
+
 from colossalai.context import ParallelMode
+from colossalai.context.parallel_mode import ParallelMode
+from colossalai.core import global_context as gpc
 from colossalai.kernel import LayerNorm
-from colossalai.nn.layer.wrapper import PipelineSharedModuleWrapper
+from colossalai.legacy.nn.layer.wrapper import PipelineSharedModuleWrapper
 from colossalai.logging import get_dist_logger
 from colossalai.pipeline.utils import partition_uniform

+from .layers import BertDualHead, BertLayer, Embedding, PreProcessor, VocabEmbedding
+from .layers.init_method import init_normal, output_init_normal
+

 class BertForPretrain(nn.Module):

-    def __init__(self,
-                 vocab_size,
-                 hidden_size,
-                 max_sequence_length,
-                 num_attention_heads,
-                 num_layers,
-                 add_binary_head,
-                 is_naive_fp16,
-                 num_tokentypes=2,
-                 dropout_prob=0.1,
-                 mlp_ratio=4,
-                 init_std=0.02,
-                 convert_fp16_to_fp32_in_softmax=False,
-                 ):
+    def __init__(
+        self,
+        vocab_size,
+        hidden_size,
+        max_sequence_length,
+        num_attention_heads,
+        num_layers,
+        add_binary_head,
+        is_naive_fp16,
+        num_tokentypes=2,
+        dropout_prob=0.1,
+        mlp_ratio=4,
+        init_std=0.02,
+        convert_fp16_to_fp32_in_softmax=False,
+    ):
        super().__init__()
        self.seq_parallel_size = gpc.get_world_size(ParallelMode.SEQUENCE)
        assert max_sequence_length % self.seq_parallel_size == 0, 'sequence length is not divisible by the sequence parallel size'
@@ -47,19 +51,19 @@ class BertForPretrain(nn.Module):
        self.bert_layers = nn.ModuleList()

        for i in range(num_layers):
-            bert_layer = BertLayer(layer_number=i+1,
+            bert_layer = BertLayer(layer_number=i + 1,
                                   hidden_size=hidden_size,
                                   num_attention_heads=num_attention_heads,
                                   attention_dropout=dropout_prob,
                                   mlp_ratio=mlp_ratio,
                                   hidden_dropout=dropout_prob,
                                   convert_fp16_to_fp32_in_softmax=convert_fp16_to_fp32_in_softmax,
-                                   is_naive_fp16=is_naive_fp16
-                                   )
+                                   is_naive_fp16=is_naive_fp16)
            self.bert_layers.append(bert_layer)

        self.layer_norm = LayerNorm(hidden_size)
-        self.head = BertDualHead(hidden_size, self.embedding.word_embedding_weight.size(0),
+        self.head = BertDualHead(hidden_size,
+                                 self.embedding.word_embedding_weight.size(0),
                                 add_binary_head=add_binary_head)
        self.reset_parameters()

@@ -166,22 +170,20 @@ class PipelineBertForPretrain(nn.Module):
            end_idx = num_layers

        for i in range(start_idx, end_idx):
-            bert_layer = BertLayer(layer_number=i+1,
+            bert_layer = BertLayer(layer_number=i + 1,
                                   hidden_size=hidden_size,
                                   num_attention_heads=num_attention_heads,
                                   attention_dropout=dropout_prob,
                                   mlp_ratio=mlp_ratio,
                                   hidden_dropout=dropout_prob,
                                   convert_fp16_to_fp32_in_softmax=convert_fp16_to_fp32_in_softmax,
-                                   is_naive_fp16=is_naive_fp16
-                                   )
+                                   is_naive_fp16=is_naive_fp16)
            self.bert_layers.append(bert_layer)

        if self.last_stage:
            self.word_embeddings = VocabEmbedding(vocab_size, hidden_size)
            self.layer_norm = LayerNorm(hidden_size)
-            self.head = BertDualHead(hidden_size, vocab_size,
-                                     add_binary_head=add_binary_head)
+            self.head = BertDualHead(hidden_size, vocab_size, add_binary_head=add_binary_head)
        self.reset_parameters()

    def _init_normal(self, tensor):
--- a/examples/tutorial/sequence_parallel/model/layers/bert_layer.py
+++ b/examples/tutorial/sequence_parallel/model/layers/bert_layer.py
@@ -1,10 +1,12 @@
 import torch
 import torch.nn as nn
-from colossalai.nn.layer.parallel_sequence import TransformerSelfAttentionRing
-from colossalai.kernel.jit import bias_dropout_add_fused_train, bias_dropout_add_fused_inference
+
 from colossalai.kernel.cuda_native import LayerNorm
-from .mlp import TransformerMLP
+from colossalai.kernel.jit import bias_dropout_add_fused_inference, bias_dropout_add_fused_train
+from colossalai.legacy.nn.layer.parallel_sequence import TransformerSelfAttentionRing
+
 from .dropout import get_bias_dropout_add
+from .mlp import TransformerMLP


 def attention_mask_func(attention_scores, attention_mask):
@@ -48,8 +50,7 @@ class BertLayer(nn.Module):
            layer_number=layer_number,
            apply_query_key_layer_scaling=True,
            convert_fp16_to_fp32_in_softmax=convert_fp16_to_fp32_in_softmax,
-            fp16=is_naive_fp16
-        )
+            fp16=is_naive_fp16)

        self.hidden_dropout = hidden_dropout
        self.bias_dropout_fusion = bias_dropout_fusion
@@ -89,11 +90,8 @@ class BertLayer(nn.Module):

        # re-enable torch grad to enable fused optimization.
        with torch.enable_grad():
-            layernorm_input = bias_dropout_add_func(
-                attention_output,
-                attention_bias.expand_as(residual),
-                residual,
-                self.hidden_dropout)
+            layernorm_input = bias_dropout_add_func(attention_output, attention_bias.expand_as(residual), residual,
+                                                    self.hidden_dropout)

        # Layer norm post the self attention.
        layernorm_output = self.post_attention_layernorm(layernorm_input)
@@ -109,10 +107,6 @@ class BertLayer(nn.Module):

        # re-enable torch grad to enable fused optimization.
        with torch.enable_grad():
-            output = bias_dropout_add_func(
-                mlp_output,
-                mlp_bias.expand_as(residual),
-                residual,
-                self.hidden_dropout)
+            output = bias_dropout_add_func(mlp_output, mlp_bias.expand_as(residual), residual, self.hidden_dropout)

        return output