[pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
2025-08-07 02:54:10 +00:00 · 2024-06-14 08:05:06 +00:00 · 2024-06-14 08:05:06 +00:00 · 2a7fa2e7d0
commit 2a7fa2e7d0
parent 1016bb3257
7 changed files with 44 additions and 36 deletions
--- a/colossalai/shardformer/layer/init.py
+++ b/colossalai/shardformer/layer/init.py
@ -4,7 +4,7 @@ from .dropout import DropoutForParallelInput, DropoutForReplicatedInput
 from .embedding import Embedding1D, PaddingEmbedding, VocabParallelEmbedding1D
 from .linear import Linear1D_Col, Linear1D_Row, PaddingLMHead, VocabParallelLMHead1D
 from .loss import cross_entropy_1d
-from .normalization import FusedLayerNorm, FusedRMSNorm, LayerNorm, RMSNorm, CohereLayerNorm, FusedCohereLayerNorm
+from .normalization import CohereLayerNorm, FusedCohereLayerNorm, FusedLayerNorm, FusedRMSNorm, LayerNorm, RMSNorm
 from .parallel_module import ParallelModule
 from .qkv_fused_linear import FusedLinear1D_Col, GPT2FusedLinearConv1D_Col, GPT2FusedLinearConv1D_Row
--- a/colossalai/shardformer/layer/normalization.py
+++ b/colossalai/shardformer/layer/normalization.py
@ -250,7 +250,6 @@ class FusedLayerNorm(BaseLayerNorm):
        return layernorm
 class CohereLayerNorm(BaseLayerNorm):
    r"""
    This is a wrapper around the transformers.models.cohere.CohereLayerNorm. It is meant to be used only with the from_native_module interface.
--- a/colossalai/shardformer/modeling/command.py
+++ b/colossalai/shardformer/modeling/command.py
@ -3,22 +3,12 @@ import warnings
 from typing import List, Optional, Tuple, Union
 import torch
 import torch.nn.functional as F
 import torch.utils.checkpoint
 from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from torch.nn import CrossEntropyLoss
 from transformers.cache_utils import Cache, DynamicCache
-from transformers.modeling_outputs import (
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
-    BaseModelOutputWithPast,
+from transformers.models.cohere.modeling_cohere import CohereForCausalLM, CohereModel, StaticCache, repeat_kv
    CausalLMOutputWithPast,
    SequenceClassifierOutputWithPast,
 )
 from transformers.models.cohere.modeling_cohere import (
    CohereForCausalLM,
    CohereModel,
    StaticCache,
    repeat_kv,
 )
 from transformers.utils import logging
 from colossalai.pipeline.stage_manager import PipelineStageManager
@ -343,10 +333,9 @@ class CommandPipelineForwards:
            hidden_states = outputs.get("hidden_states")
            return {"hidden_states": hidden_states}
 def get_command_flash_attention_forward(shard_config, sp_mode, sp_group, sp_size):
    from transformers.models.cohere.modeling_cohere import CohereAttention, apply_rotary_pos_emb
    from transformers.models.cohere.modeling_cohere import repeat_kv
 def get_command_flash_attention_forward(shard_config, sp_mode, sp_group, sp_size):
    from transformers.models.cohere.modeling_cohere import CohereAttention, apply_rotary_pos_emb, repeat_kv
    def forward(
        self: CohereAttention,
@ -728,7 +717,6 @@ def get_command_seq_parallel_attention_forward(sp_mode, sp_size, sp_group):
        else:
            attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
        attn_output = self.o_proj(attn_output)
        if not output_attentions:
--- a/colossalai/shardformer/policies/command.py
+++ b/colossalai/shardformer/policies/command.py
@ -7,12 +7,12 @@ from torch import Tensor
 from torch.nn import Module
 from colossalai.shardformer.layer import (
    CohereLayerNorm,
    FusedCohereLayerNorm,
    Linear1D_Col,
    Linear1D_Row,
    PaddingEmbedding,
    PaddingLMHead,
    CohereLayerNorm,
    VocabParallelEmbedding1D,
    VocabParallelLMHead1D,
 )
@ -383,7 +383,9 @@ class CommandForCausalLMPolicy(CommandPolicy):
        if self.pipeline_stage_manager:
            # set None as default
            self.set_pipeline_forward(
-                model_cls=CohereForCausalLM, new_forward=CommandPipelineForwards.command_for_causal_lm_forward, policy=policy
+                model_cls=CohereForCausalLM,
                new_forward=CommandPipelineForwards.command_for_causal_lm_forward,
                policy=policy,
            )
        return policy
--- a/tests/kit/model_zoo/transformers/command.py
+++ b/tests/kit/model_zoo/transformers/command.py
@ -16,8 +16,6 @@ if HAS_COMMAND:
    # ===============================
    def data_gen():
        input_ids = torch.Tensor(
            [
                [1, 15043, 29892, 590, 11203, 338, 274, 1082, 1, 15043, 29892, 590, 11203, 338, 274, 1082],
--- a/tests/test_shardformer/test_model/test_shard_command.py
+++ b/tests/test_shardformer/test_model/test_shard_command.py
@ -79,10 +79,24 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
        else:
            atol, rtol = 5e-3, 5e-3
        row_layer_grads = get_grad_tensors_for_check(
-            command_model, shard_command_model, row_layer_for_check, tp_group, atol=atol, rtol=rtol, dim=0, verbose=False
+            command_model,
            shard_command_model,
            row_layer_for_check,
            tp_group,
            atol=atol,
            rtol=rtol,
            dim=0,
            verbose=False,
        )
        col_layer_grads = get_grad_tensors_for_check(
-            command_model, shard_command_model, col_layer_for_check, tp_group, atol=atol, rtol=rtol, dim=1, verbose=False
+            command_model,
            shard_command_model,
            col_layer_for_check,
            tp_group,
            atol=atol,
            rtol=rtol,
            dim=1,
            verbose=False,
        )
        norm_layer_grads = get_grad_tensors_for_check(
            command_model,
@ -121,7 +135,14 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
        else:
            atol, rtol = 5e-3, 5e-3
        check_weight(
-            command_model, shard_command_model, col_layer_for_check, tp_group, atol=atol, rtol=rtol, dim=1, verbose=False
+            command_model,
            shard_command_model,
            col_layer_for_check,
            tp_group,
            atol=atol,
            rtol=rtol,
            dim=1,
            verbose=False,
        )
    # check grads