mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-08-07 02:54:10 +00:00
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
This commit is contained in:
parent
1016bb3257
commit
2a7fa2e7d0
@ -4,7 +4,7 @@ from .dropout import DropoutForParallelInput, DropoutForReplicatedInput
|
|||||||
from .embedding import Embedding1D, PaddingEmbedding, VocabParallelEmbedding1D
|
from .embedding import Embedding1D, PaddingEmbedding, VocabParallelEmbedding1D
|
||||||
from .linear import Linear1D_Col, Linear1D_Row, PaddingLMHead, VocabParallelLMHead1D
|
from .linear import Linear1D_Col, Linear1D_Row, PaddingLMHead, VocabParallelLMHead1D
|
||||||
from .loss import cross_entropy_1d
|
from .loss import cross_entropy_1d
|
||||||
from .normalization import FusedLayerNorm, FusedRMSNorm, LayerNorm, RMSNorm, CohereLayerNorm, FusedCohereLayerNorm
|
from .normalization import CohereLayerNorm, FusedCohereLayerNorm, FusedLayerNorm, FusedRMSNorm, LayerNorm, RMSNorm
|
||||||
from .parallel_module import ParallelModule
|
from .parallel_module import ParallelModule
|
||||||
from .qkv_fused_linear import FusedLinear1D_Col, GPT2FusedLinearConv1D_Col, GPT2FusedLinearConv1D_Row
|
from .qkv_fused_linear import FusedLinear1D_Col, GPT2FusedLinearConv1D_Col, GPT2FusedLinearConv1D_Row
|
||||||
|
|
||||||
|
@ -250,7 +250,6 @@ class FusedLayerNorm(BaseLayerNorm):
|
|||||||
return layernorm
|
return layernorm
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class CohereLayerNorm(BaseLayerNorm):
|
class CohereLayerNorm(BaseLayerNorm):
|
||||||
r"""
|
r"""
|
||||||
This is a wrapper around the transformers.models.cohere.CohereLayerNorm. It is meant to be used only with the from_native_module interface.
|
This is a wrapper around the transformers.models.cohere.CohereLayerNorm. It is meant to be used only with the from_native_module interface.
|
||||||
|
@ -3,22 +3,12 @@ import warnings
|
|||||||
from typing import List, Optional, Tuple, Union
|
from typing import List, Optional, Tuple, Union
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.nn.functional as F
|
|
||||||
import torch.utils.checkpoint
|
import torch.utils.checkpoint
|
||||||
from torch import nn
|
from torch import nn
|
||||||
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
from torch.nn import CrossEntropyLoss
|
||||||
from transformers.cache_utils import Cache, DynamicCache
|
from transformers.cache_utils import Cache, DynamicCache
|
||||||
from transformers.modeling_outputs import (
|
from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
|
||||||
BaseModelOutputWithPast,
|
from transformers.models.cohere.modeling_cohere import CohereForCausalLM, CohereModel, StaticCache, repeat_kv
|
||||||
CausalLMOutputWithPast,
|
|
||||||
SequenceClassifierOutputWithPast,
|
|
||||||
)
|
|
||||||
from transformers.models.cohere.modeling_cohere import (
|
|
||||||
CohereForCausalLM,
|
|
||||||
CohereModel,
|
|
||||||
StaticCache,
|
|
||||||
repeat_kv,
|
|
||||||
)
|
|
||||||
from transformers.utils import logging
|
from transformers.utils import logging
|
||||||
|
|
||||||
from colossalai.pipeline.stage_manager import PipelineStageManager
|
from colossalai.pipeline.stage_manager import PipelineStageManager
|
||||||
@ -343,10 +333,9 @@ class CommandPipelineForwards:
|
|||||||
hidden_states = outputs.get("hidden_states")
|
hidden_states = outputs.get("hidden_states")
|
||||||
return {"hidden_states": hidden_states}
|
return {"hidden_states": hidden_states}
|
||||||
|
|
||||||
def get_command_flash_attention_forward(shard_config, sp_mode, sp_group, sp_size):
|
|
||||||
from transformers.models.cohere.modeling_cohere import CohereAttention, apply_rotary_pos_emb
|
|
||||||
from transformers.models.cohere.modeling_cohere import repeat_kv
|
|
||||||
|
|
||||||
|
def get_command_flash_attention_forward(shard_config, sp_mode, sp_group, sp_size):
|
||||||
|
from transformers.models.cohere.modeling_cohere import CohereAttention, apply_rotary_pos_emb, repeat_kv
|
||||||
|
|
||||||
def forward(
|
def forward(
|
||||||
self: CohereAttention,
|
self: CohereAttention,
|
||||||
@ -728,7 +717,6 @@ def get_command_seq_parallel_attention_forward(sp_mode, sp_size, sp_group):
|
|||||||
else:
|
else:
|
||||||
attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
|
attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
|
||||||
|
|
||||||
|
|
||||||
attn_output = self.o_proj(attn_output)
|
attn_output = self.o_proj(attn_output)
|
||||||
|
|
||||||
if not output_attentions:
|
if not output_attentions:
|
||||||
|
@ -7,12 +7,12 @@ from torch import Tensor
|
|||||||
from torch.nn import Module
|
from torch.nn import Module
|
||||||
|
|
||||||
from colossalai.shardformer.layer import (
|
from colossalai.shardformer.layer import (
|
||||||
|
CohereLayerNorm,
|
||||||
FusedCohereLayerNorm,
|
FusedCohereLayerNorm,
|
||||||
Linear1D_Col,
|
Linear1D_Col,
|
||||||
Linear1D_Row,
|
Linear1D_Row,
|
||||||
PaddingEmbedding,
|
PaddingEmbedding,
|
||||||
PaddingLMHead,
|
PaddingLMHead,
|
||||||
CohereLayerNorm,
|
|
||||||
VocabParallelEmbedding1D,
|
VocabParallelEmbedding1D,
|
||||||
VocabParallelLMHead1D,
|
VocabParallelLMHead1D,
|
||||||
)
|
)
|
||||||
@ -383,7 +383,9 @@ class CommandForCausalLMPolicy(CommandPolicy):
|
|||||||
if self.pipeline_stage_manager:
|
if self.pipeline_stage_manager:
|
||||||
# set None as default
|
# set None as default
|
||||||
self.set_pipeline_forward(
|
self.set_pipeline_forward(
|
||||||
model_cls=CohereForCausalLM, new_forward=CommandPipelineForwards.command_for_causal_lm_forward, policy=policy
|
model_cls=CohereForCausalLM,
|
||||||
|
new_forward=CommandPipelineForwards.command_for_causal_lm_forward,
|
||||||
|
policy=policy,
|
||||||
)
|
)
|
||||||
|
|
||||||
return policy
|
return policy
|
||||||
|
@ -16,8 +16,6 @@ if HAS_COMMAND:
|
|||||||
# ===============================
|
# ===============================
|
||||||
|
|
||||||
def data_gen():
|
def data_gen():
|
||||||
|
|
||||||
|
|
||||||
input_ids = torch.Tensor(
|
input_ids = torch.Tensor(
|
||||||
[
|
[
|
||||||
[1, 15043, 29892, 590, 11203, 338, 274, 1082, 1, 15043, 29892, 590, 11203, 338, 274, 1082],
|
[1, 15043, 29892, 590, 11203, 338, 274, 1082, 1, 15043, 29892, 590, 11203, 338, 274, 1082],
|
||||||
|
@ -79,10 +79,24 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
|
|||||||
else:
|
else:
|
||||||
atol, rtol = 5e-3, 5e-3
|
atol, rtol = 5e-3, 5e-3
|
||||||
row_layer_grads = get_grad_tensors_for_check(
|
row_layer_grads = get_grad_tensors_for_check(
|
||||||
command_model, shard_command_model, row_layer_for_check, tp_group, atol=atol, rtol=rtol, dim=0, verbose=False
|
command_model,
|
||||||
|
shard_command_model,
|
||||||
|
row_layer_for_check,
|
||||||
|
tp_group,
|
||||||
|
atol=atol,
|
||||||
|
rtol=rtol,
|
||||||
|
dim=0,
|
||||||
|
verbose=False,
|
||||||
)
|
)
|
||||||
col_layer_grads = get_grad_tensors_for_check(
|
col_layer_grads = get_grad_tensors_for_check(
|
||||||
command_model, shard_command_model, col_layer_for_check, tp_group, atol=atol, rtol=rtol, dim=1, verbose=False
|
command_model,
|
||||||
|
shard_command_model,
|
||||||
|
col_layer_for_check,
|
||||||
|
tp_group,
|
||||||
|
atol=atol,
|
||||||
|
rtol=rtol,
|
||||||
|
dim=1,
|
||||||
|
verbose=False,
|
||||||
)
|
)
|
||||||
norm_layer_grads = get_grad_tensors_for_check(
|
norm_layer_grads = get_grad_tensors_for_check(
|
||||||
command_model,
|
command_model,
|
||||||
@ -121,7 +135,14 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
|
|||||||
else:
|
else:
|
||||||
atol, rtol = 5e-3, 5e-3
|
atol, rtol = 5e-3, 5e-3
|
||||||
check_weight(
|
check_weight(
|
||||||
command_model, shard_command_model, col_layer_for_check, tp_group, atol=atol, rtol=rtol, dim=1, verbose=False
|
command_model,
|
||||||
|
shard_command_model,
|
||||||
|
col_layer_for_check,
|
||||||
|
tp_group,
|
||||||
|
atol=atol,
|
||||||
|
rtol=rtol,
|
||||||
|
dim=1,
|
||||||
|
verbose=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
# check grads
|
# check grads
|
||||||
|
Loading…
Reference in New Issue
Block a user