From 885210dc27d36d5f7f14c9bf2d0d1290ea69476e Mon Sep 17 00:00:00 2001 From: wangbluo <2538539015@qq.com> Date: Mon, 28 Apr 2025 18:17:12 +0800 Subject: [PATCH 1/4] fix --- colossalai/shardformer/modeling/falcon.py | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/colossalai/shardformer/modeling/falcon.py b/colossalai/shardformer/modeling/falcon.py index c2802063f..27461be04 100644 --- a/colossalai/shardformer/modeling/falcon.py +++ b/colossalai/shardformer/modeling/falcon.py @@ -1,17 +1,9 @@ -import math -import warnings from typing import List, Optional, Tuple, Union import torch import torch.distributed as dist from torch.distributed import ProcessGroup from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss -from transformers.modeling_attn_mask_utils import ( - AttentionMaskConverter, - _prepare_4d_causal_attention_mask, - _prepare_4d_causal_attention_mask_for_sdpa, -) -from transformers.cache_utils import Cache, DynamicCache, StaticCache from transformers.modeling_outputs import ( BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions, @@ -159,7 +151,7 @@ def get_tp_falcon_decoder_layer_forward(): and self.config.num_ln_in_parallel_attn == 1 ): mlp_layernorm_out = attention_layernorm_out - + outputs = attn_outputs[1:] # MLP. @@ -215,7 +207,6 @@ class FalconPipelineForwards: logger.warning_once("use_cache=True is not supported for pipeline models at the moment.") use_cache = False - logger.warning_once("past_key_values is not supported for pipeline models at the moment.") past_key_values = None @@ -251,7 +242,7 @@ class FalconPipelineForwards: # Compute alibi tensor: check build_alibi_tensor documentation alibi = None past_key_values_length = 0 - + batch_size, seq_length, _ = hidden_states.shape if self.use_alibi: mask = ( @@ -262,7 +253,7 @@ class FalconPipelineForwards: else attention_mask ) alibi = build_alibi_tensor(mask, self.num_heads, dtype=hidden_states.dtype) - + if cache_position is None: cache_position = torch.arange( past_key_values_length, past_key_values_length + seq_length, device=hidden_states.device @@ -280,7 +271,7 @@ class FalconPipelineForwards: # attention_probs has shape batch_size x num_heads x N x N # head_mask has shape n_layer x batch x num_heads x N x N head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) - + # create position embeddings to be shared across the decoder layers position_embeddings = self.rotary_emb(hidden_states, position_ids) @@ -319,7 +310,7 @@ class FalconPipelineForwards: hidden_states = outputs[0] if use_cache is True: - next_decoder_cache = outputs[1] + outputs[1] if output_attentions: all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],) @@ -332,7 +323,7 @@ class FalconPipelineForwards: all_hidden_states = all_hidden_states + (hidden_states,) if stage_manager.is_last_stage(): - + if not return_dict: return tuple( v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None From cefdfc41250e397566a29ad769114081283fc28a Mon Sep 17 00:00:00 2001 From: wangbluo <2538539015@qq.com> Date: Thu, 8 May 2025 17:46:54 +0800 Subject: [PATCH 2/4] add explanation --- colossalai/shardformer/modeling/falcon.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/colossalai/shardformer/modeling/falcon.py b/colossalai/shardformer/modeling/falcon.py index 27461be04..4f1d0ccd8 100644 --- a/colossalai/shardformer/modeling/falcon.py +++ b/colossalai/shardformer/modeling/falcon.py @@ -111,7 +111,6 @@ def get_tp_falcon_decoder_layer_forward(): position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC **kwargs, ): - residual = hidden_states if self.config.new_decoder_architecture and self.config.num_ln_in_parallel_attn == 2: @@ -196,6 +195,8 @@ class FalconPipelineForwards: stage_index: Optional[List[int]] = None, shard_config: ShardConfig = None, ) -> Union[Tuple[torch.Tensor, ...], BaseModelOutputWithPastAndCrossAttentions]: + # Add cache_position and position_embeddings args for v4.51.3 transformers + logger = logging.get_logger(__name__) output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( @@ -261,7 +262,8 @@ class FalconPipelineForwards: if position_ids is None: position_ids = cache_position.unsqueeze(0) - + # use new version of causal mask construction. + # In v4.51.3 version, sdpa, egaer and flash attention are merged into one class. causal_mask = self._update_causal_mask( attention_mask, hidden_states, cache_position, past_key_values, output_attentions, head_mask, alibi ) From 07349e00146d06ece045aa82baa9b5335452b966 Mon Sep 17 00:00:00 2001 From: wangbluo <2538539015@qq.com> Date: Wed, 14 May 2025 10:09:35 +0800 Subject: [PATCH 3/4] fix --- colossalai/shardformer/modeling/falcon.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/colossalai/shardformer/modeling/falcon.py b/colossalai/shardformer/modeling/falcon.py index 4f1d0ccd8..d06f8db2c 100644 --- a/colossalai/shardformer/modeling/falcon.py +++ b/colossalai/shardformer/modeling/falcon.py @@ -108,11 +108,15 @@ def get_tp_falcon_decoder_layer_forward(): use_cache: bool = False, output_attentions: bool = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[ + Tuple[torch.Tensor, torch.Tensor] + ] = None, # Add cache_position and position_embeddings args for v4.51.3 transformers **kwargs, ): + residual = hidden_states + # same as v4.51.3 transformers if self.config.new_decoder_architecture and self.config.num_ln_in_parallel_attn == 2: attention_layernorm_out = self.ln_attn(hidden_states) mlp_layernorm_out = self.ln_mlp(hidden_states) @@ -143,7 +147,7 @@ def get_tp_falcon_decoder_layer_forward(): attention_output, residual, self.config.attention_dropout, training=self.training ) mlp_layernorm_out = self.post_attention_layernorm(residual) - + # v4.51.3 transformers mlp if ( self.config.new_decoder_architecture and self.config.parallel_attn @@ -241,6 +245,7 @@ class FalconPipelineForwards: all_hidden_states = () if output_hidden_states else None # Compute alibi tensor: check build_alibi_tensor documentation + # alibi calculation is same as v4.51.3 transformers. alibi = None past_key_values_length = 0 @@ -274,10 +279,11 @@ class FalconPipelineForwards: # head_mask has shape n_layer x batch x num_heads x N x N head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) - # create position embeddings to be shared across the decoder layers + # v4.51.3 create position embeddings to be shared across the decoder layers position_embeddings = self.rotary_emb(hidden_states, position_ids) start_idx, end_idx = stage_index[0], stage_index[1] + # keep past_key_values arg same with v4.51.3 transformers for i, block in enumerate(self.h[start_idx:end_idx], start=start_idx): if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) From 89917e247b4912fa8f9220e0abf5ffee1369fd08 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 14 May 2025 04:24:23 +0000 Subject: [PATCH 4/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- colossalai/shardformer/modeling/falcon.py | 62 +++++++++++------------ 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/colossalai/shardformer/modeling/falcon.py b/colossalai/shardformer/modeling/falcon.py index 870c9272b..7a8aec37d 100644 --- a/colossalai/shardformer/modeling/falcon.py +++ b/colossalai/shardformer/modeling/falcon.py @@ -1,3 +1,4 @@ +import warnings from typing import List, Optional, Tuple, Union import torch @@ -21,7 +22,6 @@ from transformers.models.falcon.modeling_falcon import ( build_alibi_tensor, ) from transformers.utils import logging -import warnings from colossalai.pipeline.stage_manager import PipelineStageManager from colossalai.shardformer.shard import ShardConfig @@ -134,12 +134,12 @@ def get_tp_falcon_decoder_layer_forward(): attention_mask=attention_mask, position_ids=position_ids, alibi=alibi, - head_mask=head_mask, - use_cache=use_cache, - output_attentions=output_attentions, - cache_position=cache_position, - position_embeddings=position_embeddings, - ) + head_mask=head_mask, + use_cache=use_cache, + output_attentions=output_attentions, + cache_position=cache_position, + position_embeddings=position_embeddings, + ) attention_output = attn_outputs[0] @@ -294,35 +294,35 @@ class FalconPipelineForwards: if self.gradient_checkpointing and self.training: outputs = self._gradient_checkpointing_func( - block.__call__, - hidden_states, - alibi, - causal_mask, - position_ids, - head_mask[i], - past_key_values, - use_cache, - output_attentions, - cache_position, - position_embeddings, - ) + block.__call__, + hidden_states, + alibi, + causal_mask, + position_ids, + head_mask[i], + past_key_values, + use_cache, + output_attentions, + cache_position, + position_embeddings, + ) else: outputs = block( - hidden_states, - layer_past=past_key_values, - attention_mask=causal_mask, - position_ids=position_ids, - head_mask=head_mask[i], - use_cache=use_cache, - output_attentions=output_attentions, - alibi=alibi, - cache_position=cache_position, - position_embeddings=position_embeddings, - ) + hidden_states, + layer_past=past_key_values, + attention_mask=causal_mask, + position_ids=position_ids, + head_mask=head_mask[i], + use_cache=use_cache, + output_attentions=output_attentions, + alibi=alibi, + cache_position=cache_position, + position_embeddings=position_embeddings, + ) hidden_states = outputs[0] if use_cache is True: - next_decoder_cache = outputs[1] + outputs[1] if output_attentions: all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)