From efb2d98da019e7e59ce79b7cc2d6ea47edebd772 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 20 May 2025 08:17:45 +0000 Subject: [PATCH] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- colossalai/shardformer/modeling/t5.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/colossalai/shardformer/modeling/t5.py b/colossalai/shardformer/modeling/t5.py index 5e119d6fc..dedbde4c0 100644 --- a/colossalai/shardformer/modeling/t5.py +++ b/colossalai/shardformer/modeling/t5.py @@ -119,7 +119,7 @@ class T5PipelineForwards: # initialize past_key_values with `None` if past does not exist if past_key_values is None: past_key_values = [None] * len(self.block) - + past_key_values_length = 0 if cache_position is None: cache_position = torch.arange( @@ -131,7 +131,7 @@ class T5PipelineForwards: # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # ourselves in which case we just need to make it broadcastable to all heads. - extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape) + self.get_extended_attention_mask(attention_mask, input_shape) # If a 2D or 3D attention mask is provided for the cross-attention # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] @@ -143,7 +143,7 @@ class T5PipelineForwards: encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) else: encoder_extended_attention_mask = None - + if self.config.is_decoder: causal_mask = self._update_causal_mask( attention_mask, @@ -159,7 +159,6 @@ class T5PipelineForwards: else: causal_mask = None - # Prepare head mask if needed head_mask = self.get_head_mask(head_mask, self.config.num_layers) cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers)