[shardformer]fix flash attention, when mask is casual, just don't unpad it (#5084)

* fix flash attn

* fix

fix
This commit is contained in:
flybird11111
2023-11-22 16:00:07 +08:00
committed by GitHub
parent 75af66cd81
commit aae496631c
6 changed files with 16 additions and 8 deletions

View File

@@ -106,7 +106,10 @@ def get_whisper_flash_attention_forward():
f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
)
flash_attention_mask = ~(attention_mask[:, :, -1].squeeze(1).to(torch.bool).contiguous())
attn_type = AttnMaskType.paddedcausal
if not torch.all(flash_attention_mask):
attn_type = AttnMaskType.paddedcausal
else:
attn_type = AttnMaskType.causal
attention = ColoAttention(
embed_dim=self.embed_dim, num_heads=self.num_heads, dropout=self.dropout, scale=self.scaling