[Kernel/Fix] Revise flash attention triton kernel API and add benchmark (#5301)

* fix decoding kernel pytest * revise and add triton context attn benchmark
2025-09-04 18:40:28 +00:00 · 2024-01-23 17:16:02 +08:00
parent 8e606ecc7e
commit 3da9993b0d
5 changed files with 116 additions and 15 deletions
--- a/colossalai/inference/modeling/layers/attention.py
+++ b/colossalai/inference/modeling/layers/attention.py
@@ -87,7 +87,7 @@ class PagedAttention:
        Transform 1D no_pad tensor into 2D padded tensor with shape [bsz,seq_len,num_heads,head_size]
        """
        bsz = len(seq_lengths)
-        padded_tensor = torch.zeros(bsz, max_seq_len, num_heads, head_size)
+        padded_tensor = torch.zeros(bsz, max_seq_len, num_heads, head_size, dtype=tensor.dtype)

        token_idx = 0
        for i, seq_len in enumerate(seq_lengths):