add paged-attetionv2: support seq length split across thread block (#5707)

2025-09-06 19:40:28 +00:00 · 2024-05-14 12:46:54 +08:00
parent 18d67d0e8e
commit 7806842f2d
8 changed files with 704 additions and 249 deletions
--- a/colossalai/inference/modeling/models/nopadding_baichuan.py
+++ b/colossalai/inference/modeling/models/nopadding_baichuan.py
@@ -338,7 +338,8 @@ class NopadBaichuanAttention(ParallelModule):
                    block_size,
                    kv_seq_len,
                    fd_inter_tensor.mid_output,
-                    fd_inter_tensor.mid_output_lse,
+                    fd_inter_tensor.exp_sums,
+                    fd_inter_tensor.max_logits,
                    self.alibi_slopes,
                    sm_scale,
                )
--- a/colossalai/inference/modeling/models/nopadding_llama.py
+++ b/colossalai/inference/modeling/models/nopadding_llama.py
@@ -596,7 +596,8 @@ class NopadLlamaAttention(LlamaAttention, ParallelModule):
                    block_size,
                    kv_seq_len,
                    fd_inter_tensor.mid_output,
-                    fd_inter_tensor.mid_output_lse,
+                    fd_inter_tensor.exp_sums,
+                    fd_inter_tensor.max_logits,
                    None,
                    sm_scale,
                )