add paged-attetionv2: support seq length split across thread block (#5707)

This commit is contained in:
Steve Luo
2024-05-14 12:46:54 +08:00
committed by GitHub
parent 18d67d0e8e
commit 7806842f2d
8 changed files with 704 additions and 249 deletions

View File

@@ -338,7 +338,8 @@ class NopadBaichuanAttention(ParallelModule):
block_size,
kv_seq_len,
fd_inter_tensor.mid_output,
fd_inter_tensor.mid_output_lse,
fd_inter_tensor.exp_sums,
fd_inter_tensor.max_logits,
self.alibi_slopes,
sm_scale,
)