[Inference/Kernel] Add Paged Decoding kernel, sequence split within the same thread block (#5531)

* feat flash decoding for paged attention * refactor flashdecodingattention * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2025-09-07 03:52:01 +00:00 · 2024-04-18 16:45:07 +08:00
parent 56b222eff8
commit be396ad6cc
15 changed files with 1765 additions and 211 deletions
--- a/colossalai/inference/modeling/models/nopadding_llama.py
+++ b/colossalai/inference/modeling/models/nopadding_llama.py
@@ -437,6 +437,19 @@ class NopadLlamaAttention(LlamaAttention):
                    block_tables,
                    high_precision,
                )
+                # inference_ops.flash_decoding_attention(
+                #     attn_output,
+                #     query_states,
+                #     k_cache,
+                #     v_cache,
+                #     sequence_lengths,
+                #     block_tables,
+                #     block_size,
+                #     kv_seq_len,
+                #     fd_inter_tensor.mid_output,
+                #     fd_inter_tensor.mid_output_lse,
+                #     sm_scale,
+                # )
            else:
                if is_verifier:
                    rotary_embedding(query_states, key_states, cos_sin[0], cos_sin[1])