[kernel] Add triton kernel for context attention (FAv2) without padding (#5192)

* add context attn unpadded triton kernel * test compatibility * kv cache copy (testing) * fix k/v cache copy * fix kv cache copy and test * fix boundary of block ptrs * add support for GQA/MQA and testing * fix import statement --------- Co-authored-by: Round Heng <yuanhengzhao@Rounds-MacBook-Pro.local>
2025-09-07 03:52:01 +00:00 · 2024-01-03 14:41:35 +08:00
parent 4df8876fca
commit 07b5283b6a
3 changed files with 422 additions and 0 deletions
--- a/colossalai/kernel/triton/init.py
+++ b/colossalai/kernel/triton/init.py
@@ -8,11 +8,13 @@ except ImportError:

 # There may exist import error even if we have triton installed.
 if HAS_TRITON:
+    from .context_attn_unpad import context_attention_unpadded
    from .fused_layernorm import layer_norm
    from .gptq_triton import gptq_fused_linear_triton
    from .softmax import softmax

    __all__ = [
+        "context_attention_unpadded",
        "softmax",
        "layer_norm",
        "gptq_fused_linear_triton",