[kernel] Add triton kernel for context attention (FAv2) without padding (#5192)

* add context attn unpadded triton kernel

* test compatibility

* kv cache copy (testing)

* fix k/v cache copy

* fix kv cache copy and test

* fix boundary of block ptrs

* add support for GQA/MQA and testing

* fix import statement

---------

Co-authored-by: Round Heng <yuanhengzhao@Rounds-MacBook-Pro.local>
This commit is contained in:
Yuanheng Zhao
2024-01-03 14:41:35 +08:00
committed by FrankLeeeee
parent 4df8876fca
commit 07b5283b6a
3 changed files with 422 additions and 0 deletions

View File

@@ -8,11 +8,13 @@ except ImportError:
# There may exist import error even if we have triton installed.
if HAS_TRITON:
from .context_attn_unpad import context_attention_unpadded
from .fused_layernorm import layer_norm
from .gptq_triton import gptq_fused_linear_triton
from .softmax import softmax
__all__ = [
"context_attention_unpadded",
"softmax",
"layer_norm",
"gptq_fused_linear_triton",