mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-01 01:06:00 +00:00
[Kernels]added flash-decoidng of triton (#5063)
* added flash-decoidng of triton based on lightllm kernel * add req * clean * clean * delete build.sh --------- Co-authored-by: cuiqing.li <lixx336@gmail.com>
This commit is contained in:
@@ -137,6 +137,7 @@ if HAS_TRITON:
|
||||
tl.store(out_ptrs, acc, mask=offs_m[:, None] < cur_batch_seq_len)
|
||||
return
|
||||
else:
|
||||
# this function is modified from https://github.com/ModelTC/lightllm/blob/main/lightllm/models/llama/triton_kernel/context_flashattention_nopad.py#L11
|
||||
@triton.jit
|
||||
def _context_flash_attention_kernel_2(
|
||||
Q, K, V, sm_scale, Alibi, B_Start_Loc, B_Seqlen,
|
||||
|
50
colossalai/kernel/triton/flash_decoding.py
Normal file
50
colossalai/kernel/triton/flash_decoding.py
Normal file
@@ -0,0 +1,50 @@
|
||||
# adepted from https://github.com/ModelTC/lightllm/blob/ece7b43f8a6dfa74027adc77c2c176cff28c76c8/lightllm/models/llama/triton_kernel/flash_decoding.py
|
||||
import torch
|
||||
try:
|
||||
from lightllm.models.llama.triton_kernel.flash_decoding_stage1 import flash_decode_stage1
|
||||
from lightllm.models.llama.triton_kernel.flash_decoding_stage2 import flash_decode_stage2
|
||||
HAS_LIGHTLLM_KERNEL = True
|
||||
except:
|
||||
print("install lightllm from https://github.com/ModelTC/lightllm/blob/ece7b43f8a6dfa74027adc77c2c176cff28c76c8")
|
||||
HAS_LIGHTLLM_KERNEL = False
|
||||
|
||||
|
||||
if HAS_LIGHTLLM_KERNEL:
|
||||
def token_flash_decoding(q, o_tensor, infer_state, q_head_num, head_dim, cache_k, cache_v):
|
||||
BLOCK_SEQ = 256
|
||||
batch_size = infer_state.batch_size
|
||||
max_len_in_batch = infer_state.max_len_in_batch
|
||||
|
||||
|
||||
calcu_shape1 = (batch_size, q_head_num, head_dim)
|
||||
|
||||
if getattr(infer_state, 'mid_o', None) is None:
|
||||
infer_state.mid_o = torch.empty([batch_size,
|
||||
q_head_num,
|
||||
max_len_in_batch // BLOCK_SEQ + 1,
|
||||
head_dim],
|
||||
dtype=torch.float32,
|
||||
device="cuda")
|
||||
infer_state.mid_o_logexpsum = torch.empty([batch_size,
|
||||
q_head_num,
|
||||
max_len_in_batch // BLOCK_SEQ + 1],
|
||||
dtype=torch.float32,
|
||||
device="cuda")
|
||||
|
||||
mid_o = infer_state.mid_o
|
||||
mid_o_logexpsum = infer_state.mid_o_logexpsum
|
||||
|
||||
flash_decode_stage1(q.view(calcu_shape1),
|
||||
cache_k,
|
||||
cache_v,
|
||||
infer_state.block_loc,
|
||||
infer_state.seq_len,
|
||||
infer_state.max_len_in_batch,
|
||||
mid_o,
|
||||
mid_o_logexpsum,
|
||||
BLOCK_SEQ)
|
||||
flash_decode_stage2(mid_o,
|
||||
mid_o_logexpsum,
|
||||
infer_state.seq_len,
|
||||
o_tensor.view(calcu_shape1),
|
||||
BLOCK_SEQ)
|
Reference in New Issue
Block a user