[Inference/Feat] Add convert_fp8 op for fp8 test in the future (#5706)

* add convert_fp8 op for fp8 test in the future

* rerun ci
This commit is contained in:
傅剑寒
2024-05-10 18:39:54 +08:00
committed by GitHub
parent bfad39357b
commit 50104ab340
5 changed files with 197 additions and 10 deletions

View File

@@ -75,6 +75,8 @@ void flash_decoding_attention(
torch::Tensor& tmp_out_lse, // [num_tokens, num_heads, max_num_partitions]
const c10::optional<torch::Tensor>& alibi_slopes, float scale);
void convert_fp8(torch::Tensor& input, torch::Tensor& output);
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("decode_kv_cache_memcpy", &decode_kv_cache_memcpy,
"Copy the GPU memory of kvcache during the decode stage.");
@@ -102,4 +104,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("flash_decoding_attention", &flash_decoding_attention,
"Compute the attention between an input query and the cached "
"keys/values using PagedAttention.");
m.def("convert_fp8", &convert_fp8,
"Convert input to fp8 output or convert fp8 input to output.");
}

View File

@@ -17,6 +17,7 @@ class InferenceOpsCudaExtension(_CudaExtension):
"kernel/cuda/rms_layernorm_kernel.cu",
"kernel/cuda/get_cos_and_sin_kernel.cu",
"kernel/cuda/flash_decoding_attention_kernel.cu",
"kernel/cuda/convert_fp8_kernel.cu",
]
] + [self.pybind_abs_path("inference/inference.cpp")]
return ret