mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-06 11:32:10 +00:00
[Inference/Feat] Add convert_fp8 op for fp8 test in the future (#5706)
* add convert_fp8 op for fp8 test in the future * rerun ci
This commit is contained in:
@@ -75,6 +75,8 @@ void flash_decoding_attention(
|
||||
torch::Tensor& tmp_out_lse, // [num_tokens, num_heads, max_num_partitions]
|
||||
const c10::optional<torch::Tensor>& alibi_slopes, float scale);
|
||||
|
||||
void convert_fp8(torch::Tensor& input, torch::Tensor& output);
|
||||
|
||||
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
|
||||
m.def("decode_kv_cache_memcpy", &decode_kv_cache_memcpy,
|
||||
"Copy the GPU memory of kvcache during the decode stage.");
|
||||
@@ -102,4 +104,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
|
||||
m.def("flash_decoding_attention", &flash_decoding_attention,
|
||||
"Compute the attention between an input query and the cached "
|
||||
"keys/values using PagedAttention.");
|
||||
|
||||
m.def("convert_fp8", &convert_fp8,
|
||||
"Convert input to fp8 output or convert fp8 input to output.");
|
||||
}
|
||||
|
@@ -17,6 +17,7 @@ class InferenceOpsCudaExtension(_CudaExtension):
|
||||
"kernel/cuda/rms_layernorm_kernel.cu",
|
||||
"kernel/cuda/get_cos_and_sin_kernel.cu",
|
||||
"kernel/cuda/flash_decoding_attention_kernel.cu",
|
||||
"kernel/cuda/convert_fp8_kernel.cu",
|
||||
]
|
||||
] + [self.pybind_abs_path("inference/inference.cpp")]
|
||||
return ret
|
||||
|
Reference in New Issue
Block a user