[Inference/Feat] Add convert_fp8 op for fp8 test in the future (#5706)

* add convert_fp8 op for fp8 test in the future * rerun ci
2025-09-06 11:32:10 +00:00 · 2024-05-10 18:39:54 +08:00
parent bfad39357b
commit 50104ab340
5 changed files with 197 additions and 10 deletions
--- a/extensions/pybind/inference/inference.cpp
+++ b/extensions/pybind/inference/inference.cpp
@@ -75,6 +75,8 @@ void flash_decoding_attention(
    torch::Tensor& tmp_out_lse,  // [num_tokens, num_heads, max_num_partitions]
    const c10::optional<torch::Tensor>& alibi_slopes, float scale);

+void convert_fp8(torch::Tensor& input, torch::Tensor& output);
+
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("decode_kv_cache_memcpy", &decode_kv_cache_memcpy,
        "Copy the GPU memory of kvcache during the decode stage.");
@@ -102,4 +104,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("flash_decoding_attention", &flash_decoding_attention,
        "Compute the attention between an input query and the cached "
        "keys/values using PagedAttention.");
+
+  m.def("convert_fp8", &convert_fp8,
+        "Convert input to fp8 output or convert fp8 input to output.");
 }
--- a/extensions/pybind/inference/inference_ops_cuda.py
+++ b/extensions/pybind/inference/inference_ops_cuda.py
@@ -17,6 +17,7 @@ class InferenceOpsCudaExtension(_CudaExtension):
                "kernel/cuda/rms_layernorm_kernel.cu",
                "kernel/cuda/get_cos_and_sin_kernel.cu",
                "kernel/cuda/flash_decoding_attention_kernel.cu",
+                "kernel/cuda/convert_fp8_kernel.cu",
            ]
        ] + [self.pybind_abs_path("inference/inference.cpp")]
        return ret