[Inference]Add CUDA KVCache Kernel (#5406)

* add cuda KVCache kernel * annotation benchmark_kvcache_copy * add use cuda * fix import path * move benchmark scripts to example/ * rm benchmark codes in test_kv_cache_memcpy.py * rm redundancy codes * rm redundancy codes * pr was modified according to the review
2025-09-28 21:17:08 +00:00 · 2024-02-28 14:36:50 +08:00
parent 19061188c3
commit 600881a8ea
15 changed files with 348 additions and 75 deletions
--- a/extensions/inference/inference_ops_cuda.py
+++ b/extensions/inference/inference_ops_cuda.py
@@ -0,0 +1,30 @@
+from ..cuda_extension import _CudaExtension
+from ..utils import get_cuda_cc_flag
+
+
+class InferenceOpsCudaExtension(_CudaExtension):
+    def __init__(self):
+        super().__init__(name="inference_ops_cuda")
+
+    def sources_files(self):
+        ret = [
+            self.csrc_abs_path(fname)
+            for fname in [
+                "cuda/colossal_inference_C_frontend.cpp",
+                "cuda/decode_kv_cache_memcpy_kernel.cu",
+            ]
+        ]
+        return ret
+
+    def include_dirs(self):
+        ret = [self.get_cuda_home_include()]
+        return ret
+
+    def cxx_flags(self):
+        version_dependent_macros = ["-DVERSION_GE_1_1", "-DVERSION_GE_1_3", "-DVERSION_GE_1_5"]
+        return ["-O3"] + version_dependent_macros
+
+    def nvcc_flags(self):
+        extra_cuda_flags = ["-lineinfo"]
+        extra_cuda_flags.extend(get_cuda_cc_flag())
+        return ["-O3", "--use_fast_math"] + extra_cuda_flags