mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-06 03:20:52 +00:00
[Inference]Add CUDA KVCache Kernel (#5406)
* add cuda KVCache kernel * annotation benchmark_kvcache_copy * add use cuda * fix import path * move benchmark scripts to example/ * rm benchmark codes in test_kv_cache_memcpy.py * rm redundancy codes * rm redundancy codes * pr was modified according to the review
This commit is contained in:
@@ -8,6 +8,7 @@ from .extensions import (
|
||||
FlashAttentionNpuExtension,
|
||||
FlashAttentionXformersCudaExtension,
|
||||
FusedOptimizerCudaExtension,
|
||||
InferenceOpsCudaExtension,
|
||||
LayerNormCudaExtension,
|
||||
MoeCudaExtension,
|
||||
ScaledMaskedSoftmaxCudaExtension,
|
||||
@@ -21,6 +22,7 @@ __all__ = [
|
||||
"LayerNormLoader",
|
||||
"MoeLoader",
|
||||
"FusedOptimizerLoader",
|
||||
"InferenceOpsLoader",
|
||||
"ScaledMaskedSoftmaxLoader",
|
||||
"ScaledUpperTriangleMaskedSoftmaxLoader",
|
||||
]
|
||||
@@ -97,6 +99,10 @@ class FusedOptimizerLoader(KernelLoader):
|
||||
REGISTRY = [FusedOptimizerCudaExtension]
|
||||
|
||||
|
||||
class InferenceOpsLoader(KernelLoader):
|
||||
REGISTRY = [InferenceOpsCudaExtension]
|
||||
|
||||
|
||||
class ScaledMaskedSoftmaxLoader(KernelLoader):
|
||||
REGISTRY = [ScaledMaskedSoftmaxCudaExtension]
|
||||
|
||||
|
Reference in New Issue
Block a user