mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-07 12:01:39 +00:00
[Inference]Add CUDA KVCache Kernel (#5406)
* add cuda KVCache kernel * annotation benchmark_kvcache_copy * add use cuda * fix import path * move benchmark scripts to example/ * rm benchmark codes in test_kv_cache_memcpy.py * rm redundancy codes * rm redundancy codes * pr was modified according to the review
This commit is contained in:
@@ -4,6 +4,7 @@ from .flash_attention import (
|
||||
FlashAttentionNpuExtension,
|
||||
FlashAttentionXformersCudaExtension,
|
||||
)
|
||||
from .inference import InferenceOpsCudaExtension
|
||||
from .layernorm import LayerNormCudaExtension
|
||||
from .moe import MoeCudaExtension
|
||||
from .optimizer import FusedOptimizerCudaExtension
|
||||
@@ -15,6 +16,7 @@ ALL_EXTENSIONS = [
|
||||
LayerNormCudaExtension,
|
||||
MoeCudaExtension,
|
||||
FusedOptimizerCudaExtension,
|
||||
InferenceOpsCudaExtension,
|
||||
ScaledMaskedSoftmaxCudaExtension,
|
||||
ScaledUpperTriangleMaskedSoftmaxCudaExtension,
|
||||
FlashAttentionDaoCudaExtension,
|
||||
@@ -28,6 +30,7 @@ __all__ = [
|
||||
"LayerNormCudaExtension",
|
||||
"MoeCudaExtension",
|
||||
"FusedOptimizerCudaExtension",
|
||||
"InferenceOpsCudaExtension",
|
||||
"ScaledMaskedSoftmaxCudaExtension",
|
||||
"ScaledUpperTriangleMaskedSoftmaxCudaExtension",
|
||||
"FlashAttentionDaoCudaExtension",
|
||||
|
Reference in New Issue
Block a user