[Inference/Feat] Feat quant kvcache step2 (#5674)

2025-09-12 20:54:35 +00:00 · 2024-04-30 11:26:36 +08:00
parent 8ccb6714e7
commit 808ee6e4ad
4 changed files with 208 additions and 71 deletions
--- a/extensions/csrc/kernel/cuda/utils/vec_copy.h
+++ b/extensions/csrc/kernel/cuda/utils/vec_copy.h
@@ -11,10 +11,9 @@ namespace colossalAI {
 namespace cuda {
 namespace utils {

-template <typename T, int VecSize>
+template <typename T, int vec_size>
 __device__ __inline__ void copy_vector(T *dst, const T *src) {
-  using VT = typename common::VecTypeTrait<T, VecSize>::Type;
-  // Note(LiuYang): Here static_cast can't be used for cast between two pointer
+  using VT = typename common::VecTypeTrait<T, vec_size>::Type;
  *(reinterpret_cast<VT *>(dst)) = *(reinterpret_cast<const VT *>(src));
 }

@@ -33,9 +32,33 @@ __device__ __inline__ void copy_zero_vector(T *dst) {
  *(reinterpret_cast<VT *>(dst)) = funcs::CastFunctor<float, VT>()(0.0f);
 }

+template <typename SrcT, typename DstT, int vec_size>
+__device__ __inline__ void copy(const SrcT *src, DstT *dst) {
+  using SrcVT = typename common::VecTypeTrait<SrcT, vec_size>::Type;
+  using DstVT = typename common::VecTypeTrait<DstT, vec_size>::Type;
+  // Note(LiuYang): Here static_cast can't be used for cast between two pointer
+  *(reinterpret_cast<DstVT *>(dst)) = funcs::CastFunctor<SrcVT, DstVT>()(
+      *(reinterpret_cast<const SrcVT *>(src)));
+}
+
+template <typename T, int vec_size>
+__device__ __inline__ void copy<T, T, vec_size>(const T *src, T *dst) {
+  using VT = typename common::VecTypeTrait<T, vec_size>::Type;
+  *(reinterpret_cast<VT *>(dst)) = *(reinterpret_cast<const VT *>(src));
+}
+
+template <>
+__device__ __inline__ void copy<float, float, 8>(const float *src, float *dst) {
+  // Since the maximum memory alignment length is 128 bits, we choose float4
+  // here.
+  *(reinterpret_cast<float4 *>(dst)) = *(reinterpret_cast<const float4 *>(src));
+  *(reinterpret_cast<float4 *>(dst + 4)) =
+      *(reinterpret_cast<const float4 *>(src + 4));
+}
+
 template <typename T>
 int get_vec_size(const torch::Tensor &tensor) {
-  uint64_t address = reinterpret_cast<uint64_t>(tensor.data_ptr<T>());
+  uint64_t address = reinterpret_cast<uint64_t>(tensor.data_ptr());
  const int max_aligned_size = 128;
  const int dtype_size = sizeof(T) * 8;