mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-12 20:54:35 +00:00
[Inference/Feat] Feat quant kvcache step2 (#5674)
This commit is contained in:
@@ -11,10 +11,9 @@ namespace colossalAI {
|
||||
namespace cuda {
|
||||
namespace utils {
|
||||
|
||||
template <typename T, int VecSize>
|
||||
template <typename T, int vec_size>
|
||||
__device__ __inline__ void copy_vector(T *dst, const T *src) {
|
||||
using VT = typename common::VecTypeTrait<T, VecSize>::Type;
|
||||
// Note(LiuYang): Here static_cast can't be used for cast between two pointer
|
||||
using VT = typename common::VecTypeTrait<T, vec_size>::Type;
|
||||
*(reinterpret_cast<VT *>(dst)) = *(reinterpret_cast<const VT *>(src));
|
||||
}
|
||||
|
||||
@@ -33,9 +32,33 @@ __device__ __inline__ void copy_zero_vector(T *dst) {
|
||||
*(reinterpret_cast<VT *>(dst)) = funcs::CastFunctor<float, VT>()(0.0f);
|
||||
}
|
||||
|
||||
template <typename SrcT, typename DstT, int vec_size>
|
||||
__device__ __inline__ void copy(const SrcT *src, DstT *dst) {
|
||||
using SrcVT = typename common::VecTypeTrait<SrcT, vec_size>::Type;
|
||||
using DstVT = typename common::VecTypeTrait<DstT, vec_size>::Type;
|
||||
// Note(LiuYang): Here static_cast can't be used for cast between two pointer
|
||||
*(reinterpret_cast<DstVT *>(dst)) = funcs::CastFunctor<SrcVT, DstVT>()(
|
||||
*(reinterpret_cast<const SrcVT *>(src)));
|
||||
}
|
||||
|
||||
template <typename T, int vec_size>
|
||||
__device__ __inline__ void copy<T, T, vec_size>(const T *src, T *dst) {
|
||||
using VT = typename common::VecTypeTrait<T, vec_size>::Type;
|
||||
*(reinterpret_cast<VT *>(dst)) = *(reinterpret_cast<const VT *>(src));
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ __inline__ void copy<float, float, 8>(const float *src, float *dst) {
|
||||
// Since the maximum memory alignment length is 128 bits, we choose float4
|
||||
// here.
|
||||
*(reinterpret_cast<float4 *>(dst)) = *(reinterpret_cast<const float4 *>(src));
|
||||
*(reinterpret_cast<float4 *>(dst + 4)) =
|
||||
*(reinterpret_cast<const float4 *>(src + 4));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
int get_vec_size(const torch::Tensor &tensor) {
|
||||
uint64_t address = reinterpret_cast<uint64_t>(tensor.data_ptr<T>());
|
||||
uint64_t address = reinterpret_cast<uint64_t>(tensor.data_ptr());
|
||||
const int max_aligned_size = 128;
|
||||
const int dtype_size = sizeof(T) * 8;
|
||||
|
||||
|
Reference in New Issue
Block a user