diff --git a/extensions/csrc/cuda/activation_kernel.cu b/extensions/csrc/cuda/activation_kernel.cu
index 2745e5fbd..a65a3df8e 100644
--- a/extensions/csrc/cuda/activation_kernel.cu
+++ b/extensions/csrc/cuda/activation_kernel.cu
@@ -37,6 +37,8 @@ __global__ void act_and_mul_kernel(
 // silu(x[:half_1stdim]) * (x[half_1stdim:])
 torch::Tensor silu_and_mul(const torch::Tensor& ins)
 {
+    // Note(LiuYang): According to torch doc, vec() may cost a lot, but I did't find a better api
+    // to manipulate ins_shape which is IntArrayRef
     auto ins_shape = ins.sizes().vec();
 
     ins_shape[0] = ins_shape[0]/2;
@@ -44,18 +46,21 @@ torch::Tensor silu_and_mul(const torch::Tensor& ins)
       ins_shape.erase(ins_shape.begin());
     }
     auto outs = torch::zeros(ins_shape,ins.options());
-    auto outs_shape = ins.sizes().vec();
 
     const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
     // Note(Liuyang): numel of ins must be divisible by 2
     int64_t numel = ((torch::numel(ins)) >> 1);
 
-    // TODO(LiuYang): Maybe we need to implement a function to get launch config
-    colossalAI::cuda::utils::NVGPUDevInfo dev_info(0);
-    auto config = colossalAI::cuda::utils::GetGPULaunchConfig1D(dev_info,numel,1);
-    dim3 grid = config.grid;
-    dim3 block = config.block;
+    // Note(LiuYang): For better performance for special case of which input is [2, 64, 11008], now
+    // I comment this part code，because it also cost a little time to calculate a better config
+    // colossalAI::cuda::utils::NVGPUDevInfo dev_info(0);
+    // auto config = colossalAI::cuda::utils::GetGPULaunchConfig1D(dev_info,numel,1);
+    // dim3 grid = config.grid;
+    // dim3 block = config.block;
+
+    dim3 grid((numel+255)/256);
+    dim3 block(256);
 
     DISPATCH_FLOAT_HALF_AND_BFLOAT(
         ins.scalar_type(),