[feature] add gptq for inference (#4754)

* [gptq] add gptq kernel (#4416) * add gptq * refactor code * fix tests * replace auto-gptq * rname inferance/quant * refactor test * add auto-gptq as an option * reset requirements * change assert and check auto-gptq * add import warnings * change test flash attn version * remove example * change requirements of flash_attn * modify tests * [skip ci] change requirements-test * [gptq] faster gptq cuda kernel (#4494) * [skip ci] add cuda kernels * add license * [skip ci] fix max_input_len * format files & change test size * [skip ci] * [gptq] add gptq tensor parallel (#4538) * add gptq tensor parallel * add gptq tp * delete print * add test gptq check * add test auto gptq check * [gptq] combine gptq and kv cache manager (#4706) * combine gptq and kv cache manager * add init bits * delete useless code * add model path * delete usless print and update test * delete usless import * move option gptq to shard config * change replace linear to shardformer * update bloom policy * delete useless code * fix import bug and delete uselss code * change colossalai/gptq to colossalai/quant/gptq * update import linear for tests * delete useless code and mv gptq_kernel to kernel directory * fix triton kernel * add triton import
2025-09-15 22:19:38 +00:00 · 2023-09-22 11:02:50 +08:00
parent 1e0e080837
commit 946ab56c48
30 changed files with 3120 additions and 2 deletions
--- a/colossalai/kernel/triton/gptq_triton.py
+++ b/colossalai/kernel/triton/gptq_triton.py
@@ -0,0 +1,541 @@
+# Adapted from AutoGPTQ auto_gptq: https://github.com/PanQiWei/AutoGPTQ
+
+import torch
+import triton
+import triton.language as tl
+from auto_gptq.nn_modules.triton_utils import custom_autotune
+
+
+@triton.jit
+def tanh(x):
+    # Tanh is just a scaled sigmoid
+    return 2 * tl.sigmoid(2 * x) - 1
+
+
+@triton.jit
+def cosh(x):
+    exp_x = tl.exp(x)
+    return (exp_x + 1.0 / exp_x) * 0.5
+
+
+# a Triton implementation of the most used activations
+# See for instance http://arxiv.org/abs/1606.08415 for an overview
+
+
+# ReLU
+@triton.jit
+def relu(x):
+    """
+    ReLU_ activation function
+
+    .. _ReLU: https://pytorch.org/docs/stable/generated/torch.nn.ReLU.html
+    """
+    return tl.where(x >= 0, x, 0.0)
+
+
+@triton.jit
+def squared_relu(x):
+    """
+    Squared ReLU activation, as proposed in the Primer_ paper.
+
+    .. _Primer: https://arxiv.org/abs/2109.08668
+    """
+    x_sq = x * x
+    return tl.where(x > 0.0, x_sq, 0.0)
+
+
+@triton.jit
+def star_relu(x):
+    """
+    Star ReLU activation, as proposed in the "MetaFormer Baselines for Vision"_ paper.
+
+    .. _ "MetaFormer Baselines for Vision": https://arxiv.org/pdf/2210.13452.pdf
+    """
+    x_sq = x * x
+    return 0.8944 * tl.where(x > 0.0, x_sq, 0.0) - 0.4472
+
+
+# Leaky ReLU
+@triton.jit
+def leaky_relu(x):
+    """
+    LeakyReLU_ activation
+
+    .. _LeakyReLU: https://pytorch.org/docs/stable/generated/torch.nn.LeakyReLU.html
+    """
+    return tl.where(x >= 0.0, x, 0.01 * x)
+
+
+@triton.jit
+def gelu(x):
+    """
+    GeLU_ activation - Gaussian error linear unit
+
+    .. _GeLU: https://arxiv.org/pdf/1606.08415.pdf
+    """
+    return 0.5 * x * (1 + tanh(_kAlpha * (x + 0.044715 * x * x * x)))
+
+
+@triton.jit
+def smelu(x):
+    """
+    SmeLU_ activation -  Smooth ReLU with beta=2.0
+
+    .. _SmeLU: https://arxiv.org/pdf/2202.06499.pdf
+    """
+    beta = 2.0
+
+    relu = tl.where(x >= beta, x, 0.0)
+    return tl.where(tl.abs(x) <= beta, (x + beta) * (x + beta) / (4.0 * beta), relu)
+
+
+@triton.jit
+def silu(x):
+    return x * tl.sigmoid(x)
+
+
+@custom_autotune.autotune(
+    configs=[
+        triton.Config(
+            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, num_stages=4, num_warps=4
+        ),
+        triton.Config(
+            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, num_stages=4, num_warps=4
+        ),
+        triton.Config(
+            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, num_stages=4, num_warps=4
+        ),
+        triton.Config(
+            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, num_stages=4, num_warps=4
+        ),
+        triton.Config(
+            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, num_stages=4, num_warps=4
+        ),
+        triton.Config(
+            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, num_stages=2, num_warps=8
+        ),
+        triton.Config(
+            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 8}, num_stages=3, num_warps=8
+        ),
+        triton.Config(
+            {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8}, num_stages=2, num_warps=4
+        ),
+    ],
+    key=["M", "N", "K"],
+    nearest_power_of_two=True,
+    prune_configs_by={
+        "early_config_prune": custom_autotune.matmul248_kernel_config_pruner,
+        "perf_model": None,
+        "top_k": None,
+    },
+)
+@triton.jit
+def cai_gptq_matmul_248_kernel(
+    a_ptr,
+    b_ptr,
+    c_ptr,
+    scales_ptr,
+    zeros_ptr,
+    bias_ptr,
+    residual_ptr,
+    M,
+    N,
+    K,
+    bits,
+    maxq,
+    gptq_group_size,
+    stride_am,
+    stride_ak,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    stride_scales,
+    stride_zeros,
+    QKV_FUSED: tl.constexpr,
+    ADD_BIAS: tl.constexpr,
+    ADD_RESIDUAL: tl.constexpr,
+    ACT_TYPE: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    """
+    Compute the matrix multiplication C = A x B.
+    A is of shape (M, K) float16
+    B is of shape (K//8, N) int32
+    C is of shape (M, N) float16
+    scales is of shape (G, N) float16
+    zeros is of shape (G, N) float16
+    """
+    infearure_per_bits = 32 // bits
+
+    pid = tl.program_id(axis=0)
+    NK = K
+
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    num_pid_k = tl.cdiv(NK, BLOCK_SIZE_K)
+    qkv_offset = pid // (num_pid_m * num_pid_n)
+    pid = pid % (num_pid_m * num_pid_n)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    # offs_bk = offs_k + qkv_offset * NK
+    a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)  # (BLOCK_SIZE_M, BLOCK_SIZE_K)
+
+    a_mask = offs_am[:, None] < M
+    # b_ptrs is set up such that it repeats elements along the K axis 8 times
+    b_ptrs = (
+        b_ptr
+        + qkv_offset * N * NK // infearure_per_bits
+        + ((offs_k[:, None] // infearure_per_bits) * stride_bk + offs_bn[None, :] * stride_bn)
+    )  # (BLOCK_SIZE_K, BLOCK_SIZE_N)
+    # g_ptrs = g_ptr + offs_k
+    # shifter is used to extract the N bits of each element in the 32-bit word from B
+    scales_ptrs = scales_ptr + qkv_offset * NK * N // gptq_group_size + offs_bn[None, :]
+    zeros_ptrs = (
+        zeros_ptr
+        + qkv_offset * NK * N // gptq_group_size // infearure_per_bits
+        + (offs_bn[None, :] // infearure_per_bits)
+    )
+
+    shifter = (offs_k % infearure_per_bits) * bits
+    zeros_shifter = (offs_bn % infearure_per_bits) * bits
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    g_idx_base = tl.arange(0, BLOCK_SIZE_K)
+    g_idx_base = g_idx_base // gptq_group_size
+    g_idx = g_idx_base
+    # tl.device_print("gidx, ", g_idx)
+
+    scales = tl.load(scales_ptrs + g_idx[:, None] * stride_scales)  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
+    zeros = tl.load(zeros_ptrs + g_idx[:, None] * stride_zeros)  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
+    zeros = (zeros >> zeros_shifter[None, :]) & maxq
+    zeros = zeros + 1
+
+    for k in range(0, num_pid_k):
+        # g_idx = tl.load(g_ptrs)
+        # if (k + 1) * BLOCK_SIZE_K > currend_group_end:
+        scales = tl.load(scales_ptrs + g_idx[:, None] * stride_scales)  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
+        zeros = tl.load(zeros_ptrs + g_idx[:, None] * stride_zeros)  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
+        zeros = (zeros >> zeros_shifter[None, :]) & maxq
+        zeros = zeros + 1
+        # Fetch scales and zeros; these are per-outfeature and thus reused in the inner loop
+        a = tl.load(a_ptrs, mask=a_mask, other=0.0)  # (BLOCK_SIZE_M, BLOCK_SIZE_K)
+        b = tl.load(b_ptrs)  # (BLOCK_SIZE_K, BLOCK_SIZE_N), but repeated
+        # Now we need to unpack b (which is N-bit values) into 32-bit values
+        b = (b >> shifter[:, None]) & maxq  # Extract the N-bit values
+        b = (b - zeros).to(tl.float16) * scales  # Scale and shift
+        accumulator += tl.dot(a, b)
+
+        a_ptrs += BLOCK_SIZE_K
+        b_ptrs += (BLOCK_SIZE_K // infearure_per_bits) * stride_bk
+        g_idx = g_idx_base + ((k + 1) * BLOCK_SIZE_K) // gptq_group_size
+        # if (k + 2) * BLOCK_SIZE_K > currend_group_end:
+
+    c_ptrs = c_ptr + qkv_offset * M * N + stride_cm * offs_am[:, None] + stride_cn * offs_bn[None, :]
+    c_mask = (offs_am[:, None] < M) & (offs_bn[None, :] < N)
+
+    if ADD_BIAS:
+        bias_mask = offs_bn < N
+        offs_bn += qkv_offset * N
+        bias_ptrs = bias_ptr + stride_cn * offs_bn
+        bias = tl.load(bias_ptrs, mask=bias_mask, other=0.0)  # (BLOCK_SIZE_M, BLOCK_SIZE_K)
+        accumulator += bias[None, :]
+
+    if ACT_TYPE == 1:
+        accumulator = relu(accumulator)
+    elif ACT_TYPE == 2:
+        accumulator = gelu(accumulator)
+    elif ACT_TYPE == 3:
+        accumulator = silu(accumulator)
+
+    if ADD_RESIDUAL:
+        residual_ptrs = residual_ptr + stride_cm * offs_am[:, None] + stride_cn * offs_bn[None, :]
+        res = tl.load(residual_ptrs, mask=c_mask, other=0.0)
+        accumulator += res
+
+    tl.store(c_ptrs, accumulator, mask=c_mask)
+
+
+@custom_autotune.autotune(
+    configs=[
+        triton.Config(
+            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, num_stages=4, num_warps=4
+        ),
+        triton.Config(
+            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, num_stages=4, num_warps=4
+        ),
+        triton.Config(
+            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, num_stages=4, num_warps=4
+        ),
+        triton.Config(
+            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, num_stages=4, num_warps=4
+        ),
+        triton.Config(
+            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, num_stages=4, num_warps=4
+        ),
+        triton.Config(
+            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, num_stages=2, num_warps=8
+        ),
+        triton.Config(
+            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 8}, num_stages=3, num_warps=8
+        ),
+        triton.Config(
+            {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8}, num_stages=2, num_warps=4
+        ),
+    ],
+    key=["M", "N", "K"],
+    nearest_power_of_two=True,
+    prune_configs_by={
+        "early_config_prune": custom_autotune.matmul248_kernel_config_pruner,
+        "perf_model": None,
+        "top_k": None,
+    },
+)
+@triton.jit
+def cai_gptq_idx_matmul_248_kernel(
+    a_ptr,
+    b_ptr,
+    c_ptr,
+    scales_ptr,
+    zeros_ptr,
+    idx_ptr,
+    bias_ptr,
+    residual_ptr,
+    M,
+    N,
+    K,
+    bits,
+    maxq,
+    gptq_group_size,
+    stride_am,
+    stride_ak,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    stride_scales,
+    stride_zeros,
+    QKV_FUSED: tl.constexpr,
+    ADD_BIAS: tl.constexpr,
+    ADD_RESIDUAL: tl.constexpr,
+    ACT_TYPE: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    """
+    Compute the matrix multiplication C = A x B.
+    A is of shape (M, K) float16
+    B is of shape (K//8, N) int32
+    C is of shape (M, N) float16
+    scales is of shape (G, N) float16
+    zeros is of shape (G, N) float16
+    """
+    infearure_per_bits = 32 // bits
+
+    pid = tl.program_id(axis=0)
+    NK = K
+
+    # if QKV_FUSED:
+    #     NK = K//3
+    # else:
+    #     NK = K
+    # NK = K
+
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    num_pid_k = tl.cdiv(NK, BLOCK_SIZE_K)
+    qkv_offset = pid // (num_pid_m * num_pid_n)
+    pid = pid % (num_pid_m * num_pid_n)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    # offs_bk = offs_k + qkv_offset * NK
+    a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)  # (BLOCK_SIZE_M, BLOCK_SIZE_K)
+
+    a_mask = offs_am[:, None] < M
+    # b_ptrs is set up such that it repeats elements along the K axis 8 times
+    b_ptrs = (
+        b_ptr
+        + qkv_offset * N * NK // infearure_per_bits
+        + ((offs_k[:, None] // infearure_per_bits) * stride_bk + offs_bn[None, :] * stride_bn)
+    )  # (BLOCK_SIZE_K, BLOCK_SIZE_N)
+    # g_ptrs = g_ptr + offs_k
+    # shifter is used to extract the N bits of each element in the 32-bit word from B
+    scales_ptrs = scales_ptr + qkv_offset * NK * N // gptq_group_size + offs_bn[None, :]
+    zeros_ptrs = (
+        zeros_ptr
+        + qkv_offset * NK * N // gptq_group_size // infearure_per_bits
+        + (offs_bn[None, :] // infearure_per_bits)
+    )
+
+    shifter = (offs_k % infearure_per_bits) * bits
+    zeros_shifter = (offs_bn % infearure_per_bits) * bits
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    g_ptrs = idx_ptr + offs_k
+    g_idx = tl.load(g_ptrs)
+    # tl.device_print("gidx, ", g_idx)
+    zeros_ptrs = zeros_ptr + (offs_bn[None, :] // infearure_per_bits)
+
+    scales = tl.load(scales_ptrs + g_idx[:, None] * stride_scales)  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
+
+    for k in range(0, num_pid_k):
+        g_idx = tl.load(g_ptrs)
+        scales = tl.load(scales_ptrs + g_idx[:, None] * stride_scales)  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
+        zeros = tl.load(zeros_ptrs + g_idx[:, None] * stride_zeros)  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
+
+        zeros = (zeros >> zeros_shifter[None, :]) & maxq
+        zeros = zeros + 1
+
+        # Fetch scales and zeros; these are per-outfeature and thus reused in the inner loop
+        a = tl.load(a_ptrs, mask=a_mask, other=0.0)  # (BLOCK_SIZE_M, BLOCK_SIZE_K)
+        b = tl.load(b_ptrs)  # (BLOCK_SIZE_K, BLOCK_SIZE_N), but repeated
+        # Now we need to unpack b (which is N-bit values) into 32-bit values
+        b = (b >> shifter[:, None]) & maxq  # Extract the N-bit values
+        b = (b - zeros).to(tl.float16) * scales  # Scale and shift
+        accumulator += tl.dot(a, b)
+
+        a_ptrs += BLOCK_SIZE_K
+        b_ptrs += (BLOCK_SIZE_K // infearure_per_bits) * stride_bk
+        g_ptrs += BLOCK_SIZE_K
+
+    c_ptrs = c_ptr + qkv_offset * M * N + stride_cm * offs_am[:, None] + stride_cn * offs_bn[None, :]
+    c_mask = (offs_am[:, None] < M) & (offs_bn[None, :] < N)
+
+    if ADD_BIAS:
+        bias_mask = offs_bn < N
+        offs_bn += qkv_offset * N
+        bias_ptrs = bias_ptr + stride_cn * offs_bn
+        bias = tl.load(bias_ptrs, mask=bias_mask, other=0.0)  # (BLOCK_SIZE_M, BLOCK_SIZE_K)
+        accumulator += bias[None, :]
+
+    if ACT_TYPE == 1:
+        accumulator = relu(accumulator)
+    elif ACT_TYPE == 2:
+        accumulator = gelu(accumulator)
+    elif ACT_TYPE == 3:
+        accumulator = silu(accumulator)
+
+    if ADD_RESIDUAL:
+        residual_ptrs = residual_ptr + stride_cm * offs_am[:, None] + stride_cn * offs_bn[None, :]
+        res = tl.load(residual_ptrs, mask=c_mask, other=0.0)
+        accumulator += res
+
+    tl.store(c_ptrs, accumulator, mask=c_mask)
+
+
+def gptq_fused_linear_triton(
+    input,
+    qweight,
+    scales,
+    qzeros,
+    bias,
+    residual,
+    bits,
+    maxq,
+    gptq_group_size,
+    qkv_fused,
+    add_bias,
+    add_residual,
+    g_idx=None,
+    act_type=0,
+):
+    # print("gptq fused ", qkv_fused, add_bias, add_residual)
+    assert input.is_cuda, "input is not in cuda"
+    assert qweight.is_cuda, "qweight is not in cuda"
+    assert scales.is_cuda, "scales is not in cuda"
+    assert qzeros.is_cuda, "qzeros is not in cuda"
+
+    with torch.cuda.device(input.device):
+        if qkv_fused:
+            grid = lambda META: (
+                triton.cdiv(input.shape[0], META["BLOCK_SIZE_M"])
+                * triton.cdiv(qweight.shape[1], META["BLOCK_SIZE_N"])
+                * 3,
+            )
+            output = torch.empty((input.shape[0] * 3, qweight.shape[1]), device=input.device, dtype=torch.float16)
+        else:
+            grid = lambda META: (
+                triton.cdiv(input.shape[0], META["BLOCK_SIZE_M"]) * triton.cdiv(qweight.shape[1], META["BLOCK_SIZE_N"]),
+            )
+            output = torch.empty((input.shape[0], qweight.shape[1]), device=input.device, dtype=torch.float16)
+        # print("dtype, ", qweight.dtype, output.dtype, scales.dtype, qzeros.dtype, bias.dtype, residual.dtype)
+        if g_idx is None:
+            cai_gptq_matmul_248_kernel[grid](
+                input,
+                qweight,
+                output,
+                scales,
+                qzeros,
+                bias,
+                residual,
+                input.shape[0],
+                qweight.shape[1],
+                input.shape[1],
+                bits,
+                maxq,
+                gptq_group_size,
+                input.stride(0),
+                input.stride(1),
+                qweight.stride(0),
+                qweight.stride(1),
+                output.stride(0),
+                output.stride(1),
+                scales.stride(0),
+                qzeros.stride(0),
+                QKV_FUSED=qkv_fused,
+                ADD_BIAS=add_bias,
+                ADD_RESIDUAL=add_residual,
+                ACT_TYPE=act_type,
+            )
+        else:
+            cai_gptq_idx_matmul_248_kernel[grid](
+                input,
+                qweight,
+                output,
+                scales,
+                qzeros,
+                g_idx,
+                bias,
+                residual,
+                input.shape[0],
+                qweight.shape[1],
+                input.shape[1],
+                bits,
+                maxq,
+                gptq_group_size,
+                input.stride(0),
+                input.stride(1),
+                qweight.stride(0),
+                qweight.stride(1),
+                output.stride(0),
+                output.stride(1),
+                scales.stride(0),
+                qzeros.stride(0),
+                QKV_FUSED=qkv_fused,
+                ADD_BIAS=add_bias,
+                ADD_RESIDUAL=add_residual,
+                ACT_TYPE=act_type,
+            )
+        if qkv_fused:
+            return output.view(3, input.shape[0], qweight.shape[1])
+        else:
+            return output