[misc] update pre-commit and run all files (#4752)

* [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format
2025-09-23 02:20:49 +00:00 · 2023-09-19 14:20:26 +08:00
parent 3c6b831c26
commit 079bf3cb26
1268 changed files with 50037 additions and 38444 deletions
--- a/tests/test_infer_ops/triton/kernel_utils.py
+++ b/tests/test_infer_ops/triton/kernel_utils.py
@@ -1,19 +1,18 @@
 import math

-import numpy as np
 import torch
 from torch.nn import functional as F


 def torch_context_attention(xq, xk, xv, bs, seqlen, num_head, head_dim):
-    '''
-     adepted from https://github.com/ModelTC/lightllm/blob/main/lightllm/models/bloom/triton_kernel/context_flashattention_nopad.py#L253
-    '''
+    """
+    adepted from https://github.com/ModelTC/lightllm/blob/main/lightllm/models/bloom/triton_kernel/context_flashattention_nopad.py#L253
+    """
    xq = xq.view(bs, seqlen, num_head, head_dim)
    xk = xk.view(bs, seqlen, num_head, head_dim)
    xv = xv.view(bs, seqlen, num_head, head_dim)
    mask = torch.tril(torch.ones(seqlen, seqlen), diagonal=0).unsqueeze(0).unsqueeze(0).cuda()
-    mask[mask == 0.] = -100000000.0
+    mask[mask == 0.0] = -100000000.0
    mask = mask.repeat(bs, num_head, 1, 1)
    keys = xk
    values = xv
--- a/tests/test_infer_ops/triton/test_bloom_context_attention.py
+++ b/tests/test_infer_ops/triton/test_bloom_context_attention.py
@@ -1,27 +1,24 @@
-import math
-
 import pytest
 import torch
 from packaging import version
-from torch import nn
-from torch.nn import functional as F

 try:
-    import triton
-    import triton.language as tl
+    pass

    from colossalai.kernel.triton import bloom_context_attn_fwd
    from tests.test_infer_ops.triton.kernel_utils import torch_context_attention
+
    HAS_TRITON = True
 except ImportError:
    HAS_TRITON = False
    print("please install triton from https://github.com/openai/triton")

-TRITON_CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse('11.4')
+TRITON_CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse("11.4")


-@pytest.mark.skipif(not TRITON_CUDA_SUPPORT or not HAS_TRITON,
-                    reason="triton requires cuda version to be higher than 11.4")
+@pytest.mark.skipif(
+    not TRITON_CUDA_SUPPORT or not HAS_TRITON, reason="triton requires cuda version to be higher than 11.4"
+)
 def test_bloom_context_attention():
    bs = 4
    head_num = 8
@@ -46,8 +43,9 @@ def test_bloom_context_attention():

    torch_out = torch_context_attention(query.clone(), k.clone(), v.clone(), bs, seq_len, head_num, head_dim)

-    assert torch.allclose(torch_out.cpu(), o.cpu(), rtol=1e-3,
-                          atol=1e-2), "outputs from triton and torch are not matched"
+    assert torch.allclose(
+        torch_out.cpu(), o.cpu(), rtol=1e-3, atol=1e-2
+    ), "outputs from triton and torch are not matched"


 if __name__ == "__main__":
--- a/tests/test_infer_ops/triton/test_copy_kv_dest.py
+++ b/tests/test_infer_ops/triton/test_copy_kv_dest.py
@@ -1,25 +1,24 @@
 import pytest
 import torch
 from packaging import version
-from torch import nn

 try:
-    import triton
-    import triton.language as tl
+    pass

    from colossalai.kernel.triton.copy_kv_cache_dest import copy_kv_cache_to_dest
+
    HAS_TRITON = True
 except ImportError:
    HAS_TRITON = False
    print("please install triton from https://github.com/openai/triton")

-TRITON_CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse('11.4')
+TRITON_CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse("11.4")


-@pytest.mark.skipif(not TRITON_CUDA_SUPPORT or not HAS_TRITON,
-                    reason="triton requires cuda version to be higher than 11.4")
+@pytest.mark.skipif(
+    not TRITON_CUDA_SUPPORT or not HAS_TRITON, reason="triton requires cuda version to be higher than 11.4"
+)
 def test_kv_cache_copy_op():
-
    B_NTX = 32 * 2048
    head_num = 8
    head_dim = 64
@@ -31,8 +30,9 @@ def test_kv_cache_copy_op():

    copy_kv_cache_to_dest(cache, dest_index, dest_data)

-    assert torch.allclose(cache.cpu(), dest_data.cpu(), rtol=1e-3,
-                          atol=1e-3), "copy_kv_cache_to_dest outputs from triton and torch are not matched"
+    assert torch.allclose(
+        cache.cpu(), dest_data.cpu(), rtol=1e-3, atol=1e-3
+    ), "copy_kv_cache_to_dest outputs from triton and torch are not matched"


 if __name__ == "__main__":
--- a/tests/test_infer_ops/triton/test_layernorm_triton.py
+++ b/tests/test_infer_ops/triton/test_layernorm_triton.py
@@ -6,30 +6,29 @@ from colossalai.kernel.triton import layer_norm
 from colossalai.testing.utils import parameterize

 try:
-    import triton
-    import triton.language as tl
+    pass

-    from colossalai.kernel.triton.fused_layernorm import _layer_norm_fwd_fused
    HAS_TRITON = True
 except ImportError:
    HAS_TRITON = False
    print("please install triton from https://github.com/openai/triton")

-TRITON_CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse('11.4')
+TRITON_CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse("11.4")


-@pytest.mark.skipif(not TRITON_CUDA_SUPPORT or not HAS_TRITON,
-                    reason="triton requires cuda version to be higher than 11.4")
-@parameterize('M', [2, 4, 8, 16])
-@parameterize('N', [64, 128])
+@pytest.mark.skipif(
+    not TRITON_CUDA_SUPPORT or not HAS_TRITON, reason="triton requires cuda version to be higher than 11.4"
+)
+@parameterize("M", [2, 4, 8, 16])
+@parameterize("N", [64, 128])
 def test_layer_norm(M, N):
    dtype = torch.float16
    eps = 1e-5
    x_shape = (M, N)
    w_shape = (x_shape[-1],)
-    weight = torch.rand(w_shape, dtype=dtype, device='cuda')
-    bias = torch.rand(w_shape, dtype=dtype, device='cuda')
-    x = -2.3 + 0.5 * torch.randn(x_shape, dtype=dtype, device='cuda')
+    weight = torch.rand(w_shape, dtype=dtype, device="cuda")
+    bias = torch.rand(w_shape, dtype=dtype, device="cuda")
+    x = -2.3 + 0.5 * torch.randn(x_shape, dtype=dtype, device="cuda")

    y_triton = layer_norm(x, weight, bias, eps)
    y_torch = torch.nn.functional.layer_norm(x, w_shape, weight, bias, eps).to(dtype)
--- a/tests/test_infer_ops/triton/test_llama_context_attention.py
+++ b/tests/test_infer_ops/triton/test_llama_context_attention.py
@@ -1,27 +1,24 @@
-import math
-
 import pytest
 import torch
 from packaging import version
-from torch import nn
-from torch.nn import functional as F

 try:
-    import triton
-    import triton.language as tl
+    pass

    from colossalai.kernel.triton import llama_context_attn_fwd
    from tests.test_infer_ops.triton.kernel_utils import torch_context_attention
+
    HAS_TRITON = True
 except ImportError:
    HAS_TRITON = False
    print("please install triton from https://github.com/openai/triton")

-TRITON_CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse('11.4')
+TRITON_CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse("11.4")


-@pytest.mark.skipif(not TRITON_CUDA_SUPPORT or not HAS_TRITON,
-                    reason="triton requires cuda version to be higher than 11.4")
+@pytest.mark.skipif(
+    not TRITON_CUDA_SUPPORT or not HAS_TRITON, reason="triton requires cuda version to be higher than 11.4"
+)
 def test_llama_context_attention():
    bs = 4
    head_num = 8
@@ -45,8 +42,9 @@ def test_llama_context_attention():

    torch_out = torch_context_attention(query.clone(), k.clone(), v.clone(), bs, seq_len, head_num, head_dim)

-    assert torch.allclose(torch_out.cpu(), o.cpu(), rtol=1e-3,
-                          atol=1e-3), "outputs from triton and torch are not matched"
+    assert torch.allclose(
+        torch_out.cpu(), o.cpu(), rtol=1e-3, atol=1e-3
+    ), "outputs from triton and torch are not matched"


 if __name__ == "__main__":
--- a/tests/test_infer_ops/triton/test_rotary_embedding.py
+++ b/tests/test_infer_ops/triton/test_rotary_embedding.py
@@ -1,14 +1,12 @@
 # Adapted from ModelTC https://github.com/ModelTC/lightllm

-import time

 import pytest
 import torch
 from packaging import version

 try:
-    import triton
-    import triton.language as tl
+    pass

    from colossalai.kernel.triton.rotary_embedding_kernel import rotary_embedding_fwd

@@ -17,13 +15,13 @@ except ImportError:
    HAS_TRITON = False
    print("please install triton from https://github.com/openai/triton")

-TRITON_CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse('11.4')
+TRITON_CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse("11.4")


 def torch_rotary_emb(x, cos, sin):
    seq_len, h, dim = x.shape
-    x0 = x[:, :, 0:dim // 2]
-    x1 = x[:, :, dim // 2:dim]
+    x0 = x[:, :, 0 : dim // 2]
+    x1 = x[:, :, dim // 2 : dim]
    cos = cos.view((seq_len, 1, dim // 2))
    sin = sin.view((seq_len, 1, dim // 2))
    o0 = x0 * cos - x1 * sin
@@ -31,8 +29,9 @@ def torch_rotary_emb(x, cos, sin):
    return torch.cat((o0, o1), dim=-1)


-@pytest.mark.skipif(not TRITON_CUDA_SUPPORT or not HAS_TRITON,
-                    reason="triton requires cuda version to be higher than 11.4")
+@pytest.mark.skipif(
+    not TRITON_CUDA_SUPPORT or not HAS_TRITON, reason="triton requires cuda version to be higher than 11.4"
+)
 def test_rotary_emb():
    SEQ_LEN = 1
    HEAD_NUM = 32
@@ -40,10 +39,10 @@ def test_rotary_emb():
    dtype = torch.half
    # create data
    x_shape = (SEQ_LEN, HEAD_NUM, HEAD_DIM)
-    x = -2.3 + 0.5 * torch.randn(x_shape, dtype=dtype, device='cuda')
+    x = -2.3 + 0.5 * torch.randn(x_shape, dtype=dtype, device="cuda")
    cos_shape = (SEQ_LEN, HEAD_DIM // 2)
-    cos = -1.2 + 0.5 * torch.randn(cos_shape, dtype=dtype, device='cuda')
-    sin = -2.0 + 0.5 * torch.randn(cos_shape, dtype=dtype, device='cuda')
+    cos = -1.2 + 0.5 * torch.randn(cos_shape, dtype=dtype, device="cuda")
+    sin = -2.0 + 0.5 * torch.randn(cos_shape, dtype=dtype, device="cuda")
    # forward pass
    y_torch = torch_rotary_emb(x, cos, sin)
    rotary_embedding_fwd(x, cos, sin)
--- a/tests/test_infer_ops/triton/test_self_attention_nonfusion.py
+++ b/tests/test_infer_ops/triton/test_self_attention_nonfusion.py
@@ -1,24 +1,27 @@
 import pytest
-from packaging import version
 import torch
-from torch import nn 
 import torch.nn.functional as F
+from packaging import version

 try:
    import triton
-    import triton.language as tl
-    from colossalai.kernel.triton.self_attention_nofusion import self_attention_compute_using_triton
+
    from colossalai.kernel.triton.qkv_matmul_kernel import qkv_gemm_4d_kernel
+    from colossalai.kernel.triton.self_attention_nofusion import self_attention_compute_using_triton
+
    HAS_TRITON = True
 except ImportError:
    HAS_TRITON = False
    print("please install triton from https://github.com/openai/triton")

-TRITON_CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse('11.4')
+TRITON_CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse("11.4")

-@pytest.mark.skipif(not TRITON_CUDA_SUPPORT or not HAS_TRITON, reason="triton requires cuda version to be higher than 11.4")
+
+@pytest.mark.skipif(
+    not TRITON_CUDA_SUPPORT or not HAS_TRITON, reason="triton requires cuda version to be higher than 11.4"
+)
 def test_qkv_matmul():
-    qkv = torch.randn((4, 24, 64*3), device="cuda", dtype=torch.float16)
+    qkv = torch.randn((4, 24, 64 * 3), device="cuda", dtype=torch.float16)
    scale = 1.2
    head_size = 32
    batches = qkv.shape[0]
@@ -26,7 +29,7 @@ def test_qkv_matmul():
    num_of_heads = d_model // head_size

    q = qkv[:, :, :d_model]
-    k = qkv[:, :, d_model:d_model * 2]
+    k = qkv[:, :, d_model : d_model * 2]

    q = q.view(batches, -1, num_of_heads, head_size)
    k = k.view(batches, -1, num_of_heads, head_size)
@@ -36,29 +39,40 @@ def test_qkv_matmul():
    k = torch.transpose(k, 1, 2).contiguous()
    k = torch.transpose(k, 2, 3).contiguous()

-    torch_ouput = torch.einsum('bnij,bnjk->bnik', q, k)
+    torch_ouput = torch.einsum("bnij,bnjk->bnik", q, k)
    torch_ouput *= 1.2

    q, k = q_copy, k_copy
    batches, M, H, K = q.shape
    N = k.shape[1]
-    score_output = torch.empty(
-    (batches, H, M, N), device=q.device, dtype=q.dtype)
+    score_output = torch.empty((batches, H, M, N), device=q.device, dtype=q.dtype)

    grid = lambda meta: (
        batches,
        H,
-        triton.cdiv(M, meta["BLOCK_SIZE_M"]) *
-        triton.cdiv(N, meta["BLOCK_SIZE_N"]),
+        triton.cdiv(M, meta["BLOCK_SIZE_M"]) * triton.cdiv(N, meta["BLOCK_SIZE_N"]),
    )

    K = q.shape[3]
    qkv_gemm_4d_kernel[grid](
-        q, k, score_output,
-        M, N, K,
-        q.stride(0), q.stride(2), q.stride(1), q.stride(3),
-        k.stride(0), k.stride(2), k.stride(3), k.stride(1),
-        score_output.stride(0), score_output.stride(1), score_output.stride(2), score_output.stride(3),
+        q,
+        k,
+        score_output,
+        M,
+        N,
+        K,
+        q.stride(0),
+        q.stride(2),
+        q.stride(1),
+        q.stride(3),
+        k.stride(0),
+        k.stride(2),
+        k.stride(3),
+        k.stride(1),
+        score_output.stride(0),
+        score_output.stride(1),
+        score_output.stride(2),
+        score_output.stride(3),
        scale=scale,
        # currently manually setting, later on we can use auto-tune config to match best setting
        BLOCK_SIZE_M=64,
@@ -69,21 +83,16 @@ def test_qkv_matmul():

    check = torch.allclose(torch_ouput.cpu(), score_output.cpu(), rtol=1e-3, atol=1e-5)
    assert check is True, "the outputs of triton and torch are not matched"
-    

-def self_attention_compute_using_torch(qkv,
-                                       input_mask,
-                                       scale,
-                                       head_size
-                                       ):

+def self_attention_compute_using_torch(qkv, input_mask, scale, head_size):
    batches = qkv.shape[0]
    d_model = qkv.shape[-1] // 3
    num_of_heads = d_model // head_size
-    
+
    q = qkv[:, :, :d_model]
-    k = qkv[:, :, d_model:d_model * 2]
-    v = qkv[:, :, d_model * 2:]
+    k = qkv[:, :, d_model : d_model * 2]
+    v = qkv[:, :, d_model * 2 :]
    q = q.view(batches, -1, num_of_heads, head_size)
    k = k.view(batches, -1, num_of_heads, head_size)
    v = v.view(batches, -1, num_of_heads, head_size)
@@ -94,37 +103,36 @@ def self_attention_compute_using_torch(qkv,

    k = torch.transpose(k, -1, -2).contiguous()

-    score_output = torch.einsum('bnij,bnjk->bnik', q, k)
+    score_output = torch.einsum("bnij,bnjk->bnik", q, k)
    score_output *= scale

-    softmax_output = F.softmax(score_output, dim = -1)
-    res = torch.einsum('bnij,bnjk->bnik', softmax_output, v)
+    softmax_output = F.softmax(score_output, dim=-1)
+    res = torch.einsum("bnij,bnjk->bnik", softmax_output, v)
    res = torch.transpose(res, 1, 2)
    res = res.contiguous()

-
    return res.view(batches, -1, d_model), score_output, softmax_output

-@pytest.mark.skipif(not TRITON_CUDA_SUPPORT or not HAS_TRITON, reason="triton requires cuda version to be higher than 11.4")
-def test_self_atttention_test():

-    qkv = torch.randn((4, 24, 64*3), device="cuda", dtype=torch.float16)
+@pytest.mark.skipif(
+    not TRITON_CUDA_SUPPORT or not HAS_TRITON, reason="triton requires cuda version to be higher than 11.4"
+)
+def test_self_atttention_test():
+    qkv = torch.randn((4, 24, 64 * 3), device="cuda", dtype=torch.float16)
    data_output_torch, score_output_torch, softmax_output_torch = self_attention_compute_using_torch(
-                                                           qkv.clone(), 
-                                                           input_mask = None, 
-                                                           scale = 1.2, 
-                                                           head_size = 32
-                                                           )
+        qkv.clone(), input_mask=None, scale=1.2, head_size=32
+    )

    data_output_triton = self_attention_compute_using_triton(
-                                                            qkv.clone(),
-                                                            alibi=None,
-                                                            head_size=32,
-                                                            scale=1.2,
-                                                            input_mask=None,
-                                                            layer_past=None,
-                                                            use_flash=False,
-                                                            triangular=True)
+        qkv.clone(),
+        alibi=None,
+        head_size=32,
+        scale=1.2,
+        input_mask=None,
+        layer_past=None,
+        use_flash=False,
+        triangular=True,
+    )

    check = torch.allclose(data_output_triton.cpu(), data_output_torch.cpu(), rtol=1e-4, atol=1e-2)
    assert check is True, "the triton output is not matched with torch output"
@@ -132,4 +140,4 @@ def test_self_atttention_test():

 if __name__ == "__main__":
    test_qkv_matmul()
-    test_self_atttention_test()
+    test_self_atttention_test()
--- a/tests/test_infer_ops/triton/test_softmax.py
+++ b/tests/test_infer_ops/triton/test_softmax.py
@@ -1,30 +1,31 @@
 import pytest
-from packaging import version
 import torch
+from packaging import version
 from torch import nn

-
 try:
-    import triton
-    import triton.language as tl
    from colossalai.kernel.triton.softmax import softmax
+
    HAS_TRITON = True
 except ImportError:
    HAS_TRITON = False
    print("please install triton from https://github.com/openai/triton")

-TRITON_CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse('11.4')
+TRITON_CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse("11.4")

-@pytest.mark.skipif(not TRITON_CUDA_SUPPORT or not HAS_TRITON, reason="triton requires cuda version to be higher than 11.4")
+
+@pytest.mark.skipif(
+    not TRITON_CUDA_SUPPORT or not HAS_TRITON, reason="triton requires cuda version to be higher than 11.4"
+)
 def test_softmax_op():
    data_samples = [
-                        torch.randn((3, 4, 5, 32), device = "cuda", dtype = torch.float32),
-                        torch.randn((320, 320, 78), device = "cuda", dtype = torch.float32),
-                        torch.randn((2345, 4, 5, 64), device = "cuda", dtype = torch.float16)
-                    ]
+        torch.randn((3, 4, 5, 32), device="cuda", dtype=torch.float32),
+        torch.randn((320, 320, 78), device="cuda", dtype=torch.float32),
+        torch.randn((2345, 4, 5, 64), device="cuda", dtype=torch.float16),
+    ]

    for data in data_samples:
-        module = nn.Softmax(dim = -1)
+        module = nn.Softmax(dim=-1)
        data_torch_out = module(data)
        data_triton_out = softmax(data)
        check = torch.allclose(data_torch_out.cpu(), data_triton_out.cpu(), rtol=1e-3, atol=1e-3)
@@ -32,4 +33,4 @@ def test_softmax_op():


 if __name__ == "__main__":
-    test_softmax_op()
+    test_softmax_op()
--- a/tests/test_infer_ops/triton/test_token_attn_1.py
+++ b/tests/test_infer_ops/triton/test_token_attn_1.py
@@ -5,16 +5,16 @@ import torch
 from packaging import version

 try:
-    import triton
-    import triton.language as tl
+    pass

    from colossalai.kernel.triton.token_attention_kernel import token_attn_fwd_1
+
    HAS_TRITON = True
 except ImportError:
    HAS_TRITON = False
    print("please install triton from https://github.com/openai/triton")

-TRITON_CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse('11.4')
+TRITON_CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse("11.4")


 def torch_attn(xq, xk, bs, seqlen, num_head, head_dim):
@@ -23,8 +23,9 @@ def torch_attn(xq, xk, bs, seqlen, num_head, head_dim):
    keys = xk
    xq = xq.transpose(1, 2)
    keys = keys.transpose(1, 2)
-    scores = (torch.matmul(xq, keys.transpose(2, 3)) / math.sqrt(head_dim)).squeeze().transpose(0, 1).reshape(
-        num_head, -1)
+    scores = (
+        (torch.matmul(xq, keys.transpose(2, 3)) / math.sqrt(head_dim)).squeeze().transpose(0, 1).reshape(num_head, -1)
+    )
    return scores


@@ -37,10 +38,11 @@ def torch_attn_1(xq, xk, seqlen, num_head, head_dim):
    return logics


-@pytest.mark.skipif(not TRITON_CUDA_SUPPORT or not HAS_TRITON,
-                    reason="triton requires cuda version to be higher than 11.4")
+@pytest.mark.skipif(
+    not TRITON_CUDA_SUPPORT or not HAS_TRITON, reason="triton requires cuda version to be higher than 11.4"
+)
 def test_attn_1():
-    import time
+    pass

    batch_size, seq_len, head_num, head_dim = 17, 1025, 12, 128

--- a/tests/test_infer_ops/triton/test_token_attn_2.py
+++ b/tests/test_infer_ops/triton/test_token_attn_2.py
@@ -1,20 +1,18 @@
-import math
-
 import pytest
 import torch
 from packaging import version

 try:
-    import triton
-    import triton.language as tl
+    pass

    from colossalai.kernel.triton.token_attention_kernel import token_attn_fwd_2
+
    HAS_TRITON = True
 except ImportError:
    HAS_TRITON = False
    print("please install triton from https://github.com/openai/triton")

-TRITON_CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse('11.4')
+TRITON_CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse("11.4")


 def torch_attn(V, P, bs, seqlen, num_head, head_dim):
@@ -25,19 +23,23 @@ def torch_attn(V, P, bs, seqlen, num_head, head_dim):
    return attn_out


-@pytest.mark.skipif(not TRITON_CUDA_SUPPORT or not HAS_TRITON,
-                    reason="triton requires cuda version to be higher than 11.4")
+@pytest.mark.skipif(
+    not TRITON_CUDA_SUPPORT or not HAS_TRITON, reason="triton requires cuda version to be higher than 11.4"
+)
 def test_token_attn_2():
-    import time
+    pass

    batch_size, seq_len, head_num, head_dim = 17, 1025, 12, 128
    dtype = torch.float16

    V = torch.empty((batch_size * seq_len, head_num, head_dim), dtype=dtype, device="cuda").normal_(mean=0.1, std=10)
-    Prob = torch.empty(
-        (head_num, batch_size * seq_len), dtype=dtype,
-        device="cuda").normal_(mean=0.4, std=0.2).reshape(head_num, batch_size,
-                                                          seq_len).softmax(-1).reshape(head_num, batch_size * seq_len)
+    Prob = (
+        torch.empty((head_num, batch_size * seq_len), dtype=dtype, device="cuda")
+        .normal_(mean=0.4, std=0.2)
+        .reshape(head_num, batch_size, seq_len)
+        .softmax(-1)
+        .reshape(head_num, batch_size * seq_len)
+    )
    attn_out = torch.empty((batch_size, head_num, head_dim), dtype=dtype, device="cuda")

    kv_cache_start_loc = torch.zeros((batch_size,), dtype=torch.int32, device="cuda")
--- a/tests/test_infer_ops/triton/test_token_attn_fwd.py
+++ b/tests/test_infer_ops/triton/test_token_attn_fwd.py
@@ -1,20 +1,18 @@
-import time
-
 import pytest
 import torch
 from packaging import version

 try:
-    import triton
-    import triton.language as tl
+    pass

    from colossalai.kernel.triton.token_attention_kernel import token_attention_fwd
+
    HAS_TRITON = True
 except ImportError:
    HAS_TRITON = False
    print("please install triton from https://github.com/openai/triton")

-TRITON_CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse('11.4')
+TRITON_CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse("11.4")


 def torch_att(xq, xk, xv, bs, seqlen, num_head, head_dim):
@@ -29,10 +27,10 @@ def torch_att(xq, xk, xv, bs, seqlen, num_head, head_dim):
    return torch.sum(prob * xv, dim=1, keepdim=False)


-@pytest.mark.skipif(not TRITON_CUDA_SUPPORT or not HAS_TRITON,
-                    reason="triton requires cuda version to be higher than 11.4")
+@pytest.mark.skipif(
+    not TRITON_CUDA_SUPPORT or not HAS_TRITON, reason="triton requires cuda version to be higher than 11.4"
+)
 def test():
-
    Z, head_num, seq_len, head_dim = 22, 112 // 8, 2048, 128
    dtype = torch.float16
    q = torch.empty((Z, head_num, head_dim), dtype=dtype, device="cuda").normal_(mean=0.1, std=0.2)
--- a/tests/test_infer_ops/triton/test_token_softmax.py
+++ b/tests/test_infer_ops/triton/test_token_softmax.py
@@ -3,22 +3,22 @@ import torch
 from packaging import version

 try:
-    import triton
-    import triton.language as tl
+    pass

    from colossalai.kernel.triton.token_attention_kernel import token_attn_softmax_fwd
+
    HAS_TRITON = True
 except ImportError:
    HAS_TRITON = False
    print("please install triton from https://github.com/openai/triton")

-TRITON_CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse('11.4')
+TRITON_CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse("11.4")


-@pytest.mark.skipif(not TRITON_CUDA_SUPPORT or not HAS_TRITON,
-                    reason="triton requires cuda version to be higher than 11.4")
+@pytest.mark.skipif(
+    not TRITON_CUDA_SUPPORT or not HAS_TRITON, reason="triton requires cuda version to be higher than 11.4"
+)
 def test_softmax():
-
    import torch

    batch_size, seq_len, head_num, head_dim = 4, 1025, 12, 128