[misc] update pre-commit and run all files (#4752)

* [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format
2025-09-03 10:06:44 +00:00 · 2023-09-19 14:20:26 +08:00
parent 3c6b831c26
commit 079bf3cb26
1268 changed files with 50037 additions and 38444 deletions
--- a/colossalai/kernel/triton/fused_layernorm.py
+++ b/colossalai/kernel/triton/fused_layernorm.py
@@ -3,6 +3,7 @@ import torch
 try:
    import triton
    import triton.language as tl
+
    HAS_TRITON = True
 except ImportError:
    HAS_TRITON = False
@@ -14,13 +15,13 @@ if HAS_TRITON:

    @triton.jit
    def _layer_norm_fwd_fused(
-        X,    # pointer to the input
-        Y,    # pointer to the output
-        W,    # pointer to the weights
-        B,    # pointer to the biases
-        stride,    # how much to increase the pointer when moving by 1 row
-        N,    # number of columns in X
-        eps,    # epsilon to avoid division by zero
+        X,  # pointer to the input
+        Y,  # pointer to the output
+        W,  # pointer to the weights
+        B,  # pointer to the biases
+        stride,  # how much to increase the pointer when moving by 1 row
+        N,  # number of columns in X
+        eps,  # epsilon to avoid division by zero
        BLOCK_SIZE: tl.constexpr,
    ):
        # Map the program id to the row of X and Y it should compute.
@@ -32,15 +33,15 @@ if HAS_TRITON:
        _mean = tl.zeros([BLOCK_SIZE], dtype=tl.float32)
        for off in range(0, N, BLOCK_SIZE):
            cols = off + tl.arange(0, BLOCK_SIZE)
-            a = tl.load(X + cols, mask=cols < N, other=0.).to(tl.float32)
+            a = tl.load(X + cols, mask=cols < N, other=0.0).to(tl.float32)
            _mean += a
        mean = tl.sum(_mean, axis=0) / N
        # Compute variance
        _var = tl.zeros([BLOCK_SIZE], dtype=tl.float32)
        for off in range(0, N, BLOCK_SIZE):
            cols = off + tl.arange(0, BLOCK_SIZE)
-            x = tl.load(X + cols, mask=cols < N, other=0.).to(tl.float32)
-            x = tl.where(cols < N, x - mean, 0.)
+            x = tl.load(X + cols, mask=cols < N, other=0.0).to(tl.float32)
+            x = tl.where(cols < N, x - mean, 0.0)
            _var += x * x
        var = tl.sum(_var, axis=0) / N
        rstd = 1 / tl.sqrt(var + eps)
@@ -50,7 +51,7 @@ if HAS_TRITON:
            mask = cols < N
            w = tl.load(W + cols, mask=mask)
            b = tl.load(B + cols, mask=mask)
-            x = tl.load(X + cols, mask=mask, other=0.).to(tl.float32)
+            x = tl.load(X + cols, mask=mask, other=0.0).to(tl.float32)
            x_hat = (x - mean) * rstd
            y = x_hat * w + b
            # Write output
@@ -71,13 +72,7 @@ if HAS_TRITON:
        # heuristics for number of warps
        num_warps = min(max(BLOCK_SIZE // 256, 1), 8)
        # enqueue kernel
-        _layer_norm_fwd_fused[(M,)](x_arg,
-                                    y,
-                                    weight,
-                                    bias,
-                                    x_arg.stride(0),
-                                    N,
-                                    eps,
-                                    BLOCK_SIZE=BLOCK_SIZE,
-                                    num_warps=num_warps)
+        _layer_norm_fwd_fused[(M,)](
+            x_arg, y, weight, bias, x_arg.stride(0), N, eps, BLOCK_SIZE=BLOCK_SIZE, num_warps=num_warps
+        )
        return y