[misc] update pre-commit and run all files (#4752)

* [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format
2025-12-24 04:52:45 +00:00 · 2023-09-19 14:20:26 +08:00
parent 3c6b831c26
commit 079bf3cb26
1268 changed files with 50037 additions and 38444 deletions
--- a/examples/tutorial/sequence_parallel/loss_func/bert_loss.py
+++ b/examples/tutorial/sequence_parallel/loss_func/bert_loss.py
@@ -1,17 +1,12 @@
 import torch
-import torch.distributed as dist
 import torch.nn as nn
 import torch.nn.functional as F

 from colossalai.legacy.context import ParallelMode
 from colossalai.legacy.core import global_context as gpc
-from colossalai.logging import get_dist_logger
-
-from .cross_entropy import vocab_cross_entropy


 class BertLoss(nn.Module):
-
    def forward(self, lm_loss, sop_logits, loss_mask, sentence_order):
        lm_loss_ = lm_loss.float()
        loss_mask = loss_mask.float()
--- a/examples/tutorial/sequence_parallel/loss_func/cross_entropy.py
+++ b/examples/tutorial/sequence_parallel/loss_func/cross_entropy.py
@@ -1,11 +1,8 @@
 import torch
 from torch.cuda.amp import custom_bwd, custom_fwd

-from colossalai.legacy.context.parallel_mode import ParallelMode
-

 class _VocabCrossEntropy(torch.autograd.Function):
-
    @staticmethod
    @custom_fwd
    def forward(ctx, vocab_parallel_logits, target):
@@ -59,7 +56,7 @@ class _VocabCrossEntropy(torch.autograd.Function):

        # Add the gradient from matching classes.
        arange_1d = torch.arange(start=0, end=grad_2d.size()[0], device=grad_2d.device)
-        grad_2d[arange_1d, masked_target_1d] -= (1.0 - target_mask.view(-1).float())
+        grad_2d[arange_1d, masked_target_1d] -= 1.0 - target_mask.view(-1).float()

        # Finally elementwise multiplication with the output gradients.
        grad_input.mul_(grad_output.unsqueeze(dim=-1))
--- a/examples/tutorial/sequence_parallel/loss_func/utils.py
+++ b/examples/tutorial/sequence_parallel/loss_func/utils.py
@@ -1,11 +1,9 @@
-
 import torch


 def ensure_divisibility(numerator, denominator):
    """Ensure that numerator is divisible by the denominator."""
-    assert numerator % denominator == 0, '{} is not divisible by {}'.format(
-        numerator, denominator)
+    assert numerator % denominator == 0, "{} is not divisible by {}".format(numerator, denominator)


 def divide(numerator, denominator):
@@ -15,8 +13,7 @@ def divide(numerator, denominator):
    return numerator // denominator


-def split_tensor_along_last_dim(tensor, num_partitions,
-                                contiguous_split_chunks=False):
+def split_tensor_along_last_dim(tensor, num_partitions, contiguous_split_chunks=False):
    """Split a tensor along its last dimension.
    Arguments:
        tensor: input tensor.
@@ -38,12 +35,11 @@ def split_tensor_along_last_dim(tensor, num_partitions,

 class VocabUtility:
    """Split the vocabulary into `world_size` chunks amd return the
-        first and last index of the vocabulary belonging to the `rank`
-        partition: Note that indices in [fist, last)"""
+    first and last index of the vocabulary belonging to the `rank`
+    partition: Note that indices in [fist, last)"""

    @staticmethod
-    def vocab_range_from_per_partition_vocab_size(per_partition_vocab_size,
-                                                  rank, world_size):
+    def vocab_range_from_per_partition_vocab_size(per_partition_vocab_size, rank, world_size):
        index_f = rank * per_partition_vocab_size
        index_l = index_f + per_partition_vocab_size
        return index_f, index_l
@@ -51,5 +47,4 @@ class VocabUtility:
    @staticmethod
    def vocab_range_from_global_vocab_size(global_vocab_size, rank, world_size):
        per_partition_vocab_size = divide(global_vocab_size, world_size)
-        return VocabUtility.vocab_range_from_per_partition_vocab_size(
-            per_partition_vocab_size, rank, world_size)
+        return VocabUtility.vocab_range_from_per_partition_vocab_size(per_partition_vocab_size, rank, world_size)