[misc] update pre-commit and run all files (#4752)

* [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format
2025-09-06 11:32:10 +00:00 · 2023-09-19 14:20:26 +08:00
parent 3c6b831c26
commit 079bf3cb26
1268 changed files with 50037 additions and 38444 deletions
--- a/colossalai/legacy/context/parallel_context.py
+++ b/colossalai/legacy/context/parallel_context.py
@@ -4,7 +4,6 @@
 import random
 import socket
 from collections import Counter
-from threading import local
 from typing import Union

 import numpy as np
@@ -95,8 +94,9 @@ class ParallelContext(metaclass=SingletonMeta):

    @staticmethod
    def _check_parallel_mode(parallel_mode: ParallelMode):
-        assert isinstance(parallel_mode, ParallelMode), \
-            f'expected the argument parallel_mode to be of enum ParallelMode, but got {type(parallel_mode)}'
+        assert isinstance(
+            parallel_mode, ParallelMode
+        ), f"expected the argument parallel_mode to be of enum ParallelMode, but got {type(parallel_mode)}"

    def get_global_rank(self):
        """Returns the global rank of the current device.
@@ -239,8 +239,10 @@ class ParallelContext(metaclass=SingletonMeta):

    def is_pipeline_last_stage(self, ignore_virtual=False):
        if not ignore_virtual:
-            if self.virtual_pipeline_parallel_size \
-                    is not None and self.virtual_pipeline_parallel_rank != self.virtual_pipeline_parallel_size - 1:
+            if (
+                self.virtual_pipeline_parallel_size is not None
+                and self.virtual_pipeline_parallel_rank != self.virtual_pipeline_parallel_size - 1
+            ):
                return False
        return self.is_last_rank(ParallelMode.PIPELINE)

@@ -371,12 +373,12 @@ class ParallelContext(metaclass=SingletonMeta):
           port (str): the master port for distributed training
        """
        # initialize the default process group
-        init_method = f'tcp://[{host}]:{port}'
+        init_method = f"tcp://[{host}]:{port}"
        dist.init_process_group(rank=rank, world_size=world_size, backend=backend, init_method=init_method)

        # None will give the default global process group for pytorch dist operations
        ranks = list(range(world_size))
-        cpu_group = dist.new_group(ranks, backend='gloo') if dist.get_backend() != 'gloo' else None
+        cpu_group = dist.new_group(ranks, backend="gloo") if dist.get_backend() != "gloo" else None
        self._register_dist(rank, world_size, dist.GroupMember.WORLD, cpu_group, ranks, ParallelMode.GLOBAL)
        self.add_global_rank(ParallelMode.GLOBAL, rank)

@@ -398,10 +400,11 @@ class ParallelContext(metaclass=SingletonMeta):
        pps = self.pipeline_parallel_size
        tps = self.tensor_parallel_size
        ws = self.world_size
-        assert ws == dps * pps * \
-            tps, f"Expected the world size {ws} to be equal to data" \
-                 f" parallel size ({dps}) * pipeline parallel size " \
-                 f"({pps}) * tensor parallel size ({tps})"
+        assert ws == dps * pps * tps, (
+            f"Expected the world size {ws} to be equal to data"
+            f" parallel size ({dps}) * pipeline parallel size "
+            f"({pps}) * tensor parallel size ({tps})"
+        )

    def _set_parallel_size_from_config(self, config: dict, key: str, attr_name: str):
        if key in config:
@@ -409,10 +412,11 @@ class ParallelContext(metaclass=SingletonMeta):
            if isinstance(ele, int):
                setattr(self, attr_name, ele)
            elif isinstance(ele, dict):
-                setattr(self, attr_name, ele['size'])
+                setattr(self, attr_name, ele["size"])
            else:
                raise NotImplementedError(
-                    f'{"Parallel configuration does not support this kind of argument, please use int or dict"}')
+                    f'{"Parallel configuration does not support this kind of argument, please use int or dict"}'
+                )

    def init_parallel_groups(self):
        """Initializes the parallel groups.
@@ -427,10 +431,10 @@ class ParallelContext(metaclass=SingletonMeta):
        self.world_size = world_size

        # set parallel size as attributes for global context
-        parallel_config = self.config.get('parallel', None)
+        parallel_config = self.config.get("parallel", None)
        if parallel_config is not None:
-            self._set_parallel_size_from_config(parallel_config, 'pipeline', 'pipeline_parallel_size')
-            self._set_parallel_size_from_config(parallel_config, 'tensor', 'tensor_parallel_size')
+            self._set_parallel_size_from_config(parallel_config, "pipeline", "pipeline_parallel_size")
+            self._set_parallel_size_from_config(parallel_config, "tensor", "tensor_parallel_size")

        # the user should not set the data parallel size manually
        # instead, it should be calculated based on other parallel config
@@ -438,33 +442,33 @@ class ParallelContext(metaclass=SingletonMeta):

        # get the tensor parallel mode and check
        tensor_parallel_mode = None
-        if parallel_config is not None and 'tensor' in \
-                parallel_config and 'mode' in parallel_config['tensor']:
-            tensor_parallel_mode = parallel_config['tensor']['mode']
-        assert tensor_parallel_mode in ALLOWED_MODES, \
-            f"mode in the parallel config must be set to one of {ALLOWED_MODES}"
+        if parallel_config is not None and "tensor" in parallel_config and "mode" in parallel_config["tensor"]:
+            tensor_parallel_mode = parallel_config["tensor"]["mode"]
+        assert (
+            tensor_parallel_mode in ALLOWED_MODES
+        ), f"mode in the parallel config must be set to one of {ALLOWED_MODES}"
        env.mode = tensor_parallel_mode

        self.check_sanity()

        pg_init = []
        # LSG: init data parallel process group for compatibility with other parallel module such as zero
-        pg_init.append(dict(type=INITIALIZER_MAPPING['data']))
+        pg_init.append(dict(type=INITIALIZER_MAPPING["data"]))

        # LSG: init model parallel process group for compatibility with amp and clip grad
-        pg_init.append(dict(type=INITIALIZER_MAPPING['model']))
+        pg_init.append(dict(type=INITIALIZER_MAPPING["model"]))

        if self.pipeline_parallel_size > 1:
-            pg_init.append(dict(type=INITIALIZER_MAPPING['pipeline']))
-        pg_init.append(dict(type=INITIALIZER_MAPPING['tensor']))
+            pg_init.append(dict(type=INITIALIZER_MAPPING["pipeline"]))
+        pg_init.append(dict(type=INITIALIZER_MAPPING["tensor"]))

        # init specific tensor parallel group
        if tensor_parallel_mode is not None:
-            tensor_parallel_cfg = parallel_config['tensor'].copy()
+            tensor_parallel_cfg = parallel_config["tensor"].copy()

            # remove duplicate parameters
-            tensor_parallel_cfg.pop('mode')
-            tensor_parallel_cfg.pop('size')
+            tensor_parallel_cfg.pop("mode")
+            tensor_parallel_cfg.pop("size")

            # add this config to initialize later
            pg_init.append(dict(type=INITIALIZER_MAPPING[tensor_parallel_mode.lower()], **tensor_parallel_cfg))
@@ -472,11 +476,16 @@ class ParallelContext(metaclass=SingletonMeta):
        # run initialization of different process groups
        for initializer_cfg in pg_init:
            cfg = initializer_cfg.copy()
-            initializer_type = cfg.pop('type')
-            initializer = DIST_GROUP_INITIALIZER.get_module(initializer_type)(rank, world_size, self.config,
-                                                                              self.data_parallel_size,
-                                                                              self.pipeline_parallel_size,
-                                                                              self.tensor_parallel_size, **cfg)
+            initializer_type = cfg.pop("type")
+            initializer = DIST_GROUP_INITIALIZER.get_module(initializer_type)(
+                rank,
+                world_size,
+                self.config,
+                self.data_parallel_size,
+                self.pipeline_parallel_size,
+                self.tensor_parallel_size,
+                **cfg,
+            )
            parallel_setting = initializer.init_dist_group()
            if isinstance(parallel_setting, list):
                for args in parallel_setting:
@@ -497,8 +506,7 @@ class ParallelContext(metaclass=SingletonMeta):
        return parallel_mode in self._groups

    def destroy(self):
-        """Destroys the current distributed parallel environment.
-        """
+        """Destroys the current distributed parallel environment."""
        for mode, group in self._groups.items():
            if mode is not ParallelMode.GLOBAL:
                dist.destroy_process_group(group)
@@ -519,7 +527,7 @@ class ParallelContext(metaclass=SingletonMeta):

        torch.cuda.set_device(device_ordinal)
        if self._verbose:
-            self._logger.info(f'process rank {global_rank} is bound to device {device_ordinal}')
+            self._logger.info(f"process rank {global_rank} is bound to device {device_ordinal}")

    def set_seed(self, seed: int):
        """Sets seeds for all random libraries.
@@ -552,21 +560,25 @@ class ParallelContext(metaclass=SingletonMeta):

            set_mode(ParallelMode.DATA)
            seeds = get_seeds()
-            seed_str = ', '.join([f'{k}: {v}' for k, v in seeds.items()])
+            seed_str = ", ".join([f"{k}: {v}" for k, v in seeds.items()])

            if self._verbose:
-                self._logger.info(f"initialized seed on rank {global_rank}, "
-                                  f"numpy: {seed}, python random: {seed}, {seed_str},"
-                                  f"the default parallel seed is {ParallelMode.DATA}.")
+                self._logger.info(
+                    f"initialized seed on rank {global_rank}, "
+                    f"numpy: {seed}, python random: {seed}, {seed_str},"
+                    f"the default parallel seed is {ParallelMode.DATA}."
+                )
        else:
            if self._verbose:
                self._logger.info(
                    f"initialized seed on rank {global_rank}, "
                    f"numpy: {seed}, python random: {seed}, pytorch: {seed}",
-                    ranks=[0])
+                    ranks=[0],
+                )
                self._logger.info(
-                    'WARNING: CUDA is not available, thus CUDA RNG cannot be used to track CUDA random number states',
-                    ranks=[0])
+                    "WARNING: CUDA is not available, thus CUDA RNG cannot be used to track CUDA random number states",
+                    ranks=[0],
+                )

    def set_virtual_pipeline_parallel_size(self, size):
        self.virtual_pipeline_parallel_size = size