[misc] update pre-commit and run all files (#4752)

* [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format
2025-09-04 18:40:28 +00:00 · 2023-09-19 14:20:26 +08:00
parent 3c6b831c26
commit 079bf3cb26
1268 changed files with 50037 additions and 38444 deletions
--- a/colossalai/zero/gemini/placement_policy.py
+++ b/colossalai/zero/gemini/placement_policy.py
@@ -17,10 +17,9 @@ from .memory_tracer import ChunkMemStatsCollector
 class PlacementPolicy(ABC):
    need_mem_stats: bool = False

-    def __init__(self,
-                 chunk_manager: ChunkManager,
-                 mem_stats_collector: Optional[ChunkMemStatsCollector] = None,
-                 **kwargs) -> None:
+    def __init__(
+        self, chunk_manager: ChunkManager, mem_stats_collector: Optional[ChunkMemStatsCollector] = None, **kwargs
+    ) -> None:
        self.chunk_manager = chunk_manager
        self.mem_stats_collector: Optional[ChunkMemStatsCollector] = mem_stats_collector

@@ -29,23 +28,25 @@ class PlacementPolicy(ABC):
        raise NotImplementedError

    @abstractmethod
-    def setup_grads_device(self, params: List[torch.Tensor], grads_device_map: Dict[torch.Tensor,
-                                                                                    torch.device]) -> None:
+    def setup_grads_device(
+        self, params: List[torch.Tensor], grads_device_map: Dict[torch.Tensor, torch.device]
+    ) -> None:
        raise NotImplementedError


 class StaticPlacementPolicy(PlacementPolicy):
-
-    def __init__(self,
-                 chunk_manager: ChunkManager,
-                 mem_stats_collector: Optional[ChunkMemStatsCollector] = None,
-                 shard_param_frac: float = 1.0,
-                 offload_optim_frac: float = 0.0,
-                 offload_param_frac: float = 0.0,
-                 **kwargs) -> None:
+    def __init__(
+        self,
+        chunk_manager: ChunkManager,
+        mem_stats_collector: Optional[ChunkMemStatsCollector] = None,
+        shard_param_frac: float = 1.0,
+        offload_optim_frac: float = 0.0,
+        offload_param_frac: float = 0.0,
+        **kwargs,
+    ) -> None:
        super().__init__(chunk_manager, mem_stats_collector=mem_stats_collector)
        if offload_param_frac > 0.0 and (shard_param_frac != 1.0 or offload_optim_frac != 1.0):
-            warnings.warn('offload_param_frac is ignored when shard_param_frac != 1.0 or offload_optim_frac != 1.0')
+            warnings.warn("offload_param_frac is ignored when shard_param_frac != 1.0 or offload_optim_frac != 1.0")
            offload_param_frac = 0.0
        self.shard_param_frac = shard_param_frac
        self.offload_optim_frac = offload_optim_frac
@@ -66,13 +67,14 @@ class StaticPlacementPolicy(PlacementPolicy):
        for chunk in can_evict_chunks:
            if can_offload_chunk_mem <= self.keep_cuda_chunk_mem:
                break
-            self.chunk_manager.move_chunk(chunk, torch.device('cpu'))
+            self.chunk_manager.move_chunk(chunk, torch.device("cpu"))
            # real saved mem is shard_mem, for simplicity we use chunk_mem
            can_offload_chunk_mem -= chunk.chunk_mem
        return 0, 0.0

-    def setup_grads_device(self, params: List[torch.Tensor], grads_device_map: Dict[torch.Tensor,
-                                                                                    torch.device]) -> None:
+    def setup_grads_device(
+        self, params: List[torch.Tensor], grads_device_map: Dict[torch.Tensor, torch.device]
+    ) -> None:
        total_chunk_mem = sum(self.chunk_manager.get_chunk(p).chunk_mem for p in params)

        offload_optim_chunk_mem = total_chunk_mem * self.offload_optim_frac
@@ -85,7 +87,7 @@ class StaticPlacementPolicy(PlacementPolicy):
            if chunk.keep_gathered or offloaded_optim_chunk_mem >= offload_optim_chunk_mem:
                device = get_current_device()
            else:
-                device = torch.device('cpu')
+                device = torch.device("cpu")
                # real offloaded mem is chunk.shard_mem, for simplicity we use chunk mem here
                offloaded_optim_chunk_mem += chunk.chunk_mem
            for p in params:
@@ -97,12 +99,14 @@ class StaticPlacementPolicy(PlacementPolicy):
 class AutoPlacementPolicy(PlacementPolicy):
    need_mem_stats: bool = True

-    def __init__(self,
-                 chunk_manager: ChunkManager,
-                 mem_stats_collector: Optional[ChunkMemStatsCollector] = None,
-                 warmup_non_model_data_ratio: float = 0.8,
-                 steady_cuda_cap_ratio: float = 0.9,
-                 **kwargs) -> None:
+    def __init__(
+        self,
+        chunk_manager: ChunkManager,
+        mem_stats_collector: Optional[ChunkMemStatsCollector] = None,
+        warmup_non_model_data_ratio: float = 0.8,
+        steady_cuda_cap_ratio: float = 0.9,
+        **kwargs,
+    ) -> None:
        super().__init__(chunk_manager, mem_stats_collector=mem_stats_collector)
        # model data will use 1-_warmup_non_model_data_ratio CUDA memory in warmup phase
        # you can set them by AutoPlacementPolicy.set_warmup_non_model_data_ratio()
@@ -110,13 +114,15 @@ class AutoPlacementPolicy(PlacementPolicy):
        self._warmup_non_model_data_ratio = warmup_non_model_data_ratio
        self._steady_cuda_cap_ratio = steady_cuda_cap_ratio

-    def evict_tensors(self,
-                      can_evict_chunks: List[Chunk],
-                      cuda_demand: int = 0,
-                      warmup: bool = True,
-                      compute_list: Optional[List[Tuple[Chunk, ...]]] = None,
-                      compute_idx: int = 0,
-                      **kwargs) -> Tuple[int, float]:
+    def evict_tensors(
+        self,
+        can_evict_chunks: List[Chunk],
+        cuda_demand: int = 0,
+        warmup: bool = True,
+        compute_list: Optional[List[Tuple[Chunk, ...]]] = None,
+        compute_idx: int = 0,
+        **kwargs,
+    ) -> Tuple[int, float]:
        """
        Evict tensors from CUDA device.

@@ -135,13 +141,13 @@ class AutoPlacementPolicy(PlacementPolicy):
        """
        start = time()
        cuda_capacity = colo_device_memory_capacity(get_current_device())
-        used_cuda_model_data = self.chunk_manager.total_mem['cuda']
+        used_cuda_model_data = self.chunk_manager.total_mem["cuda"]
        if warmup:
            # We designate a part of CUDA memory for model data in warmup iterations.
            max_cuda_non_model_data_per_period = cuda_capacity * self._warmup_non_model_data_ratio
        else:
            # max non-model-data cuda memory consumption of this sampling moment and the next sampling moment.
-            max_cuda_non_model_data_per_period = self.mem_stats_collector.next_period_non_model_data_usage('cuda')
+            max_cuda_non_model_data_per_period = self.mem_stats_collector.next_period_non_model_data_usage("cuda")
            cuda_capacity *= self._steady_cuda_cap_ratio
        total_cuda_model_data = cuda_capacity - max_cuda_non_model_data_per_period
        avail_cuda_model_data = total_cuda_model_data - used_cuda_model_data
@@ -160,11 +166,13 @@ class AutoPlacementPolicy(PlacementPolicy):
                    break

                self.chunk_manager.release_chunk(chunk)
-                self.chunk_manager.move_chunk(chunk, torch.device('cpu'))
+                self.chunk_manager.move_chunk(chunk, torch.device("cpu"))
                freed_cuda_model_data += chunk.chunk_mem
            if freed_cuda_model_data < to_free_cuda_model_data:
-                raise RuntimeError(f"Adjust layout failed! No enough CUDA memory! "
-                                   f"Need {to_free_cuda_model_data}, freed {freed_cuda_model_data}")
+                raise RuntimeError(
+                    f"Adjust layout failed! No enough CUDA memory! "
+                    f"Need {to_free_cuda_model_data}, freed {freed_cuda_model_data}"
+                )
        return freed_cuda_model_data, time() - start

    @staticmethod
@@ -178,8 +186,9 @@ class AutoPlacementPolicy(PlacementPolicy):
        next_compute_idx = sorted(next_compute_idx.items(), key=lambda pair: pair[1], reverse=True)
        return [t for (t, idx) in next_compute_idx]

-    def setup_grads_device(self, params: List[torch.Tensor], grads_device_map: Dict[torch.Tensor,
-                                                                                    torch.device]) -> None:
+    def setup_grads_device(
+        self, params: List[torch.Tensor], grads_device_map: Dict[torch.Tensor, torch.device]
+    ) -> None:
        for p in params:
            chunk = self.chunk_manager.get_chunk(p)
            # init offload optim settings
@@ -187,13 +196,13 @@ class AutoPlacementPolicy(PlacementPolicy):
            if chunk.keep_gathered:
                grads_device_map[p] = get_current_device()
            else:
-                grads_device_map[p] = torch.device('cpu')
+                grads_device_map[p] = torch.device("cpu")


 class PlacementPolicyFactory:
    policies: Dict[str, Type[PlacementPolicy]] = {
-        'auto': AutoPlacementPolicy,
-        'static': StaticPlacementPolicy,
+        "auto": AutoPlacementPolicy,
+        "static": StaticPlacementPolicy,
    }

    @staticmethod