[misc] update pre-commit and run all files (#4752)

* [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format
2025-09-10 05:20:33 +00:00 · 2023-09-19 14:20:26 +08:00
parent 3c6b831c26
commit 079bf3cb26
1268 changed files with 50037 additions and 38444 deletions
--- a/colossalai/pipeline/schedule/init.py
+++ b/colossalai/pipeline/schedule/init.py
@@ -3,7 +3,7 @@ from .interleaved_pp import InterleavedSchedule
 from .one_f_one_b import OneForwardOneBackwardSchedule

 __all__ = [
-    'PipelineSchedule',
-    'OneForwardOneBackwardSchedule',
-    'InterleavedSchedule',
+    "PipelineSchedule",
+    "OneForwardOneBackwardSchedule",
+    "InterleavedSchedule",
 ]
--- a/colossalai/pipeline/schedule/_utils.py
+++ b/colossalai/pipeline/schedule/_utils.py
@@ -4,24 +4,15 @@ from typing import Any, List, Optional, Tuple
 import torch
 import torch.cuda
 from torch.nn import Module
-from torch.utils._pytree import (
-    SUPPORTED_NODES,
-    LeafSpec,
-    TreeSpec,
-    _is_leaf,
-    _register_pytree_node,
-    tree_flatten,
-    tree_map,
-    tree_unflatten,
-)
+from torch.utils._pytree import SUPPORTED_NODES, TreeSpec, _register_pytree_node, tree_flatten, tree_map, tree_unflatten


 # this register are for torch under version 1.13.1, maybe removed in the future
-def _odict_flatten(d: 'OrderedDict[Any, Any]') -> Tuple[List[Any], Any]:
+def _odict_flatten(d: "OrderedDict[Any, Any]") -> Tuple[List[Any], Any]:
    return list(d.values()), list(d.keys())


-def _odict_unflatten(values: List[Any], context: Any) -> 'OrderedDict[Any, Any]':
+def _odict_unflatten(values: List[Any], context: Any) -> "OrderedDict[Any, Any]":
    return OrderedDict((key, value) for key, value in zip(context, values))


@@ -45,7 +36,7 @@ def tree_flatten_hf(pytree: Any) -> Tuple[List[Any], TreeSpec]:

        # Recursively flatten the children
        result: List[Any] = []
-        children_specs: List['TreeSpec'] = []
+        children_specs: List["TreeSpec"] = []
        for child in child_pytrees:
            flat, child_spec = tree_flatten_hf(child)
            result += flat
@@ -87,7 +78,7 @@ def get_batch_size(batch: Any) -> int:
    for data in data_list:
        if isinstance(data, torch.Tensor):
            return data.size(0)
-    raise RuntimeError('No tensor found in the batch')
+    raise RuntimeError("No tensor found in the batch")


 def get_micro_batch(batch: Any, start: int, micro_batch_size: int) -> Any:
@@ -104,7 +95,7 @@ def get_micro_batch(batch: Any, start: int, micro_batch_size: int) -> Any:

    def _get_tensor_slice(x: Any):
        if isinstance(x, torch.Tensor):
-            return x[start:start + micro_batch_size]
+            return x[start : start + micro_batch_size]
        return x

    return tree_map(_get_tensor_slice, batch)
@@ -175,7 +166,7 @@ def merge_batch(data: List[Any], batch_size_dim=0) -> Any:

    for elem_batch in zip(*flattened_data):
        if isinstance(elem_batch[0], torch.Tensor):
-            if len(elem_batch[0].shape) == 0:    # set loss to None in pipeline outputs
+            if len(elem_batch[0].shape) == 0:  # set loss to None in pipeline outputs
                merged_data.append(None)
            else:
                merged_data.append(torch.cat(elem_batch, dim=batch_size_dim))
--- a/colossalai/pipeline/schedule/base.py
+++ b/colossalai/pipeline/schedule/base.py
@@ -8,17 +8,18 @@ from colossalai.pipeline.stage_manager import PipelineStageManager


 class PipelineSchedule:
-
    def __init__(self, stage_manager: PipelineStageManager) -> None:
        self.stage_manager = stage_manager

-    def forward_backward_step(self,
-                              model: Module,
-                              data_iter: Iterable,
-                              criterion: Callable[[Any, Any], Tensor],
-                              optimizer: Optional[OptimizerWrapper] = None,
-                              return_loss: bool = False,
-                              return_outputs: bool = False) -> dict:
+    def forward_backward_step(
+        self,
+        model: Module,
+        data_iter: Iterable,
+        criterion: Callable[[Any, Any], Tensor],
+        optimizer: Optional[OptimizerWrapper] = None,
+        return_loss: bool = False,
+        return_outputs: bool = False,
+    ) -> dict:
        """Forward and backward step for pipeline training.

        Args:
--- a/colossalai/pipeline/schedule/interleaved_pp.py
+++ b/colossalai/pipeline/schedule/interleaved_pp.py
@@ -16,11 +16,11 @@ from .base import PipelineSchedule


 class InterleavedSchedule(PipelineSchedule):
-
    def __init__(self, num_microbatches: int, num_model_chunks: int, stage_manager: PipelineStageManager) -> None:
        self.num_model_chunks = num_model_chunks
-        assert num_microbatches % self.num_model_chunks == 0, \
-            "Number of microbatches should be an integer multiple of number of model chunks"
+        assert (
+            num_microbatches % self.num_model_chunks == 0
+        ), "Number of microbatches should be an integer multiple of number of model chunks"
        super().__init__(stage_manager)
        self.comm = PipelineP2PCommunication(stage_manager)
        self.num_microbatches = num_microbatches
@@ -42,8 +42,7 @@ class InterleavedSchedule(PipelineSchedule):
        self.batch = batch
        self.batch_size = get_batch_size(batch)
        self.microbatch_offset = [0 for _ in range(self.num_model_chunks)]
-        assert self.batch_size % self.num_microbatches == 0, \
-            "Batch size should divided by the number of microbatches"
+        assert self.batch_size % self.num_microbatches == 0, "Batch size should divided by the number of microbatches"
        self.microbatch_size = self.batch_size // self.num_microbatches

    def load_micro_batch(self, model_chunk_id: int) -> Any:
@@ -72,7 +71,7 @@ class InterleavedSchedule(PipelineSchedule):
        microbatch_id_in_group = (microbatch_id) % (self.stage_manager.num_stages * self.num_model_chunks)
        model_chunk_id = microbatch_id_in_group // self.stage_manager.num_stages
        if not forward:
-            model_chunk_id = (self.num_model_chunks - model_chunk_id - 1)
+            model_chunk_id = self.num_model_chunks - model_chunk_id - 1
        return model_chunk_id

    def is_first_stage(self, model_chunk_id: int) -> bool:
@@ -161,13 +160,15 @@ class InterleavedSchedule(PipelineSchedule):
        if not self.is_first_stage(model_chunk_id):
            self.comm.send_backward(input_object, prev_rank)

-    def forward_step(self,
-                     model_chunk: Module,
-                     model_chunk_id: int,
-                     input_obj: Optional[dict],
-                     criterion: Callable,
-                     accum_loss: Optional[torch.Tensor] = None,
-                     outputs: Optional[List[Any]] = None) -> Union[torch.Tensor, dict]:
+    def forward_step(
+        self,
+        model_chunk: Module,
+        model_chunk_id: int,
+        input_obj: Optional[dict],
+        criterion: Callable,
+        accum_loss: Optional[torch.Tensor] = None,
+        outputs: Optional[List[Any]] = None,
+    ) -> Union[torch.Tensor, dict]:
        """Forward one step of the pipeline
        Args:
            model (Module): Model Chunk to be run
@@ -195,8 +196,13 @@ class InterleavedSchedule(PipelineSchedule):
        else:
            return output_obj

-    def backward_step(self, optimizer: OptimizerWrapper, input_obj: Optional[dict],
-                      output_obj: Union[dict, torch.Tensor], output_obj_grad: Optional[dict]) -> Optional[dict]:
+    def backward_step(
+        self,
+        optimizer: OptimizerWrapper,
+        input_obj: Optional[dict],
+        output_obj: Union[dict, torch.Tensor],
+        output_obj_grad: Optional[dict],
+    ) -> Optional[dict]:
        """Backward one step of the pipeline

        Args:
@@ -235,13 +241,15 @@ class InterleavedSchedule(PipelineSchedule):
                    input_obj_grad[k] = v.grad
        return input_obj_grad

-    def forward_backward_step(self,
-                              model_chunk: Module,
-                              data_iter: Iterable,
-                              criterion: Callable[..., Any],
-                              optimizer: Optional[OptimizerWrapper] = None,
-                              return_loss: bool = False,
-                              return_outputs: bool = False) -> dict:
+    def forward_backward_step(
+        self,
+        model_chunk: Module,
+        data_iter: Iterable,
+        criterion: Callable[..., Any],
+        optimizer: Optional[OptimizerWrapper] = None,
+        return_loss: bool = False,
+        return_outputs: bool = False,
+    ) -> dict:
        """Runs interleaved 1F1B schedule, with communication between pipeline stages.

        Args:
@@ -321,7 +329,7 @@ class InterleavedSchedule(PipelineSchedule):
        # Run 1F1B in steady state.
        for i in range(num_microbatches_remaining):
            model_chunk_id = self.get_model_chunk_id(i + num_warmup_microbatches, forward=True)
-            last_iteration = (i == (num_microbatches_remaining - 1))
+            last_iteration = i == (num_microbatches_remaining - 1)

            output_obj = self.forward_step(model_chunk, model_chunk_id, input_obj, criterion, accum_loss, outputs)
            if forward_only:
@@ -369,4 +377,4 @@ class InterleavedSchedule(PipelineSchedule):

        if outputs is not None:
            outputs = merge_batch(outputs)
-        return {'loss': accum_loss, 'outputs': outputs}
+        return {"loss": accum_loss, "outputs": outputs}
--- a/colossalai/pipeline/schedule/one_f_one_b.py
+++ b/colossalai/pipeline/schedule/one_f_one_b.py
@@ -25,11 +25,12 @@ from .base import PipelineSchedule


 class OneForwardOneBackwardSchedule(PipelineSchedule):
-
-    def __init__(self,
-                 stage_manager: PipelineStageManager,
-                 num_microbatches: Optional[int] = None,
-                 microbatch_size: Optional[int] = None) -> None:
+    def __init__(
+        self,
+        stage_manager: PipelineStageManager,
+        num_microbatches: Optional[int] = None,
+        microbatch_size: Optional[int] = None,
+    ) -> None:
        """1F1B pipeline schedule.

        Args:
@@ -38,8 +39,9 @@ class OneForwardOneBackwardSchedule(PipelineSchedule):
            microbatch_size (Optional[int], optional): Microbatch size. If num_microbatches is provided, this will be ignored. Defaults to None.
        """
        super().__init__(stage_manager)
-        assert num_microbatches is not None or microbatch_size is not None, \
-            "Either num_microbatches or microbatch_size should be provided"
+        assert (
+            num_microbatches is not None or microbatch_size is not None
+        ), "Either num_microbatches or microbatch_size should be provided"
        self.comm = PipelineP2PCommunication(stage_manager)
        self.num_microbatches = num_microbatches
        self.microbatch_size = microbatch_size
@@ -62,12 +64,12 @@ class OneForwardOneBackwardSchedule(PipelineSchedule):
        self.batch_size = get_batch_size(batch)
        self.microbatch_offset = 0
        if not self._use_microbatch_size:
-            assert self.batch_size % self.num_microbatches == 0, \
-                "Batch size should divided by the number of microbatches"
+            assert (
+                self.batch_size % self.num_microbatches == 0
+            ), "Batch size should divided by the number of microbatches"
            self.microbatch_size = self.batch_size // self.num_microbatches
        else:
-            assert self.batch_size % self.microbatch_size == 0, \
-                "Batch size should divided by the microbatch size"
+            assert self.batch_size % self.microbatch_size == 0, "Batch size should divided by the microbatch size"
            self.num_microbatches = self.batch_size // self.microbatch_size

    def load_micro_batch(self) -> Any:
@@ -136,12 +138,14 @@ class OneForwardOneBackwardSchedule(PipelineSchedule):
        if not self.stage_manager.is_first_stage():
            self.comm.send_backward(input_object, prev_rank)

-    def forward_step(self,
-                     model: Module,
-                     input_obj: Optional[dict],
-                     criterion: Callable,
-                     accum_loss: Optional[torch.Tensor] = None,
-                     outputs: Optional[List[Any]] = None) -> Union[torch.Tensor, dict]:
+    def forward_step(
+        self,
+        model: Module,
+        input_obj: Optional[dict],
+        criterion: Callable,
+        accum_loss: Optional[torch.Tensor] = None,
+        outputs: Optional[List[Any]] = None,
+    ) -> Union[torch.Tensor, dict]:
        """Forward one step of the pipeline

        Args:
@@ -159,7 +163,6 @@ class OneForwardOneBackwardSchedule(PipelineSchedule):
        # for the non-first stage, input_obj is the output of the previous stage and it's must be a dict
        output_obj = model_forward(model, micro_batch, input_obj)
        if self.stage_manager.is_last_stage():
-
            loss = criterion(output_obj, micro_batch) / self.num_microbatches
            if accum_loss is not None:
                accum_loss.add_(loss.detach())
@@ -169,8 +172,13 @@ class OneForwardOneBackwardSchedule(PipelineSchedule):
        else:
            return output_obj

-    def backward_step(self, optimizer: OptimizerWrapper, input_obj: Optional[dict],
-                      output_obj: Union[dict, torch.Tensor], output_obj_grad: Optional[dict]) -> Optional[dict]:
+    def backward_step(
+        self,
+        optimizer: OptimizerWrapper,
+        input_obj: Optional[dict],
+        output_obj: Union[dict, torch.Tensor],
+        output_obj_grad: Optional[dict],
+    ) -> Optional[dict]:
        """Backward one step of the pipeline

        Args:
@@ -208,13 +216,15 @@ class OneForwardOneBackwardSchedule(PipelineSchedule):
                    input_obj_grad[k] = v.grad
        return input_obj_grad

-    def forward_backward_step(self,
-                              model: Module,
-                              data_iter: Iterable,
-                              criterion: Callable[..., Any],
-                              optimizer: Optional[OptimizerWrapper] = None,
-                              return_loss: bool = False,
-                              return_outputs: bool = False) -> dict:
+    def forward_backward_step(
+        self,
+        model: Module,
+        data_iter: Iterable,
+        criterion: Callable[..., Any],
+        optimizer: Optional[OptimizerWrapper] = None,
+        return_loss: bool = False,
+        return_outputs: bool = False,
+    ) -> dict:
        """Runs non-interleaved 1F1B schedule, with communication between pipeline stages.

        Args:
@@ -273,7 +283,7 @@ class OneForwardOneBackwardSchedule(PipelineSchedule):

        # Run 1F1B in steady state.
        for i in range(num_microbatches_remaining):
-            last_iteration = (i == (num_microbatches_remaining - 1))
+            last_iteration = i == (num_microbatches_remaining - 1)

            output_obj = self.forward_step(model, input_obj, criterion, accum_loss, outputs)
            if forward_only:
@@ -316,5 +326,5 @@ class OneForwardOneBackwardSchedule(PipelineSchedule):
        if outputs is not None:
            if isinstance(model, ModelWrapper):
                model = model.unwrap()
-            outputs = merge_batch(outputs, getattr(model, 'batch_size_dim', 0))
-        return {'loss': accum_loss, 'outputs': outputs}
+            outputs = merge_batch(outputs, getattr(model, "batch_size_dim", 0))
+        return {"loss": accum_loss, "outputs": outputs}