[legacy] move engine to legacy (#4560)

* [legacy] move engine to legacy * [example] fix seq parallel example * [example] fix seq parallel example * [test] test gemini pluging hang * [test] test gemini pluging hang * [test] test gemini pluging hang * [test] test gemini pluging hang * [test] test gemini pluging hang * [example] update seq parallel requirements
2025-09-08 04:24:47 +00:00 · 2023-09-04 11:33:40 +08:00
parent 89fe027787
commit 8accecd55b
39 changed files with 93 additions and 105 deletions
--- a/colossalai/legacy/engine/gradient_accumulation/_gradient_accumulation.py
+++ b/colossalai/legacy/engine/gradient_accumulation/_gradient_accumulation.py
@@ -0,0 +1,291 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+from typing import Any, Iterable, Tuple, Union
+
+import torch.nn as nn
+from torch import Tensor
+from torch.nn.parallel.distributed import DistributedDataParallel
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import _LRScheduler
+from torch.utils.data import DataLoader
+
+from colossalai.legacy.engine import BaseGradientHandler
+from colossalai.nn.optimizer import ColossalaiOptimizer
+from colossalai.utils import conditional_context
+
+
+class GradAccumOptimizer(ColossalaiOptimizer):
+    """A wrapper for the optimizer to enable gradient accumulation by skipping the steps
+    before accumulation size is reached.
+
+    Args:
+        optim (:class:`torch.optim.Optimizer`): Your optimizer object for gradient accumulation.
+        accumulate_size (int): The number of steps to accumulate gradients.
+        model (:class:`torch.nn.Module`):
+            Your model object to check if it is DistributedDataParallel for special handling of no_sync() context.
+    """
+
+    def __init__(self, optim: Optimizer, accumulate_size: int, model: nn.Module = None):
+        super().__init__(optim)
+        self.accumulate_size = accumulate_size
+        self.accumulate_step = 0
+
+        # handle pytorch ddp auto all reduce
+        self.model = model
+        self.is_torch_ddp = isinstance(self.model, DistributedDataParallel)
+
+    def zero_grad(self, *args, **kwargs) -> None:
+        """
+        Set all gradients to zero.
+
+        Args:
+            *args: positional arguments for the optimizer wrapped
+            **kwargs: keyword arguments for the optimizer wrapped
+        """
+
+        if self.accumulate_step == 0:
+            self.optim.zero_grad(*args, **kwargs)
+
+    def step(self, *args, **kwargs) -> None:
+        """
+        Update the model parameters.
+
+        Args:
+            *args: positional arguments for the optimizer wrapped
+            **kwargs: keyword arguments for the optimizer wrapped
+        """
+
+        if self.accumulate_step < self.accumulate_size:
+            return None
+        else:
+            self.accumulate_step = 0
+            return self.optim.step(*args, **kwargs)
+
+    def clip_grad_norm(self, model: nn.Module, max_norm: float) -> None:
+        """
+        Clip gradients by norm.
+
+        Args:
+            model (:class:`torch.nn.Module`): a torch module instance
+            max_norm (float): the max norm for gradient clipping
+        """
+
+        if self.accumulate_step < self.accumulate_size:
+            pass
+        else:
+            self.optim.clip_grad_norm(model, max_norm)
+
+    def backward(self, loss: Tensor) -> None:
+        """Execute backward pass.
+
+        Args:
+            loss (:class:`torch.Tensor`): the loss value.
+        """
+
+        self.accumulate_step += 1
+
+        if self.is_torch_ddp:
+            no_sync = self.accumulate_step < self.accumulate_size
+            with conditional_context(self.model.no_sync(), enable=no_sync):
+                scaled_loss = loss / self.accumulate_size
+                self.optim.backward(scaled_loss)
+        else:
+            scaled_loss = loss / self.accumulate_size
+            self.optim.backward(scaled_loss)
+
+    def backward_by_grad(self, tensor: Tensor, grad: Tensor) -> None:
+        """Execute backward pass given the gradients of the output.
+
+        Args:
+            loss (:class:`torch.Tensor`): the loss value.
+            grad (:class:`torch.Tensor`): the output gradient.
+        """
+
+        self.accumulate_step += 1
+        no_sync = self.is_torch_ddp and self.accumulate_step < self.accumulate_size
+
+        if no_sync:
+            with self.model.no_sync():
+                self.optim.backward_by_grad(tensor, grad)
+        else:
+            self.optim.backward_by_grad(tensor, grad)
+
+
+class GradAccumDataloader:
+    """A wrapper for dataloader to enable gradient accumulation by dropping the last incomplete steps.
+
+    Note:
+        The dataloader would drop the last incomplete steps for gradient accumulation.
+        For example, if a dataloader has 10 batches of data and accumulate size is 4. The model parameters will
+        be updated only twice at step 4 and step 8. The last two batches of data do not form a complete 4-step cycle.
+        Thus, they will be automatically skipped by this class. If the dataloader is not standard PyTorch dataloader,
+        (e.g. Dali dataloader), this class will automatically consume (load data for nothing) the remaining 2 batches.
+
+    Args:
+        dataloader (``Iterable``): Your dataloader object for gradient accumulation.
+        accumulate_size (int): The number of steps to accumulate gradients.
+    """
+
+    def __init__(self, dataloader: Iterable, accumulate_size: int) -> None:
+        self.dataloader = dataloader
+        self.consume_remain_data = not isinstance(dataloader, DataLoader)
+        self.steps_per_epoch = len(dataloader) - len(dataloader) % accumulate_size
+
+    def __getattr__(self, __name: str) -> Any:
+        return getattr(self.dataloader, __name)
+
+    def __len__(self) -> int:
+        return self.steps_per_epoch
+
+    def __iter__(self) -> Iterable:
+        self._cur_step = 0
+        self._dataiter = iter(self.dataloader)
+        return self
+
+    def __next__(self) -> Union[Tensor, Tuple[Tensor]]:
+        if self._cur_step < self.steps_per_epoch:
+            self._cur_step += 1
+            data = next(self._dataiter)
+
+            if self._cur_step == self.steps_per_epoch and self.consume_remain_data:
+                # this is to handle non standard pytorch dataloader
+                # such as dali dataloader
+                while True:
+                    try:
+                        _ = next(self._dataiter)
+                    except StopIteration:
+                        break
+            return data
+        else:
+            raise StopIteration
+
+
+class GradAccumLrSchedulerByStep(_LRScheduler):
+    """A wrapper for the LR scheduler to enable gradient accumulation by skipping the steps
+    before accumulation size is reached.
+
+    Args:
+        lr_scheduler (:class:`torch.optim.lr_scheduler._LRScheduler`):
+            Your ``lr_scheduler`` object for gradient accumulation.
+        accumulate_size (int): The number of steps to accumulate gradients.
+    """
+
+    def __init__(self, lr_scheduler: _LRScheduler, accumulate_size: int) -> None:
+        self.lr_scheduler = lr_scheduler
+        self.accumulate_size = accumulate_size
+        self.accumulate_step = 0
+
+    @staticmethod
+    def compute_effective_steps_per_epoch(dataloader: Iterable, accumulate_size: int) -> int:
+        """
+        Computes the number of effective training iterations. An effective iteration is defined
+        as the the aggregation of <accumulate_size> iterations. For examples, if accumulate_size = 4,
+        then 4 iterations are considered as one effective iteration.
+
+        Args:
+            dataloader (``Iterable``): Your dataloader object for gradient accumulation.
+            accumulate_size (int): The number of steps to accumulate gradients.
+
+        """
+        return len(dataloader) // accumulate_size
+
+    def __getattr__(self, __name: str) -> Any:
+        return getattr(self.lr_scheduler, __name)
+
+    def step(self, *args, **kwargs) -> None:
+        """
+        Update the learning rate.
+
+        Args:
+            *args: positional arguments for the lr scheduler wrapped.
+            **kwargs: keyword arguments for the lr scheduler wrapped.
+        """
+        self.accumulate_step += 1
+        if self.accumulate_step < self.accumulate_size:
+            pass
+        else:
+            self.accumulate_step = 0
+            self.lr_scheduler.step(*args, **kwargs)
+
+    def get_lr(self) -> Tensor:
+        """
+        Compute the next learning rate.
+
+        Returns:
+            Tensor: the upcoming learning rate.
+        """
+
+        return self.lr_scheduler.get_lr()
+
+    def get_last_lr(self) -> Tensor:
+        """
+        Returns the current learning rate.
+
+        Returns:
+            Tensor: the current learning rate.
+        """
+
+        return self.lr_scheduler.get_last_lr()
+
+    def print_lr(self, *args, **kwargs) -> None:
+        """
+        Print he learning rate.
+
+        Args:
+            *args: positional arguments for the lr scheduler wrapped.
+            **kwargs: keyword arguments for the lr scheduler wrapped.
+        """
+        self.lr_scheduler.print_lr(*args, **kwargs)
+
+    def state_dict(self) -> dict:
+        """
+        Returns the states of the lr scheduler as dictionary.
+
+        Returns:
+            dict: the states of the lr scheduler.
+        """
+        return self.lr_scheduler.state_dict()
+
+    def load_state_dict(self, state_dict: dict) -> None:
+        """
+        Load the states of the lr scheduler from a dictionary object.
+
+        Returns:
+            dict: the states of the lr scheduler.
+        """
+        self.lr_scheduler.load_state_dict(state_dict)
+
+
+class GradAccumGradientHandler:
+    r"""A wrapper for the gradient handler to enable gradient accumulation by skipping the steps
+    before accumulation size is reached.
+
+    Args:
+        grad_handler (:class:`colossalai.legacy.engine.BaseGradientHandler`):
+            Your ``gradient_handler`` object for gradient accumulation, would be called when achieving `accumulate_size`.
+        accumulate_size (int): The number of steps to accumulate gradients.
+
+    More details about ``gradient_handlers`` could be found in
+    `Gradient_handler <https://github.com/hpcaitech/ColossalAI/tree/main/colossalai/engine/gradient_handler>`_.
+
+    """
+
+    def __init__(self, grad_handler: BaseGradientHandler, accumulate_size: int) -> None:
+        assert isinstance(grad_handler, BaseGradientHandler), \
+            f'expected grad_handler to be type BaseGradientHandler, but got {type(grad_handler)}'
+        self.grad_handler = grad_handler
+        self.accumulate_size = accumulate_size
+        self.accumulate_step = 0
+
+    def handle_gradient(self) -> None:
+        """
+        Handle gradients reduction only in the last gradient accumulation step.
+        """
+
+        self.accumulate_step += 1
+        if self.accumulate_step < self.accumulate_size:
+            pass
+        else:
+            self.accumulate_step = 0
+            self.grad_handler.handle_gradient()