[legacy] move engine to legacy (#4560)

* [legacy] move engine to legacy * [example] fix seq parallel example * [example] fix seq parallel example * [test] test gemini pluging hang * [test] test gemini pluging hang * [test] test gemini pluging hang * [test] test gemini pluging hang * [test] test gemini pluging hang * [example] update seq parallel requirements
2025-09-10 21:40:02 +00:00 · 2023-09-04 11:33:40 +08:00
parent 89fe027787
commit 8accecd55b
39 changed files with 93 additions and 105 deletions
--- a/colossalai/legacy/engine/schedule/_base_schedule.py
+++ b/colossalai/legacy/engine/schedule/_base_schedule.py
@@ -0,0 +1,139 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+from abc import ABC, abstractmethod
+from typing import Callable, Iterable
+
+import torch
+
+from colossalai.logging import get_dist_logger
+from colossalai.utils import get_current_device
+
+
+class BaseSchedule(ABC):
+    """A basic helper class to control the process of training or evaluation.
+    It mainly composes of forward_backward_step for gradient backward and
+    optimizer_step for parameters update.
+    For the convenience to enable FP16, we aggregate all codes that contain the
+    control of FP16 in class schedule.
+
+    Args:
+        data_process_func (Callable, optional): The preprocessing function which receives a batch of data and arranges them into data and label.
+    """
+
+    def __init__(self, data_process_func: Callable = None):
+        self.logger = get_dist_logger()
+        self.data_process_func = data_process_func
+
+    @staticmethod
+    def _move_tensor(element):
+        if torch.is_tensor(element):
+            if not element.is_cuda:
+                return element.to(get_current_device()).detach()
+        return element
+
+    def _move_to_device(self, data):
+        if isinstance(data, torch.Tensor):
+            data = data.to(get_current_device())
+        elif isinstance(data, (list, tuple)):
+            data_to_return = []
+            for element in data:
+                if isinstance(element, dict):
+                    data_to_return.append({k: self._move_tensor(v) for k, v in element.items()})
+                else:
+                    data_to_return.append(self._move_tensor(element))
+            data = data_to_return
+        elif isinstance(data, dict):
+            data = {k: self._move_tensor(v) for k, v in data.items()}
+        else:
+            raise TypeError(
+                f"Expected batch data to be of type torch.Tensor, list, tuple, or dict, but got {type(data)}")
+        return data
+
+    def _get_batch_size(self, data):
+        if isinstance(data, torch.Tensor):
+            return data.size(0)
+        elif isinstance(data, (list, tuple)):
+            if isinstance(data[0], dict):
+                return data[0][list(data[0].keys())[0]].size(0)
+            return data[0].size(0)
+        elif isinstance(data, dict):
+            return data[list(data.keys())[0]].size(0)
+
+    def load_batch(self, data_iter, to_gpu=True):
+        """Loads a batch from data iterator. It returns the data and labels which are
+        already in the same GPU as where the model's.
+
+        Args:
+            data_iter (Iterable): Data iterator from which get a batch of data, obtained by calling iter(dataloader).
+            to_gpu (bool, optional): Whether the data should be moved to GPU
+
+        Returns:
+            Tuple (:class:`Tensor`, :class:`torch.Tensor`): A tuple of (data, label).
+        """
+        if data_iter is None:
+            raise RuntimeError('Dataloader is not defined.')
+        batch_data = next(data_iter)
+
+        if to_gpu:
+            batch_data = self._move_to_device(batch_data)
+        self.batch_size = self._get_batch_size(batch_data)
+        return batch_data
+
+    def pre_processing(self, engine):
+        """To perform actions before running the schedule.
+        """
+        pass
+
+    @abstractmethod
+    def forward_backward_step(self,
+                              engine,
+                              data_iter: Iterable,
+                              forward_only: bool,
+                              return_loss: bool = True,
+                              return_output_label: bool = True):
+        """The process function over a batch of dataset for training or evaluation.
+
+        Args:
+            engine (colossalai.legacy.engine.Engine): Colossalai engine for training and inference.
+            data_iter (Iterable): Data iterator from which get a batch of data, obtained by calling iter(dataloader).
+            forward_only (bool): If True, the process won't include backward.
+            return_loss (bool, optional): If False, the loss won't be returned.
+            return_output_label (bool, optional): If False, the output and label won't be returned.
+        """
+        pass
+
+    @staticmethod
+    def _call_engine(engine, inputs):
+        if isinstance(inputs, torch.Tensor):
+            return engine(inputs)
+        elif isinstance(inputs, (list, tuple)):
+            return engine(*inputs)
+        elif isinstance(inputs, dict):
+            return engine(**inputs)
+        else:
+            TypeError(
+                f"Expected engine inputs to be of type torch.Tensor, list, tuple, or dict, but got {type(inputs)}")
+
+    @staticmethod
+    def _call_engine_criterion(engine, outputs, labels):
+        assert isinstance(outputs,
+                          (torch.Tensor, list, tuple,
+                           dict)), f'Expect output of model is (torch.Tensor, list, tuple), got {type(outputs)}'
+        if isinstance(outputs, torch.Tensor):
+            outputs = (outputs,)
+        if isinstance(labels, torch.Tensor):
+            labels = (labels,)
+
+        if isinstance(outputs, (tuple, list)) and isinstance(labels, (tuple, list)):
+            return engine.criterion(*outputs, *labels)
+        elif isinstance(outputs, (tuple, list)) and isinstance(labels, dict):
+            return engine.criterion(*outputs, **labels)
+        elif isinstance(outputs, dict) and isinstance(labels, dict):
+            return engine.criterion(**outputs, **labels)
+        elif isinstance(outputs, dict) and isinstance(labels, (list, tuple)):
+            raise ValueError(f"Expected labels to be a dict when the model outputs are dict, but got {type(labels)}")
+        else:
+            raise TypeError(f"Expected model outputs and labels to be of type torch.Tensor ' \
+                '(which is auto-converted to tuple), list, tuple, or dict, ' \
+                'but got {type(outputs)} (model outputs) and {type(labels)} (labels)")