[booster] refactor all dp fashion plugins (#3684)

* [booster] add dp plugin base * [booster] inherit dp plugin base * [booster] refactor unit tests
2025-09-06 11:32:10 +00:00 · 2023-05-05 19:36:10 +08:00
parent b49020c1b1
commit d0915f54f4
8 changed files with 190 additions and 308 deletions
--- a/colossalai/booster/plugin/low_level_zero_plugin.py
+++ b/colossalai/booster/plugin/low_level_zero_plugin.py
@@ -1,24 +1,20 @@
-import random
 import warnings
 from typing import Callable, List, Optional, Tuple, Union

-import numpy as np
 import torch
-import torch.distributed as dist
 import torch.nn as nn
 from torch import Tensor
 from torch.optim import Optimizer
 from torch.optim.lr_scheduler import _LRScheduler as LRScheduler
 from torch.utils._pytree import tree_map
 from torch.utils.data import DataLoader
-from torch.utils.data.distributed import DistributedSampler

 from colossalai.checkpoint_io import CheckpointIO
 from colossalai.interface import ModelWrapper, OptimizerWrapper
 from colossalai.utils import get_current_device
 from colossalai.zero import zero_model_wrapper, zero_optim_wrapper

-from .plugin_base import Plugin
+from .dp_plugin_base import DPPluginBase
 from .torch_ddp_plugin import TorchDDPCheckpointIO

 __all__ = ['LowLevelZeroPlugin']
@@ -88,7 +84,7 @@ class LowLevelZeroOptimizer(OptimizerWrapper):
        raise NotImplementedError('LowLevelZero does not support clip_grad_by_value')


-class LowLevelZeroPlugin(Plugin):
+class LowLevelZeroPlugin(DPPluginBase):
    """
    Plugin for low level zero.

@@ -142,15 +138,10 @@ class LowLevelZeroPlugin(Plugin):
        cpu_offload: bool = False,
        verbose: bool = False,
    ) -> None:
-
-        assert dist.is_initialized(
-        ), 'torch.distributed is not initialized, please use colossalai.launch to create the distributed environment'
+        super().__init__()
        assert stage in (1, 2), f'LowLevelZeroPlugin only supports stage 1/2 training'
        assert precision in ('fp16', 'fp32'), f'LowLevelZeroPlugin only supports fp16/fp32 training'

-        self.rank = dist.get_rank()
-        self.world_size = dist.get_world_size()
-
        self.stage = stage
        self.precision = precision
        self.zero_optim_config = dict(reduce_bucket_size=reduce_bucket_size_in_m * 1024 * 1024,
@@ -183,57 +174,6 @@ class LowLevelZeroPlugin(Plugin):
    def supported_devices(self) -> List[str]:
        return ['cuda']

-    def prepare_train_dataloader(self,
-                                 dataset,
-                                 batch_size,
-                                 shuffle=False,
-                                 seed=1024,
-                                 drop_last=False,
-                                 pin_memory=False,
-                                 num_workers=0,
-                                 **kwargs):
-        r"""
-        Prepare a dataloader for distributed training. The dataloader will be wrapped by
-        `torch.utils.data.DataLoader` and `torch.utils.data.DistributedSampler`.
-
-        Note:
-            1. Evaluation datasets should not be passed to this function.
-
-        Args:
-            dataset (`torch.utils.data.Dataset`): The dataset to be loaded.
-            shuffle (bool, optional): Whether to shuffle the dataset. Defaults to False.
-            seed (int, optional): Random worker seed for sampling, defaults to 1024.
-            add_sampler: Whether to add ``DistributedDataParallelSampler`` to the dataset. Defaults to True.
-            drop_last (bool, optional): Set to True to drop the last incomplete batch, if the dataset size
-                is not divisible by the batch size. If False and the size of dataset is not divisible by
-                the batch size, then the last batch will be smaller, defaults to False.
-            pin_memory (bool, optional): Whether to pin memory address in CPU memory. Defaults to False.
-            num_workers (int, optional): Number of worker threads for this dataloader. Defaults to 0.
-            kwargs (dict): optional parameters for ``torch.utils.data.DataLoader``, more details could be found in
-                    `DataLoader <https://pytorch.org/docs/stable/_modules/torch/utils/data/dataloader.html#DataLoader>`_.
-
-        Returns:
-            :class:`torch.utils.data.DataLoader`: A DataLoader used for training or testing.
-        """
-        _kwargs = kwargs.copy()
-        sampler = DistributedSampler(dataset, num_replicas=self.world_size, rank=self.rank, shuffle=shuffle)
-
-        # Deterministic dataloader
-        def seed_worker(worker_id):
-            worker_seed = seed
-            np.random.seed(worker_seed)
-            torch.manual_seed(worker_seed)
-            random.seed(worker_seed)
-
-        return DataLoader(dataset,
-                          batch_size=batch_size,
-                          sampler=sampler,
-                          worker_init_fn=seed_worker,
-                          drop_last=drop_last,
-                          pin_memory=pin_memory,
-                          num_workers=num_workers,
-                          **_kwargs)
-
    def configure(
        self,
        model: nn.Module,