add moe context, moe utilities and refactor gradient handler (#455)

2025-09-09 04:50:17 +00:00 · 2022-03-18 16:38:32 +08:00
parent af185b5519
commit 84fd7c1d4d
11 changed files with 255 additions and 125 deletions
--- a/colossalai/context/init.py
+++ b/colossalai/context/init.py
@@ -1,5 +1,6 @@
 from .config import Config, ConfigException
 from .parallel_context import ParallelContext
+from .moe_context import MoeContext
 from .parallel_mode import ParallelMode
 from .process_group_initializer import *
 from .random import *
--- a/colossalai/context/moe_context.py
+++ b/colossalai/context/moe_context.py
@@ -0,0 +1,151 @@
+import torch
+import torch.distributed as dist
+from .parallel_mode import ParallelMode
+
+
+def _check_sanity():
+    from colossalai.core import global_context as gpc
+    if gpc.tensor_parallel_size > 1 or gpc.pipeline_parallel_size > 1:
+        raise NotImplementedError("Moe is not compatible with tensor or "
+                                  "pipeline parallel at present.")
+
+
+class MoeInfo:
+    """Moe parallelism information, storing parallel sizes and groups.
+    """
+
+    def __init__(self, ep_size: int, dp_size: int):
+        _check_sanity()
+        self.ep_size = ep_size
+        self.dp_size = dp_size
+        self.ep_group = None
+        # data parallel group for experts, since ep_group is different
+        # we may have different dp_group from get_group(ParallelMode.DATA)
+        self.dp_group = None
+
+        # Here we assume tensor parallel size = 1
+        # Otherwise, MoE can't be used
+        # Since TENSOR parallel group and DATA parallel group
+        # have been created, we can use them directly.
+        if ep_size == 1:
+            from colossalai.core import global_context as gpc
+            self.ep_group = gpc.get_group(ParallelMode.TENSOR)
+            self.dp_group = gpc.get_group(ParallelMode.DATA)
+            return
+
+        if dp_size == 1:
+            from colossalai.core import global_context as gpc
+            self.ep_group = gpc.get_group(ParallelMode.DATA)
+            self.dp_group = gpc.get_group(ParallelMode.TENSOR)
+            return
+
+        rank = dist.get_rank()
+        # Create expert parallel group
+        for i in range(dp_size):
+            ranks = [i * ep_size + j for j in range(ep_size)]
+            group = dist.new_group(ranks)
+            if rank in ranks:
+                self.ep_group = group
+
+        # Create data parallel group
+        for j in range(ep_size):
+            ranks = [i * ep_size + j for i in range(dp_size)]
+            group = dist.new_group(ranks)
+            if rank in ranks:
+                self.dp_group = group
+
+
+class MoeContext:
+    """MoE parallel context manager. This class manages different
+    parallel groups in MoE context and MoE loss in training.
+    """
+    __instance = None
+
+    @staticmethod
+    def get_instance():
+        if MoeContext.__instance is None:
+            MoeContext.__instance = MoeContext()
+        return MoeContext.__instance
+
+    def __init__(self):
+        self.world_size = 1
+        # Users may want to set maximum expert parallel size smaller than the world size
+        # since very low bandwidth across nodes may constrain the performance of MoE
+        # When we have a maximum expert parallel size, we have a minimum data parallel size naturally
+        self.max_ep_size = 1
+        self.min_dp_size = 1
+        self.aux_loss = None
+        self.use_kernel_optim = True
+
+        self.has_setup = False
+        self._info_dict = dict()
+
+    @property
+    def information(self):
+        return self._info_dict
+
+    @property
+    def is_initialized(self):
+        return self.has_setup
+
+    def setup(self, seed: int, use_kernel_optim: bool = True):
+
+        assert not self.is_initialized, "MoE distributed context shouldn't be set up again"
+        _check_sanity()
+        assert torch.cuda.is_available(), "MoE requires to enable CUDA first"
+
+        self.world_size = dist.get_world_size()
+
+        from colossalai.core import global_context as gpc
+        self.max_ep_size = gpc.config.get('max_ep_size', self.world_size)
+        assert self.world_size % self.max_ep_size == 0, \
+            "Maximum epxert parallel size must be a factor of the number of GPUs"
+        self.min_dp_size = self.world_size // self.max_ep_size
+
+        # Enabling kernel optimization may raise error in some cases
+        # Users can close kernel optimization manually
+        self.use_kernel_optim = use_kernel_optim
+
+        from .random import moe_set_seed
+        moe_set_seed(seed)
+        self.has_setup = True
+
+    def get_info(self, num_experts: int):
+        """Automatically deploys experts and returns parallel infomation about
+        distributed communication groups.
+        """
+
+        gt_flag = num_experts % self.max_ep_size == 0    # check whether num_experts is greater
+        lt_flag = self.max_ep_size % num_experts == 0    # check whether num_experts is less
+
+        assert gt_flag or lt_flag, "Automatic experts placement do not support such situation right now."
+
+        # If the number of experts is greater than maximum expert parallel size,
+        # there are multiple experts in each GPU and each GPU has different experts
+        # So it's data parallel size is 1
+        # Otherwise, there is only one expert in each GPU
+        # The data parallel size should be calculated
+        dp_size = 1 if gt_flag else self.max_ep_size // num_experts
+        ep_size = self.max_ep_size // dp_size
+
+        # Calculate the number of experts for each GPU
+        num_local_experts = 1 if lt_flag else num_experts // self.max_ep_size
+
+        # Don't forget to multiply minimum data parallel size
+        dp_size *= self.min_dp_size
+        if not (ep_size in self.information):
+            self.information[ep_size] = MoeInfo(ep_size, dp_size)
+
+        return num_local_experts, self.information[ep_size]
+
+    def set_kernel_not_use(self):
+        self.use_kernel_optim = False
+
+    def reset_loss(self):
+        self.aux_loss = 0
+
+    def add_loss(self, loss):
+        self.aux_loss += loss
+
+    def get_loss(self):
+        return self.aux_loss
--- a/colossalai/context/parallel_context.py
+++ b/colossalai/context/parallel_context.py
@@ -9,7 +9,6 @@ import torch
 import torch.distributed as dist
 from colossalai.constants import ALLOWED_MODES, INITIALIZER_MAPPING
 from colossalai.context.config import Config
-from colossalai.global_variables import moe_env
 from colossalai.global_variables import tensor_parallel_env as env
 from colossalai.logging import get_dist_logger
 from colossalai.registry import DIST_GROUP_INITIALIZER
@@ -407,13 +406,6 @@ class ParallelContext:
            # add this config to initialize later
            pg_init.append(dict(type=INITIALIZER_MAPPING[tensor_parallel_mode.lower()], **tensor_parallel_cfg))

-        # initialization for moe environment
-        if parallel_config is not None and 'moe' in parallel_config:
-            param = parallel_config['moe']
-            assert 'size' in param, "Moe model parallel size should be given"
-            moe_env.setup(param['size'])
-            pg_init.append(dict(type=INITIALIZER_MAPPING['moe']))
-
        # run initialization of different process groups
        for initializer_cfg in pg_init:
            cfg = initializer_cfg.copy()
--- a/colossalai/context/random/_helper.py
+++ b/colossalai/context/random/_helper.py
@@ -147,15 +147,10 @@ def with_seed(func, parallel_mode: ParallelMode):
 def moe_set_seed(seed):
    if torch.cuda.is_available():
        from colossalai.core import global_context as gpc
-        moe_mp_rank = gpc.get_local_rank(ParallelMode.MOE_MODEL)
-        moe_mp_seed = seed + moe_mp_rank
-        add_seed(ParallelMode.MOE_MODEL, moe_mp_seed)
-
        global_rank = gpc.get_global_rank()
-        add_seed(ParallelMode.TENSOR, global_rank, True)
-        print(f"moe seed condition: {global_rank} with moe seed {moe_mp_seed}, ",
-              f"tensor seed {global_rank}",
-              flush=True)
+        diff_seed = seed + global_rank
+        add_seed(ParallelMode.TENSOR, diff_seed, True)
+        print(f"moe seed condition: {global_rank} with tensor seed {diff_seed}", flush=True)


 def reset_seeds():