[zero] adapt zero for unsharded parameters (#561)

* support existing sharded and unsharded parameters in zero * add unitest for moe-zero model init * polish moe gradient handler
2025-09-18 16:00:49 +00:00 · 2022-03-31 18:34:11 +08:00
parent 13ed4b6441
commit e6d50ec107
11 changed files with 211 additions and 70 deletions
--- a/colossalai/nn/layer/moe/experts.py
+++ b/colossalai/nn/layer/moe/experts.py
@@ -35,7 +35,7 @@ class Experts(MoeExperts):
        expert_args: Args used to initialize experts, the args could be found in corresponding expert class
    """

-    @no_shard_zero_decrator
+    @no_shard_zero_decrator(is_replicated=False)
    def __init__(self, expert_cls: Type[nn.Module], num_experts: int, **expert_args):
        super().__init__("all_to_all", num_experts)

--- a/colossalai/nn/layer/moe/layers.py
+++ b/colossalai/nn/layer/moe/layers.py
@@ -9,7 +9,7 @@ from colossalai.context.moe_context import MOE_CONTEXT
 from colossalai.utils import get_current_device
 from ._operation import COL_MOE_KERNEL_FLAG, AllToAll, AllGather, ReduceScatter, MoeDispatch, MoeCombine, moe_cumsum
 from .experts import MoeExperts, Experts
-from .utils import ForceFP32Parameter, UniformNoiseGenerator, NormalNoiseGenerator
+from .utils import ForceFP32Parameter, UniformNoiseGenerator, NormalNoiseGenerator, autocast_softmax
 from colossalai.zero.init_ctx import no_shard_zero_context, no_shard_zero_decrator
 from typing import Callable, Optional, Type
 from torch.distributed import ProcessGroup
@@ -66,7 +66,7 @@ class Top1Router(nn.Module):
        if self.noisy_func is not None and self.training:
            inputs = self.noisy_func(inputs)

-        logits = F.softmax(inputs, dim=-1)
+        logits = autocast_softmax(inputs, dim=-1)
        num_experts = logits.size(-1)
        capacity = self.get_capacity(logits.shape)

@@ -152,7 +152,7 @@ class Top2Router(nn.Module):
        if self.noisy_func is not None and self.training:
            inputs = self.noisy_func(inputs)

-        logits = F.softmax(inputs, dim=-1)    # logits: [s, e]
+        logits = autocast_softmax(inputs, dim=-1)    # logits: [s, e]
        num_experts = logits.size(-1)
        capacity = self.get_capacity(logits.shape)

@@ -241,7 +241,7 @@ class MoeLayer(nn.Module):
        experts (:class:`torch.nn.Module`): Instance of experts generated by Expert.
    """

-    @no_shard_zero_decrator
+    @no_shard_zero_decrator(is_replicated=True)
    def __init__(self, dim_model: int, num_experts: int, router: nn.Module, experts: MoeExperts):
        super().__init__()
        self.d_model = dim_model
--- a/colossalai/nn/layer/moe/utils.py
+++ b/colossalai/nn/layer/moe/utils.py
@@ -1,4 +1,5 @@
 import torch
+import torch.nn.functional as F
 from colossalai.utils import get_current_device
 from colossalai.context.moe_context import MOE_CONTEXT
 from .experts import FFNExperts, TPExperts
@@ -51,6 +52,12 @@ class UniformNoiseGenerator:
        return inputs * noisy


+def autocast_softmax(logit: torch.Tensor, dim: int):
+    if logit.dtype != torch.float32:
+        logit = logit.float()
+    return F.softmax(logit, dim=dim)
+
+
 def build_ffn_experts(num_experts: int, d_model: int, d_ff: int, activation=None, drop_rate: float = 0):
    mep_size = MOE_CONTEXT.max_ep_size
    if num_experts % mep_size == 0 or mep_size % num_experts == 0: