mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-18 16:00:49 +00:00
[zero] adapt zero for unsharded parameters (#561)
* support existing sharded and unsharded parameters in zero * add unitest for moe-zero model init * polish moe gradient handler
This commit is contained in:
@@ -35,7 +35,7 @@ class Experts(MoeExperts):
|
||||
expert_args: Args used to initialize experts, the args could be found in corresponding expert class
|
||||
"""
|
||||
|
||||
@no_shard_zero_decrator
|
||||
@no_shard_zero_decrator(is_replicated=False)
|
||||
def __init__(self, expert_cls: Type[nn.Module], num_experts: int, **expert_args):
|
||||
super().__init__("all_to_all", num_experts)
|
||||
|
||||
|
@@ -9,7 +9,7 @@ from colossalai.context.moe_context import MOE_CONTEXT
|
||||
from colossalai.utils import get_current_device
|
||||
from ._operation import COL_MOE_KERNEL_FLAG, AllToAll, AllGather, ReduceScatter, MoeDispatch, MoeCombine, moe_cumsum
|
||||
from .experts import MoeExperts, Experts
|
||||
from .utils import ForceFP32Parameter, UniformNoiseGenerator, NormalNoiseGenerator
|
||||
from .utils import ForceFP32Parameter, UniformNoiseGenerator, NormalNoiseGenerator, autocast_softmax
|
||||
from colossalai.zero.init_ctx import no_shard_zero_context, no_shard_zero_decrator
|
||||
from typing import Callable, Optional, Type
|
||||
from torch.distributed import ProcessGroup
|
||||
@@ -66,7 +66,7 @@ class Top1Router(nn.Module):
|
||||
if self.noisy_func is not None and self.training:
|
||||
inputs = self.noisy_func(inputs)
|
||||
|
||||
logits = F.softmax(inputs, dim=-1)
|
||||
logits = autocast_softmax(inputs, dim=-1)
|
||||
num_experts = logits.size(-1)
|
||||
capacity = self.get_capacity(logits.shape)
|
||||
|
||||
@@ -152,7 +152,7 @@ class Top2Router(nn.Module):
|
||||
if self.noisy_func is not None and self.training:
|
||||
inputs = self.noisy_func(inputs)
|
||||
|
||||
logits = F.softmax(inputs, dim=-1) # logits: [s, e]
|
||||
logits = autocast_softmax(inputs, dim=-1) # logits: [s, e]
|
||||
num_experts = logits.size(-1)
|
||||
capacity = self.get_capacity(logits.shape)
|
||||
|
||||
@@ -241,7 +241,7 @@ class MoeLayer(nn.Module):
|
||||
experts (:class:`torch.nn.Module`): Instance of experts generated by Expert.
|
||||
"""
|
||||
|
||||
@no_shard_zero_decrator
|
||||
@no_shard_zero_decrator(is_replicated=True)
|
||||
def __init__(self, dim_model: int, num_experts: int, router: nn.Module, experts: MoeExperts):
|
||||
super().__init__()
|
||||
self.d_model = dim_model
|
||||
|
@@ -1,4 +1,5 @@
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from colossalai.utils import get_current_device
|
||||
from colossalai.context.moe_context import MOE_CONTEXT
|
||||
from .experts import FFNExperts, TPExperts
|
||||
@@ -51,6 +52,12 @@ class UniformNoiseGenerator:
|
||||
return inputs * noisy
|
||||
|
||||
|
||||
def autocast_softmax(logit: torch.Tensor, dim: int):
|
||||
if logit.dtype != torch.float32:
|
||||
logit = logit.float()
|
||||
return F.softmax(logit, dim=dim)
|
||||
|
||||
|
||||
def build_ffn_experts(num_experts: int, d_model: int, d_ff: int, activation=None, drop_rate: float = 0):
|
||||
mep_size = MOE_CONTEXT.max_ep_size
|
||||
if num_experts % mep_size == 0 or mep_size % num_experts == 0:
|
||||
|
Reference in New Issue
Block a user