[zero] adapt zero for unsharded parameters (#561)

* support existing sharded and unsharded parameters in zero

* add unitest for moe-zero model init

* polish moe gradient handler
This commit is contained in:
HELSON
2022-03-31 18:34:11 +08:00
committed by GitHub
parent 13ed4b6441
commit e6d50ec107
11 changed files with 211 additions and 70 deletions

View File

@@ -35,7 +35,7 @@ class Experts(MoeExperts):
expert_args: Args used to initialize experts, the args could be found in corresponding expert class
"""
@no_shard_zero_decrator
@no_shard_zero_decrator(is_replicated=False)
def __init__(self, expert_cls: Type[nn.Module], num_experts: int, **expert_args):
super().__init__("all_to_all", num_experts)

View File

@@ -9,7 +9,7 @@ from colossalai.context.moe_context import MOE_CONTEXT
from colossalai.utils import get_current_device
from ._operation import COL_MOE_KERNEL_FLAG, AllToAll, AllGather, ReduceScatter, MoeDispatch, MoeCombine, moe_cumsum
from .experts import MoeExperts, Experts
from .utils import ForceFP32Parameter, UniformNoiseGenerator, NormalNoiseGenerator
from .utils import ForceFP32Parameter, UniformNoiseGenerator, NormalNoiseGenerator, autocast_softmax
from colossalai.zero.init_ctx import no_shard_zero_context, no_shard_zero_decrator
from typing import Callable, Optional, Type
from torch.distributed import ProcessGroup
@@ -66,7 +66,7 @@ class Top1Router(nn.Module):
if self.noisy_func is not None and self.training:
inputs = self.noisy_func(inputs)
logits = F.softmax(inputs, dim=-1)
logits = autocast_softmax(inputs, dim=-1)
num_experts = logits.size(-1)
capacity = self.get_capacity(logits.shape)
@@ -152,7 +152,7 @@ class Top2Router(nn.Module):
if self.noisy_func is not None and self.training:
inputs = self.noisy_func(inputs)
logits = F.softmax(inputs, dim=-1) # logits: [s, e]
logits = autocast_softmax(inputs, dim=-1) # logits: [s, e]
num_experts = logits.size(-1)
capacity = self.get_capacity(logits.shape)
@@ -241,7 +241,7 @@ class MoeLayer(nn.Module):
experts (:class:`torch.nn.Module`): Instance of experts generated by Expert.
"""
@no_shard_zero_decrator
@no_shard_zero_decrator(is_replicated=True)
def __init__(self, dim_model: int, num_experts: int, router: nn.Module, experts: MoeExperts):
super().__init__()
self.d_model = dim_model

View File

@@ -1,4 +1,5 @@
import torch
import torch.nn.functional as F
from colossalai.utils import get_current_device
from colossalai.context.moe_context import MOE_CONTEXT
from .experts import FFNExperts, TPExperts
@@ -51,6 +52,12 @@ class UniformNoiseGenerator:
return inputs * noisy
def autocast_softmax(logit: torch.Tensor, dim: int):
if logit.dtype != torch.float32:
logit = logit.float()
return F.softmax(logit, dim=dim)
def build_ffn_experts(num_experts: int, d_model: int, d_ff: int, activation=None, drop_rate: float = 0):
mep_size = MOE_CONTEXT.max_ep_size
if num_experts % mep_size == 0 or mep_size % num_experts == 0: