[hotfix] moe hybrid parallelism benchmark & follow-up fix (#6048)

* [example] pass use_fp8_comm flag to all plugins * [example] add mixtral benchmark * [moe] refine assertion and check * [moe] fix mixtral & add more tests * [moe] consider checking dp * sp group and moe_dp_group * [mixtral] remove gate tp & add more tests * [deepseek] fix tp & sp for deepseek * [mixtral] minor fix * [deepseek] add deepseek benchmark
2025-09-16 06:30:41 +00:00 · 2024-09-10 17:30:53 +08:00
parent 8fd25d6e09
commit c54c4fcd15
21 changed files with 907 additions and 99 deletions
--- a/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py
+++ b/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py
@@ -64,13 +64,18 @@ class MoeHybridParallelZeroOptimizer(HybridParallelZeroOptimizer):
        forced_dtype: Optional[torch.dtype] = None,
        overlap_allgather: bool = False,
    ):
-        pg_param_list = {
-            dp_process_group: list(filter(lambda p: not is_moe_tensor(p), model.parameters())),
-            moe_dp_group: list(filter(is_moe_tensor, model.parameters())),
-        }
+        if dp_process_group is moe_dp_group:
+            pg_param_list = {
+                dp_process_group: list(model.parameters()),
+            }
+        else:
+            pg_param_list = {
+                dp_process_group: list(filter(lambda p: not is_moe_tensor(p), model.parameters())),
+                moe_dp_group: list(filter(is_moe_tensor, model.parameters())),
+            }

-        if len(pg_param_list[dp_process_group]) == 0 or len(pg_param_list[moe_dp_group]) == 0:
-            raise ValueError("No parameters found in dp_process_group or moe_dp_group")
+        if len(pg_param_list[moe_dp_group]) == 0:
+            raise ValueError("No parameters found in moe_dp_group, please consider using HybridParallelPlugin instead")

        super().__init__(
            model=model,
@@ -407,6 +412,13 @@ class MoeHybridParallelPlugin(HybridParallelPlugin):
                and self.enable_sequence_parallelism
                and self.sequence_parallelism_mode == "all_to_all"
            )
+
+            # sync gradients across DP * SP ranks
+            if self.enable_sequence_parallelism and self.sequence_parallelism_mode == "all_to_all":
+                dp_group = self.pg_mesh.create_group_along_axis([self.moe_dp_axis, self.ep_axis, self.sp_axis])
+            else:
+                dp_group = self.dp_group
+
            if use_ddp:
                self.logger.warning(
                    f"Will have to check all params are used in pytorch DDP since not all experts are always activated",
@@ -414,17 +426,11 @@ class MoeHybridParallelPlugin(HybridParallelPlugin):
                )
                self.ddp_config["find_unused_parameters"] = True

-                if dist.get_process_group_ranks(self.dp_group) != dist.get_process_group_ranks(self.moe_dp_group):
+                if dist.get_process_group_ranks(dp_group) != dist.get_process_group_ranks(self.moe_dp_group):
                    raise ValueError(
-                        f"if pytorch ddp is used, dp_group and moe_dp_group are expected to be the same since DDP can only reduce grad across a single group, but found dp_group {dist.get_process_group_ranks(self.dp_group)} and moe_dp_group {dist.get_process_group_ranks(self.moe_dp_group)}, you might want to use HybridParallelPlugin (i.e. set ep_size = 1) or set zero_stage > 0"
+                        f"if pytorch DDP is used, dp_group and moe_dp_group are expected to be the same since DDP can only reduce grad across a single group, but found dp_group {dist.get_process_group_ranks(dp_group)} and moe_dp_group {dist.get_process_group_ranks(self.moe_dp_group)}, you might want to modify your config to bypass DDP \nhint: check the above ddp condition to by pass this"
                    )

-            # sync gradients across DP * SP ranks
-            if self.enable_sequence_parallelism and self.sequence_parallelism_mode == "all_to_all":
-                dp_group = self.pg_mesh.create_group_along_axis([self.moe_dp_axis, self.ep_axis, self.sp_axis])
-            else:
-                dp_group = self.dp_group
-
            model = HybridParallelModule(
                module=model,
                precision=self.precision,
@@ -466,6 +472,7 @@ class MoeHybridParallelPlugin(HybridParallelPlugin):
                        tp_process_group=self.tp_group,
                    )
            else:
+                is_zero = True
                if self.dp_size <= 1:
                    self.logger.warning(
                        "Use Zero Optimizer when data parallel size is 1 may introduce unnecessary overhead. "
--- a/colossalai/moe/_operation.py
+++ b/colossalai/moe/_operation.py
@@ -308,7 +308,7 @@ class EPGradScalerIn(torch.autograd.Function):
        assert len(grad_outputs) == 1
        grad = grad_outputs[0]
        if ctx.ep_size != 1:
-            grad = grad * ctx.ep_size
+            grad.mul_(ctx.ep_size)
        return grad, None


@@ -328,7 +328,7 @@ class EPGradScalerOut(torch.autograd.Function):
        assert len(grad_outputs) == 1
        grad = grad_outputs[0]
        if ctx.ep_size != 1:
-            grad = grad / ctx.ep_size
+            grad.div_(ctx.ep_size)
        return grad, None


@@ -449,7 +449,4 @@ def all_to_all_uneven(
    overlap: bool = False,
    fp8_communication: bool = False,
 ):
-    assert (
-        inputs.requires_grad
-    ), "Input must require grad to assure that backward is executed, otherwise it might hang the program."
    return AllToAllUneven.apply(inputs, input_split_sizes, output_split_sizes, group, overlap, fp8_communication)
--- a/colossalai/shardformer/modeling/deepseek.py
+++ b/colossalai/shardformer/modeling/deepseek.py
@@ -3,7 +3,7 @@ from typing import List, Optional, Tuple, Union

 import torch
 import torch.distributed as dist
-import torch.nn as nn
+import torch.functional as F
 from torch.distributed import ProcessGroup
 from torch.nn import CrossEntropyLoss
 from transformers.cache_utils import Cache, DynamicCache
@@ -28,11 +28,13 @@ from colossalai.quantization.fp8 import all_reduce_fp8
 from colossalai.shardformer.layer._operation import (
    all_to_all_comm,
    gather_forward_split_backward,
+    linear_with_async_comm,
    split_forward_gather_backward,
 )
-from colossalai.shardformer.layer.linear import Linear1D_Col, Linear1D_Row
+from colossalai.shardformer.layer.linear import Linear1D_Col, Linear1D_Row, ParallelModule
 from colossalai.shardformer.shard import ShardConfig
 from colossalai.shardformer.shard.utils import set_tensors_to_none
+from colossalai.tensor.d_tensor.api import shard_rowwise, sharded_tensor_to_existing_param
 from colossalai.tensor.moe_tensor.api import set_moe_tensor_ep_group


@@ -58,7 +60,7 @@ class AddAuxiliaryLoss(torch.autograd.Function):
        return grad_output, grad_loss


-class EPDeepseekMoE(nn.Module):
+class EPDeepseekMoE(ParallelModule):
    def __init__(self):
        raise RuntimeError(f"Please use `from_native_module` to create an instance of {self.__class__.__name__}")

@@ -214,6 +216,79 @@ class EPDeepseekMoE(nn.Module):
        return output_hidden_states


+class DeepseekMoEGate_Col(ParallelModule):
+    def parallel_linear(self, hidden_states):
+        assert (
+            hidden_states.shape[-1] == self.weight.shape[-1]
+        ), "Invalid shapes in Linear1D_Col forward: input={}, weight={}. Expected last dim of input {}.".format(
+            hidden_states.shape, self.weight.shape, self.weight.shape[-1]
+        )
+
+        output = linear_with_async_comm(
+            hidden_states, self.weight, None, self.process_group, True, fp8_communication=self.fp8_communication
+        )
+
+        # All-gather across the partitions.
+        output = gather_forward_split_backward(
+            output, dim=-1, process_group=self.process_group, fp8_communication=self.fp8_communication
+        )
+        return output
+
+    def forward(self, hidden_states):
+        bsz, seq_len, h = hidden_states.shape
+        ### compute gating score
+        hidden_states = hidden_states.view(-1, h)
+        logits = self.parallel_linear(hidden_states)
+        if self.scoring_func == "softmax":
+            scores = logits.softmax(dim=-1)
+        else:
+            raise NotImplementedError(f"insupportable scoring function for MoE gating: {self.scoring_func}")
+
+        ### select top-k experts
+        topk_weight, topk_idx = torch.topk(scores, k=self.top_k, dim=-1, sorted=False)
+
+        ### norm gate to sum 1
+        if self.top_k > 1 and self.norm_topk_prob:
+            denominator = topk_weight.sum(dim=-1, keepdim=True) + 1e-20
+            topk_weight = topk_weight / denominator
+
+        ### expert-level computation auxiliary loss
+        if self.training and self.alpha > 0.0:
+            scores_for_aux = scores
+            aux_topk = self.top_k
+            # always compute aux loss based on the naive greedy topk method
+            topk_idx_for_aux_loss = topk_idx.view(bsz, -1)
+            if self.seq_aux:
+                scores_for_seq_aux = scores_for_aux.view(bsz, seq_len, -1)
+                ce = torch.zeros(bsz, self.n_routed_experts, device=hidden_states.device)
+                ce.scatter_add_(
+                    1, topk_idx_for_aux_loss, torch.ones(bsz, seq_len * aux_topk, device=hidden_states.device)
+                ).div_(seq_len * aux_topk / self.n_routed_experts)
+                aux_loss = (ce * scores_for_seq_aux.mean(dim=1)).sum(dim=1).mean() * self.alpha
+            else:
+                mask_ce = F.one_hot(topk_idx_for_aux_loss.view(-1), num_classes=self.n_routed_experts)
+                ce = mask_ce.float().mean(0)
+                Pi = scores_for_aux.mean(0)
+                fi = ce * self.n_routed_experts
+                aux_loss = (Pi * fi).sum() * self.alpha
+        else:
+            aux_loss = None
+
+        return topk_idx, topk_weight, aux_loss
+
+    @staticmethod
+    def from_native_module(
+        module, process_group: ProcessGroup, config, gather_output, fp8_communication
+    ) -> "DeepseekMoEGate_Col":
+        LazyInitContext.materialize(module)
+        module.process_group = process_group
+        module.fp8_communication = fp8_communication
+        sharded_weight = shard_rowwise(module.weight.data, process_group)
+        sharded_tensor_to_existing_param(sharded_weight, module.weight)
+        module.__class__ = DeepseekMoEGate_Col
+        return module
+
+
 class DeepseekPipelineForwards:
    """
    This class serves as a micro library for forward function substitution of Llama models
--- a/colossalai/shardformer/modeling/mixtral.py
+++ b/colossalai/shardformer/modeling/mixtral.py
@@ -36,7 +36,7 @@ from colossalai.shardformer.layer._operation import (
    gather_forward_split_backward,
    split_forward_gather_backward,
 )
-from colossalai.shardformer.layer.linear import Linear1D_Col, Linear1D_Row
+from colossalai.shardformer.layer.linear import Linear1D_Col, Linear1D_Row, ParallelModule
 from colossalai.shardformer.shard import ShardConfig
 from colossalai.shardformer.shard.utils import set_tensors_to_none
 from colossalai.tensor.moe_tensor.api import set_moe_tensor_ep_group
@@ -49,7 +49,7 @@ if is_flash_attn_2_available():
    _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)


-class EPMixtralSparseMoeBlock(MixtralSparseMoeBlock):
+class EPMixtralSparseMoeBlock(ParallelModule):
    def __init__(self, *args, **kwargs):
        raise RuntimeError(f"Please use `from_native_module` to create an instance of {self.__class__.__name__}")

--- a/colossalai/shardformer/policies/deepseek.py
+++ b/colossalai/shardformer/policies/deepseek.py
@@ -10,6 +10,7 @@ from colossalai.shardformer.layer import FusedRMSNorm, Linear1D_Col
 from colossalai.shardformer.layer.embedding import PaddingEmbedding, VocabParallelEmbedding1D
 from colossalai.shardformer.layer.linear import Linear1D_Row
 from colossalai.shardformer.modeling.deepseek import (
+    DeepseekMoEGate_Col,
    DeepseekPipelineForwards,
    EPDeepseekMoE,
    get_deepseek_flash_attention_forward,
@@ -56,16 +57,24 @@ class DeepseekPolicy(Policy):
        sp_size = self.shard_config.sequence_parallel_size or None
        sp_group = self.shard_config.sequence_parallel_process_group or None
        sp_partial_derived = sp_mode in ["split_gather", "ring"]
+        tp_size = self.shard_config.tensor_parallel_size
+
+        # modified for both SP and TP
+        num_q_heads = self.model.config.num_attention_heads
+        num_kv_heads = getattr(self.model.config, "num_key_value_heads", None)
        if sp_mode == "all_to_all":
+            num_q_heads //= sp_size
            decoder_attribute_replacement = {
-                "num_heads": self.model.config.num_attention_heads // sp_size,
+                "num_heads": num_q_heads,
            }
            if getattr(self.model.config, "num_key_value_heads", False):
-                decoder_attribute_replacement["num_key_value_heads"] = self.model.config.num_key_value_heads // sp_size
+                num_kv_heads //= sp_size
+                decoder_attribute_replacement["num_key_value_heads"] = num_kv_heads

            policy[attn_cls] = ModulePolicyDescription(
                attribute_replacement=decoder_attribute_replacement,
            )
+
        if self.shard_config.enable_sequence_parallelism:
            if self.pipeline_stage_manager is not None:
                # NOTE: we are replacing model forward for both sequence parallelism and pipeline parallelism
@@ -97,6 +106,7 @@ class DeepseekPolicy(Policy):
        else:
            if self.tie_weight:
                embedding_cls = PaddingEmbedding
+
        if self.shard_config.enable_tensor_parallelism:
            # tensor parallelism for non-moe params
            assert (
@@ -107,10 +117,15 @@ class DeepseekPolicy(Policy):
            ), f"The number of key_value heads must be divisible by tensor parallel size."
            decoder_attribute_replacement = {
                "self_attn.hidden_size": self.model.config.hidden_size // self.shard_config.tensor_parallel_size,
-                "self_attn.num_heads": self.model.config.num_attention_heads // self.shard_config.tensor_parallel_size,
-                "self_attn.num_key_value_heads": self.model.config.num_key_value_heads
-                // self.shard_config.tensor_parallel_size,
            }
+            num_q_heads //= tp_size
+            decoder_attribute_replacement = {
+                "self_attn.hidden_size": self.model.config.hidden_size // self.shard_config.tensor_parallel_size,
+                "self_attn.num_heads": num_q_heads,
+            }
+            if num_kv_heads:
+                num_kv_heads //= tp_size
+                decoder_attribute_replacement["self_attn.num_key_value_heads"] = num_kv_heads

            policy["DeepseekDecoderLayer"] = ModulePolicyDescription(
                attribute_replacement=decoder_attribute_replacement,
@@ -135,8 +150,19 @@ class DeepseekPolicy(Policy):
                        target_module=Linear1D_Row,
                        kwargs={"fp8_communication": self.shard_config.fp8_communication},
                    ),
+                    SubModuleReplacementDescription(
+                        suffix="mlp.gate",
+                        target_module=DeepseekMoEGate_Col,
+                        kwargs={
+                            "gather_output": True,
+                            "fp8_communication": self.shard_config.fp8_communication,
+                            "config": self.model.config,
+                        },
+                        ignore_if_not_exist=True,
+                    ),
                ],
            )
+
        if embedding_cls is not None:
            self.append_or_create_submodule_replacement(
                description=SubModuleReplacementDescription(
--- a/colossalai/shardformer/policies/mixtral.py
+++ b/colossalai/shardformer/policies/mixtral.py
@@ -51,12 +51,20 @@ class MixtralPolicy(Policy):
        sp_size = self.shard_config.sequence_parallel_size or None
        sp_group = self.shard_config.sequence_parallel_process_group or None
        sp_partial_derived = sp_mode in ["split_gather", "ring"]
+        tp_size = self.shard_config.tensor_parallel_size
+
+        # modified for both SP and TP
+        num_q_heads = self.model.config.num_attention_heads
+        num_kv_heads = getattr(self.model.config, "num_key_value_heads", None)
+
        if sp_mode == "all_to_all":
+            num_q_heads //= sp_size
            decoder_attribute_replacement = {
-                "num_heads": self.model.config.num_attention_heads // sp_size,
+                "num_heads": num_q_heads,
            }
            if getattr(self.model.config, "num_key_value_heads", False):
-                decoder_attribute_replacement["num_key_value_heads"] = self.model.config.num_key_value_heads // sp_size
+                num_kv_heads //= sp_size
+                decoder_attribute_replacement["num_key_value_heads"] = num_kv_heads

            policy[attn_cls] = ModulePolicyDescription(
                attribute_replacement=decoder_attribute_replacement,
@@ -101,12 +109,14 @@ class MixtralPolicy(Policy):
            assert (
                self.model.config.num_key_value_heads % self.shard_config.tensor_parallel_size == 0
            ), f"The number of key_value heads must be divisible by tensor parallel size."
+            num_q_heads //= tp_size
            decoder_attribute_replacement = {
                "self_attn.hidden_size": self.model.config.hidden_size // self.shard_config.tensor_parallel_size,
-                "self_attn.num_heads": self.model.config.num_attention_heads // self.shard_config.tensor_parallel_size,
-                "self_attn.num_key_value_heads": self.model.config.num_key_value_heads
-                // self.shard_config.tensor_parallel_size,
+                "self_attn.num_heads": num_q_heads,
            }
+            if num_kv_heads:
+                num_kv_heads //= tp_size
+                decoder_attribute_replacement["self_attn.num_key_value_heads"] = num_kv_heads

            policy[MixtralDecoderLayer] = ModulePolicyDescription(
                attribute_replacement=decoder_attribute_replacement,
@@ -131,7 +141,7 @@ class MixtralPolicy(Policy):
                        target_module=Linear1D_Row,
                        kwargs={"fp8_communication": self.shard_config.fp8_communication},
                    ),
-                    SubModuleReplacementDescription(  # or replicate?
+                    SubModuleReplacementDescription(
                        suffix="block_sparse_moe.gate",
                        target_module=Linear1D_Col,
                        kwargs={"gather_output": True, "fp8_communication": self.shard_config.fp8_communication},