[fp8] fix missing fp8_comm flag in mixtral (#6057)

2025-09-12 20:54:35 +00:00 · 2024-09-13 14:30:05 +08:00
parent a35a078f08
commit 696fced0d7
2 changed files with 7 additions and 1 deletions
--- a/colossalai/shardformer/modeling/mixtral.py
+++ b/colossalai/shardformer/modeling/mixtral.py
@@ -31,6 +31,7 @@ from colossalai.moe._operation import (
    all_to_all_uneven,
 )
 from colossalai.pipeline.stage_manager import PipelineStageManager
+from colossalai.quantization.fp8 import all_reduce_fp8
 from colossalai.shardformer.layer._operation import (
    all_to_all_comm,
    gather_forward_split_backward,
@@ -142,7 +143,11 @@ class EPMixtralSparseMoeBlock(ParallelModule):
            for i in range(1, self.ep_size):
                activate_experts += output_split_sizes[i * self.num_experts_per_ep : (i + 1) * self.num_experts_per_ep]
            activate_experts = (activate_experts > 0).float()
-        dist.all_reduce(activate_experts, group=self.moe_dp_group)
+
+        if self.fp8_communication:
+            all_reduce_fp8(activate_experts, group=self.moe_dp_group)
+        else:
+            dist.all_reduce(activate_experts, group=self.moe_dp_group)

        input_split_list = input_split_sizes.view(self.ep_size, self.num_experts_per_ep).sum(dim=-1).tolist()
        output_split_list = output_split_sizes.view(self.ep_size, self.num_experts_per_ep).sum(dim=-1).tolist()