[hotfix] moe hybrid parallelism benchmark & follow-up fix (#6048)

* [example] pass use_fp8_comm flag to all plugins * [example] add mixtral benchmark * [moe] refine assertion and check * [moe] fix mixtral & add more tests * [moe] consider checking dp * sp group and moe_dp_group * [mixtral] remove gate tp & add more tests * [deepseek] fix tp & sp for deepseek * [mixtral] minor fix * [deepseek] add deepseek benchmark
2025-09-24 19:17:30 +00:00 · 2024-09-10 17:30:53 +08:00
parent 8fd25d6e09
commit c54c4fcd15
21 changed files with 907 additions and 99 deletions
--- a/colossalai/moe/_operation.py
+++ b/colossalai/moe/_operation.py
@@ -308,7 +308,7 @@ class EPGradScalerIn(torch.autograd.Function):
        assert len(grad_outputs) == 1
        grad = grad_outputs[0]
        if ctx.ep_size != 1:
-            grad = grad * ctx.ep_size
+            grad.mul_(ctx.ep_size)
        return grad, None


@@ -328,7 +328,7 @@ class EPGradScalerOut(torch.autograd.Function):
        assert len(grad_outputs) == 1
        grad = grad_outputs[0]
        if ctx.ep_size != 1:
-            grad = grad / ctx.ep_size
+            grad.div_(ctx.ep_size)
        return grad, None


@@ -449,7 +449,4 @@ def all_to_all_uneven(
    overlap: bool = False,
    fp8_communication: bool = False,
 ):
-    assert (
-        inputs.requires_grad
-    ), "Input must require grad to assure that backward is executed, otherwise it might hang the program."
    return AllToAllUneven.apply(inputs, input_split_sizes, output_split_sizes, group, overlap, fp8_communication)