[hotfix] moe hybrid parallelism benchmark & follow-up fix (#6048)

* [example] pass use_fp8_comm flag to all plugins * [example] add mixtral benchmark * [moe] refine assertion and check * [moe] fix mixtral & add more tests * [moe] consider checking dp * sp group and moe_dp_group * [mixtral] remove gate tp & add more tests * [deepseek] fix tp & sp for deepseek * [mixtral] minor fix * [deepseek] add deepseek benchmark
2025-09-07 03:52:01 +00:00 · 2024-09-10 17:30:53 +08:00
parent 8fd25d6e09
commit c54c4fcd15
21 changed files with 907 additions and 99 deletions
--- a/colossalai/shardformer/modeling/mixtral.py
+++ b/colossalai/shardformer/modeling/mixtral.py
@@ -36,7 +36,7 @@ from colossalai.shardformer.layer._operation import (
    gather_forward_split_backward,
    split_forward_gather_backward,
 )
-from colossalai.shardformer.layer.linear import Linear1D_Col, Linear1D_Row
+from colossalai.shardformer.layer.linear import Linear1D_Col, Linear1D_Row, ParallelModule
 from colossalai.shardformer.shard import ShardConfig
 from colossalai.shardformer.shard.utils import set_tensors_to_none
 from colossalai.tensor.moe_tensor.api import set_moe_tensor_ep_group
@@ -49,7 +49,7 @@ if is_flash_attn_2_available():
    _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)


-class EPMixtralSparseMoeBlock(MixtralSparseMoeBlock):
+class EPMixtralSparseMoeBlock(ParallelModule):
    def __init__(self, *args, **kwargs):
        raise RuntimeError(f"Please use `from_native_module` to create an instance of {self.__class__.__name__}")