mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-28 04:55:25 +00:00
[hotfix] moe hybrid parallelism benchmark & follow-up fix (#6048)
* [example] pass use_fp8_comm flag to all plugins * [example] add mixtral benchmark * [moe] refine assertion and check * [moe] fix mixtral & add more tests * [moe] consider checking dp * sp group and moe_dp_group * [mixtral] remove gate tp & add more tests * [deepseek] fix tp & sp for deepseek * [mixtral] minor fix * [deepseek] add deepseek benchmark
This commit is contained in:
@@ -51,12 +51,20 @@ class MixtralPolicy(Policy):
|
||||
sp_size = self.shard_config.sequence_parallel_size or None
|
||||
sp_group = self.shard_config.sequence_parallel_process_group or None
|
||||
sp_partial_derived = sp_mode in ["split_gather", "ring"]
|
||||
tp_size = self.shard_config.tensor_parallel_size
|
||||
|
||||
# modified for both SP and TP
|
||||
num_q_heads = self.model.config.num_attention_heads
|
||||
num_kv_heads = getattr(self.model.config, "num_key_value_heads", None)
|
||||
|
||||
if sp_mode == "all_to_all":
|
||||
num_q_heads //= sp_size
|
||||
decoder_attribute_replacement = {
|
||||
"num_heads": self.model.config.num_attention_heads // sp_size,
|
||||
"num_heads": num_q_heads,
|
||||
}
|
||||
if getattr(self.model.config, "num_key_value_heads", False):
|
||||
decoder_attribute_replacement["num_key_value_heads"] = self.model.config.num_key_value_heads // sp_size
|
||||
num_kv_heads //= sp_size
|
||||
decoder_attribute_replacement["num_key_value_heads"] = num_kv_heads
|
||||
|
||||
policy[attn_cls] = ModulePolicyDescription(
|
||||
attribute_replacement=decoder_attribute_replacement,
|
||||
@@ -101,12 +109,14 @@ class MixtralPolicy(Policy):
|
||||
assert (
|
||||
self.model.config.num_key_value_heads % self.shard_config.tensor_parallel_size == 0
|
||||
), f"The number of key_value heads must be divisible by tensor parallel size."
|
||||
num_q_heads //= tp_size
|
||||
decoder_attribute_replacement = {
|
||||
"self_attn.hidden_size": self.model.config.hidden_size // self.shard_config.tensor_parallel_size,
|
||||
"self_attn.num_heads": self.model.config.num_attention_heads // self.shard_config.tensor_parallel_size,
|
||||
"self_attn.num_key_value_heads": self.model.config.num_key_value_heads
|
||||
// self.shard_config.tensor_parallel_size,
|
||||
"self_attn.num_heads": num_q_heads,
|
||||
}
|
||||
if num_kv_heads:
|
||||
num_kv_heads //= tp_size
|
||||
decoder_attribute_replacement["self_attn.num_key_value_heads"] = num_kv_heads
|
||||
|
||||
policy[MixtralDecoderLayer] = ModulePolicyDescription(
|
||||
attribute_replacement=decoder_attribute_replacement,
|
||||
@@ -131,7 +141,7 @@ class MixtralPolicy(Policy):
|
||||
target_module=Linear1D_Row,
|
||||
kwargs={"fp8_communication": self.shard_config.fp8_communication},
|
||||
),
|
||||
SubModuleReplacementDescription( # or replicate?
|
||||
SubModuleReplacementDescription(
|
||||
suffix="block_sparse_moe.gate",
|
||||
target_module=Linear1D_Col,
|
||||
kwargs={"gather_output": True, "fp8_communication": self.shard_config.fp8_communication},
|
||||
|
Reference in New Issue
Block a user