[fix] fix mixtral policy;

2025-08-20 17:03:22 +00:00 · 2024-10-08 09:25:11 +00:00 · 2024-10-08 09:25:11 +00:00 · 292a504bea
commit 292a504bea
parent f4d023ca6e
1 changed files with 5 additions and 3 deletions
--- a/colossalai/shardformer/policies/mixtral.py
+++ b/colossalai/shardformer/policies/mixtral.py
@ -268,9 +268,11 @@ class MixtralPolicy(Policy):
                held_layers.append(module.embed_tokens)
            for start_idx, end_idx in stage_indices:
                held_layers.extend(module.layers[start_idx:end_idx])
-            if stage_manager.use_zbv and stage_manager.is_first_stage(ignore_chunk=True):
+            if (stage_manager.use_zbv and stage_manager.is_first_stage(ignore_chunk=True)) or (
-                held_layers.append(module.norm)
+                stage_manager.is_last_stage(ignore_chunk=True)
-            elif stage_manager.is_last_stage(ignore_chunk=True):
+            ):
                # for zbv, when is_first_stage (last fwd), we append norm
                # for interleaved, when is_last_stage (last fwd), we also append norm
                held_layers.append(module.norm)
        else:
            layers_per_stage = stage_manager.distribute_layers(len(module.layers))