[hotfix] Add layer norm gradients all-reduce for sequence parallel (#4926)

* [hotfix] Add layer norm gradients all-reduce for sequence parallel. (#4915) * Add layer norm gradients all-reduce for sequence parallel. * skip pipeline inference test * [hotfix] fixing polices of sequence parallel (#4922) * Add layer norm gradients all-reduce for sequence parallel. * fix parameter passing when calling get_autopolicy --------- Co-authored-by: littsk <1214689160@qq.com> * Hotfix/add grad all reduce for sequence parallel (#4927) * Add layer norm gradients all-reduce for sequence parallel. * fix parameter passing when calling get_autopolicy * fix bug using wrong variables --------- Co-authored-by: littsk <1214689160@qq.com> * fix policy initialization * fix bloom and chatglm policices * polish code of handling layernorm * fix moe module * polish code of class initializing --------- Co-authored-by: Zhongkai Zhao <kanezz620@gmail.com>
2025-09-26 12:14:02 +00:00 · 2023-11-03 13:32:43 +08:00
parent d99b2c961a
commit 1a3315e336
30 changed files with 1120 additions and 552 deletions
--- a/colossalai/shardformer/shard/sharder.py
+++ b/colossalai/shardformer/shard/sharder.py
@@ -27,8 +27,8 @@ class ModelSharder(object):

    def __init__(self, model: nn.Module, policy: Policy, shard_config: ShardConfig = None) -> None:
        self.model = model
-        self.policy = get_autopolicy(self.model, shard_config.inference_only) if policy is None else policy
        self.shard_config = shard_config
+        self.policy = get_autopolicy(self.model, shard_config) if policy is None else policy

    def shard(self) -> List[Dict[int, Tensor]]:
        r"""
@@ -196,7 +196,7 @@ class ModelSharder(object):

            try:
                replace_layer = target_module.from_native_module(
-                    native_sub_module, self.shard_config.tensor_parallel_process_group, **kwargs
+                    native_sub_module, process_group=self.shard_config.tensor_parallel_process_group, **kwargs
                )
            except Exception as e:
                raise RuntimeError(