mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-26 12:14:02 +00:00
[hotfix] Add layer norm gradients all-reduce for sequence parallel (#4926)
* [hotfix] Add layer norm gradients all-reduce for sequence parallel. (#4915) * Add layer norm gradients all-reduce for sequence parallel. * skip pipeline inference test * [hotfix] fixing polices of sequence parallel (#4922) * Add layer norm gradients all-reduce for sequence parallel. * fix parameter passing when calling get_autopolicy --------- Co-authored-by: littsk <1214689160@qq.com> * Hotfix/add grad all reduce for sequence parallel (#4927) * Add layer norm gradients all-reduce for sequence parallel. * fix parameter passing when calling get_autopolicy * fix bug using wrong variables --------- Co-authored-by: littsk <1214689160@qq.com> * fix policy initialization * fix bloom and chatglm policices * polish code of handling layernorm * fix moe module * polish code of class initializing --------- Co-authored-by: Zhongkai Zhao <kanezz620@gmail.com>
This commit is contained in:
@@ -27,8 +27,8 @@ class ModelSharder(object):
|
||||
|
||||
def __init__(self, model: nn.Module, policy: Policy, shard_config: ShardConfig = None) -> None:
|
||||
self.model = model
|
||||
self.policy = get_autopolicy(self.model, shard_config.inference_only) if policy is None else policy
|
||||
self.shard_config = shard_config
|
||||
self.policy = get_autopolicy(self.model, shard_config) if policy is None else policy
|
||||
|
||||
def shard(self) -> List[Dict[int, Tensor]]:
|
||||
r"""
|
||||
@@ -196,7 +196,7 @@ class ModelSharder(object):
|
||||
|
||||
try:
|
||||
replace_layer = target_module.from_native_module(
|
||||
native_sub_module, self.shard_config.tensor_parallel_process_group, **kwargs
|
||||
native_sub_module, process_group=self.shard_config.tensor_parallel_process_group, **kwargs
|
||||
)
|
||||
except Exception as e:
|
||||
raise RuntimeError(
|
||||
|
Reference in New Issue
Block a user