[zero] improve adaptability for not-shard parameters (#708)

* adapt post grad hooks for not-shard parameters
* adapt optimizer for not-shard parameters
* offload gradients for not-replicated parameters
This commit is contained in:
HELSON
2022-04-11 13:38:51 +08:00
committed by GitHub
parent ab8c6b4a0e
commit a9b8300d54
9 changed files with 114 additions and 111 deletions

View File

@@ -36,7 +36,7 @@ def run_model_test(enable_autocast, shard_strategy_class):
# check whether parameters are identical in ddp
for name, p in zero_model.named_parameters():
if not p.colo_attr.param_is_sharded and p.is_replicated:
if not p.colo_attr.param_is_sharded and p.colo_attr.is_replicated:
assert_equal_in_group(p.colo_attr.sharded_data_tensor.payload)
model = MoeModel().half()