[zero] adapt zero for unsharded parameters (#561)

* support existing sharded and unsharded parameters in zero

* add unitest for moe-zero model init

* polish moe gradient handler
This commit is contained in:
HELSON
2022-03-31 18:34:11 +08:00
committed by GitHub
parent 13ed4b6441
commit e6d50ec107
11 changed files with 211 additions and 70 deletions

View File

@@ -39,7 +39,7 @@ def colo_model_mem_usage(model: torch.nn.Module) -> Tuple[int, int]:
if t.device.type == 'cpu':
_cpu_mem_usage += t.numel() * t.element_size()
elif t.device.type == 'cuda':
_cuda_mem_usages += t.numel() * t.element_size()
_cuda_mem_usage += t.numel() * t.element_size()
return _cuda_mem_usage, _cpu_mem_usage
cuda_mem_usage = 0