[feature] support no master weights option for low level zero plugin (#4816)

* [feature] support no master weights for low level zero plugin

* [feature] support no master weights for low level zero plugin, remove data copy when no master weights

* remove data copy and typecasting when no master weights

* not load weights to cpu when using no master weights

* fix grad: use fp16 grad when no master weights

* only do not update working param when no master weights

* fix: only do not update working param when no master weights

* fix: passing params in dict format in hybrid plugin

* fix: remove extra params (tp_process_group) in hybrid_parallel_plugin
This commit is contained in:
Zhongkai Zhao
2023-10-13 15:57:45 +08:00
committed by GitHub
parent 77a9328304
commit a0684e7bd6
3 changed files with 42 additions and 28 deletions

View File

@@ -464,23 +464,23 @@ class HybridParallelZeroOptimizer(LowLevelZeroOptimizer):
if use_pipeline:
init_pipeline_optimizer(optimizer, model)
super().__init__(
optimizer,
initial_scale,
min_scale,
growth_factor,
backoff_factor,
growth_interval,
hysteresis,
max_scale,
clip_grad_norm,
verbose,
reduce_bucket_size,
communication_dtype,
overlap_communication,
partition_grad,
cpu_offload,
dp_process_group,
forced_dtype,
optimizer=optimizer,
initial_scale=initial_scale,
min_scale=min_scale,
growth_factor=growth_factor,
backoff_factor=backoff_factor,
growth_interval=growth_interval,
hysteresis=hysteresis,
max_scale=max_scale,
clip_grad_norm=clip_grad_norm,
verbose=verbose,
reduce_bucket_size=reduce_bucket_size,
communication_dtype=communication_dtype,
overlap_communication=overlap_communication,
partition_grad=partition_grad,
cpu_offload=cpu_offload,
dp_process_group=dp_process_group,
forced_dtype=forced_dtype,
)
def _compute_grad_norm(self, gradients: List[Tensor], norm_type: int = 2) -> float: