mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-25 11:44:03 +00:00
[nfc] fix typo colossalai/zero (#3923)
This commit is contained in:
@@ -46,7 +46,7 @@ class ZeroInitContext(InsertPostInitMethodToModuleSubClasses):
|
||||
"""A context to initialize model.
|
||||
|
||||
1. Convert the model to fp16.
|
||||
2. The paramaters of the module are adapted to type ShardedParameter.
|
||||
2. The parameters of the module are adapted to type ShardedParameter.
|
||||
3. Shard the param and grad according to flags.
|
||||
|
||||
Args:
|
||||
|
@@ -69,7 +69,7 @@ class ShardedModelV2(nn.Module):
|
||||
If it's 'auto', they are moving dynamically based on CPU and CUDA memory usage. It will utilize heterogeneous memory space evenly and well.
|
||||
Note that 'auto' policy can only work well when no other processes use CUDA during your training.
|
||||
Defaults to 'cuda'.
|
||||
gradient_predivide_factor (Optional[float], optional): Gradient is divived by this value before reduce-scatter. Defaults to 1.0.
|
||||
gradient_predivide_factor (Optional[float], optional): Gradient is divided by this value before reduce-scatter. Defaults to 1.0.
|
||||
reuse_fp16_shard (bool, optional): Whether to reuse fp16 shard for param and grad.
|
||||
Enabling this can reduce GPU memory usage, but you have to make sure you disable it when using gradient accumulation.
|
||||
In this mode, grad will be fp16. Make sure your optimizer supports mixed precision (fp32 param and fp16 grad).
|
||||
@@ -205,7 +205,7 @@ class ShardedModelV2(nn.Module):
|
||||
exit(0)
|
||||
"""
|
||||
if self._use_memory_tracer:
|
||||
self.logger.error(f'dump memort tracer collected information to a {filename}', ranks=[0])
|
||||
self.logger.error(f'dump memory tracer collected information to a {filename}', ranks=[0])
|
||||
if gpc.get_global_rank() == 0:
|
||||
with open(filename, 'w+') as f:
|
||||
f.write(f'cuda reserved {torch.cuda.memory_reserved(get_current_device()) / 1e9} GB\n')
|
||||
@@ -385,7 +385,7 @@ class ShardedModelV2(nn.Module):
|
||||
# make parameters point to gradient
|
||||
|
||||
assert param.colo_attr.saved_grad.is_null(
|
||||
), 'Gradien accumulation is not supported when reuse_fp16_shard=True'
|
||||
), 'Gradient accumulation is not supported when reuse_fp16_shard=True'
|
||||
|
||||
param.colo_attr.grad_payload_reset(grad.data)
|
||||
# release the memory of param
|
||||
|
Reference in New Issue
Block a user