mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-25 11:44:03 +00:00
[doc] Fix typo under colossalai and doc(#3618)
* Fixed several spelling errors under colossalai * Fix the spelling error in colossalai and docs directory * Cautious Changed the spelling error under the example folder * Update runtime_preparation_pass.py revert autograft to autograd * Update search_chunk.py utile to until * Update check_installation.py change misteach to mismatch in line 91 * Update 1D_tensor_parallel.md revert to perceptron * Update 2D_tensor_parallel.md revert to perceptron in line 73 * Update 2p5D_tensor_parallel.md revert to perceptron in line 71 * Update 3D_tensor_parallel.md revert to perceptron in line 80 * Update README.md revert to resnet in line 42 * Update reorder_graph.py revert to indice in line 7 * Update p2p.py revert to megatron in line 94 * Update initialize.py revert to torchrun in line 198 * Update routers.py change to detailed in line 63 * Update routers.py change to detailed in line 146 * Update README.md revert random number in line 402
This commit is contained in:
@@ -8,7 +8,7 @@ from . import BaseOpHook
|
||||
@OPHOOKS.register_module
|
||||
class ShardGradMemTracerHook(BaseOpHook):
|
||||
"""
|
||||
A hook to process sharded param before and afther FWD and BWD operator executing.
|
||||
A hook to process sharded param before and after FWD and BWD operator executing.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
|
@@ -8,7 +8,7 @@ from . import BaseOpHook
|
||||
@OPHOOKS.register_module
|
||||
class ShardParamHook(BaseOpHook):
|
||||
"""
|
||||
A hook to process sharded param before and afther FWD and BWD operator executing.
|
||||
A hook to process sharded param before and after FWD and BWD operator executing.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
|
@@ -53,7 +53,7 @@ class StatefulTensorMgr(object):
|
||||
self._evict_time = 0
|
||||
|
||||
def adjust_layout(self) -> None:
|
||||
""" Adjust the layout of statefuil tensor according to the information provided
|
||||
""" Adjust the layout of stateful tensor according to the information provided
|
||||
by mem_stats_collector, which should belongs to a Sharded Model.
|
||||
"""
|
||||
# find stateful tensor in state COMPUTE
|
||||
|
@@ -97,7 +97,7 @@ class ZeroInitContext(InsertPostInitMethodToModuleSubClasses):
|
||||
"""We use this function to substitute fan-in and fan-out calculation in torch.nn.init.
|
||||
This can help us get correct fan-in and fan-out for sharded tensor.
|
||||
"""
|
||||
assert isinstance(tensor, nn.Parameter), "Sharded tensor initilization is only allowed for paramters"
|
||||
assert isinstance(tensor, nn.Parameter), "Sharded tensor initialization is only allowed for parameters"
|
||||
|
||||
# get correct shape of input tensor
|
||||
if not hasattr(tensor, 'colo_attr') or not tensor.colo_attr.param_is_sharded:
|
||||
|
@@ -14,7 +14,7 @@ class BucketTensorShardStrategy(TensorShardStrategy):
|
||||
"""Use the same shard scheme as `TensorShardStrategy`'s, but it gathers tensors of a sub-module together,
|
||||
which will fully utilize network bandwidth.
|
||||
It is especially useful when sub-module contains bias,
|
||||
since we cannot utilize network bandwidth well if we only gather a bias tensor (bias is usaully small).
|
||||
since we cannot utilize network bandwidth well if we only gather a bias tensor (bias is usually small).
|
||||
"""
|
||||
|
||||
def gather(self, tensor_list: List[ShardedTensor], process_group: Optional[dist.ProcessGroup] = None):
|
||||
|
@@ -192,7 +192,7 @@ class ShardedModelV2(nn.Module):
|
||||
|
||||
def dump_memory_stats(self, filename: Optional[str] = 'dump_mem_stats.log') -> None:
|
||||
"""
|
||||
dummy memory tracer collected infomation to a file.
|
||||
dummy memory tracer collected information to a file.
|
||||
try:
|
||||
# forward: model(inputs)
|
||||
# backward: optimizer.backward()
|
||||
@@ -201,7 +201,7 @@ class ShardedModelV2(nn.Module):
|
||||
exit(0)
|
||||
"""
|
||||
if self._use_memory_tracer:
|
||||
self.logger.error(f'dump memort tracer collected infomation to a {filename}', ranks=[0])
|
||||
self.logger.error(f'dump memort tracer collected information to a {filename}', ranks=[0])
|
||||
if gpc.get_global_rank() == 0:
|
||||
with open(filename, 'w+') as f:
|
||||
f.write(f'cuda reserved {torch.cuda.memory_reserved(get_current_device()) / 1e9} GB\n')
|
||||
@@ -293,7 +293,7 @@ class ShardedModelV2(nn.Module):
|
||||
if not p.requires_grad:
|
||||
continue
|
||||
# Leave the gradient accumulation state (_require_backward_grad_sync) as-is if not synchronizing this pass.
|
||||
# NOTE() (no-sync)/sync pass: (not conduct)/conduct gradient allreducing between process group.
|
||||
# NOTE() (no-sync)/sync pass: (not conduct)/conduct gradient all reducing between process group.
|
||||
# If _require_backward_grad_sync is True,
|
||||
# p.grad remains the accumulated unsharded gradient from prior no-sync passes.
|
||||
# We also allows to interleave no-sync pass with sync passes, if desired.
|
||||
@@ -385,7 +385,7 @@ class ShardedModelV2(nn.Module):
|
||||
param.colo_attr.grad_payload_reset(grad.data)
|
||||
# release the memory of param
|
||||
# we set a false None for parameter's payload
|
||||
# so we can get paramter's device and dtype later in optimizer
|
||||
# so we can get parameter's device and dtype later in optimizer
|
||||
param.colo_attr.data_payload_reset(torch.empty(0, device=grad.device, dtype=grad.dtype))
|
||||
|
||||
if param.colo_attr.is_replicated:
|
||||
|
@@ -67,8 +67,8 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
|
||||
growth_interval (float, optional): growth_interval used by DynamicGradScaler. Defaults to 1000.
|
||||
hysteresis (float, optional): hysteresis used by DynamicGradScaler. Defaults to 2.
|
||||
max_scale (int, optional): max_scale used by DynamicGradScaler. Defaults to 2**32.
|
||||
dp_process_group (Optional[ProcessGroup], optional): data paralle process group. Defaults to None.
|
||||
mp_process_group (Optional[ProcessGroup], optional): model paralle process group. Defaults to None.
|
||||
dp_process_group (Optional[ProcessGroup], optional): data parallel process group. Defaults to None.
|
||||
mp_process_group (Optional[ProcessGroup], optional): model parallel process group. Defaults to None.
|
||||
|
||||
.. _PatrickStar\: Parallel Training of Pre-trained Models via Chunk-based Memory Management:
|
||||
https://arxiv.org/abs/2108.05818
|
||||
@@ -274,7 +274,7 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
|
||||
assert hasattr(p, 'colo_attr'), 'The parameter must be wrapped with ShardedParam'
|
||||
shard_flag = not p.colo_attr.sharded_data_tensor.is_sharded and p.colo_attr.is_replicated
|
||||
if shard_flag:
|
||||
# we always shard replicated paramters
|
||||
# we always shard replicated parameters
|
||||
self.shard_strategy.shard([p.colo_attr.sharded_data_tensor], self.dp_process_group)
|
||||
self.master_params[p] = StatefulTensor(cast_tensor_to_fp32(p.colo_attr.data_payload.to(self.device)))
|
||||
if shard_flag:
|
||||
@@ -312,7 +312,7 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
|
||||
# If reuse_fp16_shard, grad fp16 which wasn't be offloaded may be evicted to CPU
|
||||
if not p.colo_attr.offload_grad:
|
||||
colo_model_data_tensor_move_inline(p.colo_attr.saved_grad, torch.cuda.current_device())
|
||||
# FIXME(ver217): p.data here is an empty tensor on CUDA and has no useful infomation
|
||||
# FIXME(ver217): p.data here is an empty tensor on CUDA and has no useful information
|
||||
# If we change p.grad directly
|
||||
# it may raise error because of different shape/dtype/device of p.data and p.grad
|
||||
# We just set p.data = p.colo_attr.saved_grad.payload here
|
||||
@@ -333,7 +333,7 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
|
||||
|
||||
def _copy_master_model_to_model_fp16(self):
|
||||
# Copy master param data (fp32) to payload of colo_attr (fp16)
|
||||
# TODO() improve efficiency by gathering tensors into a chunk and transfering
|
||||
# TODO() improve efficiency by gathering tensors into a chunk and transferring
|
||||
# a chunk.
|
||||
for group in self.optim.param_groups:
|
||||
for p in group['params']:
|
||||
@@ -350,7 +350,7 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
|
||||
|
||||
p.data = self.master_params[p].payload
|
||||
|
||||
# we need to allocate new memory for keep_not_shard paramters
|
||||
# we need to allocate new memory for keep_not_shard parameters
|
||||
# in order to use copy, otherwise, the sizes of tensor is not compatible
|
||||
if p.colo_attr.data_payload.numel() != p.data.numel():
|
||||
p.colo_attr.data_payload_reset(
|
||||
|
Reference in New Issue
Block a user