[doc] Fix typo under colossalai and doc(#3618)

* Fixed several spelling errors under colossalai

* Fix the spelling error in colossalai and docs directory

* Cautious Changed the spelling error under the example folder

* Update runtime_preparation_pass.py

revert autograft to autograd

* Update search_chunk.py

utile to until

* Update check_installation.py

change misteach to mismatch in line 91

* Update 1D_tensor_parallel.md

revert to perceptron

* Update 2D_tensor_parallel.md

revert to perceptron in line 73

* Update 2p5D_tensor_parallel.md

revert to perceptron in line 71

* Update 3D_tensor_parallel.md

revert to perceptron in line 80

* Update README.md

revert to resnet in line 42

* Update reorder_graph.py

revert to indice in line 7

* Update p2p.py

revert to megatron in line 94

* Update initialize.py

revert to torchrun in line 198

* Update routers.py

change to detailed in line 63

* Update routers.py

change to detailed in line 146

* Update README.md

revert  random number in line 402
This commit is contained in:
digger-yu
2023-04-26 11:38:43 +08:00
committed by GitHub
parent e1b0a78afa
commit b9a8dff7e5
72 changed files with 158 additions and 158 deletions

View File

@@ -8,7 +8,7 @@ from . import BaseOpHook
@OPHOOKS.register_module
class ShardGradMemTracerHook(BaseOpHook):
"""
A hook to process sharded param before and afther FWD and BWD operator executing.
A hook to process sharded param before and after FWD and BWD operator executing.
"""
def __init__(self):

View File

@@ -8,7 +8,7 @@ from . import BaseOpHook
@OPHOOKS.register_module
class ShardParamHook(BaseOpHook):
"""
A hook to process sharded param before and afther FWD and BWD operator executing.
A hook to process sharded param before and after FWD and BWD operator executing.
"""
def __init__(self):

View File

@@ -53,7 +53,7 @@ class StatefulTensorMgr(object):
self._evict_time = 0
def adjust_layout(self) -> None:
""" Adjust the layout of statefuil tensor according to the information provided
""" Adjust the layout of stateful tensor according to the information provided
by mem_stats_collector, which should belongs to a Sharded Model.
"""
# find stateful tensor in state COMPUTE

View File

@@ -97,7 +97,7 @@ class ZeroInitContext(InsertPostInitMethodToModuleSubClasses):
"""We use this function to substitute fan-in and fan-out calculation in torch.nn.init.
This can help us get correct fan-in and fan-out for sharded tensor.
"""
assert isinstance(tensor, nn.Parameter), "Sharded tensor initilization is only allowed for paramters"
assert isinstance(tensor, nn.Parameter), "Sharded tensor initialization is only allowed for parameters"
# get correct shape of input tensor
if not hasattr(tensor, 'colo_attr') or not tensor.colo_attr.param_is_sharded:

View File

@@ -14,7 +14,7 @@ class BucketTensorShardStrategy(TensorShardStrategy):
"""Use the same shard scheme as `TensorShardStrategy`'s, but it gathers tensors of a sub-module together,
which will fully utilize network bandwidth.
It is especially useful when sub-module contains bias,
since we cannot utilize network bandwidth well if we only gather a bias tensor (bias is usaully small).
since we cannot utilize network bandwidth well if we only gather a bias tensor (bias is usually small).
"""
def gather(self, tensor_list: List[ShardedTensor], process_group: Optional[dist.ProcessGroup] = None):

View File

@@ -192,7 +192,7 @@ class ShardedModelV2(nn.Module):
def dump_memory_stats(self, filename: Optional[str] = 'dump_mem_stats.log') -> None:
"""
dummy memory tracer collected infomation to a file.
dummy memory tracer collected information to a file.
try:
# forward: model(inputs)
# backward: optimizer.backward()
@@ -201,7 +201,7 @@ class ShardedModelV2(nn.Module):
exit(0)
"""
if self._use_memory_tracer:
self.logger.error(f'dump memort tracer collected infomation to a {filename}', ranks=[0])
self.logger.error(f'dump memort tracer collected information to a {filename}', ranks=[0])
if gpc.get_global_rank() == 0:
with open(filename, 'w+') as f:
f.write(f'cuda reserved {torch.cuda.memory_reserved(get_current_device()) / 1e9} GB\n')
@@ -293,7 +293,7 @@ class ShardedModelV2(nn.Module):
if not p.requires_grad:
continue
# Leave the gradient accumulation state (_require_backward_grad_sync) as-is if not synchronizing this pass.
# NOTE() (no-sync)/sync pass: (not conduct)/conduct gradient allreducing between process group.
# NOTE() (no-sync)/sync pass: (not conduct)/conduct gradient all reducing between process group.
# If _require_backward_grad_sync is True,
# p.grad remains the accumulated unsharded gradient from prior no-sync passes.
# We also allows to interleave no-sync pass with sync passes, if desired.
@@ -385,7 +385,7 @@ class ShardedModelV2(nn.Module):
param.colo_attr.grad_payload_reset(grad.data)
# release the memory of param
# we set a false None for parameter's payload
# so we can get paramter's device and dtype later in optimizer
# so we can get parameter's device and dtype later in optimizer
param.colo_attr.data_payload_reset(torch.empty(0, device=grad.device, dtype=grad.dtype))
if param.colo_attr.is_replicated:

View File

@@ -67,8 +67,8 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
growth_interval (float, optional): growth_interval used by DynamicGradScaler. Defaults to 1000.
hysteresis (float, optional): hysteresis used by DynamicGradScaler. Defaults to 2.
max_scale (int, optional): max_scale used by DynamicGradScaler. Defaults to 2**32.
dp_process_group (Optional[ProcessGroup], optional): data paralle process group. Defaults to None.
mp_process_group (Optional[ProcessGroup], optional): model paralle process group. Defaults to None.
dp_process_group (Optional[ProcessGroup], optional): data parallel process group. Defaults to None.
mp_process_group (Optional[ProcessGroup], optional): model parallel process group. Defaults to None.
.. _PatrickStar\: Parallel Training of Pre-trained Models via Chunk-based Memory Management:
https://arxiv.org/abs/2108.05818
@@ -274,7 +274,7 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
assert hasattr(p, 'colo_attr'), 'The parameter must be wrapped with ShardedParam'
shard_flag = not p.colo_attr.sharded_data_tensor.is_sharded and p.colo_attr.is_replicated
if shard_flag:
# we always shard replicated paramters
# we always shard replicated parameters
self.shard_strategy.shard([p.colo_attr.sharded_data_tensor], self.dp_process_group)
self.master_params[p] = StatefulTensor(cast_tensor_to_fp32(p.colo_attr.data_payload.to(self.device)))
if shard_flag:
@@ -312,7 +312,7 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
# If reuse_fp16_shard, grad fp16 which wasn't be offloaded may be evicted to CPU
if not p.colo_attr.offload_grad:
colo_model_data_tensor_move_inline(p.colo_attr.saved_grad, torch.cuda.current_device())
# FIXME(ver217): p.data here is an empty tensor on CUDA and has no useful infomation
# FIXME(ver217): p.data here is an empty tensor on CUDA and has no useful information
# If we change p.grad directly
# it may raise error because of different shape/dtype/device of p.data and p.grad
# We just set p.data = p.colo_attr.saved_grad.payload here
@@ -333,7 +333,7 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
def _copy_master_model_to_model_fp16(self):
# Copy master param data (fp32) to payload of colo_attr (fp16)
# TODO() improve efficiency by gathering tensors into a chunk and transfering
# TODO() improve efficiency by gathering tensors into a chunk and transferring
# a chunk.
for group in self.optim.param_groups:
for p in group['params']:
@@ -350,7 +350,7 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
p.data = self.master_params[p].payload
# we need to allocate new memory for keep_not_shard paramters
# we need to allocate new memory for keep_not_shard parameters
# in order to use copy, otherwise, the sizes of tensor is not compatible
if p.colo_attr.data_payload.numel() != p.data.numel():
p.colo_attr.data_payload_reset(