[doc] Fix typo under colossalai and doc(#3618)

* Fixed several spelling errors under colossalai

* Fix the spelling error in colossalai and docs directory

* Cautious Changed the spelling error under the example folder

* Update runtime_preparation_pass.py

revert autograft to autograd

* Update search_chunk.py

utile to until

* Update check_installation.py

change misteach to mismatch in line 91

* Update 1D_tensor_parallel.md

revert to perceptron

* Update 2D_tensor_parallel.md

revert to perceptron in line 73

* Update 2p5D_tensor_parallel.md

revert to perceptron in line 71

* Update 3D_tensor_parallel.md

revert to perceptron in line 80

* Update README.md

revert to resnet in line 42

* Update reorder_graph.py

revert to indice in line 7

* Update p2p.py

revert to megatron in line 94

* Update initialize.py

revert to torchrun in line 198

* Update routers.py

change to detailed in line 63

* Update routers.py

change to detailed in line 146

* Update README.md

revert  random number in line 402
This commit is contained in:
digger-yu
2023-04-26 11:38:43 +08:00
committed by GitHub
parent e1b0a78afa
commit b9a8dff7e5
72 changed files with 158 additions and 158 deletions

View File

@@ -74,7 +74,7 @@ class ColoInitContext(InsertPostInitMethodToModuleSubClasses):
"""
Args:
device (torch.device): the device where parameters initialized are resident. Defaults to torch.device('cpu').
dtype (torch.dtype): the dtype of parameters initialized. Defults to torch.float.
dtype (torch.dtype): the dtype of parameters initialized. Defaults to torch.float.
default_pg (ProcessGroup): the default process group for all initialized parameters.
default_dist_spec: the default distributed specifications.
"""
@@ -164,7 +164,7 @@ def post_process_colo_init_ctx(model: torch.nn.Module,
model (torch.nn.module): the model
device (torch.device, optional): device type of the model params. Defaults to torch.device('cpu').
dtype (torch.dtype, optional): dtype of the model params. Defaults to torch.float.
default_pg (Optional[ProcessGroup], optional): default process group. Defaults to None. Inidicates a DP-only process group.
default_pg (Optional[ProcessGroup], optional): default process group. Defaults to None. Indicates a DP-only process group.
default_dist_spec (Any, optional): default dist spec of params. Defaults to None.
Raises:

View File

@@ -42,7 +42,7 @@ class ZeroDDP(ColoDDP):
Args:
module (torch.nn.Module): Module to apply ZeRO-DP.
gemini_manager (GeminiManager): Manages the chunk manager and heterogeneous momery space.
gemini_manager (GeminiManager): Manages the chunk manager and heterogeneous memory space.
For more details, see the API reference of ``GeminiManager``.
pin_memory (bool): Chunks on CPU Memory use pin-memory.
force_outputs_fp32 (bool): If set to True, outputs will be fp32. Otherwise, outputs will be fp16.
@@ -684,7 +684,7 @@ class GeminiDDP(ZeroDDP):
memstats: Optional[MemStats] = None,
verbose: bool = False) -> None:
"""
A torch.Module warpper using ZeRO-DP and Genimi.
A torch.Module wrapper using ZeRO-DP and Gemini.
ZeRO is for parallel. Gemini is for memory management.
WARNING: The class will modify the module inline!
@@ -706,7 +706,7 @@ class GeminiDDP(ZeroDDP):
Users can provide this argument to speed up searching.
If users do not know this argument before training, it is ok. We will use a default value 1024.
min_chunk_size_mb (float, optional): the minimum chunk size in MegaByte.
If the aggregate size of parameters is still samller than the minimum chunk size,
If the aggregate size of parameters is still smaller than the minimum chunk size,
all parameters will be compacted into one small chunk.
memstats (MemStats, optional) the memory statistics collector by a runtime memory tracer.
"""

View File

@@ -8,7 +8,7 @@ from . import BaseOpHook
@OPHOOKS.register_module
class ShardGradMemTracerHook(BaseOpHook):
"""
A hook to process sharded param before and afther FWD and BWD operator executing.
A hook to process sharded param before and after FWD and BWD operator executing.
"""
def __init__(self):

View File

@@ -8,7 +8,7 @@ from . import BaseOpHook
@OPHOOKS.register_module
class ShardParamHook(BaseOpHook):
"""
A hook to process sharded param before and afther FWD and BWD operator executing.
A hook to process sharded param before and after FWD and BWD operator executing.
"""
def __init__(self):

View File

@@ -53,7 +53,7 @@ class StatefulTensorMgr(object):
self._evict_time = 0
def adjust_layout(self) -> None:
""" Adjust the layout of statefuil tensor according to the information provided
""" Adjust the layout of stateful tensor according to the information provided
by mem_stats_collector, which should belongs to a Sharded Model.
"""
# find stateful tensor in state COMPUTE

View File

@@ -97,7 +97,7 @@ class ZeroInitContext(InsertPostInitMethodToModuleSubClasses):
"""We use this function to substitute fan-in and fan-out calculation in torch.nn.init.
This can help us get correct fan-in and fan-out for sharded tensor.
"""
assert isinstance(tensor, nn.Parameter), "Sharded tensor initilization is only allowed for paramters"
assert isinstance(tensor, nn.Parameter), "Sharded tensor initialization is only allowed for parameters"
# get correct shape of input tensor
if not hasattr(tensor, 'colo_attr') or not tensor.colo_attr.param_is_sharded:

View File

@@ -14,7 +14,7 @@ class BucketTensorShardStrategy(TensorShardStrategy):
"""Use the same shard scheme as `TensorShardStrategy`'s, but it gathers tensors of a sub-module together,
which will fully utilize network bandwidth.
It is especially useful when sub-module contains bias,
since we cannot utilize network bandwidth well if we only gather a bias tensor (bias is usaully small).
since we cannot utilize network bandwidth well if we only gather a bias tensor (bias is usually small).
"""
def gather(self, tensor_list: List[ShardedTensor], process_group: Optional[dist.ProcessGroup] = None):

View File

@@ -192,7 +192,7 @@ class ShardedModelV2(nn.Module):
def dump_memory_stats(self, filename: Optional[str] = 'dump_mem_stats.log') -> None:
"""
dummy memory tracer collected infomation to a file.
dummy memory tracer collected information to a file.
try:
# forward: model(inputs)
# backward: optimizer.backward()
@@ -201,7 +201,7 @@ class ShardedModelV2(nn.Module):
exit(0)
"""
if self._use_memory_tracer:
self.logger.error(f'dump memort tracer collected infomation to a {filename}', ranks=[0])
self.logger.error(f'dump memort tracer collected information to a {filename}', ranks=[0])
if gpc.get_global_rank() == 0:
with open(filename, 'w+') as f:
f.write(f'cuda reserved {torch.cuda.memory_reserved(get_current_device()) / 1e9} GB\n')
@@ -293,7 +293,7 @@ class ShardedModelV2(nn.Module):
if not p.requires_grad:
continue
# Leave the gradient accumulation state (_require_backward_grad_sync) as-is if not synchronizing this pass.
# NOTE() (no-sync)/sync pass: (not conduct)/conduct gradient allreducing between process group.
# NOTE() (no-sync)/sync pass: (not conduct)/conduct gradient all reducing between process group.
# If _require_backward_grad_sync is True,
# p.grad remains the accumulated unsharded gradient from prior no-sync passes.
# We also allows to interleave no-sync pass with sync passes, if desired.
@@ -385,7 +385,7 @@ class ShardedModelV2(nn.Module):
param.colo_attr.grad_payload_reset(grad.data)
# release the memory of param
# we set a false None for parameter's payload
# so we can get paramter's device and dtype later in optimizer
# so we can get parameter's device and dtype later in optimizer
param.colo_attr.data_payload_reset(torch.empty(0, device=grad.device, dtype=grad.dtype))
if param.colo_attr.is_replicated:

View File

@@ -67,8 +67,8 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
growth_interval (float, optional): growth_interval used by DynamicGradScaler. Defaults to 1000.
hysteresis (float, optional): hysteresis used by DynamicGradScaler. Defaults to 2.
max_scale (int, optional): max_scale used by DynamicGradScaler. Defaults to 2**32.
dp_process_group (Optional[ProcessGroup], optional): data paralle process group. Defaults to None.
mp_process_group (Optional[ProcessGroup], optional): model paralle process group. Defaults to None.
dp_process_group (Optional[ProcessGroup], optional): data parallel process group. Defaults to None.
mp_process_group (Optional[ProcessGroup], optional): model parallel process group. Defaults to None.
.. _PatrickStar\: Parallel Training of Pre-trained Models via Chunk-based Memory Management:
https://arxiv.org/abs/2108.05818
@@ -274,7 +274,7 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
assert hasattr(p, 'colo_attr'), 'The parameter must be wrapped with ShardedParam'
shard_flag = not p.colo_attr.sharded_data_tensor.is_sharded and p.colo_attr.is_replicated
if shard_flag:
# we always shard replicated paramters
# we always shard replicated parameters
self.shard_strategy.shard([p.colo_attr.sharded_data_tensor], self.dp_process_group)
self.master_params[p] = StatefulTensor(cast_tensor_to_fp32(p.colo_attr.data_payload.to(self.device)))
if shard_flag:
@@ -312,7 +312,7 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
# If reuse_fp16_shard, grad fp16 which wasn't be offloaded may be evicted to CPU
if not p.colo_attr.offload_grad:
colo_model_data_tensor_move_inline(p.colo_attr.saved_grad, torch.cuda.current_device())
# FIXME(ver217): p.data here is an empty tensor on CUDA and has no useful infomation
# FIXME(ver217): p.data here is an empty tensor on CUDA and has no useful information
# If we change p.grad directly
# it may raise error because of different shape/dtype/device of p.data and p.grad
# We just set p.data = p.colo_attr.saved_grad.payload here
@@ -333,7 +333,7 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
def _copy_master_model_to_model_fp16(self):
# Copy master param data (fp32) to payload of colo_attr (fp16)
# TODO() improve efficiency by gathering tensors into a chunk and transfering
# TODO() improve efficiency by gathering tensors into a chunk and transferring
# a chunk.
for group in self.optim.param_groups:
for p in group['params']:
@@ -350,7 +350,7 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
p.data = self.master_params[p].payload
# we need to allocate new memory for keep_not_shard paramters
# we need to allocate new memory for keep_not_shard parameters
# in order to use copy, otherwise, the sizes of tensor is not compatible
if p.colo_attr.data_payload.numel() != p.data.numel():
p.colo_attr.data_payload_reset(

View File

@@ -26,7 +26,7 @@ def zero_model_wrapper(model: nn.Module,
zero_stage (int, optional): The stage of ZeRO DDP. You can find more information in ZeRO's paper.
https://arxiv.org/abs/1910.02054
gemini_config (dict, optional): The configuration dictionary of `GeminiDDP`. `GeminiDDP` is enabled
when the stage is set to 3. You can set the arguemnts of `GeminiDDP` in the gemini_config.
when the stage is set to 3. You can set the arguments of `GeminiDDP` in the gemini_config.
Here is an example where we set the device of the model, the placement policy of Gemini, and the
size of hidden dimension to help Gemini find out a unified chunk size.
@@ -78,7 +78,7 @@ def zero_optim_wrapper(model: nn.Module,
max_norm (float, optional): max_norm used for `clip_grad_norm`. You should notice that you shall not do
clip_grad_norm by yourself when using ZeRO DDP. The ZeRO optimizer will take care of clip_grad_norm.
norm_type (float, optional): norm_type used for `clip_grad_norm`.
optim_config (dict, optinoal): The configuration used for the ZeRO optimizer.
optim_config (dict, optional): The configuration used for the ZeRO optimizer.
Example:
>>> zero2_config = dict(reduce_bucket_size=12 * 1024 * 1024, overlap_communication=True)