mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-17 07:00:37 +00:00
[doc] Fix typo under colossalai and doc(#3618)
* Fixed several spelling errors under colossalai * Fix the spelling error in colossalai and docs directory * Cautious Changed the spelling error under the example folder * Update runtime_preparation_pass.py revert autograft to autograd * Update search_chunk.py utile to until * Update check_installation.py change misteach to mismatch in line 91 * Update 1D_tensor_parallel.md revert to perceptron * Update 2D_tensor_parallel.md revert to perceptron in line 73 * Update 2p5D_tensor_parallel.md revert to perceptron in line 71 * Update 3D_tensor_parallel.md revert to perceptron in line 80 * Update README.md revert to resnet in line 42 * Update reorder_graph.py revert to indice in line 7 * Update p2p.py revert to megatron in line 94 * Update initialize.py revert to torchrun in line 198 * Update routers.py change to detailed in line 63 * Update routers.py change to detailed in line 146 * Update README.md revert random number in line 402
This commit is contained in:
@@ -74,7 +74,7 @@ class ColoInitContext(InsertPostInitMethodToModuleSubClasses):
|
||||
"""
|
||||
Args:
|
||||
device (torch.device): the device where parameters initialized are resident. Defaults to torch.device('cpu').
|
||||
dtype (torch.dtype): the dtype of parameters initialized. Defults to torch.float.
|
||||
dtype (torch.dtype): the dtype of parameters initialized. Defaults to torch.float.
|
||||
default_pg (ProcessGroup): the default process group for all initialized parameters.
|
||||
default_dist_spec: the default distributed specifications.
|
||||
"""
|
||||
@@ -164,7 +164,7 @@ def post_process_colo_init_ctx(model: torch.nn.Module,
|
||||
model (torch.nn.module): the model
|
||||
device (torch.device, optional): device type of the model params. Defaults to torch.device('cpu').
|
||||
dtype (torch.dtype, optional): dtype of the model params. Defaults to torch.float.
|
||||
default_pg (Optional[ProcessGroup], optional): default process group. Defaults to None. Inidicates a DP-only process group.
|
||||
default_pg (Optional[ProcessGroup], optional): default process group. Defaults to None. Indicates a DP-only process group.
|
||||
default_dist_spec (Any, optional): default dist spec of params. Defaults to None.
|
||||
|
||||
Raises:
|
||||
|
@@ -42,7 +42,7 @@ class ZeroDDP(ColoDDP):
|
||||
|
||||
Args:
|
||||
module (torch.nn.Module): Module to apply ZeRO-DP.
|
||||
gemini_manager (GeminiManager): Manages the chunk manager and heterogeneous momery space.
|
||||
gemini_manager (GeminiManager): Manages the chunk manager and heterogeneous memory space.
|
||||
For more details, see the API reference of ``GeminiManager``.
|
||||
pin_memory (bool): Chunks on CPU Memory use pin-memory.
|
||||
force_outputs_fp32 (bool): If set to True, outputs will be fp32. Otherwise, outputs will be fp16.
|
||||
@@ -684,7 +684,7 @@ class GeminiDDP(ZeroDDP):
|
||||
memstats: Optional[MemStats] = None,
|
||||
verbose: bool = False) -> None:
|
||||
"""
|
||||
A torch.Module warpper using ZeRO-DP and Genimi.
|
||||
A torch.Module wrapper using ZeRO-DP and Gemini.
|
||||
ZeRO is for parallel. Gemini is for memory management.
|
||||
WARNING: The class will modify the module inline!
|
||||
|
||||
@@ -706,7 +706,7 @@ class GeminiDDP(ZeroDDP):
|
||||
Users can provide this argument to speed up searching.
|
||||
If users do not know this argument before training, it is ok. We will use a default value 1024.
|
||||
min_chunk_size_mb (float, optional): the minimum chunk size in MegaByte.
|
||||
If the aggregate size of parameters is still samller than the minimum chunk size,
|
||||
If the aggregate size of parameters is still smaller than the minimum chunk size,
|
||||
all parameters will be compacted into one small chunk.
|
||||
memstats (MemStats, optional) the memory statistics collector by a runtime memory tracer.
|
||||
"""
|
||||
|
@@ -8,7 +8,7 @@ from . import BaseOpHook
|
||||
@OPHOOKS.register_module
|
||||
class ShardGradMemTracerHook(BaseOpHook):
|
||||
"""
|
||||
A hook to process sharded param before and afther FWD and BWD operator executing.
|
||||
A hook to process sharded param before and after FWD and BWD operator executing.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
|
@@ -8,7 +8,7 @@ from . import BaseOpHook
|
||||
@OPHOOKS.register_module
|
||||
class ShardParamHook(BaseOpHook):
|
||||
"""
|
||||
A hook to process sharded param before and afther FWD and BWD operator executing.
|
||||
A hook to process sharded param before and after FWD and BWD operator executing.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
|
@@ -53,7 +53,7 @@ class StatefulTensorMgr(object):
|
||||
self._evict_time = 0
|
||||
|
||||
def adjust_layout(self) -> None:
|
||||
""" Adjust the layout of statefuil tensor according to the information provided
|
||||
""" Adjust the layout of stateful tensor according to the information provided
|
||||
by mem_stats_collector, which should belongs to a Sharded Model.
|
||||
"""
|
||||
# find stateful tensor in state COMPUTE
|
||||
|
@@ -97,7 +97,7 @@ class ZeroInitContext(InsertPostInitMethodToModuleSubClasses):
|
||||
"""We use this function to substitute fan-in and fan-out calculation in torch.nn.init.
|
||||
This can help us get correct fan-in and fan-out for sharded tensor.
|
||||
"""
|
||||
assert isinstance(tensor, nn.Parameter), "Sharded tensor initilization is only allowed for paramters"
|
||||
assert isinstance(tensor, nn.Parameter), "Sharded tensor initialization is only allowed for parameters"
|
||||
|
||||
# get correct shape of input tensor
|
||||
if not hasattr(tensor, 'colo_attr') or not tensor.colo_attr.param_is_sharded:
|
||||
|
@@ -14,7 +14,7 @@ class BucketTensorShardStrategy(TensorShardStrategy):
|
||||
"""Use the same shard scheme as `TensorShardStrategy`'s, but it gathers tensors of a sub-module together,
|
||||
which will fully utilize network bandwidth.
|
||||
It is especially useful when sub-module contains bias,
|
||||
since we cannot utilize network bandwidth well if we only gather a bias tensor (bias is usaully small).
|
||||
since we cannot utilize network bandwidth well if we only gather a bias tensor (bias is usually small).
|
||||
"""
|
||||
|
||||
def gather(self, tensor_list: List[ShardedTensor], process_group: Optional[dist.ProcessGroup] = None):
|
||||
|
@@ -192,7 +192,7 @@ class ShardedModelV2(nn.Module):
|
||||
|
||||
def dump_memory_stats(self, filename: Optional[str] = 'dump_mem_stats.log') -> None:
|
||||
"""
|
||||
dummy memory tracer collected infomation to a file.
|
||||
dummy memory tracer collected information to a file.
|
||||
try:
|
||||
# forward: model(inputs)
|
||||
# backward: optimizer.backward()
|
||||
@@ -201,7 +201,7 @@ class ShardedModelV2(nn.Module):
|
||||
exit(0)
|
||||
"""
|
||||
if self._use_memory_tracer:
|
||||
self.logger.error(f'dump memort tracer collected infomation to a {filename}', ranks=[0])
|
||||
self.logger.error(f'dump memort tracer collected information to a {filename}', ranks=[0])
|
||||
if gpc.get_global_rank() == 0:
|
||||
with open(filename, 'w+') as f:
|
||||
f.write(f'cuda reserved {torch.cuda.memory_reserved(get_current_device()) / 1e9} GB\n')
|
||||
@@ -293,7 +293,7 @@ class ShardedModelV2(nn.Module):
|
||||
if not p.requires_grad:
|
||||
continue
|
||||
# Leave the gradient accumulation state (_require_backward_grad_sync) as-is if not synchronizing this pass.
|
||||
# NOTE() (no-sync)/sync pass: (not conduct)/conduct gradient allreducing between process group.
|
||||
# NOTE() (no-sync)/sync pass: (not conduct)/conduct gradient all reducing between process group.
|
||||
# If _require_backward_grad_sync is True,
|
||||
# p.grad remains the accumulated unsharded gradient from prior no-sync passes.
|
||||
# We also allows to interleave no-sync pass with sync passes, if desired.
|
||||
@@ -385,7 +385,7 @@ class ShardedModelV2(nn.Module):
|
||||
param.colo_attr.grad_payload_reset(grad.data)
|
||||
# release the memory of param
|
||||
# we set a false None for parameter's payload
|
||||
# so we can get paramter's device and dtype later in optimizer
|
||||
# so we can get parameter's device and dtype later in optimizer
|
||||
param.colo_attr.data_payload_reset(torch.empty(0, device=grad.device, dtype=grad.dtype))
|
||||
|
||||
if param.colo_attr.is_replicated:
|
||||
|
@@ -67,8 +67,8 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
|
||||
growth_interval (float, optional): growth_interval used by DynamicGradScaler. Defaults to 1000.
|
||||
hysteresis (float, optional): hysteresis used by DynamicGradScaler. Defaults to 2.
|
||||
max_scale (int, optional): max_scale used by DynamicGradScaler. Defaults to 2**32.
|
||||
dp_process_group (Optional[ProcessGroup], optional): data paralle process group. Defaults to None.
|
||||
mp_process_group (Optional[ProcessGroup], optional): model paralle process group. Defaults to None.
|
||||
dp_process_group (Optional[ProcessGroup], optional): data parallel process group. Defaults to None.
|
||||
mp_process_group (Optional[ProcessGroup], optional): model parallel process group. Defaults to None.
|
||||
|
||||
.. _PatrickStar\: Parallel Training of Pre-trained Models via Chunk-based Memory Management:
|
||||
https://arxiv.org/abs/2108.05818
|
||||
@@ -274,7 +274,7 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
|
||||
assert hasattr(p, 'colo_attr'), 'The parameter must be wrapped with ShardedParam'
|
||||
shard_flag = not p.colo_attr.sharded_data_tensor.is_sharded and p.colo_attr.is_replicated
|
||||
if shard_flag:
|
||||
# we always shard replicated paramters
|
||||
# we always shard replicated parameters
|
||||
self.shard_strategy.shard([p.colo_attr.sharded_data_tensor], self.dp_process_group)
|
||||
self.master_params[p] = StatefulTensor(cast_tensor_to_fp32(p.colo_attr.data_payload.to(self.device)))
|
||||
if shard_flag:
|
||||
@@ -312,7 +312,7 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
|
||||
# If reuse_fp16_shard, grad fp16 which wasn't be offloaded may be evicted to CPU
|
||||
if not p.colo_attr.offload_grad:
|
||||
colo_model_data_tensor_move_inline(p.colo_attr.saved_grad, torch.cuda.current_device())
|
||||
# FIXME(ver217): p.data here is an empty tensor on CUDA and has no useful infomation
|
||||
# FIXME(ver217): p.data here is an empty tensor on CUDA and has no useful information
|
||||
# If we change p.grad directly
|
||||
# it may raise error because of different shape/dtype/device of p.data and p.grad
|
||||
# We just set p.data = p.colo_attr.saved_grad.payload here
|
||||
@@ -333,7 +333,7 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
|
||||
|
||||
def _copy_master_model_to_model_fp16(self):
|
||||
# Copy master param data (fp32) to payload of colo_attr (fp16)
|
||||
# TODO() improve efficiency by gathering tensors into a chunk and transfering
|
||||
# TODO() improve efficiency by gathering tensors into a chunk and transferring
|
||||
# a chunk.
|
||||
for group in self.optim.param_groups:
|
||||
for p in group['params']:
|
||||
@@ -350,7 +350,7 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
|
||||
|
||||
p.data = self.master_params[p].payload
|
||||
|
||||
# we need to allocate new memory for keep_not_shard paramters
|
||||
# we need to allocate new memory for keep_not_shard parameters
|
||||
# in order to use copy, otherwise, the sizes of tensor is not compatible
|
||||
if p.colo_attr.data_payload.numel() != p.data.numel():
|
||||
p.colo_attr.data_payload_reset(
|
||||
|
@@ -26,7 +26,7 @@ def zero_model_wrapper(model: nn.Module,
|
||||
zero_stage (int, optional): The stage of ZeRO DDP. You can find more information in ZeRO's paper.
|
||||
https://arxiv.org/abs/1910.02054
|
||||
gemini_config (dict, optional): The configuration dictionary of `GeminiDDP`. `GeminiDDP` is enabled
|
||||
when the stage is set to 3. You can set the arguemnts of `GeminiDDP` in the gemini_config.
|
||||
when the stage is set to 3. You can set the arguments of `GeminiDDP` in the gemini_config.
|
||||
Here is an example where we set the device of the model, the placement policy of Gemini, and the
|
||||
size of hidden dimension to help Gemini find out a unified chunk size.
|
||||
|
||||
@@ -78,7 +78,7 @@ def zero_optim_wrapper(model: nn.Module,
|
||||
max_norm (float, optional): max_norm used for `clip_grad_norm`. You should notice that you shall not do
|
||||
clip_grad_norm by yourself when using ZeRO DDP. The ZeRO optimizer will take care of clip_grad_norm.
|
||||
norm_type (float, optional): norm_type used for `clip_grad_norm`.
|
||||
optim_config (dict, optinoal): The configuration used for the ZeRO optimizer.
|
||||
optim_config (dict, optional): The configuration used for the ZeRO optimizer.
|
||||
Example:
|
||||
|
||||
>>> zero2_config = dict(reduce_bucket_size=12 * 1024 * 1024, overlap_communication=True)
|
||||
|
Reference in New Issue
Block a user