[doc] Fix typo under colossalai and doc(#3618)

* Fixed several spelling errors under colossalai * Fix the spelling error in colossalai and docs directory * Cautious Changed the spelling error under the example folder * Update runtime_preparation_pass.py revert autograft to autograd * Update search_chunk.py utile to until * Update check_installation.py change misteach to mismatch in line 91 * Update 1D_tensor_parallel.md revert to perceptron * Update 2D_tensor_parallel.md revert to perceptron in line 73 * Update 2p5D_tensor_parallel.md revert to perceptron in line 71 * Update 3D_tensor_parallel.md revert to perceptron in line 80 * Update README.md revert to resnet in line 42 * Update reorder_graph.py revert to indice in line 7 * Update p2p.py revert to megatron in line 94 * Update initialize.py revert to torchrun in line 198 * Update routers.py change to detailed in line 63 * Update routers.py change to detailed in line 146 * Update README.md revert random number in line 402
2025-09-17 07:00:37 +00:00 · 2023-04-26 11:38:43 +08:00
parent e1b0a78afa
commit b9a8dff7e5
72 changed files with 158 additions and 158 deletions
--- a/colossalai/zero/gemini/colo_init_context.py
+++ b/colossalai/zero/gemini/colo_init_context.py
@@ -74,7 +74,7 @@ class ColoInitContext(InsertPostInitMethodToModuleSubClasses):
        """
        Args:
            device (torch.device): the device where parameters initialized are resident. Defaults to torch.device('cpu').
-            dtype (torch.dtype): the dtype of parameters initialized. Defults to torch.float.
+            dtype (torch.dtype): the dtype of parameters initialized. Defaults to torch.float.
            default_pg (ProcessGroup): the default process group for all initialized parameters.
            default_dist_spec: the default distributed specifications.
        """
@@ -164,7 +164,7 @@ def post_process_colo_init_ctx(model: torch.nn.Module,
        model (torch.nn.module): the model
        device (torch.device, optional): device type of the model params. Defaults to torch.device('cpu').
        dtype (torch.dtype, optional): dtype of the model params. Defaults to torch.float.
-        default_pg (Optional[ProcessGroup], optional): default process group. Defaults to None. Inidicates a DP-only process group.
+        default_pg (Optional[ProcessGroup], optional): default process group. Defaults to None. Indicates a DP-only process group.
        default_dist_spec (Any, optional): default dist spec of params. Defaults to None.

    Raises:
--- a/colossalai/zero/gemini/gemini_ddp.py
+++ b/colossalai/zero/gemini/gemini_ddp.py
@@ -42,7 +42,7 @@ class ZeroDDP(ColoDDP):

    Args:
        module (torch.nn.Module): Module to apply ZeRO-DP.
-        gemini_manager (GeminiManager): Manages the chunk manager and heterogeneous momery space.
+        gemini_manager (GeminiManager): Manages the chunk manager and heterogeneous memory space.
            For more details, see the API reference of ``GeminiManager``.
        pin_memory (bool): Chunks on CPU Memory use pin-memory.
        force_outputs_fp32 (bool): If set to True, outputs will be fp32. Otherwise, outputs will be fp16.
@@ -684,7 +684,7 @@ class GeminiDDP(ZeroDDP):
                 memstats: Optional[MemStats] = None,
                 verbose: bool = False) -> None:
        """
-        A torch.Module warpper using ZeRO-DP and Genimi.
+        A torch.Module wrapper using ZeRO-DP and Gemini.
        ZeRO is for parallel. Gemini is for memory management.
        WARNING: The class will modify the module inline!

@@ -706,7 +706,7 @@ class GeminiDDP(ZeroDDP):
                Users can provide this argument to speed up searching.
                If users do not know this argument before training, it is ok. We will use a default value 1024.
            min_chunk_size_mb (float, optional): the minimum chunk size in MegaByte.
-                If the aggregate size of parameters is still samller than the minimum chunk size,
+                If the aggregate size of parameters is still smaller than the minimum chunk size,
                all parameters will be compacted into one small chunk.
            memstats (MemStats, optional) the memory statistics collector by a runtime memory tracer.
        """
--- a/colossalai/zero/legacy/gemini/ophooks/_shard_grad_ophook.py
+++ b/colossalai/zero/legacy/gemini/ophooks/_shard_grad_ophook.py
@@ -8,7 +8,7 @@ from . import BaseOpHook
@OPHOOKS.register_module
 class ShardGradMemTracerHook(BaseOpHook):
    """
-    A hook to process sharded param before and afther FWD and BWD operator executing.
+    A hook to process sharded param before and after FWD and BWD operator executing.
    """

    def __init__(self):
--- a/colossalai/zero/legacy/gemini/ophooks/_shard_param_ophook.py
+++ b/colossalai/zero/legacy/gemini/ophooks/_shard_param_ophook.py
@@ -8,7 +8,7 @@ from . import BaseOpHook
@OPHOOKS.register_module
 class ShardParamHook(BaseOpHook):
    """
-    A hook to process sharded param before and afther FWD and BWD operator executing.
+    A hook to process sharded param before and after FWD and BWD operator executing.
    """

    def __init__(self):
--- a/colossalai/zero/legacy/gemini/stateful_tensor_mgr.py
+++ b/colossalai/zero/legacy/gemini/stateful_tensor_mgr.py
@@ -53,7 +53,7 @@ class StatefulTensorMgr(object):
        self._evict_time = 0

    def adjust_layout(self) -> None:
-        """ Adjust the layout of statefuil tensor according to the information provided
+        """ Adjust the layout of stateful tensor according to the information provided
        by mem_stats_collector, which should belongs to a Sharded Model.
        """
        # find stateful tensor in state COMPUTE
--- a/colossalai/zero/legacy/init_ctx/init_context.py
+++ b/colossalai/zero/legacy/init_ctx/init_context.py
@@ -97,7 +97,7 @@ class ZeroInitContext(InsertPostInitMethodToModuleSubClasses):
        """We use this function to substitute fan-in and fan-out calculation in torch.nn.init.
        This can help us get correct fan-in and fan-out for sharded tensor.
        """
-        assert isinstance(tensor, nn.Parameter), "Sharded tensor initilization is only allowed for paramters"
+        assert isinstance(tensor, nn.Parameter), "Sharded tensor initialization is only allowed for parameters"

        # get correct shape of input tensor
        if not hasattr(tensor, 'colo_attr') or not tensor.colo_attr.param_is_sharded:
--- a/colossalai/zero/legacy/shard_utils/bucket_tensor_shard_strategy.py
+++ b/colossalai/zero/legacy/shard_utils/bucket_tensor_shard_strategy.py
@@ -14,7 +14,7 @@ class BucketTensorShardStrategy(TensorShardStrategy):
    """Use the same shard scheme as `TensorShardStrategy`'s, but it gathers tensors of a sub-module together,
    which will fully utilize network bandwidth.
    It is especially useful when sub-module contains bias,
-    since we cannot utilize network bandwidth well if we only gather a bias tensor (bias is usaully small).
+    since we cannot utilize network bandwidth well if we only gather a bias tensor (bias is usually small).
    """

    def gather(self, tensor_list: List[ShardedTensor], process_group: Optional[dist.ProcessGroup] = None):
--- a/colossalai/zero/legacy/sharded_model/sharded_model_v2.py
+++ b/colossalai/zero/legacy/sharded_model/sharded_model_v2.py
@@ -192,7 +192,7 @@ class ShardedModelV2(nn.Module):

    def dump_memory_stats(self, filename: Optional[str] = 'dump_mem_stats.log') -> None:
        """
-        dummy memory tracer collected infomation to a file.
+        dummy memory tracer collected information to a file.
        try:
            # forward: model(inputs)
            # backward: optimizer.backward()
@@ -201,7 +201,7 @@ class ShardedModelV2(nn.Module):
            exit(0)
        """
        if self._use_memory_tracer:
-            self.logger.error(f'dump memort tracer collected infomation to a {filename}', ranks=[0])
+            self.logger.error(f'dump memort tracer collected information to a {filename}', ranks=[0])
            if gpc.get_global_rank() == 0:
                with open(filename, 'w+') as f:
                    f.write(f'cuda reserved {torch.cuda.memory_reserved(get_current_device()) / 1e9} GB\n')
@@ -293,7 +293,7 @@ class ShardedModelV2(nn.Module):
            if not p.requires_grad:
                continue
            # Leave the gradient accumulation state (_require_backward_grad_sync) as-is if not synchronizing this pass.
-            # NOTE() (no-sync)/sync pass: (not conduct)/conduct gradient allreducing between process group.
+            # NOTE() (no-sync)/sync pass: (not conduct)/conduct gradient all reducing between process group.
            # If _require_backward_grad_sync is True,
            # p.grad remains the accumulated unsharded gradient from prior no-sync passes.
            # We also allows to interleave no-sync pass with sync passes, if desired.
@@ -385,7 +385,7 @@ class ShardedModelV2(nn.Module):
            param.colo_attr.grad_payload_reset(grad.data)
            # release the memory of param
            # we set a false None for parameter's payload
-            # so we can get paramter's device and dtype later in optimizer
+            # so we can get parameter's device and dtype later in optimizer
            param.colo_attr.data_payload_reset(torch.empty(0, device=grad.device, dtype=grad.dtype))

            if param.colo_attr.is_replicated:
--- a/colossalai/zero/legacy/sharded_optim/sharded_optim_v2.py
+++ b/colossalai/zero/legacy/sharded_optim/sharded_optim_v2.py
@@ -67,8 +67,8 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
        growth_interval (float, optional): growth_interval used by DynamicGradScaler. Defaults to 1000.
        hysteresis (float, optional): hysteresis used by DynamicGradScaler. Defaults to 2.
        max_scale (int, optional): max_scale used by DynamicGradScaler. Defaults to 2**32.
-        dp_process_group (Optional[ProcessGroup], optional): data paralle process group. Defaults to None.
-        mp_process_group (Optional[ProcessGroup], optional): model paralle process group. Defaults to None.
+        dp_process_group (Optional[ProcessGroup], optional): data parallel process group. Defaults to None.
+        mp_process_group (Optional[ProcessGroup], optional): model parallel process group. Defaults to None.

    .. _PatrickStar\: Parallel Training of Pre-trained Models via Chunk-based Memory Management:
        https://arxiv.org/abs/2108.05818
@@ -274,7 +274,7 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
                assert hasattr(p, 'colo_attr'), 'The parameter must be wrapped with ShardedParam'
                shard_flag = not p.colo_attr.sharded_data_tensor.is_sharded and p.colo_attr.is_replicated
                if shard_flag:
-                    # we always shard replicated paramters
+                    # we always shard replicated parameters
                    self.shard_strategy.shard([p.colo_attr.sharded_data_tensor], self.dp_process_group)
                self.master_params[p] = StatefulTensor(cast_tensor_to_fp32(p.colo_attr.data_payload.to(self.device)))
                if shard_flag:
@@ -312,7 +312,7 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
                # If reuse_fp16_shard, grad fp16 which wasn't be offloaded may be evicted to CPU
                if not p.colo_attr.offload_grad:
                    colo_model_data_tensor_move_inline(p.colo_attr.saved_grad, torch.cuda.current_device())
-                # FIXME(ver217): p.data here is an empty tensor on CUDA and has no useful infomation
+                # FIXME(ver217): p.data here is an empty tensor on CUDA and has no useful information
                # If we change p.grad directly
                # it may raise error because of different shape/dtype/device of p.data and p.grad
                # We just set p.data = p.colo_attr.saved_grad.payload here
@@ -333,7 +333,7 @@ class ShardedOptimizerV2(ColossalaiOptimizer):

    def _copy_master_model_to_model_fp16(self):
        # Copy master param data (fp32) to payload of colo_attr (fp16)
-        # TODO() improve efficiency by gathering tensors into a chunk and transfering
+        # TODO() improve efficiency by gathering tensors into a chunk and transferring
        # a chunk.
        for group in self.optim.param_groups:
            for p in group['params']:
@@ -350,7 +350,7 @@ class ShardedOptimizerV2(ColossalaiOptimizer):

        p.data = self.master_params[p].payload

-        # we need to allocate new memory for keep_not_shard paramters
+        # we need to allocate new memory for keep_not_shard parameters
        # in order to use copy, otherwise, the sizes of tensor is not compatible
        if p.colo_attr.data_payload.numel() != p.data.numel():
            p.colo_attr.data_payload_reset(
--- a/colossalai/zero/wrapper.py
+++ b/colossalai/zero/wrapper.py
@@ -26,7 +26,7 @@ def zero_model_wrapper(model: nn.Module,
        zero_stage (int, optional): The stage of ZeRO DDP. You can find more information in ZeRO's paper.
            https://arxiv.org/abs/1910.02054
        gemini_config (dict, optional): The configuration dictionary of `GeminiDDP`. `GeminiDDP` is enabled
-            when the stage is set to 3. You can set the arguemnts of `GeminiDDP` in the gemini_config.
+            when the stage is set to 3. You can set the arguments of `GeminiDDP` in the gemini_config.
            Here is an example where we set the device of the model, the placement policy of Gemini, and the
            size of hidden dimension to help Gemini find out a unified chunk size.

@@ -78,7 +78,7 @@ def zero_optim_wrapper(model: nn.Module,
        max_norm (float, optional): max_norm used for `clip_grad_norm`. You should notice that you shall not do
            clip_grad_norm by yourself when using ZeRO DDP. The ZeRO optimizer will take care of clip_grad_norm.
        norm_type (float, optional): norm_type used for `clip_grad_norm`.
-        optim_config (dict, optinoal): The configuration used for the ZeRO optimizer.
+        optim_config (dict, optional): The configuration used for the ZeRO optimizer.
            Example:

                >>> zero2_config = dict(reduce_bucket_size=12 * 1024 * 1024, overlap_communication=True)