[doc] Fix typo under colossalai and doc(#3618)

* Fixed several spelling errors under colossalai * Fix the spelling error in colossalai and docs directory * Cautious Changed the spelling error under the example folder * Update runtime_preparation_pass.py revert autograft to autograd * Update search_chunk.py utile to until * Update check_installation.py change misteach to mismatch in line 91 * Update 1D_tensor_parallel.md revert to perceptron * Update 2D_tensor_parallel.md revert to perceptron in line 73 * Update 2p5D_tensor_parallel.md revert to perceptron in line 71 * Update 3D_tensor_parallel.md revert to perceptron in line 80 * Update README.md revert to resnet in line 42 * Update reorder_graph.py revert to indice in line 7 * Update p2p.py revert to megatron in line 94 * Update initialize.py revert to torchrun in line 198 * Update routers.py change to detailed in line 63 * Update routers.py change to detailed in line 146 * Update README.md revert random number in line 402
2025-09-25 11:44:03 +00:00 · 2023-04-26 11:38:43 +08:00
parent e1b0a78afa
commit b9a8dff7e5
72 changed files with 158 additions and 158 deletions
--- a/colossalai/zero/legacy/gemini/ophooks/_shard_grad_ophook.py
+++ b/colossalai/zero/legacy/gemini/ophooks/_shard_grad_ophook.py
@@ -8,7 +8,7 @@ from . import BaseOpHook
@OPHOOKS.register_module
 class ShardGradMemTracerHook(BaseOpHook):
    """
-    A hook to process sharded param before and afther FWD and BWD operator executing.
+    A hook to process sharded param before and after FWD and BWD operator executing.
    """

    def __init__(self):
--- a/colossalai/zero/legacy/gemini/ophooks/_shard_param_ophook.py
+++ b/colossalai/zero/legacy/gemini/ophooks/_shard_param_ophook.py
@@ -8,7 +8,7 @@ from . import BaseOpHook
@OPHOOKS.register_module
 class ShardParamHook(BaseOpHook):
    """
-    A hook to process sharded param before and afther FWD and BWD operator executing.
+    A hook to process sharded param before and after FWD and BWD operator executing.
    """

    def __init__(self):
--- a/colossalai/zero/legacy/gemini/stateful_tensor_mgr.py
+++ b/colossalai/zero/legacy/gemini/stateful_tensor_mgr.py
@@ -53,7 +53,7 @@ class StatefulTensorMgr(object):
        self._evict_time = 0

    def adjust_layout(self) -> None:
-        """ Adjust the layout of statefuil tensor according to the information provided
+        """ Adjust the layout of stateful tensor according to the information provided
        by mem_stats_collector, which should belongs to a Sharded Model.
        """
        # find stateful tensor in state COMPUTE
--- a/colossalai/zero/legacy/init_ctx/init_context.py
+++ b/colossalai/zero/legacy/init_ctx/init_context.py
@@ -97,7 +97,7 @@ class ZeroInitContext(InsertPostInitMethodToModuleSubClasses):
        """We use this function to substitute fan-in and fan-out calculation in torch.nn.init.
        This can help us get correct fan-in and fan-out for sharded tensor.
        """
-        assert isinstance(tensor, nn.Parameter), "Sharded tensor initilization is only allowed for paramters"
+        assert isinstance(tensor, nn.Parameter), "Sharded tensor initialization is only allowed for parameters"

        # get correct shape of input tensor
        if not hasattr(tensor, 'colo_attr') or not tensor.colo_attr.param_is_sharded:
--- a/colossalai/zero/legacy/shard_utils/bucket_tensor_shard_strategy.py
+++ b/colossalai/zero/legacy/shard_utils/bucket_tensor_shard_strategy.py
@@ -14,7 +14,7 @@ class BucketTensorShardStrategy(TensorShardStrategy):
    """Use the same shard scheme as `TensorShardStrategy`'s, but it gathers tensors of a sub-module together,
    which will fully utilize network bandwidth.
    It is especially useful when sub-module contains bias,
-    since we cannot utilize network bandwidth well if we only gather a bias tensor (bias is usaully small).
+    since we cannot utilize network bandwidth well if we only gather a bias tensor (bias is usually small).
    """

    def gather(self, tensor_list: List[ShardedTensor], process_group: Optional[dist.ProcessGroup] = None):
--- a/colossalai/zero/legacy/sharded_model/sharded_model_v2.py
+++ b/colossalai/zero/legacy/sharded_model/sharded_model_v2.py
@@ -192,7 +192,7 @@ class ShardedModelV2(nn.Module):

    def dump_memory_stats(self, filename: Optional[str] = 'dump_mem_stats.log') -> None:
        """
-        dummy memory tracer collected infomation to a file.
+        dummy memory tracer collected information to a file.
        try:
            # forward: model(inputs)
            # backward: optimizer.backward()
@@ -201,7 +201,7 @@ class ShardedModelV2(nn.Module):
            exit(0)
        """
        if self._use_memory_tracer:
-            self.logger.error(f'dump memort tracer collected infomation to a {filename}', ranks=[0])
+            self.logger.error(f'dump memort tracer collected information to a {filename}', ranks=[0])
            if gpc.get_global_rank() == 0:
                with open(filename, 'w+') as f:
                    f.write(f'cuda reserved {torch.cuda.memory_reserved(get_current_device()) / 1e9} GB\n')
@@ -293,7 +293,7 @@ class ShardedModelV2(nn.Module):
            if not p.requires_grad:
                continue
            # Leave the gradient accumulation state (_require_backward_grad_sync) as-is if not synchronizing this pass.
-            # NOTE() (no-sync)/sync pass: (not conduct)/conduct gradient allreducing between process group.
+            # NOTE() (no-sync)/sync pass: (not conduct)/conduct gradient all reducing between process group.
            # If _require_backward_grad_sync is True,
            # p.grad remains the accumulated unsharded gradient from prior no-sync passes.
            # We also allows to interleave no-sync pass with sync passes, if desired.
@@ -385,7 +385,7 @@ class ShardedModelV2(nn.Module):
            param.colo_attr.grad_payload_reset(grad.data)
            # release the memory of param
            # we set a false None for parameter's payload
-            # so we can get paramter's device and dtype later in optimizer
+            # so we can get parameter's device and dtype later in optimizer
            param.colo_attr.data_payload_reset(torch.empty(0, device=grad.device, dtype=grad.dtype))

            if param.colo_attr.is_replicated:
--- a/colossalai/zero/legacy/sharded_optim/sharded_optim_v2.py
+++ b/colossalai/zero/legacy/sharded_optim/sharded_optim_v2.py
@@ -67,8 +67,8 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
        growth_interval (float, optional): growth_interval used by DynamicGradScaler. Defaults to 1000.
        hysteresis (float, optional): hysteresis used by DynamicGradScaler. Defaults to 2.
        max_scale (int, optional): max_scale used by DynamicGradScaler. Defaults to 2**32.
-        dp_process_group (Optional[ProcessGroup], optional): data paralle process group. Defaults to None.
-        mp_process_group (Optional[ProcessGroup], optional): model paralle process group. Defaults to None.
+        dp_process_group (Optional[ProcessGroup], optional): data parallel process group. Defaults to None.
+        mp_process_group (Optional[ProcessGroup], optional): model parallel process group. Defaults to None.

    .. _PatrickStar\: Parallel Training of Pre-trained Models via Chunk-based Memory Management:
        https://arxiv.org/abs/2108.05818
@@ -274,7 +274,7 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
                assert hasattr(p, 'colo_attr'), 'The parameter must be wrapped with ShardedParam'
                shard_flag = not p.colo_attr.sharded_data_tensor.is_sharded and p.colo_attr.is_replicated
                if shard_flag:
-                    # we always shard replicated paramters
+                    # we always shard replicated parameters
                    self.shard_strategy.shard([p.colo_attr.sharded_data_tensor], self.dp_process_group)
                self.master_params[p] = StatefulTensor(cast_tensor_to_fp32(p.colo_attr.data_payload.to(self.device)))
                if shard_flag:
@@ -312,7 +312,7 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
                # If reuse_fp16_shard, grad fp16 which wasn't be offloaded may be evicted to CPU
                if not p.colo_attr.offload_grad:
                    colo_model_data_tensor_move_inline(p.colo_attr.saved_grad, torch.cuda.current_device())
-                # FIXME(ver217): p.data here is an empty tensor on CUDA and has no useful infomation
+                # FIXME(ver217): p.data here is an empty tensor on CUDA and has no useful information
                # If we change p.grad directly
                # it may raise error because of different shape/dtype/device of p.data and p.grad
                # We just set p.data = p.colo_attr.saved_grad.payload here
@@ -333,7 +333,7 @@ class ShardedOptimizerV2(ColossalaiOptimizer):

    def _copy_master_model_to_model_fp16(self):
        # Copy master param data (fp32) to payload of colo_attr (fp16)
-        # TODO() improve efficiency by gathering tensors into a chunk and transfering
+        # TODO() improve efficiency by gathering tensors into a chunk and transferring
        # a chunk.
        for group in self.optim.param_groups:
            for p in group['params']:
@@ -350,7 +350,7 @@ class ShardedOptimizerV2(ColossalaiOptimizer):

        p.data = self.master_params[p].payload

-        # we need to allocate new memory for keep_not_shard paramters
+        # we need to allocate new memory for keep_not_shard parameters
        # in order to use copy, otherwise, the sizes of tensor is not compatible
        if p.colo_attr.data_payload.numel() != p.data.numel():
            p.colo_attr.data_payload_reset(