[doc] Fix typo under colossalai and doc(#3618)

* Fixed several spelling errors under colossalai * Fix the spelling error in colossalai and docs directory * Cautious Changed the spelling error under the example folder * Update runtime_preparation_pass.py revert autograft to autograd * Update search_chunk.py utile to until * Update check_installation.py change misteach to mismatch in line 91 * Update 1D_tensor_parallel.md revert to perceptron * Update 2D_tensor_parallel.md revert to perceptron in line 73 * Update 2p5D_tensor_parallel.md revert to perceptron in line 71 * Update 3D_tensor_parallel.md revert to perceptron in line 80 * Update README.md revert to resnet in line 42 * Update reorder_graph.py revert to indice in line 7 * Update p2p.py revert to megatron in line 94 * Update initialize.py revert to torchrun in line 198 * Update routers.py change to detailed in line 63 * Update routers.py change to detailed in line 146 * Update README.md revert random number in line 402
2025-09-25 03:31:56 +00:00 · 2023-04-26 11:38:43 +08:00
parent e1b0a78afa
commit b9a8dff7e5
72 changed files with 158 additions and 158 deletions
--- a/colossalai/nn/layer/moe/experts.py
+++ b/colossalai/nn/layer/moe/experts.py
@@ -13,7 +13,7 @@ from colossalai.zero.legacy.init_ctx import no_shard_zero_decrator


 class MoeExperts(nn.Module):
-    """Basic class for experts in MoE. It stores what kind of communication expersts use
+    """Basic class for experts in MoE. It stores what kind of communication experts use
    to exchange tokens, how many experts in a single GPU and parallel information such as
    expert parallel size, data parallel size and their distributed communication groups.
    """
@@ -24,7 +24,7 @@ class MoeExperts(nn.Module):
            "This kind of communication has not been implemented yet.\n Please use Experts build function."
        self.comm_name = comm_name
        self.num_total_experts = num_experts
-        # Get the configuration of experts' deployment and parallel information from moe contex
+        # Get the configuration of experts' deployment and parallel information from moe context
        self.num_local_experts, self.dist_info = MOE_CONTEXT.get_info(num_experts)


@@ -32,7 +32,7 @@ class MoeExperts(nn.Module):
 class Experts(MoeExperts):
    """A wrapper class to create experts. It will create E experts across the
    moe model parallel group, where E is the number of experts. Every expert
-    is a instence of the class, 'expert' in initialization parameters.
+    is a instance of the class, 'expert' in initialization parameters.

    Args:
        expert_cls (:class:`torch.nn.Module`): The class of all experts
@@ -146,15 +146,15 @@ class FFNExperts(MoeExperts):

 class TPExperts(MoeExperts):
    """Use tensor parallelism to split each expert evenly, which can deploy experts in
-    case that the number of experts can't be divied by maximum expert parallel size or
-    maximum expert parallel size can't be divied by the number of experts.
+    case that the number of experts can't be divide by maximum expert parallel size or
+    maximum expert parallel size can't be divide by the number of experts.
    """

    def __init__(self, num_experts: int, d_model: int, d_ff: int, activation=None, drop_rate: float = 0):
        super().__init__("all_gather", MOE_CONTEXT.max_ep_size)

        assert d_ff % MOE_CONTEXT.max_ep_size == 0, \
-            "d_ff should be divied by maximum expert parallel size"
+            "d_ff should be divide by maximum expert parallel size"

        p_ff = d_ff // MOE_CONTEXT.max_ep_size

--- a/colossalai/nn/layer/moe/layers.py
+++ b/colossalai/nn/layer/moe/layers.py
@@ -25,7 +25,7 @@ from colossalai.zero.legacy.init_ctx import no_shard_zero_context, no_shard_zero
 class MoeLayer(nn.Module):
    """A MoE layer, that puts its input tensor to its gate and uses the output logits
    to router all tokens, is mainly used to exchange all tokens for every expert across
-    the moe tensor group by all to all comunication. Then it will get the output of all
+    the moe tensor group by all to all communication. Then it will get the output of all
    experts and exchange the output. At last returns the output of the moe system.

    Args:
@@ -122,7 +122,7 @@ class MoeModule(nn.Module):
        drop_tks (bool, optional): Whether drops tokens in evaluation
        use_residual (bool, optional): Makes this MoE layer a Residual MoE.
            More information can be found in `Microsoft paper`_.
-        residual_instance (nn.Module, optional): The instance of residual module in Resiual MoE
+        residual_instance (nn.Module, optional): The instance of residual module in Residual MoE
        expert_instance (MoeExperts, optional): The instance of experts module in MoeLayer
        expert_cls (Type[nn.Module], optional): The class of each expert when no instance is given
        expert_args (optional): The args of expert when no instance is given
--- a/colossalai/nn/layer/moe/routers.py
+++ b/colossalai/nn/layer/moe/routers.py
@@ -60,7 +60,7 @@ class MoeRouter(nn.Module, ABC):

 class Top1Router(MoeRouter):
    """Top1 router that returns the dispatch mask [s, e, c] and combine weight [s, e, c]
-    for routing usage. More deailted function can be found in the paper about Switch Transformer
+    for routing usage. More detailed function can be found in the paper about Switch Transformer
    of Google.
    Args:
        capacity_factor_train (float, optional): Capacity factor in routing of training.
@@ -143,7 +143,7 @@ class Top1Router(MoeRouter):

 class Top2Router(MoeRouter):
    """Top2 router that returns the dispatch mask [s, e, c] and combine weight [s, e, c]
-    for routing usage. More deailted function can be found in the paper about ViT-MoE.
+    for routing usage. More detailed function can be found in the paper about ViT-MoE.
    Args:
        capacity_factor_train (float, optional): Capacity factor in routing of training.
        capacity_factor_eval (float, optional): Capacity factor in routing of evaluation.
--- a/colossalai/nn/layer/moe/utils.py
+++ b/colossalai/nn/layer/moe/utils.py
@@ -12,7 +12,7 @@ class ForceFP32Parameter(torch.nn.Parameter):


 class NormalNoiseGenerator:
-    """Generates a random noisy mask for logtis tensor.
+    """Generates a random noisy mask for logits tensor.

    All noise is generated from a normal distribution :math:`(0, 1 / E^2)`, where
    `E = the number of experts`.
@@ -32,7 +32,7 @@ class NormalNoiseGenerator:


 class UniformNoiseGenerator:
-    """Generates a random noisy mask for logtis tensor.
+    """Generates a random noisy mask for logits tensor.
    copied from mesh tensorflow:
    Multiply values by a random number between :math:`1-epsilon` and :math:`1+epsilon`.
    Makes models more resilient to rounding errors introduced by bfloat16.
--- a/colossalai/nn/layer/parallel_1d/layers.py
+++ b/colossalai/nn/layer/parallel_1d/layers.py
@@ -439,7 +439,7 @@ class Linear1D_Col(ParallelLayer):
                    to all GPUs, otherwise, every GPU will have its output
                    which is :math:`Y_i = XA_i`, defaults to False
        skip_bias_add (bool, optional): If set to ``True``, it will skip bias add for linear layer,
-            which is preserved for kernel fusion, defaults to Fals
+            which is preserved for kernel fusion, defaults to False
        weight_initializer (:class:`typing.Callable`, optional):
            The initializer of weight, defaults to kaiming uniform initializer.
        bias_initializer (:class:`typing.Callable`, optional):
@@ -578,7 +578,7 @@ class Linear1D_Row(ParallelLayer):
        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
        parallel_input (bool, optional): If set to ``True``, it's assumed that the input is split, defaults to False.
        skip_bias_add (bool, optional): If set to ``True``, it will skip bias add for linear layer,
-            which is preserved for kernel fusion, defaults to Fals
+            which is preserved for kernel fusion, defaults to False
        weight_initializer (:class:`typing.Callable`, optional):
            The initializer of weight, defaults to kaiming uniform initializer.
        bias_initializer (:class:`typing.Callable`, optional):
@@ -994,11 +994,11 @@ class PatchEmbedding1D(ColossalaiModule):
    :type dtype: torch.dtype, optional
    :param flatten: whether to flatten output tensor, defaults to True
    :type flatten: bool, optional
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
+    :param weight_initializer: The initializer of weight, defaults to kaiming uniform initializer
    :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
+    :param bias_initializer: The initializer of bias, defaults to xavier uniform initializer
    :type bias_initializer: typing.Callable, optional
-    :param position_embed_initializer: The intializer of position embedding, defaults to zero
+    :param position_embed_initializer: The initializer of position embedding, defaults to zero
    :type position_embed_initializer: typing.Callable, optional
    """