mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-25 03:31:56 +00:00
[doc] Fix typo under colossalai and doc(#3618)
* Fixed several spelling errors under colossalai * Fix the spelling error in colossalai and docs directory * Cautious Changed the spelling error under the example folder * Update runtime_preparation_pass.py revert autograft to autograd * Update search_chunk.py utile to until * Update check_installation.py change misteach to mismatch in line 91 * Update 1D_tensor_parallel.md revert to perceptron * Update 2D_tensor_parallel.md revert to perceptron in line 73 * Update 2p5D_tensor_parallel.md revert to perceptron in line 71 * Update 3D_tensor_parallel.md revert to perceptron in line 80 * Update README.md revert to resnet in line 42 * Update reorder_graph.py revert to indice in line 7 * Update p2p.py revert to megatron in line 94 * Update initialize.py revert to torchrun in line 198 * Update routers.py change to detailed in line 63 * Update routers.py change to detailed in line 146 * Update README.md revert random number in line 402
This commit is contained in:
@@ -13,7 +13,7 @@ from colossalai.zero.legacy.init_ctx import no_shard_zero_decrator
|
||||
|
||||
|
||||
class MoeExperts(nn.Module):
|
||||
"""Basic class for experts in MoE. It stores what kind of communication expersts use
|
||||
"""Basic class for experts in MoE. It stores what kind of communication experts use
|
||||
to exchange tokens, how many experts in a single GPU and parallel information such as
|
||||
expert parallel size, data parallel size and their distributed communication groups.
|
||||
"""
|
||||
@@ -24,7 +24,7 @@ class MoeExperts(nn.Module):
|
||||
"This kind of communication has not been implemented yet.\n Please use Experts build function."
|
||||
self.comm_name = comm_name
|
||||
self.num_total_experts = num_experts
|
||||
# Get the configuration of experts' deployment and parallel information from moe contex
|
||||
# Get the configuration of experts' deployment and parallel information from moe context
|
||||
self.num_local_experts, self.dist_info = MOE_CONTEXT.get_info(num_experts)
|
||||
|
||||
|
||||
@@ -32,7 +32,7 @@ class MoeExperts(nn.Module):
|
||||
class Experts(MoeExperts):
|
||||
"""A wrapper class to create experts. It will create E experts across the
|
||||
moe model parallel group, where E is the number of experts. Every expert
|
||||
is a instence of the class, 'expert' in initialization parameters.
|
||||
is a instance of the class, 'expert' in initialization parameters.
|
||||
|
||||
Args:
|
||||
expert_cls (:class:`torch.nn.Module`): The class of all experts
|
||||
@@ -146,15 +146,15 @@ class FFNExperts(MoeExperts):
|
||||
|
||||
class TPExperts(MoeExperts):
|
||||
"""Use tensor parallelism to split each expert evenly, which can deploy experts in
|
||||
case that the number of experts can't be divied by maximum expert parallel size or
|
||||
maximum expert parallel size can't be divied by the number of experts.
|
||||
case that the number of experts can't be divide by maximum expert parallel size or
|
||||
maximum expert parallel size can't be divide by the number of experts.
|
||||
"""
|
||||
|
||||
def __init__(self, num_experts: int, d_model: int, d_ff: int, activation=None, drop_rate: float = 0):
|
||||
super().__init__("all_gather", MOE_CONTEXT.max_ep_size)
|
||||
|
||||
assert d_ff % MOE_CONTEXT.max_ep_size == 0, \
|
||||
"d_ff should be divied by maximum expert parallel size"
|
||||
"d_ff should be divide by maximum expert parallel size"
|
||||
|
||||
p_ff = d_ff // MOE_CONTEXT.max_ep_size
|
||||
|
||||
|
@@ -25,7 +25,7 @@ from colossalai.zero.legacy.init_ctx import no_shard_zero_context, no_shard_zero
|
||||
class MoeLayer(nn.Module):
|
||||
"""A MoE layer, that puts its input tensor to its gate and uses the output logits
|
||||
to router all tokens, is mainly used to exchange all tokens for every expert across
|
||||
the moe tensor group by all to all comunication. Then it will get the output of all
|
||||
the moe tensor group by all to all communication. Then it will get the output of all
|
||||
experts and exchange the output. At last returns the output of the moe system.
|
||||
|
||||
Args:
|
||||
@@ -122,7 +122,7 @@ class MoeModule(nn.Module):
|
||||
drop_tks (bool, optional): Whether drops tokens in evaluation
|
||||
use_residual (bool, optional): Makes this MoE layer a Residual MoE.
|
||||
More information can be found in `Microsoft paper`_.
|
||||
residual_instance (nn.Module, optional): The instance of residual module in Resiual MoE
|
||||
residual_instance (nn.Module, optional): The instance of residual module in Residual MoE
|
||||
expert_instance (MoeExperts, optional): The instance of experts module in MoeLayer
|
||||
expert_cls (Type[nn.Module], optional): The class of each expert when no instance is given
|
||||
expert_args (optional): The args of expert when no instance is given
|
||||
|
@@ -60,7 +60,7 @@ class MoeRouter(nn.Module, ABC):
|
||||
|
||||
class Top1Router(MoeRouter):
|
||||
"""Top1 router that returns the dispatch mask [s, e, c] and combine weight [s, e, c]
|
||||
for routing usage. More deailted function can be found in the paper about Switch Transformer
|
||||
for routing usage. More detailed function can be found in the paper about Switch Transformer
|
||||
of Google.
|
||||
Args:
|
||||
capacity_factor_train (float, optional): Capacity factor in routing of training.
|
||||
@@ -143,7 +143,7 @@ class Top1Router(MoeRouter):
|
||||
|
||||
class Top2Router(MoeRouter):
|
||||
"""Top2 router that returns the dispatch mask [s, e, c] and combine weight [s, e, c]
|
||||
for routing usage. More deailted function can be found in the paper about ViT-MoE.
|
||||
for routing usage. More detailed function can be found in the paper about ViT-MoE.
|
||||
Args:
|
||||
capacity_factor_train (float, optional): Capacity factor in routing of training.
|
||||
capacity_factor_eval (float, optional): Capacity factor in routing of evaluation.
|
||||
|
@@ -12,7 +12,7 @@ class ForceFP32Parameter(torch.nn.Parameter):
|
||||
|
||||
|
||||
class NormalNoiseGenerator:
|
||||
"""Generates a random noisy mask for logtis tensor.
|
||||
"""Generates a random noisy mask for logits tensor.
|
||||
|
||||
All noise is generated from a normal distribution :math:`(0, 1 / E^2)`, where
|
||||
`E = the number of experts`.
|
||||
@@ -32,7 +32,7 @@ class NormalNoiseGenerator:
|
||||
|
||||
|
||||
class UniformNoiseGenerator:
|
||||
"""Generates a random noisy mask for logtis tensor.
|
||||
"""Generates a random noisy mask for logits tensor.
|
||||
copied from mesh tensorflow:
|
||||
Multiply values by a random number between :math:`1-epsilon` and :math:`1+epsilon`.
|
||||
Makes models more resilient to rounding errors introduced by bfloat16.
|
||||
|
@@ -439,7 +439,7 @@ class Linear1D_Col(ParallelLayer):
|
||||
to all GPUs, otherwise, every GPU will have its output
|
||||
which is :math:`Y_i = XA_i`, defaults to False
|
||||
skip_bias_add (bool, optional): If set to ``True``, it will skip bias add for linear layer,
|
||||
which is preserved for kernel fusion, defaults to Fals
|
||||
which is preserved for kernel fusion, defaults to False
|
||||
weight_initializer (:class:`typing.Callable`, optional):
|
||||
The initializer of weight, defaults to kaiming uniform initializer.
|
||||
bias_initializer (:class:`typing.Callable`, optional):
|
||||
@@ -578,7 +578,7 @@ class Linear1D_Row(ParallelLayer):
|
||||
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
|
||||
parallel_input (bool, optional): If set to ``True``, it's assumed that the input is split, defaults to False.
|
||||
skip_bias_add (bool, optional): If set to ``True``, it will skip bias add for linear layer,
|
||||
which is preserved for kernel fusion, defaults to Fals
|
||||
which is preserved for kernel fusion, defaults to False
|
||||
weight_initializer (:class:`typing.Callable`, optional):
|
||||
The initializer of weight, defaults to kaiming uniform initializer.
|
||||
bias_initializer (:class:`typing.Callable`, optional):
|
||||
@@ -994,11 +994,11 @@ class PatchEmbedding1D(ColossalaiModule):
|
||||
:type dtype: torch.dtype, optional
|
||||
:param flatten: whether to flatten output tensor, defaults to True
|
||||
:type flatten: bool, optional
|
||||
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
|
||||
:param weight_initializer: The initializer of weight, defaults to kaiming uniform initializer
|
||||
:type weight_initializer: typing.Callable, optional
|
||||
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
|
||||
:param bias_initializer: The initializer of bias, defaults to xavier uniform initializer
|
||||
:type bias_initializer: typing.Callable, optional
|
||||
:param position_embed_initializer: The intializer of position embedding, defaults to zero
|
||||
:param position_embed_initializer: The initializer of position embedding, defaults to zero
|
||||
:type position_embed_initializer: typing.Callable, optional
|
||||
"""
|
||||
|
||||
|
Reference in New Issue
Block a user