diff --git a/colossalai/nn/layer/colossalai_layer/dropout.py b/colossalai/nn/layer/colossalai_layer/dropout.py index ff86e0745..f1dc297a1 100644 --- a/colossalai/nn/layer/colossalai_layer/dropout.py +++ b/colossalai/nn/layer/colossalai_layer/dropout.py @@ -9,6 +9,14 @@ from ..utils import get_tensor_parallel_mode class Dropout(nn.Module): + """ + Dropout layer of colossalai + + :param p: dropout rate, defaults to 0.5 + :type p: float, optional + :param inplace: If set to ``True``, will do this operation in-place, defaults tp ``False`` + :type inplace: bool, optional + """ def __init__(self, p: float = 0.5, inplace: bool = False) -> None: super().__init__() self.tensor_parallel = get_tensor_parallel_mode() diff --git a/colossalai/nn/layer/colossalai_layer/embedding.py b/colossalai/nn/layer/colossalai_layer/embedding.py index 6a580a29d..cd93ddbd9 100644 --- a/colossalai/nn/layer/colossalai_layer/embedding.py +++ b/colossalai/nn/layer/colossalai_layer/embedding.py @@ -24,6 +24,20 @@ _parallel_patchembedding = { class Embedding(nn.Module): + """ + Embedding for colossalai + + :param num_embeddings: number of embeddings + :type num_embeddings: int + :param embedding_dim: dimension of embedding + :type embedding_dim: int + :param padding_idx: index of padding, defaults to None + :type padding_idx: int, optional + :param dtype: The dtype of parameters, defaults to None + :type dtype: torch.dtype, optional + :param weight_initializer: The intializer of weight, defaults to normal initializer + :type weight_initializer: typing.Callable, optional + """ def __init__(self, num_embeddings: int, embedding_dim: int, @@ -63,6 +77,28 @@ class Embedding(nn.Module): class PatchEmbedding(nn.Module): + """ + 2D Image to Patch Embedding + + :param img_size: image size + :type img_size: int + :param patch_size: patch size + :type patch_size: int + :param in_chans: number of channels of input image + :type in_chans: int + :param embed_size: size of embedding + :type embed_size: int + :param dtype: The dtype of parameters, defaults to None + :type dtype: torch.dtype, optional + :param flatten: whether to flatten output tensor, defaults to True + :type flatten: bool, optional + :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer + :type weight_initializer: typing.Callable, optional + :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer + :type bias_initializer: typing.Callable, optional + :param position_embed_initializer: The intializer of position embedding, defaults to zero + :type position_embed_initializer: typing.Callable, optional + """ def __init__(self, img_size: int, patch_size: int, diff --git a/colossalai/nn/layer/colossalai_layer/linear.py b/colossalai/nn/layer/colossalai_layer/linear.py index 7c78941a2..606daff9c 100644 --- a/colossalai/nn/layer/colossalai_layer/linear.py +++ b/colossalai/nn/layer/colossalai_layer/linear.py @@ -25,6 +25,22 @@ _parallel_classifier = { class Linear(nn.Module): + """ + Linear layer of colossalai + + :param in_features: size of each input sample + :type in_features: int + :param out_features: size of each output sample + :type out_features: int + :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True + :type bias: bool, optional + :param dtype: The dtype of parameters, defaults to None + :type dtype: torch.dtype, optional + :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer + :type weight_initializer: typing.Callable, optional + :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer + :type bias_initializer: typing.Callable, optional + """ def __init__(self, in_features: int, out_features: int, @@ -64,6 +80,22 @@ class Linear(nn.Module): class Classifier(nn.Module): + """ + Classifier layer of colossalai + + :param in_features: size of each input sample + :type in_features: int + :param num_classes: number of total classes for the dataset + :type num_classes: int + :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True + :type bias: bool, optional + :param dtype: The dtype of parameters, defaults to None + :type dtype: torch.dtype, optional + :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer + :type weight_initializer: typing.Callable, optional + :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer + :type bias_initializer: typing.Callable, optional + """ def __init__( self, in_features: int, diff --git a/colossalai/nn/layer/colossalai_layer/normalization.py b/colossalai/nn/layer/colossalai_layer/normalization.py index f1dab93f9..b29e1fbab 100644 --- a/colossalai/nn/layer/colossalai_layer/normalization.py +++ b/colossalai/nn/layer/colossalai_layer/normalization.py @@ -15,6 +15,19 @@ _parallel_layernorm = {'2d': LayerNorm2D, '2.5d': LayerNorm2p5D, '3d': LayerNorm class LayerNorm(nn.Module): + r""" + Layer Normalization for colossalai + + :param normalized_shape: input shape from an expected input + of size. :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1] \times \ldots \times \text{normalized_shape}[-1]]` + If a single integer is used, it is treated as a singleton list, and this module will + normalize over the last dimension which is expected to be of that specific size. + :type normalized_shape: int + :param eps: a value added to the denominator for numerical stability, defaults to 1e-05 + :type eps: float, optional + :param dtype: The dtype of parameters, defaults to None + :type dtype: torch.dtype, optional + """ def __init__(self, normalized_shape: int, eps=1e-05, dtype=None) -> None: super().__init__() tensor_parallel = get_tensor_parallel_mode() diff --git a/colossalai/nn/layer/parallel_1d/_operation.py b/colossalai/nn/layer/parallel_1d/_operation.py index aee28926a..d6b851e92 100644 --- a/colossalai/nn/layer/parallel_1d/_operation.py +++ b/colossalai/nn/layer/parallel_1d/_operation.py @@ -7,6 +7,18 @@ except: class FusedLayerNormAffineFunction1D(torch.autograd.Function): + r""" + Layernorm + + :param input: input maxtrix + :param weight: weight matrix + :param bias: bias matrix + :param normalized_shape: input shape from an expected input + of size. :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1] \times \ldots \times \text{normalized_shape}[-1]]` + If a single integer is used, it is treated as a singleton list, and this module will + normalize over the last dimension which is expected to be of that specific size. + :param eps: a value added to the denominator for numerical stability + """ @staticmethod def forward(ctx, input, weight, bias, normalized_shape, eps): diff --git a/colossalai/nn/layer/parallel_1d/_utils.py b/colossalai/nn/layer/parallel_1d/_utils.py index db589afe5..602bd6c3f 100644 --- a/colossalai/nn/layer/parallel_1d/_utils.py +++ b/colossalai/nn/layer/parallel_1d/_utils.py @@ -76,7 +76,12 @@ def _gather(input_, parallel_mode, dim=-1): class _ReduceGrad(torch.autograd.Function): - """Pass the input to the model parallel region.""" + """ + Pass the input to the model parallel region. + + :param input_: input matrix + :param parallel_mode: parallel mode + """ @staticmethod def symbolic(graph, input_): return input_ @@ -92,7 +97,12 @@ class _ReduceGrad(torch.autograd.Function): class _ReduceInput(torch.autograd.Function): - """All-reduce the input from the model parallel region.""" + """ + All-reduce the input from the model parallel region. + + :param input_: input matrix + :param parallel_mode: parallel mode + """ @staticmethod def symbolic(graph, input_): return _reduce(input_) @@ -107,7 +117,13 @@ class _ReduceInput(torch.autograd.Function): class _SplitForwardGatherBackward(torch.autograd.Function): - """Split the input and keep only the corresponding chuck to the rank.""" + """ + Split the input and keep only the corresponding chuck to the rank. + + :param input_: input matrix + :param parallel_mode: parallel mode + :param dim: dimension + """ @staticmethod def symbolic(graph, input_): return _split(input_) @@ -124,7 +140,13 @@ class _SplitForwardGatherBackward(torch.autograd.Function): class _GatherForwardSplitBackward(torch.autograd.Function): - """Gather the input from model parallel region and concatinate.""" + """ + Gather the input from model parallel region and concatinate. + + :param input_: input matrix + :param parallel_mode: parallel mode + :param dim: dimension + """ @staticmethod def symbolic(graph, input_): return _gather(input_) diff --git a/colossalai/nn/layer/parallel_1d/layers.py b/colossalai/nn/layer/parallel_1d/layers.py index 832d7d9df..9e3fe9bdd 100644 --- a/colossalai/nn/layer/parallel_1d/layers.py +++ b/colossalai/nn/layer/parallel_1d/layers.py @@ -26,6 +26,24 @@ from ._utils import (gather_forward_split_backward, get_parallel_input, reduce_g @LAYERS.register_module class Linear1D(torch.nn.Module): + """ + Linear layer for 1D parallelism + + :param in_features: size of each input sample + :type in_features: int + :param out_features: size of each output sample + :type out_features: int + :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True + :type bias: bool, optional + :param dtype: The dtype of parameters, defaults to None + :type dtype: torch.dtype, optional + :param skip_bias_add: If set to ``True``, it will skip bias add for linear layer, which is preserved for kernel fusion, defaults to False + :type skip_bias_add: bool, optional + :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer + :type weight_initializer: typing.Callable, optional + :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer + :type bias_initializer: typing.Callable, optional + """ def __init__(self, in_features: int, out_features: int, @@ -70,8 +88,24 @@ class Linear1D(torch.nn.Module): @LAYERS.register_module class Classifier1D(ParallelLayer): - """RowLinear with given weight""" - + """RowLinear with given weight + Classifier of 1D parallelism + + :param in_features: size of input features + :type in_features: int + :param num_classes: number of classes in the dataset + :type num_classes: int + :param weight: weight of the classifier, defaults to True + :type weight: torch.nn.Parameter, optional + :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True`` + :type bias: bool, optional + :param dtype: The dtype of parameters, defaults to None + :type dtype: torch.dtype, optional + :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer + :type weight_initializer: typing.Callable, optional + :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer + :type bias_initializer: typing.Callable, optional + """ def __init__(self, in_features: int, num_classes: int, @@ -144,7 +178,7 @@ class Linear1D_Col(ParallelLayer): :type in_features: int :param output_size: second dimension of matrix A. :type output_size: int - :param bias: If true, add bias, defaults to True + :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True`` :type bias: bool, optional :param dtype: The dtype of parameters, defaults to None :type dtype: torch.dtype, optional @@ -228,7 +262,7 @@ class Linear1D_Row(ParallelLayer): :type in_features: int :param out_features: size of each output sample :type out_features: int - :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True + :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True`` :type bias: bool, optional :param dtype: The dtype of parameters, defaults to None :type dtype: torch.dtype, optional @@ -303,7 +337,16 @@ class Linear1D_Row(ParallelLayer): @LAYERS.register_module class MixedFusedLayerNorm1D(torch.nn.Module): - """ Experimental + r""" + Layer Normalization for 1D parallelism + + :param normalized_shape: input shape from an expected input + of size. :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1] \times \ldots \times \text{normalized_shape}[-1]]` + If a single integer is used, it is treated as a singleton list, and this module will + normalize over the last dimension which is expected to be of that specific size. + :type normalized_shape: int + :param eps: a value added to the denominator for numerical stability, defaults to 1e-05 + :type eps: float, optional """ def __init__(self, normalized_shape, eps=1e-5): @@ -327,6 +370,20 @@ class MixedFusedLayerNorm1D(torch.nn.Module): @LAYERS.register_module class Embedding1D(ParallelLayer): + """ + Embedding for 1D parallelism + + :param num_embeddings: number of embeddings + :type num_embeddings: int + :param embedding_dim: dimension of embedding + :type embedding_dim: int + :param padding_idx: index of padding, defaults to None + :type padding_idx: int, optional + :param dtype: The dtype of parameters, defaults to None + :type dtype: torch.dtype, optional + :param weight_initializer: The intializer of weight, defaults to normal initializer + :type weight_initializer: typing.Callable, optional + """ def __init__(self, num_embeddings: int, embedding_dim: int, @@ -377,6 +434,14 @@ class Embedding1D(ParallelLayer): @LAYERS.register_module class Dropout1D(ParallelLayer): + """ + Dropout layer of 1D parallelism + + :param p: dropout rate, defaults to 0.5 + :type p: float, optional + :param inplace: If set to ``True``, will do this operation in-place, defaults tp ``False`` + :type inplace: bool, optional + """ def __init__(self, p: float = 0.5, inplace: bool = False): super().__init__() self.parallel_input = get_parallel_input() diff --git a/colossalai/nn/layer/parallel_2d/_operation.py b/colossalai/nn/layer/parallel_2d/_operation.py index ef899a5ec..a8f3aa565 100644 --- a/colossalai/nn/layer/parallel_2d/_operation.py +++ b/colossalai/nn/layer/parallel_2d/_operation.py @@ -20,7 +20,8 @@ def matmul_2d( row_parallel_mode=ParallelMode.PARALLEL_2D_ROW, col_parallel_mode=ParallelMode.PARALLEL_2D_COL, ): - """Matrix multiplication for 2D parallelism + """ + Matrix multiplication for 2D parallelism :param a: matrix :math:`A` :type a: torch.tensor :param b: matrix :math:`B` @@ -56,7 +57,35 @@ def matmul_2d( class classifier_2d(torch.autograd.Function): - """Matrix multiplication for :math:`C = AB` + """ + Classifier + + :param a: matrix :math:`A` + :type a: torch.tensor + :param b: matrix :math:`B` + :type b: torch.tensor + :param bias: matrix of bias + :type bias: torch.tensor, optional + :param summa_dim: dimension of SUMMA fo 2D parallelism + :type summa_dim: int + :param out_shape: shape of output tensor + :type out_shape: tuple + :param row_rank: the rank of row + :type row_rank: int + :param col_rank: the rank of column + :type col_rank: int + :param row_parallel_mode: row parallel mode + :type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode + :param col_parallel_mode: column parallel mode + :type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode + :param data_parallel_rank: data parallel rank + :type data_parallel_rank: int + :param pipeline_parallel_rank: pipeline parallel rank + :type pipeline_parallel_rank: int + :param pipeline_parallel_size: pipeline parallel size + :type pipeline_parallel_size: int + :param tensor_parallel_size: tensor parallel size + :type tensor_parallel_size: int """ @staticmethod @custom_fwd(cast_inputs=torch.float16) @@ -130,7 +159,33 @@ class classifier_2d(torch.autograd.Function): class Matmul_AB_2D(torch.autograd.Function): - """Matrix multiplication for :math:`C = AB` + """ + Matrix multiplication for :math:`C = AB` + + :param a: matrix :math:`A` + :type a: torch.tensor + :param b: matrix :math:`B` + :type b: torch.tensor + :param summa_dim: dimension of SUMMA fo 2D parallelism + :type summa_dim: int + :param out_shape: shape of output tensor + :type out_shape: tuple + :param row_rank: the rank of row + :type row_rank: int + :param col_rank: the rank of column + :type col_rank: int + :param row_parallel_mode: row parallel mode + :type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode + :param col_parallel_mode: column parallel mode + :type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode + :param data_parallel_rank: data parallel rank + :type data_parallel_rank: int + :param pipeline_parallel_rank: pipeline parallel rank + :type pipeline_parallel_rank: int + :param pipeline_parallel_size: pipeline parallel size + :type pipeline_parallel_size: int + :param tensor_parallel_size: tensor parallel size + :type tensor_parallel_size: int """ @staticmethod @custom_fwd(cast_inputs=torch.float16) @@ -238,7 +293,33 @@ class Matmul_AB_2D(torch.autograd.Function): class Matmul_ABT_2D(torch.autograd.Function): - """Matrix multiplication for :math:`C = AB^T` + """ + Matrix multiplication for :math:`C = AB^T` + + :param a: matrix :math:`A` + :type a: torch.tensor + :param b: matrix :math:`B` + :type b: torch.tensor + :param summa_dim: dimension of SUMMA fo 2D parallelism + :type summa_dim: int + :param out_shape: shape of output tensor + :type out_shape: tuple + :param row_rank: the rank of row + :type row_rank: int + :param col_rank: the rank of column + :type col_rank: int + :param row_parallel_mode: row parallel mode + :type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode + :param col_parallel_mode: column parallel mode + :type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode + :param data_parallel_rank: data parallel rank + :type data_parallel_rank: int + :param pipeline_parallel_rank: pipeline parallel rank + :type pipeline_parallel_rank: int + :param pipeline_parallel_size: pipeline parallel size + :type pipeline_parallel_size: int + :param tensor_parallel_size: tensor parallel size + :type tensor_parallel_size: int """ @staticmethod @custom_fwd(cast_inputs=torch.float16) @@ -352,7 +433,33 @@ class Matmul_ABT_2D(torch.autograd.Function): class Matmul_ATB_2D(torch.autograd.Function): - """Matrix multiplication for :math:`C = A^TB` + """ + Matrix multiplication for :math:`C = A^TB` + + :param a: matrix :math:`A` + :type a: torch.tensor + :param b: matrix :math:`B` + :type b: torch.tensor + :param summa_dim: dimension of SUMMA fo 2D parallelism + :type summa_dim: int + :param out_shape: shape of output tensor + :type out_shape: tuple + :param row_rank: the rank of row + :type row_rank: int + :param col_rank: the rank of column + :type col_rank: int + :param row_parallel_mode: row parallel mode + :type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode + :param col_parallel_mode: column parallel mode + :type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode + :param data_parallel_rank: data parallel rank + :type data_parallel_rank: int + :param pipeline_parallel_rank: pipeline parallel rank + :type pipeline_parallel_rank: int + :param pipeline_parallel_size: pipeline parallel size + :type pipeline_parallel_size: int + :param tensor_parallel_size: tensor parallel size + :type tensor_parallel_size: int """ @staticmethod @custom_fwd(cast_inputs=torch.float16) @@ -466,7 +573,33 @@ class Matmul_ATB_2D(torch.autograd.Function): class add_bias_2d(torch.autograd.Function): - """Matrix add bias: :math:`C = A + b` + """ + Matrix add bias: :math:`C = A + b` + + :param input_: matrix :math:`A` + :type input_: torch.tensor + :param bias: matrix :math:`b` + :type bias: torch.tensor + :param output_size_per_partition: size of ouput per partition + :type output_size_per_partition: int + :param row_rank: the rank of row + :type row_rank: int + :param col_rank: the rank of column + :type col_rank: int + :param row_parallel_mode: row parallel mode + :type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode + :param col_parallel_mode: column parallel mode + :type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode + :param skip_bias_add: If set to ``True``, it will skip bias add for linear layer, which is preserved for kernel fusion + :type skip_bias_add: bool + :param data_parallel_rank: data parallel rank + :type data_parallel_rank: int + :param pipeline_parallel_rank: pipeline parallel rank + :type pipeline_parallel_rank: int + :param pipeline_parallel_size: pipeline parallel size + :type pipeline_parallel_size: int + :param tensor_parallel_size: tensor parallel size + :type tensor_parallel_size: int """ @staticmethod @custom_fwd(cast_inputs=torch.float16) @@ -519,9 +652,30 @@ class add_bias_2d(torch.autograd.Function): class layernorm_2d(torch.autograd.Function): + """ + Layernorm + + :param input_: input maxtrix + :type input_: torch.tensor + :param E_x: mean + :type E_x: torch.tensor + :param Var_x: variance + :type Var_x: torch.tensor + :param hidden_size: hidden size + :type hidden_size: int + :param row_parallel_mode: row parallel mode + :type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode + :param col_parallel_mode: column parallel mode + :type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode + """ @staticmethod @custom_fwd(cast_inputs=torch.float32) - def forward(ctx: Any, input_: Tensor, E_x: Tensor, Var_x: Tensor, hidden_size: int, row_parallel_mode: ParallelMode, + def forward(ctx: Any, + input_: Tensor, + E_x: Tensor, + Var_x: Tensor, + hidden_size: int, + row_parallel_mode: ParallelMode, col_parallel_mode: ParallelMode) -> Tensor: input_ = input_ - E_x # in here, input = x - E[x], Var_x = 1 / sqrt(Var[x] + eps) @@ -556,6 +710,18 @@ class layernorm_2d(torch.autograd.Function): class all_gather_weight_2d(torch.autograd.Function): + """ + all gather the weight of 2D parallelism + + :param inputs: input maxtrix + :type inputs: torch.tensor + :param dim: dimension of all gather + :type dim: int + :param summa_dim: dimension of SUMMA fo 2D parallelism + :type summa_dim: int + :param col_parallel_mode: column parallel mode + :type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode + """ @staticmethod @custom_fwd(cast_inputs=torch.float16) def forward(ctx: Any, inputs: Tensor, dim: int, summa_dim: int, col_parallel_mode: ParallelMode) -> Tensor: @@ -574,6 +740,14 @@ class all_gather_weight_2d(torch.autograd.Function): class SplitFirst(torch.autograd.Function): + """ + :param inputs: input maxtrix + :type inputs: torch.tensor + :param summa_dim: dimension of SUMMA fo 2D parallelism + :type summa_dim: int + :param col_parallel_mode: column parallel mode + :type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode + """ @staticmethod @custom_fwd(cast_inputs=torch.float16) def forward(ctx: Any, inputs: Tensor, summa_dim: int, col_parallel_mode: ParallelMode) -> Tensor: @@ -604,7 +778,14 @@ def split_tensor_2d(input_: Tensor, dim: int = 0) -> Tensor: class reduce_by_batch_2d(torch.autograd.Function): - """All-reduce the input from the model parallel region.""" + """ + All-reduce the input from the model parallel region. + + :param input_: input maxtrix + :type input_: torch.tensor + :param reduce_mean: If set to ``True``, it will divide the output by column parallel size, default to False + :type reduce_mean: int, optional + """ @staticmethod def symbolic(graph, input_, reduce_mean: bool = False): output = all_reduce(input_, ParallelMode.PARALLEL_2D_COL) diff --git a/colossalai/nn/layer/parallel_2d/layers.py b/colossalai/nn/layer/parallel_2d/layers.py index d113ec94c..f2c8af63b 100644 --- a/colossalai/nn/layer/parallel_2d/layers.py +++ b/colossalai/nn/layer/parallel_2d/layers.py @@ -21,7 +21,8 @@ from ._utils import assert_summa_initialization, get_summa_dim_from_env @LAYERS.register_module class Linear2D(ParallelLayer): - """ Linear layer for 2D parallelism + """ + Linear layer for 2D parallelism :param in_features: size of each input sample :type in_features: int @@ -33,6 +34,10 @@ class Linear2D(ParallelLayer): :type dtype: torch.dtype, optional :param skip_bias_add: If set to ``True``, it will skip bias add for linear layer, which is preserved for kernel fusion, defaults to False :type skip_bias_add: bool, optional + :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer + :type weight_initializer: typing.Callable, optional + :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer + :type bias_initializer: typing.Callable, optional """ def __init__(self, in_features: int, @@ -113,7 +118,8 @@ class Linear2D(ParallelLayer): @LAYERS.register_module class LayerNorm2D(ParallelLayer): - r"""Layer Normalization for 2D parallelism + r""" + Layer Normalization for 2D parallelism :param normalized_shape: input shape from an expected input of size. :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1] \times \ldots \times \text{normalized_shape}[-1]]` @@ -184,18 +190,27 @@ class LayerNorm2D(ParallelLayer): @LAYERS.register_module class PatchEmbedding2D(ParallelLayer): - """ 2D Image to Patch Embedding + """ + 2D Image to Patch Embedding - :param img_size: iamge size + :param img_size: image size :type img_size: int :param patch_size: patch size :type patch_size: int - :param embed_dim: dimension of embedding - :type embed_dim: int - :param in_chans: number of channels of input image, defaults to 3 - :type in_chans: int, optional + :param in_chans: number of channels of input image + :type in_chans: int + :param embed_size: size of embedding + :type embed_size: int + :param dtype: The dtype of parameters, defaults to None + :type dtype: torch.dtype, optional :param flatten: whether to flatten output tensor, defaults to True :type flatten: bool, optional + :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer + :type weight_initializer: typing.Callable, optional + :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer + :type bias_initializer: typing.Callable, optional + :param position_embed_initializer: The intializer of position embedding, defaults to zero + :type position_embed_initializer: typing.Callable, optional """ def __init__(self, img_size: int, @@ -275,6 +290,20 @@ class PatchEmbedding2D(ParallelLayer): @LAYERS.register_module class Embedding2D(ParallelLayer): + """ + Embedding for 2D parallelism + + :param num_embeddings: number of embeddings + :type num_embeddings: int + :param embedding_dim: dimension of embedding + :type embedding_dim: int + :param padding_idx: index of padding, defaults to None + :type padding_idx: int, optional + :param dtype: The dtype of parameters, defaults to None + :type dtype: torch.dtype, optional + :param weight_initializer: The intializer of weight, defaults to normal initializer + :type weight_initializer: typing.Callable, optional + """ def __init__(self, num_embeddings: int, embedding_dim: int, @@ -325,6 +354,24 @@ class Embedding2D(ParallelLayer): @LAYERS.register_module class Classifier2D(ParallelLayer): + """ + Classifier for 2D parallelism + + :param in_features: size of each input sample + :type in_features: int + :param num_classes: number of classes + :type num_classes: int + :param weight: weight of the classifier, defaults to True + :type weight: torch.nn.Parameter, optional + :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True`` + :type bias: bool, optional + :param dtype: The dtype of parameters, defaults to None + :type dtype: torch.dtype, optional + :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer + :type weight_initializer: typing.Callable, optional + :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer + :type bias_initializer: typing.Callable, optional + """ def __init__(self, in_features: int, num_classes: int, diff --git a/colossalai/nn/layer/parallel_2p5d/_operation.py b/colossalai/nn/layer/parallel_2p5d/_operation.py index a1dbcd3cd..f583a0f02 100644 --- a/colossalai/nn/layer/parallel_2p5d/_operation.py +++ b/colossalai/nn/layer/parallel_2p5d/_operation.py @@ -28,7 +28,35 @@ def split_tensor_2p5d(input_: Tensor, dim: int = 0) -> Tensor: class classifier_2p5d(torch.autograd.Function): - """Matrix multiplication for :math:`C = AB` + """ + Classifier + + :param a: matrix :math:`A` + :type a: torch.tensor + :param b: matrix :math:`B` + :type b: torch.tensor + :param bias: matrix of bias + :type bias: torch.tensor, optional + :param tesseract_dim: dimension of TESSERACT fo 2.5D parallelism + :type tesseract_dim: int + :param out_shape: shape of output tensor + :type out_shape: tuple + :param row_rank: the rank of row + :type row_rank: int + :param col_rank: the rank of column + :type col_rank: int + :param row_parallel_mode: row parallel mode + :type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode + :param col_parallel_mode: column parallel mode + :type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode + :param data_parallel_rank: data parallel rank + :type data_parallel_rank: int + :param pipeline_parallel_rank: pipeline parallel rank + :type pipeline_parallel_rank: int + :param pipeline_parallel_size: pipeline parallel size + :type pipeline_parallel_size: int + :param tensor_parallel_size: tensor parallel size + :type tensor_parallel_size: int """ @staticmethod @custom_fwd(cast_inputs=torch.float16) @@ -101,7 +129,35 @@ class classifier_2p5d(torch.autograd.Function): class Matmul_AB_2p5D(torch.autograd.Function): - """Matrix multiplication for :math:`C = AB` + """ + Matrix multiplication for :math:`C = AB` + + :param a: matrix :math:`A` + :type a: torch.tensor + :param b: matrix :math:`B` + :type b: torch.tensor + :param tesseract_dim: dimension of TESSERACT fo 2.5D parallelism + :type tesseract_dim: int + :param out_shape: shape of output tensor + :type out_shape: tuple + :param row_rank: the rank of row + :type row_rank: int + :param col_rank: the rank of column + :type col_rank: int + :param dep_rank: the rank of depth + :type dep_rank: int + :param row_parallel_mode: row parallel mode + :type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode + :param col_parallel_mode: column parallel mode + :type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode + :param data_parallel_rank: data parallel rank + :type data_parallel_rank: int + :param pipeline_parallel_rank: pipeline parallel rank + :type pipeline_parallel_rank: int + :param pipeline_parallel_size: pipeline parallel size + :type pipeline_parallel_size: int + :param tensor_parallel_size: tensor parallel size + :type tensor_parallel_size: int """ @staticmethod @custom_fwd(cast_inputs=torch.float16) @@ -202,7 +258,35 @@ class Matmul_AB_2p5D(torch.autograd.Function): class Matmul_ABT_2p5D(torch.autograd.Function): - """Matrix multiplication for :math:`C = AB^T` + """ + Matrix multiplication for :math:`C = AB^T` + + :param a: matrix :math:`A` + :type a: torch.tensor + :param b: matrix :math:`B` + :type b: torch.tensor + :param tesseract_dim: dimension of TESSERACT fo 2.5D parallelism + :type tesseract_dim: int + :param out_shape: shape of output tensor + :type out_shape: tuple + :param row_rank: the rank of row + :type row_rank: int + :param col_rank: the rank of column + :type col_rank: int + :param dep_rank: the rank of depth + :type dep_rank: int + :param row_parallel_mode: row parallel mode + :type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode + :param col_parallel_mode: column parallel mode + :type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode + :param data_parallel_rank: data parallel rank + :type data_parallel_rank: int + :param pipeline_parallel_rank: pipeline parallel rank + :type pipeline_parallel_rank: int + :param pipeline_parallel_size: pipeline parallel size + :type pipeline_parallel_size: int + :param tensor_parallel_size: tensor parallel size + :type tensor_parallel_size: int """ @staticmethod @custom_fwd(cast_inputs=torch.float16) @@ -308,7 +392,35 @@ class Matmul_ABT_2p5D(torch.autograd.Function): class Matmul_ATB_2p5D(torch.autograd.Function): - """Matrix multiplication for :math:`C = A^TB` + """ + Matrix multiplication for :math:`C = A^TB` + + :param a: matrix :math:`A` + :type a: torch.tensor + :param b: matrix :math:`B` + :type b: torch.tensor + :param tesseract_dim: dimension of TESSERACT fo 2.5D parallelism + :type tesseract_dim: int + :param out_shape: shape of output tensor + :type out_shape: tuple + :param row_rank: the rank of row + :type row_rank: int + :param col_rank: the rank of column + :type col_rank: int + :param dep_rank: the rank of depth + :type dep_rank: int + :param row_parallel_mode: row parallel mode + :type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode + :param col_parallel_mode: column parallel mode + :type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode + :param data_parallel_rank: data parallel rank + :type data_parallel_rank: int + :param pipeline_parallel_rank: pipeline parallel rank + :type pipeline_parallel_rank: int + :param pipeline_parallel_size: pipeline parallel size + :type pipeline_parallel_size: int + :param tensor_parallel_size: tensor parallel size + :type tensor_parallel_size: int """ @staticmethod @custom_fwd(cast_inputs=torch.float16) @@ -411,7 +523,35 @@ class Matmul_ATB_2p5D(torch.autograd.Function): class Add_Bias_2p5D(torch.autograd.Function): - """Matrix add bias: :math:`C = A + b` + """ + Matrix add bias: :math:`C = A + b` + + :param input: matrix :math:`A` + :type input: torch.tensor + :param bias: matrix :math:`b` + :type bias: torch.tensor + :param output_size_per_partition: output size in each partition + :type output_size_per_partition: int + :param tesseract_dim: dimension of TESSERACT fo 2.5D parallelism + :type tesseract_dim: int + :param row_rank: the rank of row + :type row_rank: int + :param col_rank: the rank of column + :type col_rank: int + :param row_parallel_mode: row parallel mode + :type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode + :param col_parallel_mode: column parallel mode + :type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode + :param skip_bias_add: If set to ``True``, it will skip bias add for linear layer, which is preserved for kernel fusion + :type skip_bias_add: bool + :param data_parallel_rank: data parallel rank + :type data_parallel_rank: int + :param pipeline_parallel_rank: pipeline parallel rank + :type pipeline_parallel_rank: int + :param pipeline_parallel_size: pipeline parallel size + :type pipeline_parallel_size: int + :param tensor_parallel_size: tensor parallel size + :type tensor_parallel_size: int """ @staticmethod @custom_fwd(cast_inputs=torch.float16) @@ -482,6 +622,20 @@ class Add_Bias_2p5D(torch.autograd.Function): class layernorm_2p5d(torch.autograd.Function): + """ + Layernorm + + :param input: input maxtrix + :type input: torch.tensor + :param E_x: mean + :type E_x: torch.tensor + :param Var_x: variance + :type Var_x: torch.tensor + :param hidden_size: hidden size + :type hidden_size: int + :param row_parallel_mode: row parallel mode + :type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode + """ @staticmethod @custom_fwd(cast_inputs=torch.float32) def forward(ctx: Any, input: Tensor, E_x: Tensor, Var_x: Tensor, hidden_size: int, @@ -518,6 +672,18 @@ class layernorm_2p5d(torch.autograd.Function): class all_gather_weight_2p5d(torch.autograd.Function): + """ + all gather the weight of 2.5D parallelism + + :param inputs: input maxtrix + :type inputs: torch.tensor + :param dim: dimension of all gather + :type dim: int + :param tesseract_dim: dimension of TESSERACT fo 2.5D parallelism + :type tesseract_dim: int + :param col_parallel_mode: column parallel mode + :type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode + """ @staticmethod @custom_fwd(cast_inputs=torch.float16) def forward(ctx: Any, inputs: Tensor, dim: int, tesseract_dim: int, col_parallel_mode: ParallelMode) -> Tensor: @@ -536,6 +702,14 @@ class all_gather_weight_2p5d(torch.autograd.Function): class SplitFirst(torch.autograd.Function): + """ + :param inputs: input maxtrix + :type inputs: torch.tensor + :param tesseract_dim: dimension of TESSERACT fo 2.5D parallelism + :type tesseract_dim: int + :param col_parallel_mode: column parallel mode + :type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode + """ @staticmethod @custom_fwd(cast_inputs=torch.float16) def forward(ctx: Any, inputs: Tensor, tesseract_dim: int, col_parallel_mode: ParallelMode) -> Tensor: @@ -566,7 +740,14 @@ def split_tensor_2p5d(input_: Tensor, dim: int = 0) -> Tensor: class reduce_by_batch_2p5d(torch.autograd.Function): - """All-reduce the input from the model parallel region.""" + """ + All-reduce the input from the model parallel region. + + :param input_: input maxtrix + :type input_: torch.tensor + :param reduce_mean: If set to ``True``, it will divide the output by column parallel size, default to False + :type reduce_mean: int, optional + """ @staticmethod def symbolic(graph, input_, reduce_mean: bool = False): output = all_reduce(input_, ParallelMode.PARALLEL_2P5D_COL) diff --git a/colossalai/nn/layer/parallel_2p5d/layers.py b/colossalai/nn/layer/parallel_2p5d/layers.py index d7bd265bd..9f0e6679e 100644 --- a/colossalai/nn/layer/parallel_2p5d/layers.py +++ b/colossalai/nn/layer/parallel_2p5d/layers.py @@ -21,7 +21,8 @@ from ._utils import (assert_tesseract_initialization, get_tesseract_dim_dep_from @LAYERS.register_module class Linear2p5D(ParallelLayer): - """Linear layer for 2.5D parallelism + """ + Linear layer for 2.5D parallelism :param in_features: size of each input sample :type in_features: int @@ -31,6 +32,10 @@ class Linear2p5D(ParallelLayer): :type bias: bool, optional :param dtype: The dtype of parameters, defaults to None :type dtype: torch.dtype, optional + :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer + :type weight_initializer: typing.Callable, optional + :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer + :type bias_initializer: typing.Callable, optional """ def __init__(self, in_features: int, @@ -125,7 +130,8 @@ class Linear2p5D(ParallelLayer): @LAYERS.register_module class LayerNorm2p5D(ParallelLayer): - r"""Layer Normalization for 2.5D parallelism + r""" + Layer Normalization for 2.5D parallelism :param normalized_shape: input shape from an expected input of size. :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1] \times \ldots \times \text{normalized_shape}[-1]]` @@ -196,17 +202,27 @@ class LayerNorm2p5D(ParallelLayer): @LAYERS.register_module class PatchEmbedding2p5D(ParallelLayer): - """ 2D Image to Patch Embedding - :param img_size: iamge size + """ + 2D Image to Patch Embedding + + :param img_size: image size :type img_size: int :param patch_size: patch size :type patch_size: int - :param embed_dim: dimension of embedding - :type embed_dim: int - :param in_chans: number of channels of input image, defaults to 3 - :type in_chans: int, optional + :param in_chans: number of channels of input image + :type in_chans: int + :param embed_size: size of embedding + :type embed_size: int + :param dtype: The dtype of parameters, defaults to None + :type dtype: torch.dtype, optional :param flatten: whether to flatten output tensor, defaults to True :type flatten: bool, optional + :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer + :type weight_initializer: typing.Callable, optional + :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer + :type bias_initializer: typing.Callable, optional + :param position_embed_initializer: The intializer of position embedding, defaults to zero + :type position_embed_initializer: typing.Callable, optional """ def __init__(self, img_size: int, @@ -286,6 +302,20 @@ class PatchEmbedding2p5D(ParallelLayer): @LAYERS.register_module class Embedding2p5D(ParallelLayer): + """ + Embedding for 2.5D parallelism + + :param num_embeddings: number of embeddings + :type num_embeddings: int + :param embedding_dim: dimension of embedding + :type embedding_dim: int + :param padding_idx: index of padding, defaults to None + :type padding_idx: int, optional + :param dtype: The dtype of parameters, defaults to None + :type dtype: torch.dtype, optional + :param weight_initializer: The intializer of weight, defaults to normal initializer + :type weight_initializer: typing.Callable, optional + """ def __init__(self, num_embeddings: int, embedding_dim: int, @@ -336,6 +366,24 @@ class Embedding2p5D(ParallelLayer): @LAYERS.register_module class Classifier2p5D(ParallelLayer): + """ + Classifier for 2.5D parallelism + + :param in_features: size of each input sample + :type in_features: int + :param num_classes: number of classes + :type num_classes: int + :param weight: weight of the classifier, defaults to True + :type weight: torch.nn.Parameter, optional + :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True + :type bias: bool, optional + :param dtype: The dtype of parameters, defaults to None + :type dtype: torch.dtype, optional + :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer + :type weight_initializer: typing.Callable, optional + :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer + :type bias_initializer: typing.Callable, optional + """ def __init__(self, in_features: int, num_classes: int, diff --git a/colossalai/nn/layer/parallel_3d/_operation.py b/colossalai/nn/layer/parallel_3d/_operation.py index 96ed775ec..19ceb897e 100644 --- a/colossalai/nn/layer/parallel_3d/_operation.py +++ b/colossalai/nn/layer/parallel_3d/_operation.py @@ -12,6 +12,28 @@ from torch.cuda.amp import custom_bwd, custom_fwd class linear_3d(torch.autograd.Function): + """ + Linear layer for 3D parallelism + + :param input_: matrix of input + :type input_: torch.tensor + :param weight: matrix of weight + :type weight: torch.tensor + :param bias: matrix of bias + :type bias: torch.tensor, optional + :param input_parallel_mode: input parallel mode + :type input_parallel_mode: colossalai.context.parallel_mode.ParallelMode + :param weight_parallel_mode: weight parallel mode + :type weight_parallel_mode: colossalai.context.parallel_mode.ParallelMode + :param output_parallel_mode: output parallel mode + :type output_parallel_mode: colossalai.context.parallel_mode.ParallelMode + :param input_dim: dimension of input, defaults to 0 + :type input_dim: int, optional + :param weight_dim: dimension of weight, defaults to -1 + :type weight_dim: int, optional + :param output_dim: dimension of output, defaults to 0 + :type output_dim: int, optional + """ @staticmethod @custom_fwd(cast_inputs=torch.float16) def forward(ctx, @@ -74,6 +96,22 @@ class linear_3d(torch.autograd.Function): class classifier_3d(torch.autograd.Function): + """ + Classifier + + :param input_: matrix of input + :type input_: torch.tensor + :param weight: matrix of weight + :type weight: torch.tensor + :param bias: matrix of bias + :type bias: torch.tensor, optional + :param input_parallel_mode: input parallel mode + :type input_parallel_mode: colossalai.context.parallel_mode.ParallelMode + :param weight_parallel_mode: weight parallel mode + :type weight_parallel_mode: colossalai.context.parallel_mode.ParallelMode + :param output_parallel_mode: output parallel mode + :type output_parallel_mode: colossalai.context.parallel_mode.ParallelMode + """ @staticmethod @custom_fwd(cast_inputs=torch.float16) def forward(ctx, input_: Tensor, weight: Tensor, bias: Optional[Tensor], input_parallel_mode: ParallelMode, @@ -129,6 +167,29 @@ class classifier_3d(torch.autograd.Function): class layernorm_3d(torch.autograd.Function): + """ + Layernorm + + :param input_: input maxtrix + :type input_: torch.tensor + :param weight: matrix of weight + :type weight: torch.tensor + :param bias: matrix of bias + :type bias: torch.tensor + :param normalized_shape: input shape from an expected input + of size. :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1] \times \ldots \times \text{normalized_shape}[-1]]` + If a single integer is used, it is treated as a singleton list, and this module will + normalize over the last dimension which is expected to be of that specific size. + :type normalized_shape: int + :param eps: a value added to the denominator for numerical stability + :type eps: float + :param input_parallel_mode: input parallel mode + :type input_parallel_mode: colossalai.context.parallel_mode.ParallelMode + :param weight_parallel_mode: weight parallel mode + :type weight_parallel_mode: colossalai.context.parallel_mode.ParallelMode + :param output_parallel_mode: output parallel mode + :type output_parallel_mode: colossalai.context.parallel_mode.ParallelMode + """ @staticmethod @custom_fwd(cast_inputs=torch.float32) def forward(ctx, input_: Tensor, weight: Tensor, bias: Tensor, normalized_shape: int, eps: float, @@ -189,6 +250,18 @@ def split_tensor_3d(input_: Tensor, class reduce_by_batch_3d(torch.autograd.Function): + """ + All-reduce the input from the model parallel region. + + :param input_: input maxtrix + :type input_: torch.tensor + :param input_parallel_mode: input parallel mode + :type input_parallel_mode: colossalai.context.parallel_mode.ParallelMode + :param weight_parallel_mode: weight parallel mode + :type weight_parallel_mode: colossalai.context.parallel_mode.ParallelMode + :param reduce_mean: If set to ``True``, it will divide the output by (input parallel size * weight parallel size), default to False + :type reduce_mean: int, optional + """ @staticmethod @custom_fwd(cast_inputs=torch.float32) def forward(ctx, @@ -215,6 +288,18 @@ class reduce_by_batch_3d(torch.autograd.Function): class broadcast_weight_3d_from_diagonal(torch.autograd.Function): + """ + broadcast weight from diagonal + + :param input_: input maxtrix + :type input_: torch.tensor + :param input_parallel_mode: input parallel mode + :type input_parallel_mode: colossalai.context.parallel_mode.ParallelMode + :param weight_parallel_mode: weight parallel mode + :type weight_parallel_mode: colossalai.context.parallel_mode.ParallelMode + :param weight_parallel_mode: output parallel mode + :type weight_parallel_mode: colossalai.context.parallel_mode.ParallelMode + """ @staticmethod @custom_fwd(cast_inputs=torch.float16) def forward(ctx, input_: Tensor, input_parallel_mode: ParallelMode, weight_parallel_mode: ParallelMode, diff --git a/colossalai/nn/layer/parallel_3d/layers.py b/colossalai/nn/layer/parallel_3d/layers.py index 4871d1443..66f2b98a5 100644 --- a/colossalai/nn/layer/parallel_3d/layers.py +++ b/colossalai/nn/layer/parallel_3d/layers.py @@ -24,6 +24,19 @@ from ._utils import get_depth_from_env, get_last_group, get_parallel_mode_from_e @LAYERS.register_module class LayerNorm3D(ParallelLayer): + r""" + Layer Normalization for 3D parallelism + + :param normalized_shape: input shape from an expected input + of size. :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1] \times \ldots \times \text{normalized_shape}[-1]]` + If a single integer is used, it is treated as a singleton list, and this module will + normalize over the last dimension which is expected to be of that specific size. + :type normalized_shape: int + :param eps: a value added to the denominator for numerical stability, defaults to 1e-12 + :type eps: float, optional + :param dtype: The dtype of parameters, defaults to None + :type dtype: torch.dtype, optional + """ def __init__(self, normalized_shape: int, eps: float = 1e-12, dtype: dtype = None): super().__init__() self.input_parallel_mode = get_parallel_mode_from_env(INPUT_GROUP_3D) @@ -55,6 +68,22 @@ class LayerNorm3D(ParallelLayer): @LAYERS.register_module class Linear3D(ParallelLayer): + """ + Linear layer for 3D parallelism + + :param in_features: size of each input sample + :type in_features: int + :param out_features: size of each output sample + :type out_features: int + :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True + :type bias: bool, optional + :param dtype: The dtype of parameters, defaults to None + :type dtype: torch.dtype, optional + :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer + :type weight_initializer: typing.Callable, optional + :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer + :type bias_initializer: typing.Callable, optional + """ def __init__(self, in_features: int, out_features: int, @@ -113,6 +142,24 @@ class Linear3D(ParallelLayer): @LAYERS.register_module class Classifier3D(ParallelLayer): + """ + Classifier for 3D parallelism + + :param in_features: size of each input sample + :type in_features: int + :param num_classes: number of classes + :type num_classes: int + :param weight: weight of the classifier, defaults to True + :type weight: torch.nn.Parameter, optional + :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True + :type bias: bool, optional + :param dtype: The dtype of parameters, defaults to None + :type dtype: torch.dtype, optional + :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer + :type weight_initializer: typing.Callable, optional + :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer + :type bias_initializer: typing.Callable, optional + """ def __init__(self, in_features: int, num_classes: int, @@ -173,6 +220,28 @@ class Classifier3D(ParallelLayer): @LAYERS.register_module class PatchEmbedding3D(ParallelLayer): + """ + 2D Image to Patch Embedding + + :param img_size: image size + :type img_size: int + :param patch_size: patch size + :type patch_size: int + :param in_chans: number of channels of input image + :type in_chans: int + :param embed_size: size of embedding + :type embed_size: int + :param dtype: The dtype of parameters, defaults to None + :type dtype: torch.dtype, optional + :param flatten: whether to flatten output tensor, defaults to True + :type flatten: bool, optional + :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer + :type weight_initializer: typing.Callable, optional + :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer + :type bias_initializer: typing.Callable, optional + :param position_embed_initializer: The intializer of position embedding, defaults to zero + :type position_embed_initializer: typing.Callable, optional + """ def __init__(self, img_size: int, patch_size: int, @@ -256,6 +325,20 @@ class PatchEmbedding3D(ParallelLayer): @LAYERS.register_module class Embedding3D(ParallelLayer): + """ + Embedding for 3D parallelism + + :param num_embeddings: number of embeddings + :type num_embeddings: int + :param embedding_dim: dimension of embedding + :type embedding_dim: int + :param padding_idx: index of padding, defaults to None + :type padding_idx: int, optional + :param dtype: The dtype of parameters, defaults to None + :type dtype: torch.dtype, optional + :param weight_initializer: The intializer of weight, defaults to normal initializer + :type weight_initializer: typing.Callable, optional + """ def __init__(self, num_embeddings: int, embedding_dim: int, diff --git a/colossalai/nn/layer/vanilla/layers.py b/colossalai/nn/layer/vanilla/layers.py index dc33c461e..6f3d07abd 100644 --- a/colossalai/nn/layer/vanilla/layers.py +++ b/colossalai/nn/layer/vanilla/layers.py @@ -32,7 +32,8 @@ def drop_path(x, drop_prob: float = 0., training: bool = False): class DropPath(nn.Module): - """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + """ + Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). Adapted from https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/drop.py """ def __init__(self, drop_prob=None): @@ -97,7 +98,27 @@ class WrappedDropPath(nn.Module): @LAYERS.register_module class VanillaPatchEmbedding(nn.Module): - """ 2D Image to Patch Embedding + """ + 2D Image to Patch Embedding + + :param img_size: image size + :type img_size: int + :param patch_size: patch size + :type patch_size: int + :param in_chans: number of channels of input image + :type in_chans: int + :param embed_size: size of embedding + :type embed_size: int + :param dtype: The dtype of parameters, defaults to None + :type dtype: torch.dtype, optional + :param flatten: whether to flatten output tensor, defaults to True + :type flatten: bool, optional + :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer + :type weight_initializer: typing.Callable, optional + :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer + :type bias_initializer: typing.Callable, optional + :param position_embed_initializer: The intializer of position embedding, defaults to zero + :type position_embed_initializer: typing.Callable, optional """ def __init__(self, img_size: int, @@ -148,6 +169,24 @@ class VanillaPatchEmbedding(nn.Module): @LAYERS.register_module class VanillaClassifier(nn.Module): + """ + Classifier + + :param in_features: size of each input sample + :type in_features: int + :param num_classes: number of classes + :type num_classes: int + :param weight: weight of the classifier, defaults to True + :type weight: torch.nn.Parameter, optional + :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True + :type bias: bool, optional + :param dtype: The dtype of parameters, defaults to None + :type dtype: torch.dtype, optional + :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer + :type weight_initializer: typing.Callable, optional + :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer + :type bias_initializer: typing.Callable, optional + """ def __init__(self, in_features: int, num_classes: int, diff --git a/colossalai/nn/loss/loss_2d.py b/colossalai/nn/loss/loss_2d.py index 7aef949f6..a1fb0a7fa 100644 --- a/colossalai/nn/loss/loss_2d.py +++ b/colossalai/nn/loss/loss_2d.py @@ -7,7 +7,8 @@ from torch.nn.modules.loss import _Loss @LOSSES.register_module class CrossEntropyLoss2D(_Loss): - """Cross entropy loss for 2D parallelism + """ + Cross entropy loss for 2D parallelism :param reduction: whether to average the loss, defaults to True :type reduction: bool, optional diff --git a/colossalai/nn/loss/loss_2p5d.py b/colossalai/nn/loss/loss_2p5d.py index d7596d924..a849259b4 100644 --- a/colossalai/nn/loss/loss_2p5d.py +++ b/colossalai/nn/loss/loss_2p5d.py @@ -7,7 +7,9 @@ from torch.nn.modules.loss import _Loss @LOSSES.register_module class CrossEntropyLoss2p5D(_Loss): - """Cross entropy loss for 2.5D parallelism + """ + Cross entropy loss for 2.5D parallelism + :param reduction: whether to average the loss, defaults to True :type reduction: bool, optional """ diff --git a/colossalai/nn/loss/loss_3d.py b/colossalai/nn/loss/loss_3d.py index 59b6ffeeb..9b8083069 100644 --- a/colossalai/nn/loss/loss_3d.py +++ b/colossalai/nn/loss/loss_3d.py @@ -7,14 +7,11 @@ from torch.nn.modules.loss import _Loss @LOSSES.register_module class CrossEntropyLoss3D(_Loss): - """Cross entropy loss for 3D parallelism + """ + Cross entropy loss for 3D parallelism :param depth: depth for 3D parallelism :type depth: int - :param input_parallel_mode: parallel mode for input tensor - :type input_parallel_mode: ParallelMode - :param weight_parallel_mode: parallel mode for weight - :type weight_parallel_mode: ParallelMode :param reduction: whether to average the loss, defaults to True :type reduction: bool, optional """ diff --git a/colossalai/nn/metric/accuracy_2d.py b/colossalai/nn/metric/accuracy_2d.py index cc207b02c..5bcbfa65c 100644 --- a/colossalai/nn/metric/accuracy_2d.py +++ b/colossalai/nn/metric/accuracy_2d.py @@ -6,6 +6,12 @@ from ._utils import calc_acc class Accuracy2D(nn.Module): + """ + Accuracy for 2D parallelism + + :param logits: predicted labels + :param targets: true labels + """ def __init__(self): super().__init__() diff --git a/colossalai/nn/metric/accuracy_2p5d.py b/colossalai/nn/metric/accuracy_2p5d.py index 90dc4af26..b7c9c6afb 100644 --- a/colossalai/nn/metric/accuracy_2p5d.py +++ b/colossalai/nn/metric/accuracy_2p5d.py @@ -6,6 +6,12 @@ from ._utils import calc_acc class Accuracy2p5D(nn.Module): + """ + Accuracy for 2p5D parallelism + + :param logits: predicted labels + :param targets: true labels + """ def __init__(self): super().__init__() diff --git a/colossalai/nn/metric/accuracy_3d.py b/colossalai/nn/metric/accuracy_3d.py index 576800510..57b1874ba 100644 --- a/colossalai/nn/metric/accuracy_3d.py +++ b/colossalai/nn/metric/accuracy_3d.py @@ -8,6 +8,12 @@ from ._utils import calc_acc class Accuracy3D(nn.Module): + """ + Accuracy for 3D parallelism + + :param logits: predicted labels + :param targets: true labels + """ def __init__(self): super().__init__() self.input_parallel_mode = get_parallel_mode_from_env(INPUT_GROUP_3D) diff --git a/colossalai/trainer/hooks/_base_hook.py b/colossalai/trainer/hooks/_base_hook.py index e4b5edfbf..0b32f4ca2 100644 --- a/colossalai/trainer/hooks/_base_hook.py +++ b/colossalai/trainer/hooks/_base_hook.py @@ -10,10 +10,9 @@ class BaseHook(ABC): """This class allows users to add desired actions in specific time points during training or evaluation. - :param trainer: Trainer attached with current hook :param priority: Priority in the printing, hooks with small priority will be printed in front - :type trainer: Trainer :type priority: int + :param trainer: Trainer attached with current hook """ def __init__(self, priority: int) -> None: @@ -43,11 +42,11 @@ class BaseHook(ABC): """Actions after running a training iteration. :param output: Output of the model + :type output: torch.Tensor :param label: Labels of the input data + :type label: torch.Tensor :param loss: Loss between the output and input data - :type output: Tensor - :type label: Tensor - :type loss: Tensor + :type loss: torch.Tensor """ pass @@ -90,10 +89,10 @@ class BaseHook(ABC): """Actions after running a testing iteration. :param output: Output of the model - :param label: Labels of the input data - :param loss: Loss between the output and input data :type output: Tensor + :param label: Labels of the input data :type label: Tensor + :param loss: Loss between the output and input data :type loss: Tensor """ pass diff --git a/colossalai/trainer/hooks/_checkpoint_hook.py b/colossalai/trainer/hooks/_checkpoint_hook.py index 939e957bd..9fc8d59c5 100644 --- a/colossalai/trainer/hooks/_checkpoint_hook.py +++ b/colossalai/trainer/hooks/_checkpoint_hook.py @@ -16,14 +16,15 @@ from ._lr_scheduler_hook import LRSchedulerHook class SaveCheckpointHook(BaseHook): """Saves the model by interval in training process. - :param interval: Saving interval - :param checkpoint_dir: Directory of saving checkpoint - :param suffix: Saving suffix of the file - :param priority: Priority in the printing, hooks with small priority will be printed in front + :param interval: Saving interval, defaults to 1 :type interval: int, optional - :type checkpoint_dir: int, optional + :param checkpoint_dir: Directory of saving checkpoint, defaults to None + :type checkpoint_dir: str, optional + :param suffix: Saving suffix of the file, defaults to '' :type suffix: str, optional + :param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10 :type priority: int, optional + :param trainer: Trainer attached with current hook """ def __init__(self, @@ -71,16 +72,19 @@ class SaveCheckpointHook(BaseHook): class LoadCheckpointHook(BaseHook): """Loads the model before training process. - :param checkpoint_dir: Directory of saving checkpoint - :param epoch: Epoch number to be set - :param finetune: Whether allows to load a part of the model - :param strict: Whether loads a model that has the same shape of parameters - :param priority: Priority in the printing, hooks with small priority will be printed in front + :param checkpoint_dir: Directory of saving checkpoint, defaults to None :type checkpoint_dir: str, optional + :param epoch: Epoch number to be set, defaults to -1 :type epoch: str, optional + :param finetune: Whether allows to load a part of the model, defaults to False :type finetune: bool, optional + :param strict: Whether loads a model that has the same shape of parameters, defaults to False :type strict: bool, optional + :param suffix: Suffic, defaults to '' + :type suffix: str, optional + :param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 0 :type priority: int, optional + :param trainer: Trainer attached with current hook """ def __init__(self, diff --git a/colossalai/trainer/hooks/_log_hook.py b/colossalai/trainer/hooks/_log_hook.py index bb42ea2c8..b8230fdf4 100644 --- a/colossalai/trainer/hooks/_log_hook.py +++ b/colossalai/trainer/hooks/_log_hook.py @@ -25,6 +25,15 @@ def _format_number(val, prec=5): class LogByEpochHook(BaseHook): + """hook to log by epoch + + :param logger: logger for the log + :param interval: Recording interval, defaults to 1 + :type interval: int, optional + :param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 1 + :type priority: int, optional + :param trainer: Trainer attached with current hook + """ def __init__(self, logger, interval: int = 1, @@ -39,6 +48,12 @@ class LogByEpochHook(BaseHook): @HOOKS.register_module class LogMetricByStepHook(BaseHook): + """hook to log metric by step + + :param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10 + :type priority: int, optional + :param trainer: Trainer attached with current hook + """ def __init__(self, priority: int = 10): super().__init__(priority) @@ -59,12 +74,13 @@ class LogMetricByStepHook(BaseHook): class LogMetricByEpochHook(LogByEpochHook): """Specialized Hook to record the metric to log. - :param trainer: Trainer attached with current hook - :type trainer: Trainer - :param interval: Recording interval + :param logger: logger for the log + :param interval: Recording interval, defaults to 1 :type interval: int, optional - :param priority: Priority in the printing, hooks with small priority will be printed in front + :param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10 :type priority: int, optional + :param trainer: Trainer attached with current hook + :param mode: Mode of metrics, 'train' and 'test' """ def __init__(self, @@ -102,12 +118,17 @@ class LogMetricByEpochHook(LogByEpochHook): class TensorboardHook(BaseHook): """Specialized Hook to record the metric to Tensorboard. - :param trainer: Trainer attached with current hook - :type trainer: Trainer :param log_dir: Directory of log - :type log_dir: str, optional - :param priority: Priority in the printing, hooks with small priority will be printed in front + :type log_dir: str + :param ranks: ranks of processors + :type ranks: typing.List + :param parallel_mode: Parallel mode, defaults to colossalai.context.parallel_mode.ParallelMode.GLOBAL + :type parallel_mode: colossalai.context.parallel_mode.ParallelMode, optional + :param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10 :type priority: int, optional + :param trainer: Trainer attached with current hook + :param mode: Mode of metrics, 'train' and 'test' + :type mode: str """ def __init__(self, @@ -184,14 +205,20 @@ class TensorboardHook(BaseHook): class LogTimingByEpochHook(LogByEpochHook): """Specialized Hook to write timing record to log. - :param trainer: Trainer attached with current hook - :type trainer: Trainer - :param interval: Recording interval + :param timer: Timer for the hook + :type timer: colossalai.utils.MultiTimer + :param logger: Logger for the log + :type logger: colossalai.logging.DistributedLogger + :param interval: Recording interval, defaults to 1 :type interval: int, optional - :param priority: Priority in the printing, hooks with small priority will be printed in front + :param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10 :type priority: int, optional - :param log_eval: Whether writes in evaluation + :param log_eval: Whether writes in evaluation, defaults to True :type log_eval: bool, optional + :param ignore_num_train_steps: Number of training steps to ignore, defaults to 0 + :type ignore_num_train_steps: int, optional + :param mode: Mode of metrics, 'train' and 'test' + :param trainer: Trainer attached with current hook """ def __init__(self, timer: MultiTimer, @@ -249,13 +276,13 @@ class LogTimingByEpochHook(LogByEpochHook): class LogMemoryByEpochHook(LogByEpochHook): """Specialized Hook to write memory usage record to log. - :param trainer: Trainer attached with current hook - :type trainer: Trainer - :param interval: Recording interval + :param logger: Logger for the log + :type logger: colossalai.logging.DistributedLogger + :param interval: Recording interval, defaults to 1 :type interval: int, optional - :param priority: Priority in the printing, hooks with small priority will be printed in front + :param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10 :type priority: int, optional - :param log_eval: Whether writes in evaluation + :param log_eval: Whether writes in evaluation, defaults to True :type log_eval: bool, optional """ def __init__(self, @@ -263,7 +290,8 @@ class LogMemoryByEpochHook(LogByEpochHook): interval: int = 1, priority: int = 10, log_eval: bool = True, - report_cpu: bool = False) -> None: + report_cpu: bool = False, # no reference + ) -> None: super().__init__(logger=logger, interval=interval, priority=priority) self._log_eval = log_eval self._is_rank_to_log = is_dp_rank_0() and is_tp_rank_0() diff --git a/colossalai/trainer/hooks/_lr_scheduler_hook.py b/colossalai/trainer/hooks/_lr_scheduler_hook.py index 0677754ff..76cf12c53 100644 --- a/colossalai/trainer/hooks/_lr_scheduler_hook.py +++ b/colossalai/trainer/hooks/_lr_scheduler_hook.py @@ -8,14 +8,14 @@ from ._metric_hook import LearningRateMetric, MetricHook class LRSchedulerHook(MetricHook): """Build LR scheduler - :param trainer: Trainer attached with current hook - :type trainer: Trainer - :param lr_scheduler_cfg: The config of LR scheduler - :type lr_scheduler_cfg: dict - :param by_epoch: If `True`, the LR will be scheduled every epoch. Else, the LR will be scheduled every batch. Defaults to `True`. + :param lr_scheduler: LR scheduler + :param by_epoch: If `True`, the LR will be scheduled every epoch. Else, the LR will be scheduled every batch :type by_epoch: bool - :param priority: Priority in the printing, hooks with small priority will be printed in front + :param store_lr_in_state: If `True`, store the learning rate in each state, defaults to `True` + :type store_lr_in_state: bool, optional + :param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 1 :type priority: int, optional + :param trainer: Trainer attached with current hook """ def __init__( self, diff --git a/colossalai/trainer/hooks/_metric_hook.py b/colossalai/trainer/hooks/_metric_hook.py index c4546b6f9..20d5409a7 100644 --- a/colossalai/trainer/hooks/_metric_hook.py +++ b/colossalai/trainer/hooks/_metric_hook.py @@ -133,6 +133,8 @@ class LearningRateMetric(Metric): :param epoch_only: Whether the metric only read for the full epoch :type epoch_only: bool + :param initial_lr: initial learning rate, defaults to 0.0 + :type initial_lr: float, optional """ def __init__(self, epoch_only: bool, initial_lr: float = 0.): @@ -161,6 +163,8 @@ class AccuracyMetric(Metric): :param epoch_only: Whether the metric only read for the full epoch :type epoch_only: bool + :param accuracy_func: accuracy function for the classification task + :type accuracy_func: typing.Callable """ def __init__(self, epoch_only: bool, accuracy_func: Callable): @@ -182,7 +186,8 @@ class AccuracyMetric(Metric): and labels. It expects the output has logits and labels. :param logits: The logits output of the model - :param label: The labels of the input data + :param targets: real labels of the dataset + :param batch_size: batch size of the task """ if isinstance(logits, (list, tuple)): logits = logits[0] @@ -216,10 +221,10 @@ class MetricHook(BaseHook): update their states. Others are used to display and record the metric. - :param trainer: Trainer attached with current hook :param priority: Priority in the printing, hooks with small priority will be printed in front - :type trainer: Trainer :type priority: int + :param trainer: Trainer attached with current hook + :type trainer: Trainer """ def __init__( @@ -238,10 +243,10 @@ class MetricHook(BaseHook): class LossHook(MetricHook): """Specialized hook class for :class:`Loss`. - :param trainer: Trainer attached with current hook - :param priority: Priority in the printing, hooks with small priority will be printed in front - :type trainer: Trainer + :param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 0 :type priority: int, optional + :param trainer: Trainer attached with current hook + :type trainer: Trainer """ def __init__(self, priority: int = 0): @@ -279,10 +284,12 @@ class LossHook(MetricHook): class AccuracyHook(MetricHook): """Specialized hook class for :class:`Accuracy`. + :param accuracy_func: Priority in the printing, hooks with small priority will be printed in front + :type accuracy_func: typing.Callable + :param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 0 + :type priority: int, optional :param trainer: Trainer attached with current hook - :param priority: Priority in the printing, hooks with small priority will be printed in front :type trainer: Trainer - :type priority: int """ def __init__(self, accuracy_func: Callable, priority: int = 0): @@ -308,6 +315,13 @@ class AccuracyHook(MetricHook): class ThroughputMetric(Metric): + """Metric for :class:`Throughput`. + + :param epoch_only: epoch only + :type epoch_only: bool + :param num_samples: number of samples + :param time: time + """ def __init__(self, epoch_only: bool): super().__init__(epoch_only=epoch_only) self.accumulated_num_samples = torch.zeros(1, device=get_current_device()) @@ -345,6 +359,13 @@ class ThroughputMetric(Metric): @HOOKS.register_module class ThroughputHook(MetricHook): + """Specialized hook class for :class:`Throughput`. + + :param priority: priority of throughput hook, defaults to 10 + :type priority: int, optional + :param trainer: Trainer attached with current hook + :type trainer: Trainer + """ def __init__(self, priority: int = 10): super().__init__(priority)