Refactored docstring to google style

This commit is contained in:
Liang Bowen 2022-03-25 13:02:39 +08:00 committed by アマデウス
parent 53b1b6e340
commit ec5086c49c
94 changed files with 3389 additions and 2982 deletions

View File

@ -12,21 +12,27 @@ from .naive_amp import convert_to_naive_amp
def convert_to_amp(model: nn.Module, optimizer: Optimizer, criterion: _Loss, mode: AMP_TYPE, amp_config: Config = None): def convert_to_amp(model: nn.Module, optimizer: Optimizer, criterion: _Loss, mode: AMP_TYPE, amp_config: Config = None):
"""A helper function to wrap training components with Torch AMP modules """A helper function to wrap training components with Torch AMP modules.
:param model: your model object Args:
:type model: :class:`torch.nn.Module` param model (:class:`torch.nn.Module`): your model object.
:param optimizer: your optimizer object optimizer (:class:`torch.optim.Optimizer`): your optimizer object.
:type optimizer: :class:`torch.optim.Optimizer` criterion (:class:`torch.nn.modules.loss._Loss`): your loss function object.
:param criterion: your loss function object mode (:class:`colossalai.amp.AMP_TYPE`): amp mode.
:type criterion: :class:`torch.nn.modules.loss._Loss` amp_config (:class:`colossalai.context.Config` or dict): configuration for different amp modes
:param mode: amp mode
:type mode: :class:`colossalai.amp.AMP_TYPE`
:param amp_config: configuration for different amp modes
:type amp_config: :class:`colossalai.context.Config` or dict
:return: (model, optimizer, criterion) Returns:
:rtype: Tuple A tuple (model, optimizer, criterion).
Note:
``amp_config`` may vary from different mode you choose. You should check the corresponding amp mode
for more details about ``amp_config``.
For ``apex_amp``, please check
`apex_amp config <https://nvidia.github.io/apex/amp.html?highlight=apex%20amp>`_.
For ``naive_amp``, please check
`naive_amp config <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/amp/naive_amp/_fp16_optimizer.py#L42>`_.
For ``torch_amp``, please check
`torch_amp config <https://github.com/pytorch/pytorch/blob/master/torch/cuda/amp/grad_scaler.py#L97>`_.
""" """
assert isinstance(mode, AMP_TYPE), \ assert isinstance(mode, AMP_TYPE), \
f'expected the argument mode be AMP_TYPE, but got {type(mode)}' f'expected the argument mode be AMP_TYPE, but got {type(mode)}'

View File

@ -4,17 +4,33 @@ from torch.optim import Optimizer
def convert_to_apex_amp(model: nn.Module, optimizer: Optimizer, amp_config): def convert_to_apex_amp(model: nn.Module, optimizer: Optimizer, amp_config):
"""A helper function to wrap training components with Apex AMP modules r"""A helper function to wrap training components with Apex AMP modules
:param model: your model object Args:
:type model: :class:`torch.nn.Module` model (:class:`torch.nn.Module`): your model object.
:param optimizer: your optimizer object optimizer (:class:`torch.optim.Optimizer`): your optimizer object.
:type optimizer: :class:`torch.optim.Optimizer` amp_config (:class: colossalai.context.Config or dict): configuration for initializing apex_amp.
:param amp_config: configuration for nvidia apex
:type amp_config: :class:`colossalai.context.Config` or dict
:return: (model, optimizer) The ``amp_config`` should include parameters below:
:rtype: Tuple ::
enabled (bool, optional, default=True)
opt_level (str, optional, default="O1")
cast_model_type (``torch.dtype``, optional, default=None)
patch_torch_functions (bool, optional, default=None)
keep_batchnorm_fp32 (bool or str, optional, default=None
master_weights (bool, optional, default=None)
loss_scale (float or str, optional, default=None)
cast_model_outputs (torch.dtype, optional, default=None)
num_losses (int, optional, default=1)
verbosity (int, default=1)
min_loss_scale (float, default=None)
max_loss_scale (float, default=2.**24)
Returns:
Tuples: A tuple (model, optimizer).
More details about ``amp_config`` refer to `amp_config <https://nvidia.github.io/apex/amp.html?highlight=apex%20amp>`_.
""" """
import apex.amp as apex_amp import apex.amp as apex_amp
model, optimizer = apex_amp.initialize(model, optimizer, **amp_config) model, optimizer = apex_amp.initialize(model, optimizer, **amp_config)

View File

@ -21,8 +21,8 @@ class ApexAMPOptimizer(ColossalaiOptimizer):
def backward(self, loss: Tensor): def backward(self, loss: Tensor):
"""Backward pass to get all gradients """Backward pass to get all gradients
:param loss: Loss computed by a loss function Args:
:type loss: torch.Tensor loss (torch.Tensor): Loss computed by a loss function
""" """
with apex_amp.scale_loss(loss, self.optim) as scaled_loss: with apex_amp.scale_loss(loss, self.optim) as scaled_loss:
scaled_loss.backward() scaled_loss.backward()
@ -30,10 +30,9 @@ class ApexAMPOptimizer(ColossalaiOptimizer):
def clip_grad_norm(self, model: nn.Module, max_norm: float): def clip_grad_norm(self, model: nn.Module, max_norm: float):
"""Clip gradients' norm """Clip gradients' norm
:param model: Your model object Args:
:type model: torch.nn.Module model (torch.nn.Module): Your model object
:param max_norm: The max norm value for gradient clipping max_norm (float): The max norm value for gradient clipping
:type max_norm: float
""" """
if max_norm > 0: if max_norm > 0:
clip_grad_norm_fp32(apex_amp.master_params(self.optim), max_norm) clip_grad_norm_fp32(apex_amp.master_params(self.optim), max_norm)

View File

@ -4,20 +4,30 @@ from torch.optim import Optimizer
from colossalai.utils import is_no_pp_or_last_stage from colossalai.utils import is_no_pp_or_last_stage
from .naive_amp import NaiveAMPOptimizer, NaiveAMPModel from .naive_amp import NaiveAMPOptimizer, NaiveAMPModel
from .grad_scaler import DynamicGradScaler, ConstantGradScaler from .grad_scaler import DynamicGradScaler, ConstantGradScaler
from ._fp16_optimizer import FP16Optimizer
def convert_to_naive_amp(model: nn.Module, optimizer: Optimizer, amp_config): def convert_to_naive_amp(model: nn.Module, optimizer: Optimizer, amp_config):
"""A helper function to wrap training components with naive AMP modules """A helper function to wrap training components with naive AMP modules. In this mode,
we forcibly cast the model weights and inputs to FP16, and cast the model outputs to FP32 to calculate loss,
which is equivalent to Apex O3.
:param model: your model object Args:
:type model: :class:`torch.nn.Module` model (:class:`torch.nn.Module`): your model object
:param optimizer: your optimizer object optimizer (:class:`torch.optim.Optimizer`): your optimizer object
:type optimizer: :class:`torch.optim.Optimizer` amp_config (:class:`colossalai.context.Config` or dict): configuration for naive mode amp.
:param amp_config: configuration for naive mode amp
:type amp_config: :class:`colossalai.context.Config` or dict
:return: (model, optimizer)
:rtype: Tuple The ``amp_config`` should contain parameters below:
:
verbose (bool, optional): if set to `True`, will print debug info (Default: False).
clip_grad_norm (float, optional): clip gradients with this global L2 norm (Default 0).
Note that clipping is ignored if clip_grad == 0.
dynamic_grad_scale (bool): whether to use dynamic grad scaler.
Returns:
Tuples: A tuple (model, optimizer)
""" """
if isinstance(model, nn.ModuleList): if isinstance(model, nn.ModuleList):
# interleaved pipeline # interleaved pipeline
@ -46,4 +56,4 @@ def convert_to_naive_amp(model: nn.Module, optimizer: Optimizer, amp_config):
return model, optimizer return model, optimizer
__all__ = ['convert_to_naive_amp', 'NaiveAMPOptimizer'] __all__ = ['convert_to_naive_amp', 'NaiveAMPOptimizer', 'FP16Optimizer']

View File

@ -42,24 +42,13 @@ def _multi_tensor_copy_this_to_that(this, that, overflow_buf=None):
class FP16Optimizer(Optimizer): class FP16Optimizer(Optimizer):
"""Float16 optimizer for fp16 and bf16 data types. """Float16 optimizer for fp16 and bf16 data types.
:param optimizer: base optimizer such as Adam or SGD Args:
:type optimizer: torch.optim.Optimizer optimizer (torch.optim.Optimizer): base optimizer such as Adam or SGD
:param clip_grad: clip gradeints with this global L2 norm. Note that clipping is ignored if clip_grad == 0 grad_scaler (BaseGradScaler): grad scaler for gradient chose in
:type param clip_grad: float ``constant_grad_scaler`` or ``dynamic_grad_scaler``.
:param log_num_zeros_in_grad: return number of zeros in the gradients. clip_grad_norm (float, optional): clip gradients with this global L2 norm. Default 0.
:type log_num_zeros_in_grad: bool Note that clipping is ignored if clip_grad == 0
:param initial_scale: initial scale of gradient scaler verbose (bool, optional): if set to `True`, will print debug info. Default False.
:type initial_scale: int
:param growth_factor: the growth rate of loss scale
:type growth_factor: int
:param backoff_factor: the decrease rate of loss scale
:type backoff_factor: float
:param hysterisis: delay shift in dynamic loss scaling
:type hysterisis: int
:param max_scale: maximum loss scale allowed
:type max_scale: int
:param verbose: if set to `True`, will print debug info
:type verbose: bool
""" """
def __init__(self, def __init__(self,

View File

@ -18,11 +18,15 @@ from ._fp16_optimizer import FP16Optimizer
class NaiveAMPOptimizer(ColossalaiOptimizer): class NaiveAMPOptimizer(ColossalaiOptimizer):
"""A wrapper class for optimizer to cast all parameters to fp16 """A wrapper class for optimizer to cast all parameters to fp16
:param optim: A normal optimizer like Adam or SGD Args:
:param args: Args used to initialize FP16 optimizer optim (torch.optim.Optimizer): A normal optimizer like Adam or SGD.
:param kwargs: Kwargs used to initialize FP16 optimizer grad_scaler (BaseGradScaler): grad scaler for gradient chose in
``constant_grad_scaler`` or ``dynamic_grad_scaler``.
clip_grad_norm (float, optional): clip gradients with this global L2 norm. Default 0.
verbose (bool, optional): if set to `True`, will print debug info. Default False.
:type optim: torch.optim.Optimizer Note:
clipping is ignored if ``clip_grad_norm`` equals 0.
""" """
def __init__(self, optim: Optimizer, *args, **kwargs): def __init__(self, optim: Optimizer, *args, **kwargs):
@ -40,8 +44,19 @@ class NaiveAMPOptimizer(ColossalaiOptimizer):
class NaiveAMPModel(nn.Module): class NaiveAMPModel(nn.Module):
"""A wrapper class for model to cast the model into fp16 and r"""A wrapper class for model to cast the model into fp16 and
automatically cast the input and output automatically cast the input and output
Args:
model (torch.nn.Module): torch.nn.Module to be wrapped.
output_to_fp32 (bool, optional): Whether cast output of this module into fp32. (Default: True)
parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel group mode used in this module.
(Default: ``ParallelMode.DATA``)
sync_buffer (bool, optional): whether to synchronize buffer. (Default: True)
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
""" """
def __init__(self, def __init__(self,

View File

@ -10,18 +10,25 @@ def convert_to_torch_amp(model: nn.Module,
optimizer: Optimizer, optimizer: Optimizer,
criterion: Optional[_Loss] = None, criterion: Optional[_Loss] = None,
amp_config: Optional[Config] = None): amp_config: Optional[Config] = None):
"""A helper function to wrap training components with Torch AMP modules """A helper function to wrap training components with Pytorch AMP modules
:param model: your model object Args:
:type model: :class:`torch.nn.Module` model (:class:`torch.nn.Module`): your model object.
:param optimizer: your optimizer object optimizer (:class:`torch.optim.Optimizer`): your optimizer object
:type optimizer: :class:`torch.optim.Optimizer` criterion (:class:`torch.nn.modules.loss._Loss`, optional): your loss function object
:param criterion: your loss function object amp_config (:class:`colossalai.context.Config` or dict, optional): configuration for Pytorch AMP.
:type criterion: :class:`torch.nn.modules.loss._Loss`, optional
:param amp_config: configuration for different amp modes The ``amp_config`` should include parameters below:
:type amp_config: :class:`colossalai.context.Config` or dict, optional ::
:return: (model, optimizer, criterion)
:rtype: Tuple init_scale (float, optional, default=2.**16)
growth_factor (float, optional, default=2.0)
backoff_factor (float, optional, default=0.5)
growth_interval (int, optional, default=2000)
enabled (bool, optional, default=True)
Returns:
A tuple (model, optimizer, criterion)
""" """
model = TorchAMPModel(model) model = TorchAMPModel(model)
if amp_config is None: if amp_config is None:

View File

@ -14,13 +14,19 @@ from colossalai.utils import clip_grad_norm_fp32
class TorchAMPOptimizer(ColossalaiOptimizer): class TorchAMPOptimizer(ColossalaiOptimizer):
"""A wrapper class which integrate pytorch amp with an optimizer """A wrapper class which integrate Pytorch AMP with an optimizer
:param optim: A normal optimizer like Adam or SGD Args:
:param args: Args used to initialize gradient scaler optim (torch.optim.Optimizer): A normal optimizer like Adam or SGD.
:param kwargs: Kwargs used to initialize gradient scaler init_scale (float, optional, default=2.**16): Initial scale factor.
growth_factor (float, optional, default=2.0): Factor by which the scale is multiplied during
:type optim: torch.optim.Optimizer :meth:`update` if no inf/NaN gradients occur for ``growth_interval`` consecutive iterations.
backoff_factor (float, optional, default=0.5): Factor by which the scale is multiplied during
:meth:`update` if inf/NaN gradients occur in an iteration.
growth_interval (int, optional, default=2000): Number of consecutive iterations without inf/NaN gradients
that must occur for the scale to be multiplied by ``growth_factor``.
enabled (bool, optional, default=True): If ``False``, disables gradient scaling. :meth:`step` simply
invokes the underlying ``optimizer.step()``, and other methods become no-ops.
""" """
def __init__(self, optim: Optimizer, *args, **kwargs): def __init__(self, optim: Optimizer, *args, **kwargs):
@ -30,8 +36,8 @@ class TorchAMPOptimizer(ColossalaiOptimizer):
def backward(self, loss: Tensor): def backward(self, loss: Tensor):
"""Backward with torch amp gradient scaler """Backward with torch amp gradient scaler
:param loss: Loss computed by a loss function Args:
:type loss: torch.Tensor loss (torch.Tensor): Loss computed by a loss function
""" """
self.scaler.scale(loss).backward() self.scaler.scale(loss).backward()
@ -44,10 +50,9 @@ class TorchAMPOptimizer(ColossalaiOptimizer):
def clip_grad_norm(self, model: nn.Module, max_norm: float): def clip_grad_norm(self, model: nn.Module, max_norm: float):
"""Apply gradient clipping to the model parameters """Apply gradient clipping to the model parameters
:param model: Your model object Args:
:type model: torch.nn.Module model (torch.nn.Module): Your model object
:param max_norm: Max norm value for gradient clipping max_norm (float): Max norm value for gradient clipping
:type max_norm: float
""" """
if max_norm > 0.0: if max_norm > 0.0:
self.scaler.unscale_(self.optim) self.scaler.unscale_(self.optim)
@ -71,8 +76,8 @@ class TorchAMPModel(nn.Module):
class TorchAMPLoss(nn.Module): class TorchAMPLoss(nn.Module):
"""A wrapper class for a criterion object which computes the loss in mixed-precision context """A wrapper class for a criterion object which computes the loss in mixed-precision context
:param loss: A loss function object Args:
:type loss: torch.nn.modules.loss._Loss loss (torch.nn.modules.loss._Loss): A loss function object
""" """
def __init__(self, loss: _Loss): def __init__(self, loss: _Loss):

View File

@ -10,34 +10,40 @@ from colossalai.registry import *
def build_from_config(module, config: dict): def build_from_config(module, config: dict):
"""Returns an object of :class:`module` constructed from `config`. """Returns an object of :class:`module` constructed from `config`.
:param module: A python or user-defined class Args:
:type module: class module: A python or user-defined class
:param config: A python dict containing information used in the construction config: A python dict containing information used in the construction of the return object
of the return object
:type config: dict Returns: An ``object`` of interest
:raises AssertionError: Raises an AssertionError if `module` is not a class
:return: An object of interest Raises:
:rtype: Object AssertionError: Raises an AssertionError if `module` is not a class
""" """
assert inspect.isclass(module), 'module must be a class' assert inspect.isclass(module), 'module must be a class'
return module(**config) return module(**config)
def build_from_registry(config, registry: Registry): def build_from_registry(config, registry: Registry):
"""Returns an object constructed from `config`, the type of the object r"""Returns an object constructed from `config`, the type of the object
is specified by `registry`. is specified by `registry`.
:param config: A python dict or a :class:`colossalai.context.Config` object Note:
containing information used in the construction of the return object the `config` is used to construct the return object such as `LAYERS`,
:type config: dict or :class:`colossalai.context.colossalai.context.Config` `OPTIMIZERS` and other support types in `registry`. The `config` should contain
:param registry: A registry specifying the type of the return object all required parameters of corresponding object. The details of support
:type registry: :class:`Registry` types in `registry` and the `mod_type` in `config` could be found in
:raises AssertionError: Raises an AssertionError if `registry` is not an object `registry <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/registry/__init__.py>`_.
of :class:`Registry` or `mod_type` in `config` is not found in `registry`
:raises Exception: Raises an Exception if an error occurred when building Args:
from registry config (dict or :class:`colossalai.context.colossalai.context.Config`): information
:return: An object specified by `registry` used in the construction of the return object.
:rtype: Python object specified by `registry` registry (:class:`Registry`): A registry specifying the type of the return object
Returns: A Python object specified by `registry`
Raises:
Exception: Raises an Exception if an error occurred when building from registry
""" """
config_ = config.copy() # keep the original config untouched config_ = config.copy() # keep the original config untouched
assert isinstance( assert isinstance(
@ -60,11 +66,13 @@ def build_from_registry(config, registry: Registry):
def build_layer(config): def build_layer(config):
"""Returns a layer object of :class:`nn.Module` constructed from `config`. """Returns a layer object of :class:`nn.Module` constructed from `config`.
:param config: A python dict or a :class:`colossalai.context.Config` object Args:
containing information used in the construction of the return object config (dict or :class:`colossalai.context.Config`): A python dict or
:type config: dict or :class:`colossalai.context.Config` a :class:`colossalai.context.Config` object containing information
:return: An object of :class:`torch.nn.Module` used in the construction of the ``LAYERS``.
:rtype: :class:`torch.nn.Module`
Returns:
An object of :class:`torch.nn.Module`
""" """
return build_from_registry(config, LAYERS) return build_from_registry(config, LAYERS)
@ -73,11 +81,13 @@ def build_loss(config):
"""Returns a loss function object of :class:`torch.autograd.Function` constructed """Returns a loss function object of :class:`torch.autograd.Function` constructed
from `config`. from `config`.
:param config: A python dict or a :class:`colossalai.context.Config` object Args:
containing information used in the construction of the return object config (dict or :class:`colossalai.context.Config`): A python dict or
:type config: dict or :class:`colossalai.context.Config` a :class:`colossalai.context.Config` object containing information
:return: An object of :class:`torch.nn.modules.loss._Loss` used in the construction of the ``LOSSES``.
:rtype: :class:`torch.nn.modules.loss._Loss`
Returns:
An object of :class:`torch.nn.modules.loss._Loss`
""" """
return build_from_registry(config, LOSSES) return build_from_registry(config, LOSSES)
@ -85,11 +95,13 @@ def build_loss(config):
def build_model(config): def build_model(config):
"""Returns a model object of :class:`nn.Module` constructed from `config`. """Returns a model object of :class:`nn.Module` constructed from `config`.
:param config: A python dict or a :class:`colossalai.context.Config` object Args:
containing information used in the construction of the return object config (dict or :class:`colossalai.context.Config`): A python dict or
:type config: dict or :class:`colossalai.context.Config` a :class:`colossalai.context.Config` object containing information
:return: An object of :class:`torch.nn.Module` used in the construction of the ``MODELS``.
:rtype: :class:`torch.nn.Module`
Returns:
An object of :class:`torch.nn.Module`
""" """
return build_from_registry(config, MODELS) return build_from_registry(config, MODELS)
@ -98,11 +110,13 @@ def build_dataset(config):
"""Returns a dataset object of :class:`torch.utils.data.Dataset` constructed """Returns a dataset object of :class:`torch.utils.data.Dataset` constructed
from `config`. from `config`.
:param config: A python dict or a :class:`colossalai.context.Config` object Args:
containing information used in the construction of the return object config (dict or :class:`colossalai.context.Config`): A python dict or
:type config: dict or :class:`colossalai.context.Config` a :class:`colossalai.context.Config` object containing information
:return: An object of :class:`torch.utils.data.Dataset` used in the construction of the ``DATASETS``.
:rtype: :class:`torch.utils.data.Dataset`
Returns:
An object of :class:`torch.utils.data.Dataset`
""" """
return build_from_registry(config, DATASETS) return build_from_registry(config, DATASETS)
@ -111,13 +125,14 @@ def build_optimizer(config, model):
"""Returns an optimizer object of :class:`torch.optim.Optimizer` constructed from `config`, """Returns an optimizer object of :class:`torch.optim.Optimizer` constructed from `config`,
'model' and 'params'. 'model' and 'params'.
:param config: A python dict or a :class:`colossalai.context.Config` object Args:
containing information used in the construction of the return object config (dict or :class:`colossalai.context.Config`): A python dict or
:type config: dict or :class:`colossalai.context.Config` a :class:`colossalai.context.Config` object containing information
:param model: A model containing parameters for the optimizer used in the construction of the ``OPTIMIZERS``.
:type model: :class:`nn.Module` model (:class:`nn.Module`): A model containing parameters for the optimizer
:return: An object of :class:`torch.optim.Optimizer`
:rtype: :class:`torch.optim.Optimizer` Returns:
An object of :class:`torch.optim.Optimizer`
""" """
config_ = config.copy() config_ = config.copy()
config_['params'] = model.parameters() config_['params'] = model.parameters()
@ -128,15 +143,15 @@ def build_gradient_handler(config, model, optimizer):
"""Returns a gradient handler object of :class:`BaseGradientHandler` constructed from `config`, """Returns a gradient handler object of :class:`BaseGradientHandler` constructed from `config`,
`model` and `optimizer`. `model` and `optimizer`.
:param config: A python dict or a :class:`colossalai.context.Config` object Args:
containing information used in the construction of the return object config (dict or :class:`colossalai.context.Config`): A python dict or
:type config: dict or :class:`colossalai.context.Config` a :class:`colossalai.context.Config` object containing information
:param model: A model containing parameters for the gradient handler used in the construction of the ``GRADIENT_HANDLER``.
:type model: :class:`nn.Module` model (:class:`nn.Module`): A model containing parameters for the gradient handler
:param optimizer: An optimizer object containing parameters for the gradient handler optimizer (:class:`torch.optim.Optimizer`): An optimizer object containing parameters for the gradient handler
:type optimizer: :class:`torch.optim.Optimizer`
:return: An object of :class:`colossalai.engine.BaseGradientHandler` Returns:
:rtype: :class:`colossalai.engine.BaseGradientHandler` An object of :class:`colossalai.engine.BaseGradientHandler`
""" """
config_ = config.copy() config_ = config.copy()
config_['model'] = model config_['model'] = model
@ -147,13 +162,13 @@ def build_gradient_handler(config, model, optimizer):
def build_hooks(config, trainer): def build_hooks(config, trainer):
"""Returns a hook object of :class:`BaseHook` constructed from `config` and `trainer`. """Returns a hook object of :class:`BaseHook` constructed from `config` and `trainer`.
:param config: A python dict or a :class:`colossalai.context.Config` object Args:
containing information used in the construction of the return object config (dict or :class:`colossalai.context.Config`): A python dict or
:type config: dict or :class:`colossalai.context.Config` a :class:`colossalai.context.Config` object containing information
:param trainer: A :class:`Trainer` object containing parameters for the hook used in the construction of the ``HOOKS``.
:type trainer: :class:`Trainer`
:return: An object of :class:`colossalai.trainer.hooks.BaseHook` Returns:
:rtype: :class:`colossalai.trainer.hooks.BaseHook` An object of :class:`colossalai.trainer.hooks.BaseHook`
""" """
config_ = config.copy() config_ = config.copy()
config_['trainer'] = trainer config_['trainer'] = trainer
@ -163,11 +178,13 @@ def build_hooks(config, trainer):
def build_ophooks(config): def build_ophooks(config):
"""Returns a hook object of :class:`BaseOpHook` constructed from `config`. """Returns a hook object of :class:`BaseOpHook` constructed from `config`.
:param config: A python dict or a :class:`colossalai.context.Config` object Args:
containing information used in the construction of the return object config (dict or :class:`colossalai.context.Config`): A python dict or
:type config: dict or :class:`colossalai.context.Config` a :class:`colossalai.context.Config` object containing information
:return: An object of :class:`colossalai.trainer.hooks.BaseOpHook` used in the construction of the ``OPHOOKS``.
:rtype: :class:`colossalai.trainer.hooks.BaseOpHook`
Returns:
An object of :class:`colossalai.trainer.hooks.BaseOpHook`
""" """
config_ = config.copy() config_ = config.copy()
return build_from_registry(config_, OPHOOKS) return build_from_registry(config_, OPHOOKS)
@ -177,11 +194,13 @@ def build_transform(config):
"""Returns a transformation object of :class:`torchvision.transforms` constructed """Returns a transformation object of :class:`torchvision.transforms` constructed
from `config`. from `config`.
:param config: A python dict or a :class:`colossalai.context.Config` object Args:
containing information used in the construction of the return object config (dict or :class:`colossalai.context.Config`): A python dict or
:type config: dict or :class:`colossalai.context.Config` a :class:`colossalai.context.Config` object containing information
:return: An object of :class:`torchvision.transforms` used in the construction of the ``TRANSFORMS``.
:rtype: :class:`torchvision.transforms`
Returns:
An object of :class:`torchvision.transforms`
""" """
return build_from_registry(config, TRANSFORMS) return build_from_registry(config, TRANSFORMS)
@ -190,14 +209,15 @@ def build_data_sampler(config, dataset):
"""Returns a data sampler object of :class:`colossalai.nn.data.sampler.BaseSampler` """Returns a data sampler object of :class:`colossalai.nn.data.sampler.BaseSampler`
constructed from `config`. constructed from `config`.
:param config: A python dict or a :class:`colossalai.context.Config` object Args:
containing information used in the construction of the return object config (dict or :class:`colossalai.context.Config`): A python dict or
:type config: dict or :class:`colossalai.context.Config` a :class:`colossalai.context.Config` object containing information
:param dataset: An object of :class:`torch.utils.data.Dataset` containing information used in the construction of the ``DATA_SAMPLERS``.
used in the construction of the return object dataset (:class:`torch.utils.data.Dataset`): An object of
:type dataset: :class:`torch.utils.data.Dataset` :class:`torch.utils.data.Dataset` containing information
:return: An object of :class:`colossalai.utils.data_sampler.BaseSampler` used in the construction of the return object
:rtype: :class:`colossalai.utils.data_sampler.BaseSampler` Returns:
An object of :class:`colossalai.utils.data_sampler.BaseSampler`
""" """
config_ = config.copy() config_ = config.copy()
config_['dataset'] = dataset config_['dataset'] = dataset
@ -208,14 +228,15 @@ def build_lr_scheduler(config, optimizer):
"""Returns a learning rate scheduler object of :class:`torch.optim.lr_scheduler` """Returns a learning rate scheduler object of :class:`torch.optim.lr_scheduler`
constructed from `config`, `optimizer`, `total_steps` and `num_steps_per_epoch`. constructed from `config`, `optimizer`, `total_steps` and `num_steps_per_epoch`.
:param config: A python dict or a :class:`colossalai.context.Config` object Args:
containing information used in the construction of the return object config (dict or :class:`colossalai.context.Config`): A python dict or
:type config: dict or :class:`colossalai.context.Config` a :class:`colossalai.context.Config` object containing information
:param optimizer: An optimizer object containing parameters for the learning rate used in the construction of the ``lr_schedule``.
scheduler optimizer (:class:`torch.optim.Optimizer`): An optimizer object containing
:type optimizer: :class:`torch.optim.Optimizer` parameters for the learning rate scheduler.
:return: An object of :class:`torch.optim.lr_scheduler`
:rtype: :class:`torch.optim.lr_scheduler` Returns:
An object of :class:`torch.optim.lr_scheduler`
""" """
config_ = config.copy() config_ = config.copy()
config_['optimizer'] = optimizer config_['optimizer'] = optimizer
@ -225,10 +246,12 @@ def build_lr_scheduler(config, optimizer):
def build_schedule(config): def build_schedule(config):
"""Returns a schedule of :class:`colossalai.engine.schedule.BaseSchedule`. """Returns a schedule of :class:`colossalai.engine.schedule.BaseSchedule`.
:param config: A python dict or a :class:`colossalai.context.Config` object Args:
containing information used in the construction of the return object config (dict or :class:`colossalai.context.Config`): A python dict or
:type config: dict or :class:`colossalai.context.Config` a :class:`colossalai.context.Config` object containing information
:return: An object of :class:`colossalai.engine.schedule.BaseSchedule` used in the construction of the ``Schedule``.
:rtype: :class:`colossalai.engine.schedule.BaseSchedule`
Returns:
An object of :class:`colossalai.engine.schedule.BaseSchedule`
""" """
return build_from_registry(config, SCHEDULE) return build_from_registry(config, SCHEDULE)

View File

@ -13,14 +13,13 @@ def _binary_partition(weights, st, ed):
"""Returns the binary partition position of `weights`, given the start """Returns the binary partition position of `weights`, given the start
position `st` and the end position `ed`. position `st` and the end position `ed`.
:param weights: A python list to be binary partitioned Args:
:type weights: list weights (list): A python list to be binary partitioned
:param st: the start position of the binary partition st (int): the start position of the binary partition
:type st: int ed (int): the end position of the binary partition
:param ed: the end postition of the binary partition
:type ed: int Returns:
:return: the binary partition position of `weights` int: the binary partition position of `weights`
:rtype: int
""" """
w_sum = weights[ed - 1] w_sum = weights[ed - 1]
prefix = 0 prefix = 0
@ -176,16 +175,13 @@ def build_pipeline_model_from_cfg(config, num_chunks: int = 1, partition_method:
... ...
) )
:param config: Configuration of the model Args:
:type config: dict config (dict): Configuration of the model.
:param num_chunks: The number of chunks you want to have on the current stage. This value should be 1 num_chunks (int, optional): The number of chunks you want to have on the current stage.
in most cases unless you are using virutal pipeline parallelism. This value should be 1 in most cases unless you are using virtual pipeline parallelism.
:type num_chunks: int, optional partition_method (str, optional): This parameter determines how you want to split your model
:param partition_method: This parameter determines how you want to split your model layers into stages, layers into stages, you can set it as 'layer' or 'parameter'.
you can set it as 'layer' or 'parameter' verbose (bool, optional): Whether to print the logs.
:type partition_method: str, optional
:param verbose: Whether to print the logs
:type verbose: bool, optional
""" """
ori_model = build_model(config) ori_model = build_model(config)
layers = ori_model.layers_cfg layers = ori_model.layers_cfg
@ -240,13 +236,11 @@ def build_pipeline_model(layers: nn.Sequential, num_chunks: int = 1, verbose: bo
"""An intializer to split the model into different stages for pipeline parallelism. """An intializer to split the model into different stages for pipeline parallelism.
Note that `layer` must be `torch.nn.Sequential`. Note that `layer` must be `torch.nn.Sequential`.
:param layers: Layers of model Args:
:type layers: `torch.nn.Sequential` layers (`torch.nn.Sequential`): Layers of model
:param num_chunks: The number of chunks you want to have on the current stage. This value should be 1 num_chunks: The number of chunks you want to have on the current stage. This value should be 1
in most cases unless you are using virutal pipeline parallelism. in most cases unless you are using virtual pipeline parallelism.
:type num_chunks: int, optional verbose (bool, optional): Whether to print the logs.
:param verbose: Whether to print the logs
:type verbose: bool, optional
""" """
pipeline_parallel_size = gpc.get_world_size(ParallelMode.PIPELINE) pipeline_parallel_size = gpc.get_world_size(ParallelMode.PIPELINE)
pipeline_rank = gpc.get_local_rank(ParallelMode.PIPELINE) pipeline_rank = gpc.get_local_rank(ParallelMode.PIPELINE)

View File

@ -12,21 +12,22 @@ from colossalai.utils import get_current_device
def all_gather(tensor: Tensor, dim: int, parallel_mode: ParallelMode, async_op: bool = False) -> Tensor: def all_gather(tensor: Tensor, dim: int, parallel_mode: ParallelMode, async_op: bool = False) -> Tensor:
"""Gathers all tensors from the parallel group and concatenates them in a r"""Gathers all tensors from the parallel group and concatenates them in a
specific dimension. specific dimension.
:param tensor: Tensor to be gathered Note:
:param dim: The dimension concatenating in The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
:param parallel_mode: Parallel group mode used in this communication in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
:param async_op: Whether operations are asynchronous
:type tensor: :class:`torch.Tensor` Args:
:type dim: int tensor (:class:`torch.Tensor`): Tensor to be gathered.
:type parallel_mode: :class:`colossalai.context.ParallelMode` dim (int): The dimension concatenating in.
:type async_op: bool, optional parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel group mode used in this communication.
async_op (bool, optional): Whether operations are asynchronous.
:return: The tensor generated by all-gather Returns:
:rtype: :class:`torch.Tensor` Union[tuple(:class:`torch.Tensor`, work handle), :class:`torch.Tensor`]: The result of all-together only,
if async_op is set to False. A tuple of output of all-gather and Async work handle, if async_op is set to True.
""" """
depth = gpc.get_world_size(parallel_mode) depth = gpc.get_world_size(parallel_mode)
if depth == 1: if depth == 1:
@ -54,23 +55,26 @@ def reduce_scatter(tensor: Tensor,
parallel_mode: ParallelMode, parallel_mode: ParallelMode,
op: ReduceOp = ReduceOp.SUM, op: ReduceOp = ReduceOp.SUM,
async_op: bool = False) -> Tensor: async_op: bool = False) -> Tensor:
"""Reduces all tensors then scatters it in a specific dimension to all r"""Reduces all tensors then scatters it in a specific dimension to all
members in the parallel group. members in the parallel group.
:param tensor: Tensor to be reduced and scattered Note:
:param dim: The dimension scattering in The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
:param parallel_mode: Parallel group mode used in this communication in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
:param op: The type of reduce operation
:param async_op: Whether operations are asynchronous
:type tensor: :class:`torch.Tensor` Args:
:type dim: int tensor (:class:`torch.Tensor`): Tensor to be reduce_scattered.
:type parallel_mode: :class:`colossalai.context.ParallelMode` dim (int): The dimension concatenating in.
:type op: ReduceOp, optional parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel group mode used in this communication.
:type async_op: bool, optional op (torch.distributed.ReduceOp, optional): The type of reduce operation,
should be included in [SUM, AVG, PRODUCT, MIN, MAX, BAND, BOR, BXOR].
More details about ReduceOp please refer to
`ReduceOp <https://pytorch.org/docs/stable/distributed.html#torch.distributed.ReduceOp>`_.
async_op (bool, optional): Whether operations are asynchronous.
:return: The tensor generated by reduce-scatter Returns:
:rtype: :class:`Tensor` Union[tuple(:class:`torch.Tensor`, work handle), :class:`torch.Tensor`]: The result of reduce_scatter only,
if async_op is set to False. A tuple of output of all-gather and Async work handle, if async_op is set to True.
""" """
depth = gpc.get_world_size(parallel_mode) depth = gpc.get_world_size(parallel_mode)
if depth == 1: if depth == 1:
@ -94,6 +98,25 @@ def all_reduce(tensor: Tensor,
parallel_mode: ParallelMode, parallel_mode: ParallelMode,
op: ReduceOp = ReduceOp.SUM, op: ReduceOp = ReduceOp.SUM,
async_op: bool = False) -> Tensor: async_op: bool = False) -> Tensor:
r"""Reduces the tensor data across whole parallel group in such a way that all get the final result.
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
Args:
tensor (:class:`torch.Tensor`): Tensor to be all-reduced.
parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel group mode used in this communication.
op (torch.distributed.ReduceOp, optional): The type of reduce operation,
should be included in [SUM, AVG, PRODUCT, MIN, MAX, BAND, BOR, BXOR].
More details about ReduceOp please refer to
`ReduceOp <https://pytorch.org/docs/stable/distributed.html#torch.distributed.ReduceOp>`_.
async_op (bool, optional): Whether operations are asynchronous.
Returns:
Union[tuple(:class:`torch.Tensor`, work handle), :class:`torch.Tensor`]: The result of all-gather only,
if async_op is set to False. A tuple of output of all-gather and Async work handle, if async_op is set to True.
"""
depth = gpc.get_world_size(parallel_mode) depth = gpc.get_world_size(parallel_mode)
if depth == 1: if depth == 1:
out = tensor out = tensor
@ -108,6 +131,23 @@ def all_reduce(tensor: Tensor,
def broadcast(tensor: Tensor, src: int, parallel_mode: ParallelMode, async_op: bool = False): def broadcast(tensor: Tensor, src: int, parallel_mode: ParallelMode, async_op: bool = False):
r"""Broadcast tensors to whole parallel group. Tensor must have the same
number of elements in all processes participating in the collective.
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
Args:
tensor (:class:`torch.Tensor`): Tensor to be broadcast.
src (int): Source rank.
parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel group mode used in this communication.
async_op (bool, optional): Whether operations are asynchronous.
Returns:
Union[tuple(:class:`torch.Tensor`, work handle), :class:`torch.Tensor`]: The tensor need to be broadcast only,
if async_op is set to False. A tuple of output of all-gather and Async work handle, if async_op is set to True.
"""
depth = gpc.get_world_size(parallel_mode) depth = gpc.get_world_size(parallel_mode)
if depth == 1: if depth == 1:
out = tensor out = tensor
@ -122,6 +162,23 @@ def broadcast(tensor: Tensor, src: int, parallel_mode: ParallelMode, async_op: b
def reduce(tensor: Tensor, dst: int, parallel_mode: ParallelMode, op: ReduceOp = ReduceOp.SUM, async_op: bool = False): def reduce(tensor: Tensor, dst: int, parallel_mode: ParallelMode, op: ReduceOp = ReduceOp.SUM, async_op: bool = False):
r"""Reduce tensors across whole parallel group. Only the process with
rank ``dst`` is going to receive the final result.
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
Args:
tensor (:class:`torch.Tensor`): Tensor to be reduced.
dst (int): Destination rank.
parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel group mode used in this communication.
async_op (bool, optional): Whether operations are asynchronous.
Returns:
Union[tuple(:class:`torch.Tensor`, work handle), :class:`torch.Tensor`]: The result of reduce only,
if async_op is set to False. A tuple of output of all-gather and Async work handle, if async_op is set to True.
"""
depth = gpc.get_world_size(parallel_mode) depth = gpc.get_world_size(parallel_mode)
if depth == 1: if depth == 1:
out = tensor out = tensor

View File

@ -19,12 +19,12 @@ TensorShape = Union[torch.Size, List[int], Tuple[int]]
def _get_tensor_shape(tensor_shape: TensorShape, chunk_tensor: bool = False) -> Tuple[TensorShape, bool]: def _get_tensor_shape(tensor_shape: TensorShape, chunk_tensor: bool = False) -> Tuple[TensorShape, bool]:
"""get the exact tensor shape when communicating and return whether the tensor is a chunk """get the exact tensor shape when communicating and return whether the tensor is a chunk
:param tensor_shape: shape of tensor Args:
:type tensor_shape: TensorShape tensor_shape (:class:`torch.Size`): shape of tensor
:param chunk_tensor: whether to chunk tensor, defaults to False chunk_tensor (bool, optional): whether to chunk tensor, defaults to False
:type chunk_tensor: bool, optional
:return: exact tensor shape, whether to chunk tensor Returns:
:rtype: Tuple[Union[torch.Size, List[int], Tuple[int]], bool] Tuple[Union[torch.Size, List[int], Tuple[int]], bool]: exact tensor shape, whether to chunk tensor
""" """
if chunk_tensor: if chunk_tensor:
tensor_chunk_shape = reduce(operator.mul, tensor_shape, 1) tensor_chunk_shape = reduce(operator.mul, tensor_shape, 1)
@ -134,14 +134,14 @@ def _communicate(tensor_send_next=None,
def recv_forward(input_tensor_shape, prev_rank=None, dtype=torch.float, scatter_gather_tensors=False): def recv_forward(input_tensor_shape, prev_rank=None, dtype=torch.float, scatter_gather_tensors=False):
"""Receives the input tensor from the previous member in pipeline. """Copy the forward output from the previous stage in pipeline as the input tensor of this stage.
:param input_tensor_shape: The shape of the tensor to be recieved Args:
:param prev_rank: The rank of the source of the tensor input_tensor_shape (:class:`torch.Size`): The shape of the tensor to be received.
:type input_tensor_shape: torch.Size prev_rank (int, optional): The rank of the source of the tensor.
:type prev_rank: int, optional
:return: The input tensor in forward step Returns:
:rtype: :class:`torch.Tensor` :class:`torch.Tensor`: The input tensor.
""" """
if gpc.is_pipeline_first_stage(): if gpc.is_pipeline_first_stage():
input_tensor = None input_tensor = None
@ -155,14 +155,14 @@ def recv_forward(input_tensor_shape, prev_rank=None, dtype=torch.float, scatter_
def recv_backward(output_grad_shape, next_rank=None, dtype=torch.float, scatter_gather_tensors=False): def recv_backward(output_grad_shape, next_rank=None, dtype=torch.float, scatter_gather_tensors=False):
"""Receives the grad tensor from the next member in pipeline. """Copy the gradient tensor from the next stage in pipeline as the input gradient of this stage.
:param output_grad_shape: The shape of the tensor to be recieved Args:
:param next_rank: The rank of the source of the tensor output_grad_shape (:class:`torch.Size`): The shape of the tensor to be received.
:type output_grad_shape: torch.Size next_rank (int, optional): The rank of the source of the tensor.
:type next_rank: int, optional
:return: The grad of output tensor in forward step Returns:
:rtype: :class:`torch.Tensor` :class:`torch.Tensor`: The input gradient tensor.
""" """
if gpc.is_pipeline_last_stage(): if gpc.is_pipeline_last_stage():
output_tensor_grad = None output_tensor_grad = None
@ -176,12 +176,11 @@ def recv_backward(output_grad_shape, next_rank=None, dtype=torch.float, scatter_
def send_forward(output_tensor, next_rank=None, scatter_gather_tensors=False): def send_forward(output_tensor, next_rank=None, scatter_gather_tensors=False):
"""Sends the input tensor to the next member in pipeline. """Sends the input tensor to the next stage in pipeline.
:param output_tensor: Tensor to be sent Args:
:param next_rank: The rank of the recipient of the tensor output_tensor (:class:`torch.Tensor`): Tensor to be sent.
:type output_tensor: :class:`torch.Tensor` next_rank (int, optional): The rank of the recipient of the tensor.
:type next_rank: int, optional
""" """
if not gpc.is_pipeline_last_stage(): if not gpc.is_pipeline_last_stage():
_communicate(tensor_send_next=output_tensor, _communicate(tensor_send_next=output_tensor,
@ -190,12 +189,11 @@ def send_forward(output_tensor, next_rank=None, scatter_gather_tensors=False):
def send_backward(input_tensor_grad, prev_rank=None, scatter_gather_tensors=False): def send_backward(input_tensor_grad, prev_rank=None, scatter_gather_tensors=False):
"""Sends the grad tensor to the previous member in pipeline. """Sends the gradient tensor to the previous stage in pipeline.
:param input_tensor_grad: Tensor to be sent Args:
:param prev_rank: The rank of the recipient of the tensor input_tensor_grad (:class:`torch.Tensor`): Tensor to be sent
:type input_tensor_grad: :class:`torch.Tensor` prev_rank (int, optional): The rank of the recipient of the tensor
:type prev_rank: int, optional
""" """
if not gpc.is_pipeline_first_stage(): if not gpc.is_pipeline_first_stage():
_communicate(tensor_send_prev=input_tensor_grad, _communicate(tensor_send_prev=input_tensor_grad,
@ -210,15 +208,15 @@ def send_forward_recv_backward(output_tensor,
dtype=torch.float, dtype=torch.float,
scatter_gather_tensors=False): scatter_gather_tensors=False):
"""Batched communication operation. Sends the input tensor to the """Batched communication operation. Sends the input tensor to the
next member in pipeline, while recieves the grad tensor from the next stage in pipeline, while receives the gradient tensor from the
next member in pipeline. next stage in pipeline as the input gradient tensor of this stage.
:param output_tensor: Tensor to be sent Args:
:param output_grad_shape: The shape of the tensor to be recieved output_tensor (:class:`torch.Tensor`): Tensor to be sent.
:type output_tensor: :class:`torch.Tensor` output_grad_shape (:class:`torch.Size`): The shape of the tensor to be received.
:type output_grad_shape: :class:`torch.Size`
:return: The grad of output tensor in forward step Returns:
:rtype: :class:`torch.Tensor` :class:`torch.Tensor`: The input gradient tensor.
""" """
if gpc.is_pipeline_last_stage(): if gpc.is_pipeline_last_stage():
output_tensor_grad = None output_tensor_grad = None
@ -238,16 +236,16 @@ def send_backward_recv_forward(input_tensor_grad,
prev_rank=None, prev_rank=None,
dtype=torch.float, dtype=torch.float,
scatter_gather_tensors=False): scatter_gather_tensors=False):
"""Batched communication operation. Sends the grad tensor to the """Batched communication operation. Sends the gradient tensor to the
previous member in pipeline, while recieves the input tensor from the previous stage in pipeline, while receives the output tensor from the
previous member in pipeline. previous stage in pipeline as the input of this stage.
:param input_tensor_grad: Tensor to be sent Args:
:param input_tensor_shape: The shape of the tensor to be recieved input_tensor_grad (:class:`torch.Tensor`): Tensor to be sent.
:type input_tensor_grad: :class:`torch.Tensor` input_tensor_shape (:class:`torch.Size`): The shape of the tensor to be received.
:type input_tensor_shape: :class:`torch.Size`
:return: The input tensor in forward step Returns:
:rtype: :class:`torch.Tensor` :class:`torch.Tensor`: The input tensor.
""" """
if gpc.is_pipeline_first_stage(): if gpc.is_pipeline_first_stage():
input_tensor = None input_tensor = None
@ -269,15 +267,15 @@ def send_forward_recv_forward(output_tensor,
dtype=torch.float, dtype=torch.float,
scatter_gather_tensors=False): scatter_gather_tensors=False):
"""Batched communication operation. Sends the input tensor to the """Batched communication operation. Sends the input tensor to the
next member in pipeline, while recieves the input tensor from the next stage in pipeline, while receives the output tensor from the
previous member in pipeline. previous stage in pipeline as the input of this stage.
:param output_tensor: Tensor to be sent Args:
:param input_tensor_shape: The shape of the tensor to be recieved output_tensor (:class:`torch.Tensor`): Tensor to be sent.
:type output_tensor: :class:`torch.Tensor` input_tensor_shape (:class:`torch.Size`): The shape of the tensor to be received.
:type input_tensor_shape: :class:`torch.Size`
:return: The input tensor in forward step Returns:
:rtype: :class:`torch.Tensor` :class:`torch.Tensor`: The input tensor.
""" """
input_tensor, _ = _communicate(tensor_send_next=output_tensor, input_tensor, _ = _communicate(tensor_send_next=output_tensor,
recv_prev=recv_prev, recv_prev=recv_prev,
@ -296,16 +294,16 @@ def send_backward_recv_backward(input_tensor_grad,
next_rank=None, next_rank=None,
dtype=torch.float, dtype=torch.float,
scatter_gather_tensors=False): scatter_gather_tensors=False):
"""Batched communication operation. Sends the grad tensor to the """Batched communication operation. Sends the gradient tensor to the
previous member in pipeline, while recieves the grad tensor from the previous stage in pipeline, while receives the gradient tensor from the
next member in pipeline. next member in pipeline as the input of this stage.
:param input_tensor_grad: Tensor to be sent Args:
:param output_grad_shape: The shape of the tensor to be recieved input_tensor_grad (:class:`torch.Tensor`): Tensor to be sent.
:type input_tensor_grad: :class:`torch.Tensor` output_grad_shape (:class:`torch.Size`): The shape of the tensor to be received.
:type output_grad_shape: :class:`torch.Size`
:return: The grad of output tensor in forward step Returns:
:rtype: :class:`torch.Tensor` :class:`torch.Tensor`: The input gradient tensor.
""" """
_, output_tensor_grad = _communicate(tensor_send_prev=input_tensor_grad, _, output_tensor_grad = _communicate(tensor_send_prev=input_tensor_grad,
recv_next=recv_next, recv_next=recv_next,
@ -327,20 +325,18 @@ def send_forward_backward_recv_forward_backward(output_tensor,
next_rank=None, next_rank=None,
dtype=torch.float, dtype=torch.float,
scatter_gather_tensors=False): scatter_gather_tensors=False):
"""Batched communication operation. Sends the input tensor to the next and """Batched communication operation. Sends the input tensor to the next stage in pipeline and
the grad tensor to the previous, while recieves the grad tensor from the the gradient tensor to the previous stage, while receives the input gradient tensor from the
next and the input tensor from the previous. next stage and the input tensor from the previous stage.
:param output_tensor: Tensor sent to the next Args:
:param input_tensor_grad: Tensor sent to the previous output_tensor (:class:`torch.Tensor`): Tensor sent to the next.
:param input_tensor_shape: The shape of the tensor recieved from the previous input_tensor_grad (:class:`torch.Tensor`): Tensor sent to the previous.
:param output_grad_shape: The shape of the tensor recieved from the next input_tensor_shape (:class:`torch.Size`): The shape of the tensor received from the previous.
:type output_tensor: :class:`torch.Tensor` output_grad_shape (:class:`torch.Size`): The shape of the tensor received from the next.
:type input_tensor_grad: :class:`torch.Tensor`
:type input_tensor_shape: :class:`torch.Size` Returns:
:type output_grad_shape: :class:`torch.Size` Tuple(Tensor, Tensor): (the input tensor, the input gradient tensor)
:return: (the input tensor in forward step, the grad of output tensor in forward step)
:rtype: (Tensor, Tensor)
""" """
input_tensor, output_tensor_grad = _communicate( input_tensor, output_tensor_grad = _communicate(
tensor_send_next=output_tensor, tensor_send_next=output_tensor,

View File

@ -9,15 +9,19 @@ from colossalai.utils import get_current_device, synchronize
def ring_forward(tensor_send_next: torch.Tensor, parallel_mode: ParallelMode): def ring_forward(tensor_send_next: torch.Tensor, parallel_mode: ParallelMode):
"""Sends a tensor to the next member and recieves a tensor from the previous member. """Sends a tensor to the next member and receives a tensor from the previous member.
This function returns the recieved tensor from the previous member. This function returns the received tensor from the previous member.
:param tensor_send_next: Tensor sent to next member Args:
:param parallel_mode: Parallel group mode used in this communication tensor_send_next: Tensor sent to next member
:type tensor_send_next: :class:`torch.Tensor` parallel_mode: Parallel group mode used in this communication
:type parallel_mode: :class:`colossalai.context.ParallelMode`
:return: The tensor recieved from the previous Returns:
:rtype: :class:`torch.Tensor` :class:`torch.Tensor`: The tensor received from the previous.
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
""" """
buffer_shape = tensor_send_next.size() buffer_shape = tensor_send_next.size()

View File

@ -12,14 +12,13 @@ def send_tensor_meta(tensor, need_meta=True, next_rank=None):
meta information of the tensor should be sent before communications. This function meta information of the tensor should be sent before communications. This function
synchronizes with :func:`recv_tensor_meta`. synchronizes with :func:`recv_tensor_meta`.
:param tensor: Tensor to be sent Args:
:param need_meta: If False, meta information won't be sent tensor (torch.Tensor): Tensor to be sent.
:param next_rank: The rank of the next member in pipeline parallel group need_meta (bool, optional): If False, meta information won't be sent.
:type tensor: Tensor next_rank (int): The rank of the next member in pipeline parallel group.
:type need_meta: bool, optional
:type next_rank: int Returns:
:return: False bool: False
:rtype: bool
""" """
if need_meta: if need_meta:
if next_rank is None: if next_rank is None:
@ -36,17 +35,17 @@ def send_tensor_meta(tensor, need_meta=True, next_rank=None):
def recv_tensor_meta(tensor_shape, prev_rank=None): def recv_tensor_meta(tensor_shape, prev_rank=None):
"""Recieves tensor meta information before recieving a specific tensor. """Receives tensor meta information before receiving a specific tensor.
Since the recipient must know the shape of the tensor in p2p communications, Since the recipient must know the shape of the tensor in p2p communications,
meta information of the tensor should be recieved before communications. This function meta information of the tensor should be received before communications. This function
synchronizes with :func:`send_tensor_meta`. synchronizes with :func:`send_tensor_meta`.
:param tensor_shape: The shape of the tensor to be recieved Args:
:param prev_rank: The rank of the source of the tensor tensor_shape (torch.Size): The shape of the tensor to be received.
:type tensor_shape: torch.Size prev_rank (int): The rank of the source of the tensor.
:type prev_rank: int, optional
:return: The shape of the tensor to be recieved Returns:
:rtype: torch.Size torch.Size: The shape of the tensor to be received.
""" """
if tensor_shape is None: if tensor_shape is None:
if prev_rank is None: if prev_rank is None:
@ -67,14 +66,12 @@ def recv_tensor_meta(tensor_shape, prev_rank=None):
def split_tensor_into_1d_equal_chunks(tensor, new_buffer=False): def split_tensor_into_1d_equal_chunks(tensor, new_buffer=False):
"""Break a tensor into equal 1D chunks. """Break a tensor into equal 1D chunks.
:param tensor: Tensor to be splitted before communication Args:
:param new_buffer: Whether uses a new buffer to store sliced tensor tensor (torch.Tensor): Tensor to be split before communication.
new_buffer (bool, optional): Whether to use a new buffer to store sliced tensor.
:type tensor: torch.Tensor Returns:
:type new_buffer: bool, optional torch.Tensor: The split tensor
:return splitted_tensor: The splitted tensor
:rtype splitted_tensor: torch.Tensor
""" """
partition_size = torch.numel(tensor) // gpc.get_world_size(ParallelMode.PARALLEL_1D) partition_size = torch.numel(tensor) // gpc.get_world_size(ParallelMode.PARALLEL_1D)
start_index = partition_size * gpc.get_local_rank(ParallelMode.PARALLEL_1D) start_index = partition_size * gpc.get_local_rank(ParallelMode.PARALLEL_1D)
@ -92,11 +89,10 @@ def split_tensor_into_1d_equal_chunks(tensor, new_buffer=False):
def gather_split_1d_tensor(tensor): def gather_split_1d_tensor(tensor):
"""Opposite of above function, gather values from model parallel ranks. """Opposite of above function, gather values from model parallel ranks.
:param tensor: Tensor to be gathered after communication Args:
:type tensor: torch.Tensor tensor (torch.Tensor): Tensor to be gathered after communication.
Returns:
:return gathered: The gathered tensor gathered (torch.Tensor): The gathered tensor
:rtype gathered: torch.Tensor
""" """
world_size = gpc.get_world_size(ParallelMode.PARALLEL_1D) world_size = gpc.get_world_size(ParallelMode.PARALLEL_1D)
numel = torch.numel(tensor) numel = torch.numel(tensor)

View File

@ -12,8 +12,8 @@ class Config(dict):
"""This is a wrapper class for dict objects so that values of which can be """This is a wrapper class for dict objects so that values of which can be
accessed as attributes. accessed as attributes.
:param config: The dict object to be wrapped Args:
:type config: dict config (dict): The dict object to be wrapped.
""" """
def __init__(self, config: dict = None): def __init__(self, config: dict = None):
@ -50,12 +50,14 @@ class Config(dict):
def from_file(filename: str): def from_file(filename: str):
"""Reads a python file and constructs a corresponding :class:`Config` object. """Reads a python file and constructs a corresponding :class:`Config` object.
:param filename: Name of the file to construct the return object Args:
:type filename: str filename (str): Name of the file to construct the return object.
:raises AssertionError: Raises an AssertionError if the file does not exist, or the file
is not .py file Returns:
:return: A :class:`Config` object constructed with information in the file :class:`Config`: A :class:`Config` object constructed with information in the file.
:rtype: :class:`Config`
Raises:
AssertionError: Raises an AssertionError if the file does not exist, or the file is not .py file
""" """
# check config path # check config path

View File

@ -22,6 +22,10 @@ class ParallelContext(metaclass=SingletonMeta):
"""This class provides interface functions for users to get the parallel context, """This class provides interface functions for users to get the parallel context,
such as the global rank, the local rank, the world size, etc. of each device. such as the global rank, the local rank, the world size, etc. of each device.
Note:
The parallel_mode used in this class should be concluded in ``ParallelMode``.
More details about ``ParallelMode`` could be found in
`parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
""" """
def __init__(self): def __init__(self):
@ -62,10 +66,12 @@ class ParallelContext(metaclass=SingletonMeta):
def load_config(self, config: Union[dict, str]): def load_config(self, config: Union[dict, str]):
"""Loads the configuration from either a dict or a file. """Loads the configuration from either a dict or a file.
:param config: Either a dict containing the configuration information or the filename Args:
of a file containing the configuration information config (dict or str): Either a dict containing the configuration information or the filename
:type config: dict or str of a file containing the configuration information.
:raises TypeError: Raises a TypeError if `config` is neither a dict or a str
Raises:
TypeError: Raises a TypeError if `config` is neither a dict nor a str.
""" """
if isinstance(config, str): if isinstance(config, str):
self._config = Config.from_file(config) self._config = Config.from_file(config)
@ -81,20 +87,21 @@ class ParallelContext(metaclass=SingletonMeta):
def get_global_rank(self): def get_global_rank(self):
"""Returns the global rank of the current device. """Returns the global rank of the current device.
:return: The global rank of the current device Returns:
:rtype: int int: The global rank of the current device
""" """
return self._global_ranks[ParallelMode.GLOBAL] return self._global_ranks[ParallelMode.GLOBAL]
def add_global_rank(self, parallel_mode: ParallelMode, rank: int): def add_global_rank(self, parallel_mode: ParallelMode, rank: int):
"""Adds the global rank of the current device for `parallel_mode` to the context. """Adds the global rank of the current device for `parallel_mode` to the context.
:param parallel_mode: The parallel mode for the rank Args:
:type parallel_mode: :class:`colossalai.context.ParallelMode` parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode for the rank.
:param rank: The rank to be added rank (int): The rank to be added
:type rank: int
:raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance Raises:
of :class:`colossalai.context.ParallelMode` AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
of :class:`colossalai.context.ParallelMode`.
""" """
self._check_parallel_mode(parallel_mode) self._check_parallel_mode(parallel_mode)
self._global_ranks[parallel_mode] = rank self._global_ranks[parallel_mode] = rank
@ -102,12 +109,15 @@ class ParallelContext(metaclass=SingletonMeta):
def get_local_rank(self, parallel_mode: ParallelMode): def get_local_rank(self, parallel_mode: ParallelMode):
"""Returns the local rank of the current device. """Returns the local rank of the current device.
:param parallel_mode: The chosen parallel mode Args:
:type parallel_mode: :class:`colossalai.context.ParallelMode` parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
:raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
of :class:`colossalai.context.ParallelMode` Raises:
:return: The local rank of the current device for `parallel_mode` AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
:rtype: int of :class:`colossalai.context.ParallelMode`.
Returns:
int: The local rank of the current device for `parallel_mode`.
""" """
self._check_parallel_mode(parallel_mode) self._check_parallel_mode(parallel_mode)
return self._local_ranks[parallel_mode] return self._local_ranks[parallel_mode]
@ -115,12 +125,13 @@ class ParallelContext(metaclass=SingletonMeta):
def add_local_rank(self, parallel_mode: ParallelMode, rank: int): def add_local_rank(self, parallel_mode: ParallelMode, rank: int):
"""Adds the local rank of the current device for `parallel_mode` to the context. """Adds the local rank of the current device for `parallel_mode` to the context.
:param parallel_mode: The parallel mode for the rank Args:
:type parallel_mode: :class:`colossalai.context.ParallelMode` parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode for the rank.
:param rank: The rank to be added rank (int): The rank to be added.
:type rank: int
:raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance Raises:
of :class:`colossalai.context.ParallelMode` AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
of :class:`colossalai.context.ParallelMode`.
""" """
self._check_parallel_mode(parallel_mode) self._check_parallel_mode(parallel_mode)
self._local_ranks[parallel_mode] = rank self._local_ranks[parallel_mode] = rank
@ -128,12 +139,15 @@ class ParallelContext(metaclass=SingletonMeta):
def get_next_global_rank(self, parallel_mode: ParallelMode): def get_next_global_rank(self, parallel_mode: ParallelMode):
"""Returns the global rank of the next device. """Returns the global rank of the next device.
:param parallel_mode: The chosen parallel mode Args:
:type parallel_mode: :class:`colossalai.context.ParallelMode` parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
:raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
of :class:`colossalai.context.ParallelMode` Raises:
:return: The global rank of the next device for `parallel_mode` AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
:rtype: int of :class:`colossalai.context.ParallelMode`.
Returns:
int: The global rank of the next device for `parallel_mode`.
""" """
self._check_parallel_mode(parallel_mode) self._check_parallel_mode(parallel_mode)
@ -147,12 +161,15 @@ class ParallelContext(metaclass=SingletonMeta):
def get_prev_global_rank(self, parallel_mode: ParallelMode): def get_prev_global_rank(self, parallel_mode: ParallelMode):
"""Returns the global rank of the previous device. """Returns the global rank of the previous device.
:param parallel_mode: The chosen parallel mode Args:
:type parallel_mode: :class:`colossalai.context.ParallelMode` parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
:raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
of :class:`colossalai.context.ParallelMode` Raises:
:return: The global rank of the previous device for `parallel_mode` AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
:rtype: int of :class:`colossalai.context.ParallelMode`.
Returns:
int: The global rank of the previous device for `parallel_mode`.
""" """
self._check_parallel_mode(parallel_mode) self._check_parallel_mode(parallel_mode)
@ -167,13 +184,16 @@ class ParallelContext(metaclass=SingletonMeta):
"""Returns a boolean value indicating whether the current device is the first one """Returns a boolean value indicating whether the current device is the first one
among its group for `parallel_mode`. among its group for `parallel_mode`.
:param parallel_mode: The chosen parallel mode Args:
:type parallel_mode: :class:`colossalai.context.ParallelMode` parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
:raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
of :class:`colossalai.context.ParallelMode` Raises:
:return: a boolean value indicating whether the current device is the first one AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
among its group for `parallel_mode` of :class:`colossalai.context.ParallelMode`.
:rtype: bool
Returns:
bool: a boolean value indicating whether the current device is the first one
among its group for `parallel_mode`.
""" """
rank = self.get_local_rank(parallel_mode) rank = self.get_local_rank(parallel_mode)
return rank == 0 return rank == 0
@ -182,13 +202,16 @@ class ParallelContext(metaclass=SingletonMeta):
"""Returns a boolean value indicating whether the current device is the last one """Returns a boolean value indicating whether the current device is the last one
among its group for `parallel_mode`. among its group for `parallel_mode`.
:param parallel_mode: The chosen parallel mode Args:
:type parallel_mode: :class:`colossalai.context.ParallelMode` parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
:raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
of :class:`colossalai.context.ParallelMode` Raises:
:return: a boolean value indicating whether the current device is the last one AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
among its group for `parallel_mode` of :class:`colossalai.context.ParallelMode`.
:rtype: bool
Returns:
bool: a boolean value indicating whether the current device is the first one
among its group for `parallel_mode`.
""" """
rank = self.get_local_rank(parallel_mode) rank = self.get_local_rank(parallel_mode)
world_size = self.get_world_size(parallel_mode) world_size = self.get_world_size(parallel_mode)
@ -210,12 +233,15 @@ class ParallelContext(metaclass=SingletonMeta):
def get_world_size(self, parallel_mode: ParallelMode): def get_world_size(self, parallel_mode: ParallelMode):
"""Returns the world size for `parallel_mode`. """Returns the world size for `parallel_mode`.
:param parallel_mode: The chosen parallel mode Args:
:type parallel_mode: :class:`colossalai.context.ParallelMode` parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
:raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
of :class:`colossalai.context.ParallelMode` Raises:
:return: The world size for `parallel_mode` AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
:rtype: int of :class:`colossalai.context.ParallelMode`.
Returns:
int: The world size for `parallel_mode`.
""" """
self._check_parallel_mode(parallel_mode) self._check_parallel_mode(parallel_mode)
return self._world_sizes[parallel_mode] return self._world_sizes[parallel_mode]
@ -223,12 +249,13 @@ class ParallelContext(metaclass=SingletonMeta):
def add_world_size(self, parallel_mode: ParallelMode, world_size: int): def add_world_size(self, parallel_mode: ParallelMode, world_size: int):
"""Adds world size for `parallel_mode`. """Adds world size for `parallel_mode`.
:param parallel_mode: The chosen parallel mode Args:
:type parallel_mode: :class:`colossalai.context.ParallelMode` parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
:param world_size: The world size to be added world_size (int): The world size to be added
:type world_size: int
:raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance Raises:
of :class:`colossalai.context.ParallelMode` AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
of :class:`colossalai.context.ParallelMode`.
""" """
self._check_parallel_mode(parallel_mode) self._check_parallel_mode(parallel_mode)
self._world_sizes[parallel_mode] = world_size self._world_sizes[parallel_mode] = world_size
@ -236,12 +263,15 @@ class ParallelContext(metaclass=SingletonMeta):
def get_group(self, parallel_mode: ParallelMode): def get_group(self, parallel_mode: ParallelMode):
"""Returns the group of the current device for `parallel_mode`. """Returns the group of the current device for `parallel_mode`.
:param parallel_mode: The chosen parallel mode Args:
:type parallel_mode: :class:`colossalai.context.ParallelMode` parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
:raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
of :class:`colossalai.context.ParallelMode` Raises:
:return: The group of the current device for `parallel_mode` AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
:rtype: torch.distributed.ProcessGroup of :class:`colossalai.context.ParallelMode`.
Returns:
torch.distributed.ProcessGroup: The group of the current device for `parallel_mode`.
""" """
self._check_parallel_mode(parallel_mode) self._check_parallel_mode(parallel_mode)
return self._groups[parallel_mode] return self._groups[parallel_mode]
@ -249,12 +279,13 @@ class ParallelContext(metaclass=SingletonMeta):
def add_group(self, parallel_mode: ParallelMode, group: dist.ProcessGroup): def add_group(self, parallel_mode: ParallelMode, group: dist.ProcessGroup):
"""Adds the group of the current device for `parallel_mode`. """Adds the group of the current device for `parallel_mode`.
:param parallel_mode: The chosen parallel mode Args:
:type parallel_mode: :class:`colossalai.context.ParallelMode` parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
:param group: The group to be added group (torch.distributed.ProcessGroup): The group to be added
:type group: torch.distributed.ProcessGroup
:raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance Raises:
of :class:`colossalai.context.ParallelMode` AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
of :class:`colossalai.context.ParallelMode`.
""" """
self._check_parallel_mode(parallel_mode) self._check_parallel_mode(parallel_mode)
self._groups[parallel_mode] = group self._groups[parallel_mode] = group
@ -262,12 +293,15 @@ class ParallelContext(metaclass=SingletonMeta):
def get_ranks_in_group(self, parallel_mode: ParallelMode): def get_ranks_in_group(self, parallel_mode: ParallelMode):
"""Returns the rank of the current device for `parallel_mode` in the group. """Returns the rank of the current device for `parallel_mode` in the group.
:param parallel_mode: The chosen parallel mode Args:
:type parallel_mode: :class:`colossalai.context.ParallelMode` parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
:raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
of :class:`colossalai.context.ParallelMode` Raises:
:return: the rank of the current device for `parallel_mode` in the group AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
:rtype: int of :class:`colossalai.context.ParallelMode`.
Returns:
int: The rank of the current device for `parallel_mode` in the group.
""" """
self._check_parallel_mode(parallel_mode) self._check_parallel_mode(parallel_mode)
return self._ranks_in_group[parallel_mode] return self._ranks_in_group[parallel_mode]
@ -275,28 +309,26 @@ class ParallelContext(metaclass=SingletonMeta):
def add_ranks_in_group(self, parallel_mode: ParallelMode, ranks: list): def add_ranks_in_group(self, parallel_mode: ParallelMode, ranks: list):
"""Adds the ranks of the current device for `parallel_mode` in the group. """Adds the ranks of the current device for `parallel_mode` in the group.
:param parallel_mode: The chosen parallel mode Args:
:type parallel_mode: :class:`colossalai.context.ParallelMode` parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
:param ranks: List of ranks to be added ranks (list): List of ranks to be added
:type ranks: list
:raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance Raises:
of :class:`colossalai.context.ParallelMode` AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
of :class:`colossalai.context.ParallelMode`.
""" """
self._check_parallel_mode(parallel_mode) self._check_parallel_mode(parallel_mode)
self._ranks_in_group[parallel_mode] = ranks self._ranks_in_group[parallel_mode] = ranks
def init_global_dist(self, rank: int, world_size: int, backend: str, host: str, port: int): def init_global_dist(self, rank: int, world_size: int, backend: str, host: str, port: int):
"""Initializes the global distributed environment """Initializes the global distributed environment
:param rank: rank for the default process group
:type rank: int Args:
:param world_size: world size of the default process group rank (int): rank for the default process group.
:type world_size: int world_size (int): world size of the default process group.
:param host: the master address for distributed training backend (str): backend for ``torch.distributed``
:type host: str host (str): the master address for distributed training.
:param port: the master port for distributed training port (str): the master port for distributed training
:type port: str
:param backend: backend for torch.distributed
:type backend: str
""" """
# initialize the default process group # initialize the default process group
init_method = f'tcp://{host}:{port}' init_method = f'tcp://{host}:{port}'
@ -315,8 +347,9 @@ class ParallelContext(metaclass=SingletonMeta):
def check_sanity(self): def check_sanity(self):
"""Checks sanity of the parallel context. """Checks sanity of the parallel context.
:raises AssertionError: Raises an AssertionError if the world size does not equal to the product Raises:
of data paralle size, pipeline parallel size and tensor parallel size AssertionError: Raises an AssertionError if the world size does not equal to the product
of data parallel size, pipeline parallel size and tensor parallel size.
""" """
dps = self.data_parallel_size dps = self.data_parallel_size
pps = self.pipeline_parallel_size pps = self.pipeline_parallel_size
@ -341,7 +374,8 @@ class ParallelContext(metaclass=SingletonMeta):
def init_parallel_groups(self): def init_parallel_groups(self):
"""Initializes the parallel groups. """Initializes the parallel groups.
:raises AssertionError: Raises an AssertionError if the field paralle is not present in the config file Raises:
AssertionError: Raises an AssertionError if the field parallel is not present in the config file.
""" """
# get rank and world size # get rank and world size
@ -411,11 +445,11 @@ class ParallelContext(metaclass=SingletonMeta):
"""Returns a boolean value indicating whether `parallel_mode` is initialized """Returns a boolean value indicating whether `parallel_mode` is initialized
in the current system. in the current system.
:param parallel_mode: The chosen parallel mode Args:
:type parallel_mode: :class:`colossalai.context.ParallelMode` parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
:return: a boolean value indicating whether `parallel_mode` is initialized
in the current system Returns:
:rtype: bool bool: a boolean value indicating whether `parallel_mode` is initialized in the current system.
""" """
return parallel_mode in self._groups return parallel_mode in self._groups
@ -432,8 +466,8 @@ class ParallelContext(metaclass=SingletonMeta):
def set_device(self, device_ordinal: int = None): def set_device(self, device_ordinal: int = None):
"""Sets distributed processes to be bound to devices. """Sets distributed processes to be bound to devices.
:param device_ordinal: the device id to be bound to Args:
:type device_ordinal: int, optional device_ordinal (int, optional): the device id to be bound to
""" """
global_rank = self.get_global_rank() global_rank = self.get_global_rank()
if device_ordinal is None: if device_ordinal is None:
@ -447,8 +481,8 @@ class ParallelContext(metaclass=SingletonMeta):
def set_seed(self, seed: int): def set_seed(self, seed: int):
"""Sets seeds for all random libraries. """Sets seeds for all random libraries.
:param seed: seed for random states Args:
:type seed: int seed (int): seed for random states
""" """
random.seed(seed) random.seed(seed)
np.random.seed(seed) np.random.seed(seed)

View File

@ -11,8 +11,16 @@ from .process_group_initializer import ProcessGroupInitializer
@DIST_GROUP_INITIALIZER.register_module @DIST_GROUP_INITIALIZER.register_module
class Initializer_1D(ProcessGroupInitializer): class Initializer_1D(ProcessGroupInitializer):
'''A ProcessGroupInitializer for 1d tensor parallelism. """A ProcessGroupInitializer for 1d tensor parallelism.
'''
Args:
rank (int): The rank of current process.
world_size (int): Size of whole communication world.
config (Config): Running configuration.
data_parallel_size (int): Size of data parallel.
pipeline_parallel_size (int): Size of pipeline parallel.
tensor_parallel_size (int): Size of tensor parallel.
"""
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
@ -20,8 +28,10 @@ class Initializer_1D(ProcessGroupInitializer):
def init_dist_group(self): def init_dist_group(self):
"""Initialize 1D tensor parallel groups, and assign local_ranks and groups to each gpu. """Initialize 1D tensor parallel groups, and assign local_ranks and groups to each gpu.
:return: (local_rank, group_world_size, process_group, ranks_in_group, mode)
:rtype: Tuple Returns:
Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
1D tensor parallelism's information in a tuple.
""" """
local_rank = None local_rank = None
ranks_in_group = None ranks_in_group = None

View File

@ -22,12 +22,16 @@ def _check_summa_env_var(summa_dim):
class Initializer_2D_Row(ProcessGroupInitializer): class Initializer_2D_Row(ProcessGroupInitializer):
"""2d tensor parallel initialization among rows. """2d tensor parallel initialization among rows.
:param num_group: The number of all tensor groups
:param summa_dim: The dimension of SUMMA Args:
:param args: Args used to initialize base class num_group (int): The number of all tensor groups.
:param kwargs: Kwargs used to initialize base class summa_dim (int): The dimension of SUMMA.
:type num_group: int rank (int): The rank of current process.
:type summa_dim: int world_size (int): Size of whole communication world.
config (Config): Running configuration.
data_parallel_size (int): Size of data parallel.
pipeline_parallel_size (int): Size of pipeline parallel.
tensor_parallel_size (int): Size of tensor parallel.
""" """
def __init__(self, num_group, summa_dim, *args, **kwargs): def __init__(self, num_group, summa_dim, *args, **kwargs):
@ -37,9 +41,9 @@ class Initializer_2D_Row(ProcessGroupInitializer):
def init_dist_group(self): def init_dist_group(self):
"""Initialize 2D tensor row parallel groups, and assign local_ranks and groups to each gpu. """Initialize 2D tensor row parallel groups, and assign local_ranks and groups to each gpu.
Returns:
:return: 2D tensor row parallelism's information Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
:rtype: Tuple(local_rank, group_world_size, process_group, ranks_in_group, mode) 2D tensor row parallelism's information in a tuple.
""" """
local_rank = None local_rank = None
ranks_in_group = None ranks_in_group = None
@ -64,13 +68,15 @@ class Initializer_2D_Row(ProcessGroupInitializer):
class Initializer_2D_Col(ProcessGroupInitializer): class Initializer_2D_Col(ProcessGroupInitializer):
"""2d tensor parallel initialization among cols. """2d tensor parallel initialization among cols.
:param num_group: The number of all tensor groups Args:
:param summa_dim: The dimension of SUMMA num_group (int): The number of all tensor groups.
:param args: Args used to initialize base class summa_dim (int): The dimension of SUMMA.
:param kwargs: Kwargs used to initialize base class rank (int): The rank of current process.
world_size (int): Size of whole communication world.
:type num_group: int config (Config): Running configuration.
:type summa_dim: int data_parallel_size (int): Size of data parallel.
pipeline_parallel_size (int): Size of pipeline parallel.
tensor_parallel_size (int): Size of tensor parallel.
""" """
def __init__(self, num_group, summa_dim, *args, **kwargs): def __init__(self, num_group, summa_dim, *args, **kwargs):
@ -81,8 +87,9 @@ class Initializer_2D_Col(ProcessGroupInitializer):
def init_dist_group(self): def init_dist_group(self):
"""Initialize 2D tensor row parallel groups, and assign local_ranks and groups to each gpu. """Initialize 2D tensor row parallel groups, and assign local_ranks and groups to each gpu.
:return: 2D tensor col parallelism's information Returns:
:rtype: Tuple(local_rank, group_world_size, process_group, ranks_in_group, mode) Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
2D tensor col parallelism's information in a tuple.
""" """
local_rank = None local_rank = None
ranks_in_group = None ranks_in_group = None
@ -109,8 +116,13 @@ class Initializer_2D(ProcessGroupInitializer):
""" """
Serve as the single entry point to 2D parallel initialization. Serve as the single entry point to 2D parallel initialization.
:param args: Args used to initialize ProcessGroupInitializer Args:
:param kwargs: Kwargs used to initialize ProcessGroupInitializer rank (int): The rank of current process.
world_size (int): Size of whole communication world.
config (Config): Running configuration.
data_parallel_size (int): Size of data parallel.
pipeline_parallel_size (int): Size of pipeline parallel.
tensor_parallel_size (int): Size of tensor parallel.
""" """
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
@ -127,8 +139,10 @@ class Initializer_2D(ProcessGroupInitializer):
def init_dist_group(self): def init_dist_group(self):
"""Initialize 2D tensor row and col parallel groups, and assign local_ranks and groups to each gpu. """Initialize 2D tensor row and col parallel groups, and assign local_ranks and groups to each gpu.
:return: 2D tensor parallelism's information
:rtype: list of Tuples (local_rank, group_world_size, process_group, ranks_in_group, mode) Returns:
List[Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode)]:
2D tensor parallelism's information in a list of tuples.
""" """
parallel_setting = [self.row_initializer.init_dist_group(), self.col_initializer.init_dist_group()] parallel_setting = [self.row_initializer.init_dist_group(), self.col_initializer.init_dist_group()]
return parallel_setting return parallel_setting

View File

@ -31,14 +31,17 @@ def _check_tesseract_env_var(tesseract_dim: int, tesseract_dep: int):
# i row j col k dep # i row j col k dep
class Initializer_2p5D_ROW(ProcessGroupInitializer): class Initializer_2p5D_ROW(ProcessGroupInitializer):
"""2p5d tensor parallel initialization among rows. """2.5d tensor parallel initialization among rows.
:param tesseract_dim: The dimension of tesseract Args:
:param tesseract_dep: The dimension of depth tesseract_dim (int): The dimension of tesseract.
:param args: Args used to initialize base class tesseract_dep (int): The dimension of depth.
rank (int): The rank of current process.
:type tesseract_dim: int world_size (int): Size of whole communication world.
:type tesseract_dep: int config (Config): Running configuration.
data_parallel_size (int): Size of data parallel.
pipeline_parallel_size (int): Size of pipeline parallel.
tensor_parallel_size (int): Size of tensor parallel.
""" """
def __init__(self, tesseract_dim: int, tesseract_dep: int, *args): def __init__(self, tesseract_dim: int, tesseract_dep: int, *args):
@ -50,10 +53,11 @@ class Initializer_2p5D_ROW(ProcessGroupInitializer):
"Tensor parallel size should be depth * dim ** 2 in 2.5D parallel" "Tensor parallel size should be depth * dim ** 2 in 2.5D parallel"
def init_dist_group(self): def init_dist_group(self):
"""Initialize 2p5D tensor row parallel groups, and assign local_ranks and groups to each gpu. """Initialize 2.5D tensor row parallel groups, and assign local_ranks and groups to each gpu.
:return: 2p5D tensor row parallelism's information Returns:
:rtype: Tuple(local_rank, group_world_size, process_group, ranks_in_group, mode) Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
2.5D tensor row parallelism's information in a tuple.
""" """
local_rank = None local_rank = None
ranks_in_group = None ranks_in_group = None
@ -80,14 +84,17 @@ class Initializer_2p5D_ROW(ProcessGroupInitializer):
class Initializer_2p5D_Col(ProcessGroupInitializer): class Initializer_2p5D_Col(ProcessGroupInitializer):
"""2p5d tensor parallel initialization among cols. """2.5d tensor parallel initialization among cols.
:param tesseract_dim: The dimension of tesseract Args:
:param tesseract_dep: The dimension of depth tesseract_dim (int): The dimension of tesseract.
:param args: Args used to initialize base class tesseract_dep (int): The dimension of depth.
rank (int): The rank of current process.
:type tesseract_dim: int world_size (int): Size of whole communication world.
:type tesseract_dep: int config (Config): Running configuration.
data_parallel_size (int): Size of data parallel.
pipeline_parallel_size (int): Size of pipeline parallel.
tensor_parallel_size (int): Size of tensor parallel.
""" """
def __init__(self, tesseract_dim: int, tesseract_dep: int, *args): def __init__(self, tesseract_dim: int, tesseract_dep: int, *args):
@ -99,10 +106,11 @@ class Initializer_2p5D_Col(ProcessGroupInitializer):
"Tensor parallel size should be depth * dim ** 2 in 2.5D parallel" "Tensor parallel size should be depth * dim ** 2 in 2.5D parallel"
def init_dist_group(self): def init_dist_group(self):
"""Initialize 2p5D tensor col parallel groups, and assign local_ranks and groups to each gpu. """Initialize 2.5D tensor col parallel groups, and assign local_ranks and groups to each gpu.
:return: 2p5D tensor col parallelism's information Returns:
:rtype: Tuple(local_rank, group_world_size, process_group, ranks_in_group, mode) Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
2.5D tensor col parallelism's information in a tuple.
""" """
local_rank = None local_rank = None
ranks_in_group = None ranks_in_group = None
@ -129,14 +137,17 @@ class Initializer_2p5D_Col(ProcessGroupInitializer):
class Initializer_2p5D_Dep(ProcessGroupInitializer): class Initializer_2p5D_Dep(ProcessGroupInitializer):
"""2p5D tensor parallel initialization among depths. """2.5D tensor parallel initialization among depths.
:param tesseract_dim: The dimension of tesseract Args:
:param tesseract_dep: The dimension of depth tesseract_dim (int): The dimension of tesseract.
:param args: Args used to initialize base class tesseract_dep (int): The dimension of depth.
rank (int): The rank of current process.
:type tesseract_dim: int world_size (int): Size of whole communication world.
:type tesseract_dep: int config (Config): Running configuration.
data_parallel_size (int): Size of data parallel.
pipeline_parallel_size (int): Size of pipeline parallel.
tensor_parallel_size (int): Size of tensor parallel.
""" """
def __init__(self, tesseract_dim: int, tesseract_dep: int, *args): def __init__(self, tesseract_dim: int, tesseract_dep: int, *args):
@ -148,10 +159,11 @@ class Initializer_2p5D_Dep(ProcessGroupInitializer):
"Tensor parallel size should be depth * dim ** 2 in 2.5D parallel" "Tensor parallel size should be depth * dim ** 2 in 2.5D parallel"
def init_dist_group(self): def init_dist_group(self):
"""Initialize 2p5D tensor depth parallel groups, and assign local_ranks and groups to each gpu. """Initialize 2.5D tensor depth parallel groups, and assign local_ranks and groups to each gpu.
:return: 2p5D tensor depth parallelism's information Returns:
:rtype: Tuple(local_rank, group_world_size, process_group, ranks_in_group, mode) Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
2.5D tensor depth parallelism's information in a tuple.
""" """
local_rank = None local_rank = None
ranks_in_group = None ranks_in_group = None
@ -179,14 +191,17 @@ class Initializer_2p5D_Dep(ProcessGroupInitializer):
# i row j col k dep # i row j col k dep
class Initializer_2p5D_XZ(ProcessGroupInitializer): class Initializer_2p5D_XZ(ProcessGroupInitializer):
"""2p5d tensor parallel initialization among cols times dep. """2.5d tensor parallel initialization among cols times dep.
:param tesseract_dim: The dimension of tesseract Args:
:param tesseract_dep: The dimension of depth tesseract_dim (int): The dimension of tesseract.
:param args: Args used to initialize base class tesseract_dep (int): The dimension of depth.
rank (int): The rank of current process.
:type tesseract_dim: int world_size (int): Size of whole communication world.
:type tesseract_dep: int config (Config): Running configuration.
data_parallel_size (int): Size of data parallel.
pipeline_parallel_size (int): Size of pipeline parallel.
tensor_parallel_size (int): Size of tensor parallel.
""" """
def __init__(self, tesseract_dim: int, tesseract_dep: int, *args): def __init__(self, tesseract_dim: int, tesseract_dep: int, *args):
@ -198,10 +213,11 @@ class Initializer_2p5D_XZ(ProcessGroupInitializer):
"Tensor parallel size should be depth * dim ** 2 in 2.5D parallel" "Tensor parallel size should be depth * dim ** 2 in 2.5D parallel"
def init_dist_group(self): def init_dist_group(self):
"""Initialize 2p5D tensor colXdepth parallel groups, and assign local_ranks and groups to each gpu. """Initialize 2.5D tensor colXdepth parallel groups, and assign local_ranks and groups to each gpu.
:return: 2p5D tensor colXdepth parallelism's information Returns:
:rtype: Tuple(local_rank, group_world_size, process_group, ranks_in_group, mode) Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
2.5D tensor colXdepth parallelism's information in a tuple.
""" """
local_rank = None local_rank = None
ranks_in_group = None ranks_in_group = None
@ -232,20 +248,14 @@ class Initializer_2p5D(ProcessGroupInitializer):
""" """
Serve as the single entry point to Tesseract parallel initialization. Serve as the single entry point to Tesseract parallel initialization.
:param rank: The rank of current process Args:
:param world_size: Size of whole communication world rank (int): The rank of current process.
:param config: Running configuration world_size (int): Size of whole communication world.
:param data_parallel_size: Size of data parallel config (Config): Running configuration.
:param pipeline_parallel_size: Size of pipeline parallel data_parallel_size (int): Size of data parallel.
:param tensor_parallel_size: Size of tensor parallel pipeline_parallel_size (int): Size of pipeline parallel.
:param depth: The depth of 2p5d parallel tensor_parallel_size (int): Size of tensor parallel.
:type rank: int depth (int): The depth of 2.5d parallel.
:type world_size: int
:type config: Config
:type data_parallel_size: int
:type pipeline_parallel_size: int
:type tensor_parallel_size: int
:type depth: int
""" """
def __init__(self, rank: int, world_size: int, config: Config, data_parallel_size: int, pipeline_parallel_size: int, def __init__(self, rank: int, world_size: int, config: Config, data_parallel_size: int, pipeline_parallel_size: int,
@ -266,9 +276,11 @@ class Initializer_2p5D(ProcessGroupInitializer):
self.xz_initializer = Initializer_2p5D_XZ(self.tesseract_dim, self.tesseract_dep, *args) self.xz_initializer = Initializer_2p5D_XZ(self.tesseract_dim, self.tesseract_dep, *args)
def init_dist_group(self): def init_dist_group(self):
"""Initialize 2p5D tensor row, col, depth, and colXdepth parallel groups, and assign local_ranks and groups to each gpu. """Initialize 2.5D tensor row, col, depth, and colXdepth parallel groups, and assign local_ranks and groups to each gpu.
:return: Whole 2p5D tensor parallelism's information
:rtype: list of Tuples (local_rank, group_world_size, process_group, ranks_in_group, mode) Returns:
List[Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode)]:
Whole 2.5D tensor parallelism's information in a list of tuples.
""" """
parallel_setting = [ parallel_setting = [
self.col_initializer.init_dist_group(), self.col_initializer.init_dist_group(),

View File

@ -26,12 +26,15 @@ def _check_depth_env_var(depth):
class Initializer_3D_Input(ProcessGroupInitializer): class Initializer_3D_Input(ProcessGroupInitializer):
"""3D tensor parallel initialization among input. """3D tensor parallel initialization among input.
:param num_group: The number of all tensor groups Args:
:param depth: Depth of 3D parallelism num_group (int): The number of all tensor groups.
:param args: Args used in base class depth (int): Depth of 3D parallelism.
rank (int): The rank of current process.
:type num_group: int world_size (int): Size of whole communication world.
:type depth: int config (Config): Running configuration.
data_parallel_size (int): Size of data parallel.
pipeline_parallel_size (int): Size of pipeline parallel.
tensor_parallel_size (int): Size of tensor parallel.
""" """
def __init__(self, num_group: int, depth: int, *args): def __init__(self, num_group: int, depth: int, *args):
@ -42,8 +45,9 @@ class Initializer_3D_Input(ProcessGroupInitializer):
def init_dist_group(self): def init_dist_group(self):
"""Initialize 3D tensor parallel groups among input, and assign local_ranks and groups to each gpu. """Initialize 3D tensor parallel groups among input, and assign local_ranks and groups to each gpu.
:return: 3D tensor parallelism's information among input Returns:
:rtype: Tuple(local_rank, group_world_size, process_group, ranks_in_group, mode) Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
3D tensor parallelism's information among input in a tuple.
""" """
local_rank = None local_rank = None
ranks_in_group = None ranks_in_group = None
@ -70,12 +74,15 @@ class Initializer_3D_Input(ProcessGroupInitializer):
class Initializer_3D_Weight(ProcessGroupInitializer): class Initializer_3D_Weight(ProcessGroupInitializer):
"""3D tensor parallel initialization among weight. """3D tensor parallel initialization among weight.
:param num_group: The number of all tensor groups Args:
:param depth: Depth of 3D parallelism num_group (int): The number of all tensor groups.
:param args: Args used in base class depth (int): Depth of 3D parallelism.
rank (int): The rank of current process.
:type num_group: int world_size (int): Size of whole communication world.
:type depth: int config (Config): Running configuration.
data_parallel_size (int): Size of data parallel.
pipeline_parallel_size (int): Size of pipeline parallel.
tensor_parallel_size (int): Size of tensor parallel.
""" """
def __init__(self, num_group: int, depth: int, *args): def __init__(self, num_group: int, depth: int, *args):
@ -86,8 +93,9 @@ class Initializer_3D_Weight(ProcessGroupInitializer):
def init_dist_group(self): def init_dist_group(self):
"""Initialize 3D tensor parallel groups among weight, and assign local_ranks and groups to each gpu. """Initialize 3D tensor parallel groups among weight, and assign local_ranks and groups to each gpu.
:return: 3D tensor parallelism's information among weight Returns:
:rtype: Tuple(local_rank, group_world_size, process_group, ranks_in_group, mode) Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
3D tensor parallelism's information among weight in a tuple.
""" """
local_rank = None local_rank = None
ranks_in_group = None ranks_in_group = None
@ -114,12 +122,15 @@ class Initializer_3D_Weight(ProcessGroupInitializer):
class Initializer_3D_Output(ProcessGroupInitializer): class Initializer_3D_Output(ProcessGroupInitializer):
"""3D tensor parallel initialization among output. """3D tensor parallel initialization among output.
:param num_group: The number of all tensor groups Args:
:param depth: Depth of 3D parallelism num_group (int): The number of all tensor groups.
:param args: Args used in base class depth (int): Depth of 3D parallelism.
rank (int): The rank of current process.
:type num_group: int world_size (int): Size of whole communication world.
:type depth: int config (Config): Running configuration.
data_parallel_size (int): Size of data parallel.
pipeline_parallel_size (int): Size of pipeline parallel.
tensor_parallel_size (int): Size of tensor parallel.
""" """
def __init__(self, num_group: int, depth: int, *args): def __init__(self, num_group: int, depth: int, *args):
@ -130,8 +141,9 @@ class Initializer_3D_Output(ProcessGroupInitializer):
def init_dist_group(self): def init_dist_group(self):
"""Initialize 3D tensor parallel groups among output, and assign local_ranks and groups to each gpu. """Initialize 3D tensor parallel groups among output, and assign local_ranks and groups to each gpu.
:return: 3D tensor parallelism's information among output Returns:
:rtype: Tuple(local_rank, group_world_size, process_group, ranks_in_group, mode) Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
3D tensor parallelism's information among output in a tuple.
""" """
local_rank = None local_rank = None
ranks_in_group = None ranks_in_group = None
@ -158,7 +170,14 @@ class Initializer_3D_Output(ProcessGroupInitializer):
@DIST_GROUP_INITIALIZER.register_module @DIST_GROUP_INITIALIZER.register_module
class Initializer_3D(ProcessGroupInitializer): class Initializer_3D(ProcessGroupInitializer):
"""Serve as the single entry point to 3D parallel initialization. """Serve as the single entry point to 3D parallel initialization.
:param args: Args used to initialize ProcessGroupInitializer
Args:
rank (int): The rank of current process.
world_size (int): Size of whole communication world.
config (Config): Running configuration.
data_parallel_size (int): Size of data parallel.
pipeline_parallel_size (int): Size of pipeline parallel.
tensor_parallel_size (int): Size of tensor parallel.
""" """
def __init__(self, *args): def __init__(self, *args):
@ -175,8 +194,10 @@ class Initializer_3D(ProcessGroupInitializer):
def init_dist_group(self): def init_dist_group(self):
"""Initialize 3D tensor parallel groups, and assign local_ranks and groups to each gpu. """Initialize 3D tensor parallel groups, and assign local_ranks and groups to each gpu.
:return: 3D tensor parallelism's information
:rtype: list of Tuples (local_rank, group_world_size, process_group, ranks_in_group, mode) Returns:
List[Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode)]:
Whole 3D tensor parallelism's information in a list of tuples.
""" """
parallel_setting = [ parallel_setting = [
self.input_initializer.init_dist_group(), self.input_initializer.init_dist_group(),

View File

@ -12,8 +12,13 @@ from ..parallel_mode import ParallelMode
class Initializer_Data(ProcessGroupInitializer): class Initializer_Data(ProcessGroupInitializer):
"""A ProcessGroupInitializer for data parallelism. """A ProcessGroupInitializer for data parallelism.
:param args: Args used to initialize ProcessGroupInitializer Args:
:param kwargs: Kwargs used to initialize ProcessGroupInitializer rank (int): The rank of current process.
world_size (int): Size of whole communication world.
config (Config): Running configuration.
data_parallel_size (int): Size of data parallel.
pipeline_parallel_size (int): Size of pipeline parallel.
tensor_parallel_size (int): Size of tensor parallel.
""" """
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
@ -22,8 +27,9 @@ class Initializer_Data(ProcessGroupInitializer):
def init_dist_group(self): def init_dist_group(self):
"""Initialize data parallel groups, and assign local_ranks and groups to each gpu. """Initialize data parallel groups, and assign local_ranks and groups to each gpu.
:return: Data parallelism's information Returns:
:rtype: Tuple(local_rank, group_world_size, process_group, ranks_in_group, mode) Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
A Data parallelism's information tuple.
""" """
local_rank = None local_rank = None
ranks_in_group = None ranks_in_group = None

View File

@ -12,8 +12,13 @@ class Initializer_Model(ProcessGroupInitializer):
"""A ProcessGroupInitializer for model parallelism (model parallel group contains pipeline and tensor parallel """A ProcessGroupInitializer for model parallelism (model parallel group contains pipeline and tensor parallel
groups). groups).
:param args: Args used to initialize ProcessGroupInitializer Args:
:param kwargs: Kwargs used to initialize ProcessGroupInitializer rank (int): The rank of current process.
world_size (int): Size of whole communication world.
config (Config): Running configuration.
data_parallel_size (int): Size of data parallel.
pipeline_parallel_size (int): Size of pipeline parallel.
tensor_parallel_size (int): Size of tensor parallel.
""" """
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
@ -24,8 +29,9 @@ class Initializer_Model(ProcessGroupInitializer):
def init_dist_group(self): def init_dist_group(self):
"""Initialize model parallel groups, and assign local_ranks and groups to each gpu. """Initialize model parallel groups, and assign local_ranks and groups to each gpu.
:return: (local_rank, group_world_size, process_group, ranks_in_group, mode) Returns:
:rtype: Tuple Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
A Model parallelism's information tuple.
""" """
local_rank = None local_rank = None
ranks_in_group = None ranks_in_group = None

View File

@ -12,8 +12,13 @@ from ..parallel_mode import ParallelMode
class Initializer_Pipeline(ProcessGroupInitializer): class Initializer_Pipeline(ProcessGroupInitializer):
"""A ProcessGroupInitializer for pipeline parallelism. """A ProcessGroupInitializer for pipeline parallelism.
:param args: Args used to initialize ProcessGroupInitializer Args:
:param kwargs: Kwargs used to initialize ProcessGroupInitializer rank (int): The rank of current process
world_size (int): Size of whole communication world
config (Config): Running configuration
data_parallel_size (int): Size of data parallel
pipeline_parallel_size (int): Size of pipeline parallel
tensor_parallel_size (int): Size of tensor parallel
""" """
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
@ -23,8 +28,9 @@ class Initializer_Pipeline(ProcessGroupInitializer):
def init_dist_group(self): def init_dist_group(self):
"""Initialize pipeline parallel groups, and assign local_ranks and groups to each gpu. """Initialize pipeline parallel groups, and assign local_ranks and groups to each gpu.
:return: Pipeline parallelism's information Returns:
:rtype: list of Tuples (local_rank, group_world_size, process_group, ranks_in_group, mode) List[Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode)]:
A Pipeline parallelism's information in list of tuples.
""" """
dist_settings = list() dist_settings = list()
for i in range(self.data_parallel_size): for i in range(self.data_parallel_size):

View File

@ -15,8 +15,13 @@ class Initializer_Sequence_DP(ProcessGroupInitializer):
In Sequence Parallelism, each GPU holds the full copy of model weights, In Sequence Parallelism, each GPU holds the full copy of model weights,
thus, gradient all-reduce occurs across all processes in the same pipeline stage thus, gradient all-reduce occurs across all processes in the same pipeline stage
:param args: Args used to initialize ProcessGroupInitializer Args:
:param kwargs: Kwargs used to initialize ProcessGroupInitializer rank (int): The rank of current process
world_size (int): Size of whole communication world
config (Config): Running configuration
data_parallel_size (int): Size of data parallel
pipeline_parallel_size (int): Size of pipeline parallel
tensor_parallel_size (int): Size of tensor parallel
""" """
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
@ -27,8 +32,8 @@ class Initializer_Sequence_DP(ProcessGroupInitializer):
def init_dist_group(self): def init_dist_group(self):
"""Initialize Sequence Parallel process groups used for gradient all-reduce. """Initialize Sequence Parallel process groups used for gradient all-reduce.
:return: (local_rank, group_world_size, process_group, ranks_in_group, mode) Returns:
:rtype: Tuple Tuple: A tuple (local_rank, group_world_size, process_group, ranks_in_group, mode).
""" """
local_rank = None local_rank = None
ranks_in_group = None ranks_in_group = None
@ -52,8 +57,13 @@ class Initializer_Sequence_DP(ProcessGroupInitializer):
class Initializer_Sequence(ProcessGroupInitializer): class Initializer_Sequence(ProcessGroupInitializer):
"""A ProcessGroupInitializer for sequence parallelism. """A ProcessGroupInitializer for sequence parallelism.
:param args: Args used to initialize ProcessGroupInitializer Args:
:param kwargs: Kwargs used to initialize ProcessGroupInitializer rank (int): The rank of current process.
world_size (int): Size of whole communication world.
config (Config): Running configuration.
data_parallel_size (int): Size of data parallel.
pipeline_parallel_size (int): Size of pipeline parallel.
tensor_parallel_size (int): Size of tensor parallel.
""" """
def __init__(self, def __init__(self,
*args, **kwargs): *args, **kwargs):
@ -66,11 +76,12 @@ class Initializer_Sequence(ProcessGroupInitializer):
"""Initialize Sequence parallel process groups and assign local_ranks and groups to each gpu. """Initialize Sequence parallel process groups and assign local_ranks and groups to each gpu.
Sequence parallelism requires 2 process groups. The first is for model forward where several processes Sequence parallelism requires 2 process groups. The first is for model forward where several processes
exchange paritial query, key and value embedding to compute self attention values. The second is for exchange partial query, key and value embedding to compute self attention values. The second is for
all-reduce to synchronize the model parameters. all-reduce to synchronize the model parameters.
:return: Sequence parallelism's information Returns:
:rtype: list of Tuples (local_rank, group_world_size, process_group, ranks_in_group, mode) List[Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode)]:
A Sequence parallelism's information in list of tuples.
""" """
parallel_setting = [] parallel_setting = []

View File

@ -12,8 +12,13 @@ from ..parallel_mode import ParallelMode
class Initializer_Tensor(ProcessGroupInitializer): class Initializer_Tensor(ProcessGroupInitializer):
"""A ProcessGroupInitializer for tensor parallelism. """A ProcessGroupInitializer for tensor parallelism.
:param args: Args used to initialize ProcessGroupInitializer Args:
:param kwargs: Kwargs used to initialize ProcessGroupInitializer rank (int): The rank of current process.
world_size (int): Size of whole communication world.
config (Config): Running configuration.
data_parallel_size (int): Size of data parallel.
pipeline_parallel_size (int): Size of pipeline parallel.
tensor_parallel_size (int): Size of tensor parallel.
""" """
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
@ -22,8 +27,9 @@ class Initializer_Tensor(ProcessGroupInitializer):
def init_dist_group(self): def init_dist_group(self):
"""Initialize tensor parallel groups, and assign local_ranks and groups to each gpu. """Initialize tensor parallel groups, and assign local_ranks and groups to each gpu.
:return: Tensor parallelism's information Returns:
:rtype: Tuple(local_rank, group_world_size, process_group, ranks_in_group, mode) Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
A Tensor parallelism's information tuple.
""" """
local_rank = None local_rank = None
ranks_in_group = None ranks_in_group = None

View File

@ -9,19 +9,13 @@ from colossalai.context import Config
class ProcessGroupInitializer(ABC): class ProcessGroupInitializer(ABC):
"""An object, knowing the parallelism configuration, that initializes parallel groups. """An object, knowing the parallelism configuration, that initializes parallel groups.
:param rank: The rank of current process Args:
:param world_size: Size of whole communication world rank (int): The rank of current process.
:param config: Running configuration world_size (int): Size of whole communication world.
:param data_parallel_size: Size of data parallel config (Config): Running configuration.
:param pipeline_parallel_size: Size of pipeline parallel data_parallel_size (int): Size of data parallel.
:param tensor_parallel_size: Size of tensor parallel pipeline_parallel_size (int): Size of pipeline parallel.
tensor_parallel_size (int): Size of tensor parallel.
:type rank: int
:type world_size: int
:type config: Config
:type data_parallel_size: int
:type pipeline_parallel_size: int
:type tensor_parallel_size: int
""" """
def __init__(self, def __init__(self,
rank: int, rank: int,

View File

@ -16,8 +16,8 @@ _SEED_MANAGER = SeedManager()
def get_seeds(): def get_seeds():
"""Returns the seeds of the seed manager. """Returns the seeds of the seed manager.
:return: The seeds of the seed manager Returns:
:rtype: dict dict: The seeds of the seed manager.
""" """
return _SEED_MANAGER.seeds return _SEED_MANAGER.seeds
@ -25,8 +25,8 @@ def get_seeds():
def get_states(copy=False): def get_states(copy=False):
"""Returns the seed states of the seed manager. """Returns the seed states of the seed manager.
:return: The seed states of the seed manager Returns:
:rtype: dict dict: The seed states of the seed manager.
""" """
states = _SEED_MANAGER.seed_states states = _SEED_MANAGER.seed_states
@ -43,8 +43,8 @@ def get_states(copy=False):
def get_current_mode(): def get_current_mode():
"""Returns the current mode of the seed manager. """Returns the current mode of the seed manager.
:return: The current mode of the seed manager. Returns:
:rtype: :class:`torch.ByteTensor` :class:`torch.ByteTensor`: The current mode of the seed manager.
""" """
return _SEED_MANAGER.current_mode return _SEED_MANAGER.current_mode
@ -52,12 +52,16 @@ def get_current_mode():
def add_seed(parallel_mode: ParallelMode, seed: int, overwrite: bool = False): def add_seed(parallel_mode: ParallelMode, seed: int, overwrite: bool = False):
"""Adds a seed to the seed manager for `parallel_mode`. """Adds a seed to the seed manager for `parallel_mode`.
:param parallel_mode: The chosen parallel mode Args:
:type parallel_mode: :class:`colossalai.context.ParallelMode` parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
:param seed: The seed to be added seed (int): The seed to be added
:type seed: int Raises:
:raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance of AssertionError: Raises an AssertionError if `parallel_mode` is not an instance of
:class:`colossalai.context.ParallelMode` or the seed for `parallel_mode` has been added :class:`colossalai.context.ParallelMode` or the seed for `parallel_mode` has been added.
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
""" """
_SEED_MANAGER.add_seed(parallel_mode, seed, overwrite) _SEED_MANAGER.add_seed(parallel_mode, seed, overwrite)
@ -65,8 +69,12 @@ def add_seed(parallel_mode: ParallelMode, seed: int, overwrite: bool = False):
def set_mode(parallel_mode: ParallelMode): def set_mode(parallel_mode: ParallelMode):
"""Sets the current mode of the seed manager. """Sets the current mode of the seed manager.
:param parallel_mode: The chosen parallel mode Args:
:type parallel_mode: :class:`colossalai.context.ParallelMode` parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
""" """
_SEED_MANAGER.set_mode(parallel_mode) _SEED_MANAGER.set_mode(parallel_mode)
@ -74,11 +82,12 @@ def set_mode(parallel_mode: ParallelMode):
def set_seed_states(parallel_mode: ParallelMode, state: Tensor): def set_seed_states(parallel_mode: ParallelMode, state: Tensor):
"""Sets the state of the seed manager for `parallel_mode`. """Sets the state of the seed manager for `parallel_mode`.
:param parallel_mode: The chosen parallel mode Args:
:type parallel_mode: :class:`colossalai.context.ParallelMode` parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
:param state: the state to be set state (:class:`torch.Tensor`): the state to be set.
:type state: :class:`torch.Tensor`
:raises AssertionError: Raises an AssertionError if `parallel_mode` is not found in the seed manager Raises:
AssertionError: Raises an AssertionError if `parallel_mode` is not found in the seed manager.
""" """
_SEED_MANAGER.set_state(parallel_mode, state) _SEED_MANAGER.set_state(parallel_mode, state)
@ -98,6 +107,9 @@ def seed(parallel_mode: ParallelMode):
with seed(ParallelMode.DATA): with seed(ParallelMode.DATA):
output = F.dropout(input) output = F.dropout(input)
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
""" """
try: try:
# set to new mode # set to new mode
@ -125,6 +137,9 @@ def with_seed(func, parallel_mode: ParallelMode):
wrapper_forward = with_seed(forward, ParallelMode.DATA) wrapper_forward = with_seed(forward, ParallelMode.DATA)
out = wrapped_forward(input) out = wrapped_forward(input)
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
""" """
@functools.wraps(func) @functools.wraps(func)

View File

@ -9,6 +9,10 @@ from colossalai.context.parallel_mode import ParallelMode
class SeedManager: class SeedManager:
"""This class is a manager of all random seeds involved in the system. """This class is a manager of all random seeds involved in the system.
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
""" """
def __init__(self): def __init__(self):
@ -30,12 +34,12 @@ class SeedManager:
def set_state(self, parallel_mode: ParallelMode, state: Tensor): def set_state(self, parallel_mode: ParallelMode, state: Tensor):
"""Sets the state of the seed manager for `parallel_mode`. """Sets the state of the seed manager for `parallel_mode`.
Args:
parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
state (:class:`torch.Tensor`): the state to be set.
:param parallel_mode: The chosen parallel mode Raises:
:type parallel_mode: :class:`colossalai.context.ParallelMode` AssertionError: Raises an AssertionError if `parallel_mode` is not found in the seed manager.
:param state: the state to be set
:type state: :class:`torch.Tensor`
:raises AssertionError: Raises an AssertionError if `parallel_mode` is not found in the seed manager
""" """
assert parallel_mode in self._seed_states, f'Parallel mode {parallel_mode} is not found in the seed manager' assert parallel_mode in self._seed_states, f'Parallel mode {parallel_mode} is not found in the seed manager'
self._seed_states[parallel_mode] = state self._seed_states[parallel_mode] = state
@ -43,8 +47,8 @@ class SeedManager:
def set_mode(self, parallel_mode: ParallelMode): def set_mode(self, parallel_mode: ParallelMode):
"""Sets the current mode of the seed manager. """Sets the current mode of the seed manager.
:param parallel_mode: The chosen parallel mode Args:
:type parallel_mode: :class:`colossalai.context.ParallelMode` parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
""" """
if self.current_mode: if self.current_mode:
# save the current state for current mode # save the current state for current mode
@ -57,14 +61,14 @@ class SeedManager:
def add_seed(self, parallel_mode: ParallelMode, seed: int, overwrtie: bool = False): def add_seed(self, parallel_mode: ParallelMode, seed: int, overwrtie: bool = False):
"""Adds a seed to the seed manager for `parallel_mode`. """Adds a seed to the seed manager for `parallel_mode`.
:param parallel_mode: The chosen parallel mode Args:
:type parallel_mode: :class:`colossalai.context.ParallelMode` parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
:param seed: The seed to be added seed (int): The seed to be added.
:type seed: int overwrtie (bool, optional): Whether allows to overwrite the seed that has been set already
:param overwrtie: Whether allows to overwrite the seed that has been set already
:type overwrtie: bool, optional Raises
:raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance of AssertionError: Raises an AssertionError if `parallel_mode` is not an instance of
:class:`colossalai.context.ParallelMode` or the seed for `parallel_mode` has been added :class:`colossalai.context.ParallelMode` or the seed for `parallel_mode` has been added.
""" """
assert isinstance(parallel_mode, ParallelMode), 'A valid ParallelMode must be provided' assert isinstance(parallel_mode, ParallelMode), 'A valid ParallelMode must be provided'
if overwrtie is False: if overwrtie is False:

View File

@ -19,20 +19,37 @@ class Engine:
:meth:`step` which is based on the given :attr:`schedule` over each batch of a dataset. :meth:`step` which is based on the given :attr:`schedule` over each batch of a dataset.
It controls a iteration in training. It controls a iteration in training.
:param model: The neural network model Args:
:type model: ``torch.nn.Module`` model (``torch.nn.Module``): The neural network model.
:param optimizer: Optimizer for updating the parameters optimizer (``torch.optim.Optimizer``): Optimizer for updating the parameters.
:type optimizer: ``torch.optim.Optimizer`` criterion (``torch.nn.modules.loss._Loss``, optional): Loss function for calculating loss.
:param criterion: Loss function for calculating loss gradient_handlers (List[``BaseGradientHandler``], optional): A list of gradient handler used in backward.
:type criterion: ``torch.nn.modules.loss._Loss``, optional clip_grad_norm (float, optional): The norm of gradient clipping.
:param gradient_handlers: A list of gradient handler used in backward ophook_list (list): List of ophook.
:type gradient_handlers: a list of ``BaseGradientHandler``, optional verbose (bool): whether to display log info.
:param clip_grad_norm: The norm of gradient clipping
:type clip_grad_norm: float, optional Examples:
:param ophook_list: List of ophook >>> # define model, criterion, optimizer, lr_scheduler, train_dataloader for your training
:type ophook_list: list >>> model = ...
:param verbose: whether to display log info >>> criterion = ...
:type verbose: bool >>> optimizer = ...
>>> train_dataloader = ...
>>> engine, _, _, _ = colossalai.initialize(model, optimizer, criterion)
>>> engine.train()
>>> for inputs, labels in train_dataloader
>>> # set gradients to zero
>>> engine.zero_grad()
>>> # run forward pass
>>> outputs = engine(inputs)
>>> # compute loss value and run backward pass
>>> loss = engine.criterion(outputs, labels)
>>> engine.backward(loss)
>>> # update parameters
>>> engine.step()
The example of using Engine in training could be find in
`Training with engine and trainer <https://www.colossalai.org/docs/basics/engine_trainer>`_. and
`Run resnet cifar10 with engine <https://github.com/hpcaitech/ColossalAI-Examples/blob/main/image/resnet/run_resnet_cifar10_with_engine.py>`_.
""" """
def __init__(self, def __init__(self,
@ -113,10 +130,10 @@ class Engine:
return self.optimizer.step() return self.optimizer.step()
def backward(self, loss: Tensor): def backward(self, loss: Tensor):
"""Start backward propagation given the loss value computed by a loss function """Start backward propagation given the loss value computed by a loss function.
:param loss: Loss value computed by a loss function Args:
:type loss: :class:`torch.Tensor` loss (:class:`torch.Tensor`): Loss value computed by a loss function.
""" """
ret = self.optimizer.backward(loss) ret = self.optimizer.backward(loss)
for ophook in self._ophook_list: for ophook in self._ophook_list:
@ -124,34 +141,22 @@ class Engine:
return ret return ret
def backward_by_grad(self, tensor, grad): def backward_by_grad(self, tensor, grad):
"""Start backward propagation given the gradient of the output tensor """Start backward propagation given the gradient of the output tensor.
:param tensor: Output tensor Args:
:type tensor: :class:`torch.Tensor` tensor (:class:`torch.Tensor`): Output tensor.
:param grad: Gradient passed back to the output grad (:class:`torch.Tensor`): Gradient passed back to the output.
:type grad: :class:`torch.Tensor`
""" """
ret = self.optimizer.backward_by_grad(tensor, grad) ret = self.optimizer.backward_by_grad(tensor, grad)
for ophook in self._ophook_list: for ophook in self._ophook_list:
ophook.post_iter() ophook.post_iter()
return ret return ret
def calc_loss(self, *args, **kwargs):
"""Compute the loss value
:param args: Args used in criterion function
:param kwargs: Kwargs used in criterion function
:return: The loss value
:rtype: :class:`torch.Tensor`
"""
return self.criterion(*args, **kwargs)
def __call__(self, *args, **kwargs): def __call__(self, *args, **kwargs):
"""Run the forward step for the model """Run the forward step for the model.
:return: Output the model Returns:
:rtype: Tuple[:class:`torch.Tensor`] or :class:`torch.Tensor` Tuple[:class:`torch.Tensor`] or :class:`torch.Tensor`: Output of the model.
""" """
return self.model(*args, **kwargs) return self.model(*args, **kwargs)

View File

@ -8,10 +8,9 @@ class BaseGradientHandler(ABC):
"""A basic helper class to handle all-reduce operations of gradients across different parallel groups """A basic helper class to handle all-reduce operations of gradients across different parallel groups
before optimization. before optimization.
:param model: Model where the gradients accumulate Args:
:param optimizer: Optimizer for updating the parameters model (Module): Model where the gradients accumulate.
:type model: Module optimizer (Optimizer): Optimizer for updating the parameters.
:type optimizer: Optimizer
""" """
def __init__(self, model, optimizer): def __init__(self, model, optimizer):
self._model = model self._model = model

View File

@ -17,12 +17,11 @@ import math
class MemTracerOpHook(BaseOpHook): class MemTracerOpHook(BaseOpHook):
""" """
Collect GPU memory usage information Collect GPU memory usage information
:param warmup: This parameter indicates how many iterations to truncate before profiling, defaults to 50
:type warmup: int Args:
:param refreshrate: This parameter decides the frequency of write file, defaults to 10 warmup (int): This parameter indicates how many iterations to truncate before profiling, defaults to 50.
:type refreshrate: int refreshrate (int): This parameter decides the frequency of write file, defaults to 10.
:param data_prefix: The prefix of the stats data file, defaults to "memstats" data_prefix (string): The prefix of the stats data file, defaults to "memstats".
:type data_prefix: string
""" """
def __init__(self, warmup: int = 50, refreshrate: int = 10, data_prefix: str = "memstats"): def __init__(self, warmup: int = 50, refreshrate: int = 10, data_prefix: str = "memstats"):

View File

@ -15,8 +15,12 @@ class BaseSchedule(ABC):
"""A basic helper class to control the process of training or evaluation. """A basic helper class to control the process of training or evaluation.
It mainly composes of forward_backward_step for gradient backward and It mainly composes of forward_backward_step for gradient backward and
optimizer_step for parameters update. optimizer_step for parameters update.
For the convenience to enable FP16, we aggreate all codes that contain the For the convenience to enable FP16, we aggregate all codes that contain the
control of FP16 in class schedule. control of FP16 in class schedule.
Args:
batch_data_process_func (Callable, optional): The preprocessing function which receives a batch of data,
and it will be executed in load_batch.
""" """
def __init__(self, batch_data_process_func: Callable = None): def __init__(self, batch_data_process_func: Callable = None):
@ -46,13 +50,12 @@ class BaseSchedule(ABC):
"""Loads a batch from data iterator. It returns the data and labels which are """Loads a batch from data iterator. It returns the data and labels which are
already in the same GPU as where the model's. already in the same GPU as where the model's.
:param data_iter: Data iterator from which get a batch of data Args:
:type data_iter: DataIter data_iter (Iterable): Data iterator from which get a batch of data, obtained by calling iter(dataloader).
:param to_gpu: Whether the data should be moved to GPU to_gpu (bool, optional): Whether the data should be moved to GPU
:type to_gpu: bool, optional
:return: (data, label) Returns:
:rtype: (:class:`Tensor`, :class:`torch.Tensor`) Tuple (:class:`Tensor`, :class:`torch.Tensor`): A tuple of (data, label).
""" """
if data_iter is None: if data_iter is None:
raise RuntimeError('Dataloader is not defined.') raise RuntimeError('Dataloader is not defined.')
@ -87,16 +90,12 @@ class BaseSchedule(ABC):
): ):
"""The process function over a batch of dataset for training or evaluation. """The process function over a batch of dataset for training or evaluation.
:param engine: Colossalai training engine Args:
:type engine: colossalai.engine.Engine engine (colossalai.engine.Engine): Colossalai engine for training and inference.
:param data_iter: Data iterator from which get a batch of data data_iter (Iterable): Data iterator from which get a batch of data, obtained by calling iter(dataloader).
:type data_iter: DataIter forward_only (bool): If True, the process won't include backward.
:param forward_only: If True, the process won't include backward return_loss (bool, optional): If False, the loss won't be returned.
:type forward_only: bool return_output_label (bool, optional): If False, the output and label won't be returned.
:param return_loss: If False, the loss won't be returned
:type return_loss: bool, optional
:param return_output_label: If False, the output and label won't be returned
:type return_output_label: bool, optional
""" """
pass pass

View File

@ -15,6 +15,10 @@ class NonPipelineSchedule(BaseSchedule):
During one process, it loads a batch of dataset and feeds it to the model. During one process, it loads a batch of dataset and feeds it to the model.
After getting the output and calculating the loss, it will use :meth:`step` After getting the output and calculating the loss, it will use :meth:`step`
to update the parameters if it is in training mode. to update the parameters if it is in training mode.
Args:
batch_data_process_func (Callable, optional): The preprocessing function which receives a batch of data,
and it will be executed in load_batch.
""" """
def forward_backward_step(self, def forward_backward_step(self,
@ -23,22 +27,19 @@ class NonPipelineSchedule(BaseSchedule):
forward_only: bool = False, forward_only: bool = False,
return_loss: bool = True, return_loss: bool = True,
return_output_label: bool = True): return_output_label: bool = True):
"""The process function that loads loads a batch of dataset and feeds it to the model. """The process function that loads a batch of dataset and feeds it to the model.
The returned labels and loss will None if :attr:`return_loss` is False. The returned labels and loss will None if :attr:`return_loss` is False.
:param engine: Model for training and inference Args:
:param data_iter: Data iterator of the dataloader, e.g. iter(dataloader) engine (colossalai.engine.Engine): Colossalai engine for training and inference.
:param forward_only: If True, the model is run for the forward pass, else back propagation will be executed data_iter (Iterable): Dataloader as the form of an iterator, obtained by calling iter(dataloader).
:param return_loss: Loss will be returned if True forward_only (bool, optional):
:param return_output_label: Output and label will be returned if True If True, the model is run for the forward pass, else back propagation will be executed.
:type engine: Iterator return_loss (bool, optional): Loss will be returned if True.
:type data_iter: Iterator return_output_label (bool, optional): Output and label will be returned if True.
:type forward_only: bool, optional
:type return_loss: bool, optional
:type return_output_label: bool, optional
:return: (output, label, loss) Returns:
:rtype: Tuple[:class:`torch.Tensor`] Tuple[:class:`torch.Tensor`]: A tuple of (output, label, loss), loss and label could be None.
""" """
assert forward_only or return_loss, \ assert forward_only or return_loss, \
"The argument 'return_loss' has to be True when 'forward_only' is False, but got False." "The argument 'return_loss' has to be True when 'forward_only' is False, but got False."

View File

@ -41,14 +41,13 @@ class PipelineSchedule(BaseSchedule):
It uses non-interleaved 1F1B strategy. Other properties are similar as It uses non-interleaved 1F1B strategy. Other properties are similar as
:class:`NonPipelineSchedule`. :class:`NonPipelineSchedule`.
:param num_microbatches: The number of microbatches Args:
:type num_microbatches: int num_microbatches (int): The number of microbatches.
:param batch_data_process_func: The preprocessing function which receives a batch of data, and it will be executed in `load_batch` batch_data_process_func (Callable, optional):
:type batch_data_process_func: Callable, optional The preprocessing function which receives a batch of data, and it will be executed in `load_batch`.
:param tensor_shape: Specified shape in pipeline communication tensor_shape (torch.Size, optional): Specified shape in pipeline communication.
:type tensor_shape: torch.Size, optional scatter_gather_tensors (bool, optional):
:param scatter_gather_tensors: If set to `True`, communication will be reduced over pipeline when using 1D tensor parallelization If set to `True`, communication will be reduced over pipeline when using 1D tensor parallelization.
:type scatter_gather_tensors: bool, optional
""" """
def __init__(self, def __init__(self,
@ -131,19 +130,14 @@ class PipelineSchedule(BaseSchedule):
is obtained from data_iterator, otherwise the passed-in input_tensor is used. is obtained from data_iterator, otherwise the passed-in input_tensor is used.
Returns output tensor. This is a helper function and can be ignored by users. Returns output tensor. This is a helper function and can be ignored by users.
:param engine: Your engine object Args:
:type engine: colossalai.engine.Engine engine (colossalai.engine.Engine): Colossalai engine for training and inference.
:param input_tensor: Input tensor for this pipeline stage input_tensor (:class:`torch.Tensor`): Input tensor for this pipeline stage.
:type input_tensor: :class:`torch.Tensor` return_tensors (List[:class:`torch.Tensor`]): A list of tensors to return.
:param return_tensors: A list of tensors to return return_output_label (bool, optional): Whether returns output labels.
:type return_tensors: List[:class:`torch.Tensor`] accum_loss (optional): Where accumulated loss stores.
:param return_output_label: Whether returns output labels Returns:
:type return_output_label: bool, optional :class:`torch.Tensor`: output or the loss value of the current pipeline stage.
:param accum_loss: Where accumulated loss stores
:type accum_loss: optional
:return: output or the loss value of the current pipeline stage
:rtype: :class:`torch.Tensor`
""" """
data, label = self.load_micro_batch() data, label = self.load_micro_batch()
output_tensor = self._call_engine(engine.model, input_tensor, data) output_tensor = self._call_engine(engine.model, input_tensor, data)
@ -173,17 +167,14 @@ class PipelineSchedule(BaseSchedule):
Returns the gradients with respect to the input tensor (None if first stage). Returns the gradients with respect to the input tensor (None if first stage).
This is a helper function and can be ignored by users. This is a helper function and can be ignored by users.
:param engine: your engine object Args:
:type engine: colossalai.engine.Engine engine (colossalai.engine.Engine): Colossalai engine for training and inference.
:param input_tensor: input tensor for this pipeline stage input_tensor (:class:`torch.Tensor`): input tensor for this pipeline stage.
:type input_tensor: :class:`torch.Tensor` output_tensor (:class:`torch.Tensor`): output tensor for this pipeline stage.
:param output_tensor: output tensor for this pipeline stage output_tensor_grad (:class:`torch.Tensor`): gradient of output tensor for this pipeline stage.
:type output_tensor: :class:`torch.Tensor`
:param output_tensor_grad: gradient of output tensor for this pipeline stage
:type output_tensor_grad: :class:`torch.Tensor`
:return: gradient of input tensor Returns:
:rtype: :class:`torch.Tensor` :class:`torch.Tensor`: gradient of input tensor.
""" """
# Retain the grad on the input_tensor. # Retain the grad on the input_tensor.
@ -207,19 +198,16 @@ class PipelineSchedule(BaseSchedule):
"""Runs non-interleaved 1F1B schedule, with communication between pipeline stages. """Runs non-interleaved 1F1B schedule, with communication between pipeline stages.
Returns a tuple with losses if the last stage, an empty tuple otherwise. Returns a tuple with losses if the last stage, an empty tuple otherwise.
:param engine: Your engine object Args:
:type engine: colossalai.engine.Engine engine (colossalai.engine.Engine): Colossalai engine for training and inference.
:param data_iter: Dataloader as the form of an iterator, obtained by calling iter(dataloader) data_iter (Iterable): Dataloader as the form of an iterator, obtained by calling iter(dataloader).
:type data_iter: Iterable forward_only (bool, optional):
:param forward_only: Whether run forward step only. Default is false. If true, no backward will be run. Whether run forward step only. Default is false. If true, no backward will be run.
:type forward_only: bool return_loss (bool, optional): Whether returns the loss value. Default is true.
:param return_loss: Whether returns the loss value. Default is true. return_output_label (bool, optional): If False, the output and label won't be returned.
:type return_loss: bool
:param return_output_label: If False, the output and label won't be returned
:type return_output_label: bool
:return: (output, label, loss) Returns:
:rtype: Tuple[:class:`torch.Tensor`] Tuple[:class:`torch.Tensor`]: A tuple of (output, label, loss), loss and label could be None.
""" """
assert forward_only or return_loss, \ assert forward_only or return_loss, \
@ -354,16 +342,14 @@ class InterleavedPipelineSchedule(PipelineSchedule):
It uses interleaved 1F1B strategy. Other properties are similar as It uses interleaved 1F1B strategy. Other properties are similar as
:class:`NonPipelineSchedule`. :class:`NonPipelineSchedule`.
:param num_microbatches: The number of microbatches Args:
:type num_microbatches: int num_microbatches (int): The number of microbatches.
:param num_model_chunks: The number of model chunks num_model_chunks (int): The number of model chunks.
:type num_model_chunks: int batch_data_process_func (Callable, optional):
:param batch_data_process_func: The preprocessing function which receives a batch of data, and it will be executed in `load_batch` The preprocessing function which receives a batch of data, and it will be executed in `load_batch`.
:type batch_data_process_func: Callable, optional tensor_shape (torch.Size, optional): Specified shape in pipeline communication.
:param tensor_shape: Specified shape in pipeline communication scatter_gather_tensors (bool, optional):
:type tensor_shape: torch.Size, optional If set to `True`, communication will be reduced over pipeline when using 1D tensor parallelization.
:param scatter_gather_tensors: If set to `True`, communication will be reduced over pipeline when using 1D tensor parallelization
:type scatter_gather_tensors: bool, optional
""" """
assert num_microbatches % gpc.get_world_size(ParallelMode.PIPELINE) == 0, \ assert num_microbatches % gpc.get_world_size(ParallelMode.PIPELINE) == 0, \
'num_microbatches must be an integer multiple of pipeline parallel world size' 'num_microbatches must be an integer multiple of pipeline parallel world size'
@ -408,6 +394,16 @@ class InterleavedPipelineSchedule(PipelineSchedule):
"""Forward step for passed-in model. If it is the first stage, the input tensor """Forward step for passed-in model. If it is the first stage, the input tensor
is obtained from data_iterator, otherwise the passed-in input_tensor is used. is obtained from data_iterator, otherwise the passed-in input_tensor is used.
Returns output tensor. This is a helper function and can be ignored by users. Returns output tensor. This is a helper function and can be ignored by users.
Args:
engine (colossalai.engine.Engine): Colossalai engine for training and inference.
model_chunk_id (int): The id of model chunks.
input_tensor (:class:`torch.Tensor`): Input tensor for this pipeline stage.
return_tensors (List[:class:`torch.Tensor`]): A list of tensors to return.
return_output_label (bool, optional): Whether returns output labels.
accum_loss (optional): Where accumulated loss stores.
Returns:
:class:`torch.Tensor`: output or the loss value of the current pipeline stage.
""" """
data, label = self.load_micro_batch(model_chunk_id) data, label = self.load_micro_batch(model_chunk_id)
output_tensor = self._call_engine(engine.model[model_chunk_id], input_tensor, data) output_tensor = self._call_engine(engine.model[model_chunk_id], input_tensor, data)
@ -435,18 +431,17 @@ class InterleavedPipelineSchedule(PipelineSchedule):
"""Run interleaved 1F1B schedule (model split into model chunks), with """Run interleaved 1F1B schedule (model split into model chunks), with
communication between pipeline stages as needed. communication between pipeline stages as needed.
Returns dictionary with losses if the last stage, empty dict otherwise. Args:
engine (colossalai.engine.Engine): Colossalai engine for training and inference.
data_iter (Iterable): Dataloader as the form of an iterator, obtained by calling iter(dataloader).
forward_only (bool, optional):
Whether run forward step only. Default is false. If true, no backward will be run.
return_loss (bool, optional): Whether returns the loss value. Default is true.
return_output_label (bool, optional): If False, the output and label won't be returned.
:param engine: Your engine object Returns:
:type engine: colossalai.engine.Engine Tuple[:class:`torch.Tensor`]: A tuple of (output, label, loss), loss and label could be None.
:param data_iter: Dataloader as the form of an iterator, obtained by calling iter(dataloader) The loss would be returned only in the last stage.
:type data_iter: Iterable
:param forward_only: Whether run forward step only. Default is false. If true, no backward will be run.
:type forward_only: bool
:param return_loss: Whether returns the loss value. Default is true.
:type return_loss: bool
:param return_output_label: If False, the output and label won't be returned
:type return_output_label: bool
""" """
assert forward_only or return_loss, \ assert forward_only or return_loss, \
'The argument \'return_loss\' has to be True when \'forward_only\' is False, but got False.' 'The argument \'return_loss\' has to be True when \'forward_only\' is False, but got False.'

View File

@ -37,8 +37,8 @@ def get_default_parser():
"""Reads user command line and uses an argument parser to parse the input arguments. """Reads user command line and uses an argument parser to parse the input arguments.
Input arguments include configuration, host, port, world size, local rank, backend for torch.distributed. Input arguments include configuration, host, port, world size, local rank, backend for torch.distributed.
:return: Returns the parser with the default arguments, the user may add customized arguments into this parser Returns:
:rtype: Namespace Namespace: Returns the parser with the default arguments, the user may add customized arguments into this parser.
""" """
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('--config', type=str, help='path to the config file') parser.add_argument('--config', type=str, help='path to the config file')
@ -63,26 +63,21 @@ def launch(config: Union[str, Path, Config, Dict],
"""This function first parses the configuration arguments, using :func:`parse_args()` in case one of the input """This function first parses the configuration arguments, using :func:`parse_args()` in case one of the input
arguments are not given. Then initialize and set distributed environment by calling global_context's functions. arguments are not given. Then initialize and set distributed environment by calling global_context's functions.
:param config: Config file or config file path are both acceptable Args:
:type config: Union[str, dict, Config] config (Union[str, dict, Config]): Config file or config file path are both acceptable
:param rank: Rank for the default process group rank (int): Rank for the default process group
:type rank: int world_size (int): World size of the default process group
:param world_size: World size of the default process group host (str): The master address for distributed training
:type world_size: int port (str): The master port for distributed training
:param host: The master address for distributed training backend (str, optional): Backend for ``torch.distributed``, defaults to ``nccl``
:type host: str local_rank (int, optional):
:param port: The master port for distributed training Rank for the process on the node and is used to set the default CUDA device,
:type port: str defaults to None. If local_rank = None, the default device ordinal will be calculated automatically.
:param backend: Backend for torch.distributed seed (int, optional): Specified random seed for every process. Defaults to 1024.
:type backend: str, optional verbose (bool, optional): Whether to print logs. Defaults to True.
:param local_rank: Rank for the process on the node and is used to set the default CUDA device, defaults to None.
If local_rank = None, the default device ordinal will be calculated automatically Raises:
:type local_rank: int, optional Exception: Raise exception when config type is wrong
:param seed: Specified random seed for every processes
:type seed: int, optional
:param verbose: Whether to print logs
:type verbose: bool, optional
:raises Exception: Raise exception when config type is wrong
""" """
gpc.verbose = verbose gpc.verbose = verbose
@ -126,18 +121,13 @@ def launch_from_slurm(config: Union[str, Path, Config, Dict],
"""A wrapper for colossalai.launch for SLURM launcher by reading rank and world size from the environment variables """A wrapper for colossalai.launch for SLURM launcher by reading rank and world size from the environment variables
set by SLURM set by SLURM
:param config: Config file or config file path are both acceptable Args:
:type config: Union[str, dict, Config] config (Union[str, dict, Config]): Config file or config file path are both acceptable
:param host: The master address for distributed training host (str): The master address for distributed training
:type host: str port (str): The master port for distributed training
:param port: The master port for distributed training backend (str, optional): Backend for ``torch.distributed``, defaults to ``nccl``
:type port: str seed (int, optional): Specified random seed for every process. Defaults to 1024.
:param backend: Backend for torch.distributed verbose (bool, optional): Whether to print logs. Defaults to True.
:type backend: str, optional
:param seed: Specified random seed for every processes
:type seed: int, optional
:param verbose: Whether to print logs
:type verbose: bool, optional
""" """
rank = int(os.environ['SLURM_PROCID']) rank = int(os.environ['SLURM_PROCID'])
world_size = int(os.environ['SLURM_NPROCS']) world_size = int(os.environ['SLURM_NPROCS'])
@ -160,18 +150,13 @@ def launch_from_openmpi(config: Union[str, Path, Config, Dict],
"""A wrapper for colossalai.launch for OpenMPI launcher by reading rank and world size from the environment variables """A wrapper for colossalai.launch for OpenMPI launcher by reading rank and world size from the environment variables
set by OpenMPI set by OpenMPI
:param config: Config file or config file path are both acceptable Args:
:type config: Union[str, dict, Config] config (Union[str, dict, Config]): Config file or config file path are both acceptable
:param host: The master address for distributed training host (str): The master address for distributed training
:type host: str port (str): The master port for distributed training
:param port: The master port for distributed training backend (str, optional): Backend for ``torch.distributed``, defaults to ``nccl``
:type port: str seed (int, optional): Specified random seed for every process. Defaults to 1024.
:param backend: Backend for torch.distributed verbose (bool, optional): Whether to print logs. Defaults to True.
:type backend: str, optional
:param seed: Specified random seed for every processes
:type seed: int, optional
:param verbose: Whether to print logs
:type verbose: bool, optional
""" """
rank = int(os.environ['OMPI_COMM_WORLD_RANK']) rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
local_rank = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK']) local_rank = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])
@ -194,14 +179,11 @@ def launch_from_torch(config: Union[str, Path, Config, Dict],
"""A wrapper for colossalai.launch for torchrun or torch.distributed.launch by reading rank and world size """A wrapper for colossalai.launch for torchrun or torch.distributed.launch by reading rank and world size
from the environment variables set by PyTorch from the environment variables set by PyTorch
:param config: Config file or config file path are both acceptable Args:
:type config: Union[str, dict, Config] config (Union[str, dict, Config]): Config file or config file path are both acceptable
:param backend: Backend for torch.distributed backend (str, optional): Backend for ``torch.distributed``, defaults to ``nccl``
:type backend: str, optional seed (int, optional): Specified random seed for every process. Defaults to 1024.
:param seed: Specified random seed for every processes verbose (bool, optional): Whether to print logs. Defaults to True.
:type seed: int, optional
:param verbose: Whether to print logs
:type verbose: bool, optional
""" """
rank = int(os.environ['RANK']) rank = int(os.environ['RANK'])
local_rank = int(os.environ['LOCAL_RANK']) local_rank = int(os.environ['LOCAL_RANK'])
@ -230,22 +212,20 @@ def initialize(model: nn.Module,
"""Core function to wrap the essential training components with our functionality based on the config which is """Core function to wrap the essential training components with our functionality based on the config which is
loaded into gpc.config. loaded into gpc.config.
:param model: Your model instance or a function to build the model Args:
:type model: :class:`torch.nn.Module` or Callbale model (:class:`torch.nn.Module` or Callbale): Your model instance or a function to build the model.
:param optimizer: Your optimizer instance optimizer (:class:`torch.optim.optimizer.Optimizer` or :class:`Type[torch.optim.optimizer]`):
:type optimizer: :class:`torch.optim.optimizer.Optimizer` or :class:`Type[torch.optim.optimizer]` Your optimizer instance.
:param criterion: Your criterion instance criterion (:class:`torch.nn.modules.loss._Loss`, optional): Your criterion instance.
:type criterion: :class:`torch.nn.modules.loss._Loss`, optional train_dataloader (:class:`torch.utils.data.DataLoader`, optional): Dataloader for training.
:param train_dataloader: Dataloader for training test_dataloader (:class:`torch.utils.data.DataLoader`, optional): Dataloader for testing.
:type train_dataloader: :class:`torch.utils.data.DataLoader`, optional lr_scheduler (:class:`torch.nn.lr_scheduler._LRScheduler`, optional): Your lr scheduler instance, optional.
:param test_dataloader: Dataloader for testing verbose (bool, optional): Whether to print logs.
:type test_dataloader: :class:`torch.utils.data.DataLoader`, optional
:param lr_scheduler: Your lr scheduler instance, optional Returns:
:type lr_scheduler: :class:`torch.nn.lr_scheduler._LRScheduler`, optional Tuple (engine, train_dataloader, test_dataloader, lr_scheduler):
:param verbose: Whether to print logs A tuple of ``(engine, train_dataloader, test_dataloader, lr_scheduler)``
:type verbose: bool, optional where only ``engine`` could not be None.
:return: (engine, train_dataloader, test_dataloader, lr_scheduler)
:rtype: Tuple
""" """
# get logger # get logger
logger = get_dist_logger() logger = get_dist_logger()

View File

@ -10,6 +10,8 @@ def get_dist_logger(name='colossalai'):
"""Get logger instance based on name. The DistributedLogger will create singleton instances, """Get logger instance based on name. The DistributedLogger will create singleton instances,
which means that only one logger instance is created per name. which means that only one logger instance is created per name.
Args:
:param name: name of the logger, name must be unique :param name: name of the logger, name must be unique
:type name: str :type name: str

View File

@ -23,8 +23,13 @@ except ImportError:
class DistributedLogger: class DistributedLogger:
"""This is a distributed event logger class essentially based on :class:`logging`. """This is a distributed event logger class essentially based on :class:`logging`.
:param name: The name of the logger Args:
:type name: str name (str): The name of the logger.
Note:
The parallel_mode used in ``info``, ``warning``, ``debug`` and ``error``
should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
""" """
__instances = dict() __instances = dict()
@ -33,10 +38,10 @@ class DistributedLogger:
def get_instance(name: str): def get_instance(name: str):
"""Get the unique single logger instance based on name. """Get the unique single logger instance based on name.
:param name: The name of the logger Args:
:type name: str name (str): The name of the logger.
:return: A DistributedLogger object Returns:
:rtype: DistributedLogger DistributedLogger: A DistributedLogger object
""" """
if name in DistributedLogger.__instances: if name in DistributedLogger.__instances:
return DistributedLogger.__instances[name] return DistributedLogger.__instances[name]
@ -73,8 +78,8 @@ class DistributedLogger:
def set_level(self, level: str): def set_level(self, level: str):
"""Set the logging level """Set the logging level
:param level: Can only be INFO, DEBUG, WARNING and ERROR Args:
:type level: str level (str): Can only be INFO, DEBUG, WARNING and ERROR.
""" """
self._check_valid_logging_level(level) self._check_valid_logging_level(level)
self._logger.setLevel(getattr(logging, level)) self._logger.setLevel(getattr(logging, level))
@ -82,14 +87,11 @@ class DistributedLogger:
def log_to_file(self, path: Union[str, Path], mode: str = 'a', level: str = 'INFO', suffix: str = None): def log_to_file(self, path: Union[str, Path], mode: str = 'a', level: str = 'INFO', suffix: str = None):
"""Save the logs to file """Save the logs to file
:param path: The file to save the log Args:
:type path: A string or pathlib.Path object path (A string or pathlib.Path object): The file to save the log.
:param mode: The mode to write log into the file mode (str): The mode to write log into the file.
:type mode: str level (str): Can only be INFO, DEBUG, WARNING and ERROR.
:param level: Can only be INFO, DEBUG, WARNING and ERROR suffix (str): The suffix string of log's name.
:type level: str
:param suffix: The suffix string of log's name
:type suffix: str
""" """
assert isinstance(path, (str, Path)), \ assert isinstance(path, (str, Path)), \
f'expected argument path to be type str or Path, but got {type(path)}' f'expected argument path to be type str or Path, but got {type(path)}'
@ -131,12 +133,11 @@ class DistributedLogger:
def info(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: list = None): def info(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: list = None):
"""Log an info message. """Log an info message.
:param message: The message to be logged Args:
:type message: str message (str): The message to be logged.
:param parallel_mode: The parallel mode used for logging. Defaults to ParallelMode.GLOBAL parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`):
:type parallel_mode: :class:`colossalai.context.parallel_mode.ParallelMode` The parallel mode used for logging. Defaults to ParallelMode.GLOBAL.
:param ranks: List of parallel ranks ranks (List): List of parallel ranks.
:type ranks: list
""" """
message_prefix = "{}:{} {}".format(*self.__get_call_info()) message_prefix = "{}:{} {}".format(*self.__get_call_info())
self._log('info', message_prefix, parallel_mode, ranks) self._log('info', message_prefix, parallel_mode, ranks)
@ -145,12 +146,11 @@ class DistributedLogger:
def warning(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: list = None): def warning(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: list = None):
"""Log a warning message. """Log a warning message.
:param message: The message to be logged Args:
:type message: str message (str): The message to be logged.
:param parallel_mode: The parallel mode used for logging. Defaults to ParallelMode.GLOBAL parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`):
:type parallel_mode: :class:`colossalai.context.parallel_mode.ParallelMode` The parallel mode used for logging. Defaults to ParallelMode.GLOBAL.
:param ranks: List of parallel ranks ranks (List): List of parallel ranks.
:type ranks: list
""" """
message_prefix = "{}:{} {}".format(*self.__get_call_info()) message_prefix = "{}:{} {}".format(*self.__get_call_info())
self._log('warning', message_prefix, parallel_mode, ranks) self._log('warning', message_prefix, parallel_mode, ranks)
@ -159,12 +159,11 @@ class DistributedLogger:
def debug(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: list = None): def debug(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: list = None):
"""Log a debug message. """Log a debug message.
:param message: The message to be logged Args:
:type message: str message (str): The message to be logged.
:param parallel_mode: The parallel mode used for logging. Defaults to ParallelMode.GLOBAL parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`):
:type parallel_mode: :class:`colossalai.context.parallel_mode.ParallelMode` The parallel mode used for logging. Defaults to ParallelMode.GLOBAL.
:param ranks: List of parallel ranks ranks (List): List of parallel ranks.
:type ranks: list
""" """
message_prefix = "{}:{} {}".format(*self.__get_call_info()) message_prefix = "{}:{} {}".format(*self.__get_call_info())
self._log('debug', message_prefix, parallel_mode, ranks) self._log('debug', message_prefix, parallel_mode, ranks)
@ -173,12 +172,11 @@ class DistributedLogger:
def error(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: list = None): def error(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: list = None):
"""Log an error message. """Log an error message.
:param message: The message to be logged Args:
:type message: str message (str): The message to be logged.
:param parallel_mode: The parallel mode used for logging. Defaults to ParallelMode.GLOBAL parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`):
:type parallel_mode: :class:`colossalai.context.parallel_mode.ParallelMode` The parallel mode used for logging. Defaults to ParallelMode.GLOBAL.
:param ranks: List of parallel ranks ranks (List): List of parallel ranks.
:type ranks: list
""" """
message_prefix = "{}:{} {}".format(*self.__get_call_info()) message_prefix = "{}:{} {}".format(*self.__get_call_info())
self._log('error', message_prefix, parallel_mode, ranks) self._log('error', message_prefix, parallel_mode, ranks)

View File

@ -6,6 +6,7 @@ import torch.nn as nn
def zeros_(): def zeros_():
"""Return the initializer filling the input Tensor with the scalar zeros"""
def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None): def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None):
return nn.init.zeros_(tensor) return nn.init.zeros_(tensor)
@ -13,6 +14,7 @@ def zeros_():
def ones_(): def ones_():
"""Return the initializer filling the input Tensor with the scalar ones"""
def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None): def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None):
return nn.init.ones_(tensor) return nn.init.ones_(tensor)
@ -20,6 +22,14 @@ def ones_():
def uniform_(a: float = 0., b: float = 1.): def uniform_(a: float = 0., b: float = 1.):
r"""Return the initializer filling the input Tensor with values drawn from the uniform
distribution :math:`\mathcal{U}(a, b)`.
Args:
a (float): the lower bound of the uniform distribution. Defaults 0.0.
b (float): the upper bound of the uniform distribution. Defaults 1.0.
"""
def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None): def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None):
return nn.init.uniform_(tensor, a, b) return nn.init.uniform_(tensor, a, b)
@ -27,6 +37,15 @@ def uniform_(a: float = 0., b: float = 1.):
def normal_(mean: float = 0., std: float = 1.): def normal_(mean: float = 0., std: float = 1.):
r"""Return the initializer filling the input Tensor with values drawn from the normal distribution
.. math::
\mathcal{N}(\text{mean}, \text{std}^2)
Args:
mean (float): the mean of the normal distribution. Defaults 0.0.
std (float): the standard deviation of the normal distribution. Defaults 1.0.
"""
def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None): def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None):
return nn.init.normal_(tensor, mean, std) return nn.init.normal_(tensor, mean, std)
@ -34,6 +53,19 @@ def normal_(mean: float = 0., std: float = 1.):
def trunc_normal_(mean: float = 0., std: float = 1., a: float = -2., b: float = 2.): def trunc_normal_(mean: float = 0., std: float = 1., a: float = -2., b: float = 2.):
r"""Return the initializer filling the input Tensor with values drawn from a truncated
normal distribution. The values are effectively drawn from the
normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
with values outside :math:`[a, b]` redrawn until they are within
the bounds. The method used for generating the random values works
best when :math:`a \leq \text{mean} \leq b`.
Args:
mean (float): the mean of the normal distribution. Defaults 0.0.
std (float): the standard deviation of the normal distribution. Defaults 1.0.
a (float): the minimum cutoff value. Defaults -2.0.
b (float): the maximum cutoff value. Defaults 2.0.
"""
def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None): def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None):
return nn.init.trunc_normal_(tensor, mean, std, a, b) return nn.init.trunc_normal_(tensor, mean, std, a, b)
@ -41,6 +73,26 @@ def trunc_normal_(mean: float = 0., std: float = 1., a: float = -2., b: float =
def kaiming_uniform_(a=0, mode='fan_in', nonlinearity='leaky_relu'): def kaiming_uniform_(a=0, mode='fan_in', nonlinearity='leaky_relu'):
r"""Return the initializer filling the input `Tensor` with values according to the method
described in `Delving deep into rectifiers: Surpassing human-level
performance on ImageNet classification` - He, K. et al. (2015), using a
uniform distribution. The resulting tensor will have values sampled from
:math:`\mathcal{U}(-\text{bound}, \text{bound})` where
.. math::
\text{bound} = \text{gain} \times \sqrt{\frac{3}{\text{fan_mode}}}
Also known as 'He initialization'.
Args:
a (int): the negative slope of the rectifier used after this layer (only used with ``'leaky_relu'``).
mode (str, optional): either ``'fan_in'`` (default) or ``'fan_out'``. Choosing ``'fan_in'``
preserves the magnitude of the variance of the weights in the
forward pass. Choosing ``'fan_out'`` preserves the magnitudes in the
backwards pass.
nonlinearity (str, optional): the non-linear function (`nn.functional` name),
recommended to use only with ``'relu'`` or ``'leaky_relu'`` (default).
"""
# adapted from torch.nn.init # adapted from torch.nn.init
def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None): def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None):
if 0 in tensor.shape: if 0 in tensor.shape:
@ -64,6 +116,26 @@ def kaiming_uniform_(a=0, mode='fan_in', nonlinearity='leaky_relu'):
def kaiming_normal_(a=0, mode='fan_in', nonlinearity='leaky_relu'): def kaiming_normal_(a=0, mode='fan_in', nonlinearity='leaky_relu'):
r"""Return the initializer filling the input `Tensor` with values according to the method
described in `Delving deep into rectifiers: Surpassing human-level
performance on ImageNet classification` - He, K. et al. (2015), using a
normal distribution. The resulting tensor will have values sampled from
:math:`\mathcal{N}(0, \text{std}^2)` where
.. math::
\text{std} = \frac{\text{gain}}{\sqrt{\text{fan_mode}}}
Also known as 'He initialization'.
Args:
a (int): the negative slope of the rectifier used after this layer (only used with ``'leaky_relu'``).
mode (str, optional): either ``'fan_in'`` (default) or ``'fan_out'``. Choosing ``'fan_in'``
preserves the magnitude of the variance of the weights in the
forward pass. Choosing ``'fan_out'`` preserves the magnitudes in the
backwards pass.
nonlinearity (str, optional): the non-linear function (`nn.functional` name),
recommended to use only with ``'relu'`` or ``'leaky_relu'`` (default).
"""
# adapted from torch.nn.init # adapted from torch.nn.init
def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None): def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None):
if 0 in tensor.shape: if 0 in tensor.shape:
@ -86,6 +158,23 @@ def kaiming_normal_(a=0, mode='fan_in', nonlinearity='leaky_relu'):
def xavier_uniform_(a: float = math.sqrt(3.), scale: float = 2., gain: float = 1.): def xavier_uniform_(a: float = math.sqrt(3.), scale: float = 2., gain: float = 1.):
r"""Return the initializer filling the input `Tensor` with values according to the method
described in `Understanding the difficulty of training deep feedforward
neural networks` - Glorot, X. & Bengio, Y. (2010), using a uniform
distribution. The resulting tensor will have values sampled from
:math:`\mathcal{U}(-a, a)` where
.. math::
a = \text{gain} \times \sqrt{\frac{6}{\text{fan_in} + \text{fan_out}}}
Also known as 'Glorot initialization'.
Args:
a (float, optional): an optional scaling factor used to calculate uniform
bounds from standard deviation. Defaults ``math.sqrt(3.)``.
scale (float, optional): an optional scaling factor used to calculate standard deviation. Defaults 2.0.
gain (float, optional): an optional scaling factor. Defaults 1.0.
"""
# adapted from torch.nn.init # adapted from torch.nn.init
def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None): def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None):
assert fan_in is not None, 'Fan_in is not provided.' assert fan_in is not None, 'Fan_in is not provided.'
@ -102,6 +191,21 @@ def xavier_uniform_(a: float = math.sqrt(3.), scale: float = 2., gain: float = 1
def xavier_normal_(scale: float = 2., gain: float = 1.): def xavier_normal_(scale: float = 2., gain: float = 1.):
r"""Return the initializer filling the input `Tensor` with values according to the method
described in `Understanding the difficulty of training deep feedforward
neural networks` - Glorot, X. & Bengio, Y. (2010), using a normal
distribution. The resulting tensor will have values sampled from
:math:`\mathcal{N}(0, \text{std}^2)` where
.. math::
\text{std} = \text{gain} \times \sqrt{\frac{2}{\text{fan_in} + \text{fan_out}}}
Also known as 'Glorot initialization'.
Args:
scale (float, optional): an optional scaling factor used to calculate standard deviation. Defaults 2.0.
gain (float, optional): an optional scaling factor. Defaults 1.0.
"""
# adapted from torch.nn.init # adapted from torch.nn.init
def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None): def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None):
assert fan_in is not None, 'Fan_in is not provided.' assert fan_in is not None, 'Fan_in is not provided.'

View File

@ -6,13 +6,11 @@ from ..utils import get_tensor_parallel_mode
class Dropout(nn.Module): class Dropout(nn.Module):
""" """Dropout layer of colossalai.
Dropout layer of colossalai
:param p: dropout rate, defaults to 0.5 Args:
:type p: float, optional p (float, optional): probability of an element to be zeroed, defaults 0.5.
:param inplace: If set to ``True``, will do this operation in-place, defaults tp ``False`` inplace (bool, optional): whether to do dropout in-place, default to be False.
:type inplace: bool, optional
""" """
def __init__(self, p: float = 0.5, inplace: bool = False) -> None: def __init__(self, p: float = 0.5, inplace: bool = False) -> None:
super().__init__() super().__init__()

View File

@ -35,21 +35,33 @@ _parallel_patchembedding = {
class Embedding(nn.Module): class Embedding(nn.Module):
""" r"""Embedding for colossalai.
Embedding for colossalai
:param num_embeddings: number of embeddings Args:
:type num_embeddings: int num_embeddings (int): number of embeddings.
:param embedding_dim: dimension of embedding embedding_dim (int): dimension of embedding.
:type embedding_dim: int padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
:param padding_idx: index of padding, defaults to None therefore, the embedding vector at padding_idx is not updated during training,
:type padding_idx: int, optional i.e. it remains as a fixed pad, defaults to None.
:param dtype: The dtype of parameters, defaults to None dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
:type dtype: torch.dtype, optional weight_initializer (:class:`typing.Callable`, optional):
:param weight_initializer: The intializer of weight, defaults to normal initializer he initializer of weight, defaults to normal initializer.
:type weight_initializer: typing.Callable, optional
:param args: Args used in F.embedding The ``args`` and ``kwargs`` used in :class:`torch.nn.functional.embedding` should contain:
:param kwargs: Kwargs used in F.embedding ::
max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
renormalized to have norm max_norm. Note: this will modify weight in-place.
norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
of frequency of the words in the mini-batch. Default False.
sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
More details about ``args`` and ``kwargs`` could be found in
`Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
More details about ``initializer`` please refer to
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_
""" """
def __init__(self, def __init__(self,
@ -97,27 +109,24 @@ class Embedding(nn.Module):
class PatchEmbedding(nn.Module): class PatchEmbedding(nn.Module):
""" """2D Image to Patch Embedding.
2D Image to Patch Embedding
:param img_size: image size Args:
:type img_size: int img_size (int): image size.
:param patch_size: patch size patch_size (int): patch size.
:type patch_size: int in_chans (int): number of channels of input image.
:param in_chans: number of channels of input image embed_size (int): size of embedding.
:type in_chans: int dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
:param embed_size: size of embedding flatten (bool, optional): whether to flatten output tensor, defaults to True.
:type embed_size: int weight_initializer (:class:`typing.Callable`, optional):
:param dtype: The dtype of parameters, defaults to None The initializer of weight, defaults to kaiming uniform initializer.
:type dtype: torch.dtype, optional bias_initializer (:class:`typing.Callable`, optional):
:param flatten: whether to flatten output tensor, defaults to True The initializer of bias, defaults to xavier uniform initializer.
:type flatten: bool, optional position_embed_initializer (:class:`typing.Callable`, optional):
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer The initializer of position embedding, defaults to zeros initializer.
:type weight_initializer: typing.Callable, optional
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer More details about ``initializer`` please refer to
:type bias_initializer: typing.Callable, optional `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
:param position_embed_initializer: The intializer of position embedding, defaults to zero
:type position_embed_initializer: typing.Callable, optional
""" """
def __init__( def __init__(

View File

@ -31,22 +31,35 @@ _vocab_parallel_classifier = {
class Linear(nn.Module): class Linear(nn.Module):
""" """Linear layer of colossalai.
Linear layer of colossalai
:param in_features: size of each input sample Args:
:type in_features: int in_features (int): size of each input sample.
:param out_features: size of each output sample out_features (int): size of each output sample.
:type out_features: int bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
:param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
:type bias: bool, optional weight_initializer (:class:`typing.Callable`, optional):
:param dtype: The dtype of parameters, defaults to None The initializer of weight, defaults to kaiming uniform initializer.
:type dtype: torch.dtype, optional bias_initializer (:class:`typing.Callable`, optional):
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer The initializer of bias, defaults to xavier uniform initializer.
:type weight_initializer: typing.Callable, optional
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer Note: ``kwargs`` would contain different parameters when you use different parallelisms.
:type bias_initializer: typing.Callable, optional
:param kwargs: Kwargs used for particular parallelisms The ``kwargs`` should contain parameters below:
::
Linear1D:
gather_output: bool (optional, default to be false)
skip_bias_add: bool (optional, default to be false)
Linear2D:
skip_bias_add: bool (optional, default to be false)
Linear2p5D:
skip_bias_add: bool (optional, default to be false)
Linear3D:
None
More details about ``initializer`` please refer to
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
""" """
def __init__(self, def __init__(self,
@ -88,21 +101,21 @@ class Linear(nn.Module):
class Classifier(nn.Module): class Classifier(nn.Module):
""" """Classifier layer of colossalai.
Classifier layer of colossalai
:param in_features: size of each input sample Args:
:type in_features: int in_features (int): size of each input sample.
:param num_classes: number of total classes for the dataset num_classes (int): number of classes.
:type num_classes: int weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
:param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
:type bias: bool, optional dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
:param dtype: The dtype of parameters, defaults to None weight_initializer (:class:`typing.Callable`, optional):
:type dtype: torch.dtype, optional The initializer of weight, defaults to kaiming uniform initializer.
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer bias_initializer (:class:`typing.Callable`, optional):
:type weight_initializer: typing.Callable, optional The initializer of bias, defaults to xavier uniform initializer.
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
:type bias_initializer: typing.Callable, optional More details about ``initializer`` please refer to
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
""" """
def __init__(self, def __init__(self,

View File

@ -19,18 +19,15 @@ _parallel_layernorm = {
class LayerNorm(nn.Module): class LayerNorm(nn.Module):
r""" r"""Layer Normalization for colossalai.
Layer Normalization for colossalai
:param normalized_shape: input shape from an expected input Args:
of size. :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1] \times \ldots \times \text{normalized_shape}[-1]]` normalized_shape (int): input shape from an expected input of size.
If a single integer is used, it is treated as a singleton list, and this module will :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1] \times \ldots \times \text{normalized_shape}[-1]]`
normalize over the last dimension which is expected to be of that specific size. If a single integer is used, it is treated as a singleton list, and this module will
:type normalized_shape: int normalize over the last dimension which is expected to be of that specific size.
:param eps: a value added to the denominator for numerical stability, defaults to 1e-05 eps (float, optional): a value added to the denominator for numerical stability, defaults to 1e-05
:type eps: float, optional dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
:param dtype: The dtype of parameters, defaults to None
:type dtype: torch.dtype, optional
""" """
def __init__(self, normalized_shape: int, eps=1e-05, dtype=None) -> None: def __init__(self, normalized_shape: int, eps=1e-05, dtype=None) -> None:

View File

@ -28,11 +28,10 @@ class Experts(MoeExperts):
moe model parallel group, where E is the number of experts. Every expert moe model parallel group, where E is the number of experts. Every expert
is a instence of the class, 'expert' in initialization parameters. is a instence of the class, 'expert' in initialization parameters.
:param expert: The class of all experts Args:
:param num_experts: The number of experts expert_cls (:class:`torch.nn.Module`): The class of all experts
:param expert_args: Args used to initialize experts num_experts (int): The number of experts
expert_args: Args used to initialize experts, the args could be found in corresponding expert class
:type num_experts: int
""" """
def __init__(self, expert_cls: Type[nn.Module], num_experts: int, **expert_args): def __init__(self, expert_cls: Type[nn.Module], num_experts: int, **expert_args):

View File

@ -18,19 +18,13 @@ class Top1Router(nn.Module):
for routing usage. More deailted function can be found in the paper about Switch Transformer for routing usage. More deailted function can be found in the paper about Switch Transformer
of Google. of Google.
:param capacity_factor_train: Capacity factor in routing during training Args:
:param capacity_factor_eval: Capacity factor in routing during evaluation capacity_factor_train (float, optional): Capacity factor in routing of training.
:param min_capacity: The minimum number of the capacity of each expert capacity_factor_eval (float, optional): Capacity factor in routing of evaluation.
:param select_policy: The policy about tokens selection min_capacity (int, optional): The minimum number of the capacity of each expert.
:param noisy_func: Noisy function used in logits select_policy (str, optional): The policy about tokens selection.
:param drop_tks: Whether drops tokens in evaluation noisy_func (:class:`typing.Callable`, optional): Noisy function used in logits.
drop_tks (bool, optional): Whether drops tokens in evaluation
:type capacity_factor_train: float, optional
:type capacity_factor_eval: float, optional
:type min_capacity: int, optional
:type select_policy: str, optional
:type noisy_func: Callable, optional
:type drop_tks: bool, optional
""" """
def __init__(self, def __init__(self,
@ -119,17 +113,12 @@ class Top2Router(nn.Module):
"""Top2 router that returns the dispatch mask [s, e, c] and combine weight [s, e, c] """Top2 router that returns the dispatch mask [s, e, c] and combine weight [s, e, c]
for routing usage. More deailted function can be found in the paper about ViT-MoE. for routing usage. More deailted function can be found in the paper about ViT-MoE.
:param capacity_factor_train: Capacity factor in routing during training Args:
:param capacity_factor_eval: Capacity factor in routing during evaluation capacity_factor_train (float, optional): Capacity factor in routing of training.
:param min_capacity: The minimum number of the capacity of each expert capacity_factor_eval (float, optional): Capacity factor in routing of evaluation.
:param noisy_func: Noisy function used in logits min_capacity (int, optional): The minimum number of the capacity of each expert
:param drop_tks: Whether drops tokens in evaluation noisy_func (:class:`typing.Callable`, optional): Noisy function used in logits.
drop_tks (bool, optional): Whether drops tokens in evaluation.
:type capacity_factor_train: float, optional
:type capacity_factor_eval: float, optional
:type min_capacity: int, optional
:type noisy_func: Callable, optional
:type drop_tks: bool, optional
""" """
def __init__(self, def __init__(self,
@ -239,15 +228,11 @@ class MoeLayer(nn.Module):
the moe tensor group by all to all comunication. Then it will get the output of all the moe tensor group by all to all comunication. Then it will get the output of all
experts and exchange the output. At last returns the output of the moe system. experts and exchange the output. At last returns the output of the moe system.
:param dim_model: Dimension of model Args:
:param num_experts: The number of experts dim_model (int): Dimension of model.
:param router: Instance of router used in routing num_experts (int): The number of experts.
:param experts: Instance of experts generated by Expert router (:class:`torch.nn.Module`): Instance of router used in routing.
experts (:class:`torch.nn.Module`): Instance of experts generated by Expert.
:type dim_model: int
:type num_experts: int
:type router: nn.Module
:type experts: nn.Module
""" """
def __init__(self, dim_model: int, num_experts: int, router: nn.Module, experts: MoeExperts): def __init__(self, dim_model: int, num_experts: int, router: nn.Module, experts: MoeExperts):

View File

@ -16,8 +16,8 @@ class NormalNoiseGenerator:
All noise is generated from a normal distribution (0, 1 / E^2), where All noise is generated from a normal distribution (0, 1 / E^2), where
E = the number of experts. E = the number of experts.
:param num_experts: The number of experts Args:
:type num_experts: int num_experts (int): The number of experts.
""" """
def __init__(self, num_experts: int): def __init__(self, num_experts: int):
@ -37,8 +37,8 @@ class UniformNoiseGenerator:
Makes models more resilient to rounding errors introduced by bfloat16. Makes models more resilient to rounding errors introduced by bfloat16.
This seems particularly important for logits. This seems particularly important for logits.
:param eps: Epsilon in generator Args:
:type eps: float eps (float, optional): Epsilon in generator, defaults 1e-2.
""" """
def __init__(self, eps: float = 1e-2): def __init__(self, eps: float = 1e-2):

View File

@ -7,17 +7,17 @@ except:
class FusedLayerNormAffineFunction1D(torch.autograd.Function): class FusedLayerNormAffineFunction1D(torch.autograd.Function):
r""" r"""Layernorm
Layernorm
:param input: input maxtrix Args:
:param weight: weight matrix input: input matrix.
:param bias: bias matrix weight: weight matrix.
:param normalized_shape: input shape from an expected input bias: bias matrix.
of size. :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1] \times \ldots \times \text{normalized_shape}[-1]]` normalized_shape: input shape from an expected input of size.
If a single integer is used, it is treated as a singleton list, and this module will :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1] \times \ldots \times \text{normalized_shape}[-1]]`
normalize over the last dimension which is expected to be of that specific size. If a single integer is used, it is treated as a singleton list, and this module will
:param eps: a value added to the denominator for numerical stability normalize over the last dimension which is expected to be of that specific size.
eps: a value added to the denominator for numerical stability
""" """
@staticmethod @staticmethod

View File

@ -78,8 +78,9 @@ class _ReduceGrad(torch.autograd.Function):
""" """
Pass the input to the model parallel region. Pass the input to the model parallel region.
:param input_: input matrix Args:
:param parallel_mode: parallel mode input_: input matrix.
parallel_mode: parallel mode.
""" """
@staticmethod @staticmethod
@ -100,8 +101,9 @@ class _ReduceInput(torch.autograd.Function):
""" """
All-reduce the input from the model parallel region. All-reduce the input from the model parallel region.
:param input_: input matrix Args:
:param parallel_mode: parallel mode input_: input matrix.
parallel_mode: parallel mode.
""" """
@staticmethod @staticmethod
@ -121,9 +123,10 @@ class _SplitForwardGatherBackward(torch.autograd.Function):
""" """
Split the input and keep only the corresponding chuck to the rank. Split the input and keep only the corresponding chuck to the rank.
:param input_: input matrix Args:
:param parallel_mode: parallel mode input_: input matrix.
:param dim: dimension parallel_mode: parallel mode.
dim: dimension
""" """
@staticmethod @staticmethod
@ -142,12 +145,12 @@ class _SplitForwardGatherBackward(torch.autograd.Function):
class _GatherForwardSplitBackward(torch.autograd.Function): class _GatherForwardSplitBackward(torch.autograd.Function):
""" """Gather the input from model parallel region and concatenate.
Gather the input from model parallel region and concatinate.
:param input_: input matrix Args:
:param parallel_mode: parallel mode input_: input matrix.
:param dim: dimension parallel_mode: parallel mode.
dim: dimension
""" """
@staticmethod @staticmethod

View File

@ -24,24 +24,23 @@ from ._utils import (gather_forward_split_backward, get_parallel_input, reduce_g
@LAYERS.register_module @LAYERS.register_module
class Linear1D(torch.nn.Module): class Linear1D(torch.nn.Module):
""" r"""Linear layer for 1D parallelism.
Linear layer for 1D parallelism
:param in_features: size of each input sample Args:
:type in_features: int in_features (int): size of each input sample.
:param out_features: size of each output sample out_features (int): size of each output sample.
:type out_features: int bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
:param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
:type bias: bool, optional gather_output (bool, optional): Whether to call all-gather on output, defaults to False.
:param dtype: The dtype of parameters, defaults to None skip_bias_add (bool, optional): If set to ``True``, it will skip bias add for linear layer,
:type dtype: torch.dtype, optional which is preserved for kernel fusion, defaults to False
:param skip_bias_add: If set to ``True``, it will skip bias add for linear layer, weight_initializer (:class:`typing.Callable`, optional):
which is preserved for kernel fusion, defaults to False The initializer of weight, defaults to kaiming uniform initializer.
:type skip_bias_add: bool, optional bias_initializer (:class:`typing.Callable`, optional):
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer The initializer of bias, defaults to xavier uniform initializer.
:type weight_initializer: typing.Callable, optional
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer More details about ``initializer`` please refer to
:type bias_initializer: typing.Callable, optional `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
""" """
def __init__(self, def __init__(self,
@ -88,23 +87,21 @@ class Linear1D(torch.nn.Module):
@LAYERS.register_module @LAYERS.register_module
class Classifier1D(ParallelLayer): class Classifier1D(ParallelLayer):
"""RowLinear with given weight r"""RowLinear with given weight. Classifier of 1D parallelism.
Classifier of 1D parallelism
:param in_features: size of input features Args:
:type in_features: int in_features (int): size of each input sample.
:param num_classes: number of classes in the dataset num_classes (int): number of classes.
:type num_classes: int weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
:param weight: weight of the classifier, defaults to True bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
:type weight: torch.nn.Parameter, optional dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
:param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True`` weight_initializer (:class:`typing.Callable`, optional):
:type bias: bool, optional The initializer of weight, defaults to kaiming uniform initializer.
:param dtype: The dtype of parameters, defaults to None bias_initializer (:class:`typing.Callable`, optional):
:type dtype: torch.dtype, optional The initializer of bias, defaults to xavier uniform initializer.
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
:type weight_initializer: typing.Callable, optional More details about ``initializer`` please refer to
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
:type bias_initializer: typing.Callable, optional
""" """
def __init__(self, def __init__(self,
@ -171,23 +168,21 @@ class Classifier1D(ParallelLayer):
@LAYERS.register_module @LAYERS.register_module
class VocabParallelClassifier1D(ParallelLayer): class VocabParallelClassifier1D(ParallelLayer):
"""ColLinear with given weight r"""ColLinear with given weight. Classifier of 1D parallelism.
Classifier of 1D parallelism
:param in_features: size of input features Args:
:type in_features: int in_features (int): size of each input sample.
:param num_classes: number of classes in the dataset num_classes (int): number of classes.
:type num_classes: int weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
:param weight: weight of the classifier, defaults to True bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
:type weight: torch.nn.Parameter, optional dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
:param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True`` weight_initializer (:class:`typing.Callable`, optional):
:type bias: bool, optional The initializer of weight, defaults to kaiming uniform initializer.
:param dtype: The dtype of parameters, defaults to None bias_initializer (:class:`typing.Callable`, optional):
:type dtype: torch.dtype, optional The initializer of bias, defaults to xavier uniform initializer.
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
:type weight_initializer: typing.Callable, optional More details about ``initializer`` please refer to
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
:type bias_initializer: typing.Callable, optional
""" """
def __init__(self, def __init__(self,
@ -249,30 +244,28 @@ class VocabParallelClassifier1D(ParallelLayer):
@LAYERS.register_module @LAYERS.register_module
class Linear1D_Col(ParallelLayer): class Linear1D_Col(ParallelLayer):
"""Linear layer with column parallelism. r"""Linear layer with column parallelism.
The linear layer is defined as :math:`Y = XA + b`. A is parallelized along The linear layer is defined as :math:`Y = XA + b`. A is parallelized along
its second dimension as :math:`A = [A_1, ..., A_p]`. its second dimension as :math:`A = [A_1, ..., A_p]`.
:param in_features: first dimension of matrix A. Args:
:type in_features: int in_features (int): size of each input sample.
:param output_size: second dimension of matrix A. out_features (int): size of each output sample.
:type output_size: int bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
:param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True`` dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
:type bias: bool, optional gather_output (bool, optional): If true, call all-gather on output and make Y available
:param dtype: The dtype of parameters, defaults to None
:type dtype: torch.dtype, optional
:param gather_output: If true, call all-gether on output and make Y avaiable
to all GPUs, otherwise, every GPU will have its output to all GPUs, otherwise, every GPU will have its output
which is :math:`Y_i = XA_i`, defaults to False which is :math:`Y_i = XA_i`, defaults to False
:type gather_output: bool, optional skip_bias_add (bool, optional): If set to ``True``, it will skip bias add for linear layer,
:param skip_bias_add: If set to ``True``, it will skip bias add for linear layer, which is preserved for kernel fusion, defaults to Fals
which is preserved for kernel fusion, defaults to False weight_initializer (:class:`typing.Callable`, optional):
:type skip_bias_add: bool, optional The initializer of weight, defaults to kaiming uniform initializer.
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer bias_initializer (:class:`typing.Callable`, optional):
:type weight_initializer: typing.Callable, optional The initializer of bias, defaults to xavier uniform initializer.
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
:type bias_initializer: typing.Callable, optional More details about ``initializer`` please refer to
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
""" """
def __init__(self, def __init__(self,
@ -343,25 +336,23 @@ class Linear1D_Col(ParallelLayer):
@LAYERS.register_module @LAYERS.register_module
class Linear1D_Row(ParallelLayer): class Linear1D_Row(ParallelLayer):
""" Linear layer with row parallelism r""" Linear layer with row parallelism
:param in_features: size of each input sample Args:
:type in_features: int in_features (int): size of each input sample.
:param out_features: size of each output sample out_features (int): size of each output sample.
:type out_features: int bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
:param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True`` dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
:type bias: bool, optional parallel_input (bool, optional): If set to ``True``, it's assumed that the input is split, defaults to False.
:param dtype: The dtype of parameters, defaults to None skip_bias_add (bool, optional): If set to ``True``, it will skip bias add for linear layer,
:type dtype: torch.dtype, optional which is preserved for kernel fusion, defaults to Fals
:param parallel_input: If set to ``True``, it's assumed that the input is splitted, defaults to False weight_initializer (:class:`typing.Callable`, optional):
:type parallel_input: bool, optional The initializer of weight, defaults to kaiming uniform initializer.
:param skip_bias_add: If set to ``True``, it will skip bias add for linear layer, bias_initializer (:class:`typing.Callable`, optional):
which is preserved for kernel fusion, defaults to False The initializer of bias, defaults to xavier uniform initializer.
:type skip_bias_add: bool, optional
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer More details about ``initializer`` please refer to
:type weight_initializer: typing.Callable, optional `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
:type bias_initializer: typing.Callable, optional
""" """
def __init__(self, def __init__(self,
@ -432,21 +423,33 @@ class Linear1D_Row(ParallelLayer):
@LAYERS.register_module @LAYERS.register_module
class Embedding1D(ParallelLayer): class Embedding1D(ParallelLayer):
""" r"""Embedding for 1D parallelism.
Embedding for 1D parallelism
:param num_embeddings: number of embeddings Args:
:type num_embeddings: int num_embeddings (int): number of embeddings.
:param embedding_dim: dimension of embedding embedding_dim (int): dimension of embedding.
:type embedding_dim: int padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
:param padding_idx: index of padding, defaults to None therefore, the embedding vector at padding_idx is not updated during training,
:type padding_idx: int, optional i.e. it remains as a fixed pad, defaults to None.
:param dtype: The dtype of parameters, defaults to None dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
:type dtype: torch.dtype, optional weight_initializer (:class:`typing.Callable`, optional):
:param weight_initializer: The intializer of weight, defaults to normal initializer he initializer of weight, defaults to normal initializer.
:type weight_initializer: typing.Callable, optional
:param args: Args used in F.embedding The ``args`` and ``kwargs`` used in :class:`torch.nn.functional.embedding` should contain:
:param kwargs: Kwargs used in F.embedding ::
max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
renormalized to have norm max_norm. Note: this will modify weight in-place.
norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
of frequency of the words in the mini-batch. Default False.
sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
More details about ``args`` and ``kwargs`` could be found in
`Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
More details about ``initializer`` please refer to
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_
""" """
def __init__(self, def __init__(self,
@ -499,20 +502,33 @@ class Embedding1D(ParallelLayer):
@LAYERS.register_module @LAYERS.register_module
class VocabParallelEmbedding1D(torch.nn.Module): class VocabParallelEmbedding1D(torch.nn.Module):
"""Embedding parallelized in the vocabulary dimension. r"""Embedding parallelized in the vocabulary dimension.
:param num_embeddings: number of embeddings Args:
:type num_embeddings: int num_embeddings (int): number of embeddings.
:param embedding_dim: dimension of embedding embedding_dim (int): dimension of embedding.
:type embedding_dim: int padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
:param padding_idx: index of padding, defaults to None therefore, the embedding vector at padding_idx is not updated during training,
:type padding_idx: int, optional i.e. it remains as a fixed pad, defaults to None.
:param dtype: The dtype of parameters, defaults to None dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
:type dtype: torch.dtype, optional weight_initializer (:class:`typing.Callable`, optional):
:param weight_initializer: The intializer of weight, defaults to normal initializer he initializer of weight, defaults to normal initializer.
:type weight_initializer: typing.Callable, optional
:param args: Args used in F.embedding The ``args`` and ``kwargs`` used in :class:``torch.nn.functional.embedding`` should contain:
:param kwargs: Kwargs used in F.embedding ::
max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
renormalized to have norm max_norm. Note: this will modify weight in-place.
norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
of frequency of the words in the mini-batch. Default False.
sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
More details about ``args`` and ``kwargs`` could be found in
`Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
More details about initializer please refer to
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
""" """
def __init__(self, def __init__(self,
@ -578,13 +594,11 @@ class VocabParallelEmbedding1D(torch.nn.Module):
@LAYERS.register_module @LAYERS.register_module
class Dropout1D(ParallelLayer): class Dropout1D(ParallelLayer):
""" """Dropout layer of 1D parallelism.
Dropout layer of 1D parallelism
:param p: dropout rate, defaults to 0.5 Args:
:type p: float, optional p (float, optional): probability of an element to be zeroed, defaults 0.5.
:param inplace: If set to ``True``, will do this operation in-place, defaults tp ``False`` inplace (bool, optional): whether to do dropout in-place, default to be False.
:type inplace: bool, optional
""" """
def __init__(self, p: float = 0.5, inplace: bool = False): def __init__(self, p: float = 0.5, inplace: bool = False):

View File

@ -21,27 +21,26 @@ def matmul_2d(
row_parallel_mode=ParallelMode.PARALLEL_2D_ROW, row_parallel_mode=ParallelMode.PARALLEL_2D_ROW,
col_parallel_mode=ParallelMode.PARALLEL_2D_COL, col_parallel_mode=ParallelMode.PARALLEL_2D_COL,
): ):
""" r"""Matrix multiplication for 2D parallelism.
Matrix multiplication for 2D parallelism
:param a: matrix :math:`A` Args:
:type a: torch.tensor a (:class:`torch.tensor`): matrix :math:`A`.
:param b: matrix :math:`B` b (:class:`torch.tensor`): matrix :math:`B`.
:type b: torch.tensor summa_dim (int): dimension of SUMMA fo 2D parallelism.
:param summa_dim: dimension of SUMMA fo 2D parallelism out_shape (:class:`torch.size`): shape of output tensor.
:type summa_dim: int row_rank (int, optional): the rank of row, defaults to None.
:param out_shape: shape of output tensor col_rank (int, optional): the rank of column, defaults to None.
:type out_shape: tuple row_parallel_mode (:class:`colossalai.context.ParallelMode`, optional):
:param row_rank: the rank of row, defaults to None row parallel mode, defaults to ParallelMode.PARALLEL_2D_ROW.
:type row_rank: int, optional col_parallel_mode (:class:`colossalai.context.ParallelMode`, optional):
:param col_rank: the rank of column, defaults to None column parallel mode, defaults to ParallelMode.PARALLEL_2D_COL.
:type col_rank: int, optional
:param row_parallel_mode: row parallel mode, defaults to ParallelMode.PARALLEL_2D_ROW Returns:
:type row_parallel_mode: str, optional :class:`torch.tensor`: :math:`C = AB`.
:param col_parallel_mode: column parallel mode, defaults to ParallelMode.PARALLEL_2D_COL
:type col_parallel_mode: str, optional Note:
:return: :math:`C = AB` The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
:rtype: torch.tensor in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
""" """
if row_rank is None: if row_rank is None:
row_rank = gpc.get_local_rank(col_parallel_mode) row_rank = gpc.get_local_rank(col_parallel_mode)
@ -135,35 +134,26 @@ def classifier_2d(A: Tensor, B: Tensor, bias: Optional[Tensor], summa_dim: int,
row_rank: int, col_rank: int, row_parallel_mode: ParallelMode, col_parallel_mode: ParallelMode, row_rank: int, col_rank: int, row_parallel_mode: ParallelMode, col_parallel_mode: ParallelMode,
data_parallel_rank: int, pipeline_parallel_rank: int, pipeline_parallel_size: int, data_parallel_rank: int, pipeline_parallel_rank: int, pipeline_parallel_size: int,
tensor_parallel_size: int) -> Tensor: tensor_parallel_size: int) -> Tensor:
""" r"""2D parallel classifier.
2D parallel classifier
:param a: matrix :math:`A` Args:
:type a: torch.tensor A (:class:`torch.tensor`): matrix :math:`A`.
:param b: matrix :math:`B` B (:class:`torch.tensor`): matrix :math:`B`.
:type b: torch.tensor bias (:class:`torch.tensor`, optional): matrix of bias.
:param bias: matrix of bias summa_dim (int): dimension of SUMMA fo 2D parallelism.
:type bias: torch.tensor, optional out_shape (:class:`torch.size`): shape of output tensor.
:param summa_dim: dimension of SUMMA fo 2D parallelism row_rank (int, optional): the rank of row, defaults to None.
:type summa_dim: int col_rank (int, optional): the rank of column, defaults to None.
:param out_shape: shape of output tensor row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
:type out_shape: tuple col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
:param row_rank: the rank of row data_parallel_rank (int): data parallel rank.
:type row_rank: int pipeline_parallel_rank (int): pipeline parallel rank
:param col_rank: the rank of column pipeline_parallel_size (int): pipeline parallel size.
:type col_rank: int tensor_parallel_size (int): tensor parallel size.
:param row_parallel_mode: row parallel mode
:type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode Note:
:param col_parallel_mode: column parallel mode The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
:type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
:param data_parallel_rank: data parallel rank
:type data_parallel_rank: int
:param pipeline_parallel_rank: pipeline parallel rank
:type pipeline_parallel_rank: int
:param pipeline_parallel_size: pipeline parallel size
:type pipeline_parallel_size: int
:param tensor_parallel_size: tensor parallel size
:type tensor_parallel_size: int
""" """
return _Classifier2D.apply(A, B, bias, summa_dim, out_shape, row_rank, col_rank, row_parallel_mode, return _Classifier2D.apply(A, B, bias, summa_dim, out_shape, row_rank, col_rank, row_parallel_mode,
col_parallel_mode, data_parallel_rank, pipeline_parallel_rank, pipeline_parallel_size, col_parallel_mode, data_parallel_rank, pipeline_parallel_rank, pipeline_parallel_size,
@ -171,33 +161,25 @@ def classifier_2d(A: Tensor, B: Tensor, bias: Optional[Tensor], summa_dim: int,
class Matmul_AB_2D(torch.autograd.Function): class Matmul_AB_2D(torch.autograd.Function):
""" r"""Matrix multiplication for :math:`C = AB`.
Matrix multiplication for :math:`C = AB`
:param a: matrix :math:`A` Args:
:type a: torch.tensor A (:class:`torch.tensor`): matrix :math:`A`.
:param b: matrix :math:`B` B (:class:`torch.tensor`): matrix :math:`B`.
:type b: torch.tensor summa_dim (int): dimension of SUMMA fo 2D parallelism.
:param summa_dim: dimension of SUMMA fo 2D parallelism out_shape (:class:`torch.size`): shape of output tensor.
:type summa_dim: int row_rank (int, optional): the rank of row, defaults to None.
:param out_shape: shape of output tensor col_rank (int, optional): the rank of column, defaults to None.
:type out_shape: tuple row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
:param row_rank: the rank of row col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
:type row_rank: int data_parallel_rank (int): data parallel rank.
:param col_rank: the rank of column pipeline_parallel_rank (int): pipeline parallel rank
:type col_rank: int pipeline_parallel_size (int): pipeline parallel size.
:param row_parallel_mode: row parallel mode tensor_parallel_size (int): tensor parallel size.
:type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
:param col_parallel_mode: column parallel mode Note:
:type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
:param data_parallel_rank: data parallel rank in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
:type data_parallel_rank: int
:param pipeline_parallel_rank: pipeline parallel rank
:type pipeline_parallel_rank: int
:param pipeline_parallel_size: pipeline parallel size
:type pipeline_parallel_size: int
:param tensor_parallel_size: tensor parallel size
:type tensor_parallel_size: int
""" """
@staticmethod @staticmethod
@custom_fwd(cast_inputs=torch.float16) @custom_fwd(cast_inputs=torch.float16)
@ -305,33 +287,26 @@ class Matmul_AB_2D(torch.autograd.Function):
class Matmul_ABT_2D(torch.autograd.Function): class Matmul_ABT_2D(torch.autograd.Function):
""" r"""Matrix multiplication for :math:`C = AB^T`
Matrix multiplication for :math:`C = AB^T`
:param a: matrix :math:`A` Args:
:type a: torch.tensor A (:class:`torch.tensor`): matrix :math:`A`.
:param b: matrix :math:`B` B (:class:`torch.tensor`): matrix :math:`B`.
:type b: torch.tensor summa_dim (int): dimension of SUMMA fo 2D parallelism.
:param summa_dim: dimension of SUMMA fo 2D parallelism out_shape (:class:`torch.size`): shape of output tensor.
:type summa_dim: int row_rank (int, optional): the rank of row, defaults to None.
:param out_shape: shape of output tensor col_rank (int, optional): the rank of column, defaults to None.
:type out_shape: tuple row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
:param row_rank: the rank of row col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
:type row_rank: int column parallel mode, defaults to ParallelMode.PARALLEL_2D_COL.
:param col_rank: the rank of column data_parallel_rank (int): data parallel rank.
:type col_rank: int pipeline_parallel_rank (int): pipeline parallel rank
:param row_parallel_mode: row parallel mode pipeline_parallel_size (int): pipeline parallel size.
:type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode tensor_parallel_size (int): tensor parallel size.
:param col_parallel_mode: column parallel mode
:type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode Note:
:param data_parallel_rank: data parallel rank The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
:type data_parallel_rank: int in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
:param pipeline_parallel_rank: pipeline parallel rank
:type pipeline_parallel_rank: int
:param pipeline_parallel_size: pipeline parallel size
:type pipeline_parallel_size: int
:param tensor_parallel_size: tensor parallel size
:type tensor_parallel_size: int
""" """
@staticmethod @staticmethod
@custom_fwd(cast_inputs=torch.float16) @custom_fwd(cast_inputs=torch.float16)
@ -445,33 +420,25 @@ class Matmul_ABT_2D(torch.autograd.Function):
class Matmul_ATB_2D(torch.autograd.Function): class Matmul_ATB_2D(torch.autograd.Function):
""" r"""Matrix multiplication for :math:`C = A^TB`.
Matrix multiplication for :math:`C = A^TB`
:param a: matrix :math:`A` Args:
:type a: torch.tensor A (:class:`torch.tensor`): matrix :math:`A`.
:param b: matrix :math:`B` B (:class:`torch.tensor`): matrix :math:`B`.
:type b: torch.tensor summa_dim (int): dimension of SUMMA fo 2D parallelism.
:param summa_dim: dimension of SUMMA fo 2D parallelism out_shape (:class:`torch.size`): shape of output tensor.
:type summa_dim: int row_rank (int, optional): the rank of row, defaults to None.
:param out_shape: shape of output tensor col_rank (int, optional): the rank of column, defaults to None.
:type out_shape: tuple row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
:param row_rank: the rank of row col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
:type row_rank: int data_parallel_rank (int): data parallel rank.
:param col_rank: the rank of column pipeline_parallel_rank (int): pipeline parallel rank
:type col_rank: int pipeline_parallel_size (int): pipeline parallel size.
:param row_parallel_mode: row parallel mode tensor_parallel_size (int): tensor parallel size.
:type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
:param col_parallel_mode: column parallel mode Note:
:type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
:param data_parallel_rank: data parallel rank in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
:type data_parallel_rank: int
:param pipeline_parallel_rank: pipeline parallel rank
:type pipeline_parallel_rank: int
:param pipeline_parallel_size: pipeline parallel size
:type pipeline_parallel_size: int
:param tensor_parallel_size: tensor parallel size
:type tensor_parallel_size: int
""" """
@staticmethod @staticmethod
@custom_fwd(cast_inputs=torch.float16) @custom_fwd(cast_inputs=torch.float16)
@ -639,33 +606,26 @@ def add_bias_2d(input_: Tensor, bias: Tensor, output_size_per_partition: int, ro
row_parallel_mode: ParallelMode, col_parallel_mode: ParallelMode, skip_bias_add: bool, row_parallel_mode: ParallelMode, col_parallel_mode: ParallelMode, skip_bias_add: bool,
data_parallel_rank: int, pipeline_parallel_rank: int, pipeline_parallel_size: int, data_parallel_rank: int, pipeline_parallel_rank: int, pipeline_parallel_size: int,
tensor_parallel_size: int) -> Tensor: tensor_parallel_size: int) -> Tensor:
""" r"""Matrix add bias: :math:`C = A + b`.
Matrix add bias: :math:`C = A + b`
:param input_: matrix :math:`A` Args:
:type input_: torch.tensor input_ (:class:`torch.tensor`): matrix :math:`A`.
:param bias: matrix :math:`b` bias (:class:`torch.tensor`): matrix :math:`B`.
:type bias: torch.tensor output_size_per_partition (int): size of output per partition.
:param output_size_per_partition: size of ouput per partition row_rank (int, optional): the rank of row, defaults to None.
:type output_size_per_partition: int col_rank (int, optional): the rank of column, defaults to None.
:param row_rank: the rank of row row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
:type row_rank: int col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
:param col_rank: the rank of column skip_bias_add (bool):
:type col_rank: int If set to ``True``, it will skip bias add for linear layer, which is preserved for kernel fusion.
:param row_parallel_mode: row parallel mode data_parallel_rank (int): data parallel rank.
:type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode pipeline_parallel_rank (int): pipeline parallel rank
:param col_parallel_mode: column parallel mode pipeline_parallel_size (int): pipeline parallel size.
:type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode tensor_parallel_size (int): tensor parallel size.
:param skip_bias_add: If set to ``True``, it will skip bias add for linear layer, which is preserved for kernel fusion
:type skip_bias_add: bool Note:
:param data_parallel_rank: data parallel rank The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
:type data_parallel_rank: int in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
:param pipeline_parallel_rank: pipeline parallel rank
:type pipeline_parallel_rank: int
:param pipeline_parallel_size: pipeline parallel size
:type pipeline_parallel_size: int
:param tensor_parallel_size: tensor parallel size
:type tensor_parallel_size: int
""" """
return _Add_Bias_2D.apply(input_, bias, output_size_per_partition, row_rank, col_rank, row_parallel_mode, return _Add_Bias_2D.apply(input_, bias, output_size_per_partition, row_rank, col_rank, row_parallel_mode,
col_parallel_mode, skip_bias_add, data_parallel_rank, pipeline_parallel_rank, col_parallel_mode, skip_bias_add, data_parallel_rank, pipeline_parallel_rank,
@ -711,21 +671,19 @@ class _Layernorm_2D(torch.autograd.Function):
def layernorm_2d(input_: Tensor, E_x: Tensor, Var_x: Tensor, hidden_size: int, row_parallel_mode: ParallelMode, def layernorm_2d(input_: Tensor, E_x: Tensor, Var_x: Tensor, hidden_size: int, row_parallel_mode: ParallelMode,
col_parallel_mode: ParallelMode) -> Tensor: col_parallel_mode: ParallelMode) -> Tensor:
""" r"""Layernorm.
Layernorm
:param input_: input maxtrix Args:
:type input_: torch.tensor input_ (:class:`torch.tensor`): input matrix.
:param E_x: mean E_x (:class:`torch.tensor`): mean.
:type E_x: torch.tensor Var_x (:class:`torch.tensor`): variance.
:param Var_x: variance hidden_size (int): hidden size.
:type Var_x: torch.tensor row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
:param hidden_size: hidden size col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
:type hidden_size: int
:param row_parallel_mode: row parallel mode Note:
:type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
:param col_parallel_mode: column parallel mode in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
:type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
""" """
return _Layernorm_2D.apply(input_, E_x, Var_x, hidden_size, row_parallel_mode, col_parallel_mode) return _Layernorm_2D.apply(input_, E_x, Var_x, hidden_size, row_parallel_mode, col_parallel_mode)
@ -748,27 +706,29 @@ class _AllGatherTensor2D(torch.autograd.Function):
def all_gather_tensor_2d(tensor: Tensor, dim: int, parallel_mode: ParallelMode) -> Tensor: def all_gather_tensor_2d(tensor: Tensor, dim: int, parallel_mode: ParallelMode) -> Tensor:
""" r"""All gather the tensor of 2D parallelism.
All gather the tensor of 2D parallelism
:param inputs: input maxtrix Args:
:type inputs: torch.tensor tensor (:class:`torch.tensor`): Input tensor.
:param dim: dimension to gather dim (int): Dimension to gather.
:type dim: int parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode tensor used.
:param parallel_mode: parallel mode
:type parallel_mode: colossalai.context.parallel_mode.ParallelMode Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
""" """
return _AllGatherTensor2D.apply(tensor, dim, parallel_mode) return _AllGatherTensor2D.apply(tensor, dim, parallel_mode)
def split_tensor_2d(input_: Tensor, dim: int = 0) -> Tensor: def split_tensor_2d(input_: Tensor, dim: int = 0) -> Tensor:
"""Splits 2D tensor in specified dimension across cols """Splits 2D tensor in specified dimension across cols.
:param input_: Input tensor
:param dim: Specified dimension in which to split Args:
:type input_: torch.Tensor input_ (:class:`torch.tensor`): Input tensor.
:type dim: int, optional dim (int): Specified dimension in which to split.
:return output: Splitted tensor
:rtype output: torch.Tensor Returns:
:class:`torch.tensor`: The tensor has been split.
""" """
if input_.size(dim) <= 1: if input_.size(dim) <= 1:
return input_ return input_
@ -787,11 +747,15 @@ class _ReduceTensor2D(torch.autograd.Function):
def reduce_tensor_2d(input_: Tensor, parallel_mode: ParallelMode) -> Tensor: def reduce_tensor_2d(input_: Tensor, parallel_mode: ParallelMode) -> Tensor:
""" r"""All-reduce the input.
All-reduce the input.
:param input_: input tensor Args:
:param parallel_mode: parallel mode input_ (:class:`torch.tensor`): Input tensor.
parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode tensor used.
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
""" """
return _ReduceTensor2D.apply(input_, parallel_mode) return _ReduceTensor2D.apply(input_, parallel_mode)
@ -809,12 +773,16 @@ class _ReduceScatterTensor2D(torch.autograd.Function):
def reduce_scatter_tensor_2d(tensor: Tensor, dim: int, parallel_mode: ParallelMode) -> Tensor: def reduce_scatter_tensor_2d(tensor: Tensor, dim: int, parallel_mode: ParallelMode) -> Tensor:
""" r"""Reduce-scatter the input.
Reduce-scatter the input.
:param tensor: Input tensor Args:
:param dim: Dimension to scatter tensor (:class:`torch.tensor`): Input tensor.
:param parallel_mode: Parallel mode dim (int): Dimension to reduce.
parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode tensor used.
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
""" """
return _ReduceScatterTensor2D.apply(tensor, dim, parallel_mode) return _ReduceScatterTensor2D.apply(tensor, dim, parallel_mode)
@ -849,11 +817,11 @@ class _ReduceByBatch2D(torch.autograd.Function):
def reduce_by_batch_2d(input_, reduce_mean: bool = False) -> Tensor: def reduce_by_batch_2d(input_, reduce_mean: bool = False) -> Tensor:
"""All-reduce the input from the model parallel region. r"""All-reduce the input from the model parallel region.
:param input_: input maxtrix Args:
:type input_: torch.tensor input_ (:class:`torch.tensor`): input matrix.
:param reduce_mean: If set to ``True``, it will divide the output by column parallel size, default to False reduce_mean (bool, optional):
:type reduce_mean: bool, optional If set to ``True``, it will divide the output by column parallel size, default to False.
""" """
return _ReduceByBatch2D.apply(input_, reduce_mean) return _ReduceByBatch2D.apply(input_, reduce_mean)

View File

@ -22,23 +22,22 @@ from ._utils import assert_summa_initialization, get_summa_dim_from_env
@LAYERS.register_module @LAYERS.register_module
class Linear2D(ParallelLayer): class Linear2D(ParallelLayer):
""" r"""Linear layer for 2D parallelism
Linear layer for 2D parallelism
:param in_features: size of each input sample Args:
:type in_features: int in_features (int): size of each input sample.
:param out_features: size of each output sample out_features (int): size of each output sample.
:type out_features: int bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
:param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
:type bias: bool, optional skip_bias_add (bool, optional): If set to ``True``, it will skip bias add for linear layer,
:param dtype: The dtype of parameters, defaults to None which is preserved for kernel fusion, defaults to False.
:type dtype: torch.dtype, optional weight_initializer (:class:`typing.Callable`, optional):
:param skip_bias_add: If set to ``True``, it will skip bias add for linear layer, which is preserved for kernel fusion, defaults to False The initializer of weight, defaults to kaiming uniform initializer.
:type skip_bias_add: bool, optional bias_initializer (:class:`typing.Callable`, optional):
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer The initializer of bias, defaults to xavier uniform initializer.
:type weight_initializer: typing.Callable, optional
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer More details about ``initializer`` please refer to
:type bias_initializer: typing.Callable, optional `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
""" """
def __init__(self, def __init__(self,
in_features: int, in_features: int,
@ -119,18 +118,16 @@ class Linear2D(ParallelLayer):
@LAYERS.register_module @LAYERS.register_module
class LayerNorm2D(ParallelLayer): class LayerNorm2D(ParallelLayer):
r""" r"""Layer Normalization for 2D parallelism.
Layer Normalization for 2D parallelism
:param normalized_shape: input shape from an expected input Args:
of size. :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1] \times \ldots \times \text{normalized_shape}[-1]]` normalized_shape (int): input shape from an expected input of size.
If a single integer is used, it is treated as a singleton list, and this module will :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
normalize over the last dimension which is expected to be of that specific size. \times \ldots \times \text{normalized_shape}[-1]]`
:type normalized_shape: int If a single integer is used, it is treated as a singleton list, and this module will
:param eps: a value added to the denominator for numerical stability, defaults to 1e-05 normalize over the last dimension which is expected to be of that specific size.
:type eps: float, optional eps (float, optional): a value added to the denominator for numerical stability, defaults to 1e-05.
:param dtype: The dtype of parameters, defaults to None dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
:type dtype: torch.dtype, optional
""" """
def __init__(self, normalized_shape: int, eps: float = 1e-05, dtype=None): def __init__(self, normalized_shape: int, eps: float = 1e-05, dtype=None):
super().__init__() super().__init__()
@ -189,27 +186,24 @@ class LayerNorm2D(ParallelLayer):
@LAYERS.register_module @LAYERS.register_module
class PatchEmbedding2D(ParallelLayer): class PatchEmbedding2D(ParallelLayer):
""" r"""2D Image to Patch Embedding.
2D Image to Patch Embedding
:param img_size: image size Args:
:type img_size: int img_size (int): image size.
:param patch_size: patch size patch_size (int): patch size.
:type patch_size: int in_chans (int): number of channels of input image.
:param in_chans: number of channels of input image embed_size (int): size of embedding.
:type in_chans: int dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
:param embed_size: size of embedding flatten (bool, optional): whether to flatten output tensor, defaults to True.
:type embed_size: int weight_initializer (:class:`typing.Callable`, optional):
:param dtype: The dtype of parameters, defaults to None The initializer of weight, defaults to kaiming uniform initializer.
:type dtype: torch.dtype, optional bias_initializer (:class:`typing.Callable`, optional):
:param flatten: whether to flatten output tensor, defaults to True The initializer of bias, defaults to xavier uniform initializer.
:type flatten: bool, optional position_embed_initializer (:class:`typing.Callable`, optional):
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer The initializer of position embedding, defaults to zeros initializer.
:type weight_initializer: typing.Callable, optional
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer More details about ``initializer`` please refer to
:type bias_initializer: typing.Callable, optional `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
:param position_embed_initializer: The intializer of position embedding, defaults to zero
:type position_embed_initializer: typing.Callable, optional
""" """
def __init__(self, def __init__(self,
img_size: int, img_size: int,
@ -291,21 +285,33 @@ class PatchEmbedding2D(ParallelLayer):
@LAYERS.register_module @LAYERS.register_module
class Embedding2D(ParallelLayer): class Embedding2D(ParallelLayer):
""" r"""Embedding for 2D parallelism.
Embedding for 2D parallelism
:param num_embeddings: number of embeddings Args:
:type num_embeddings: int num_embeddings (int): number of embeddings.
:param embedding_dim: dimension of embedding embedding_dim (int): dimension of embedding.
:type embedding_dim: int padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
:param padding_idx: index of padding, defaults to None therefore, the embedding vector at padding_idx is not updated during training,
:type padding_idx: int, optional i.e. it remains as a fixed pad, defaults to None.
:param dtype: The dtype of parameters, defaults to None dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
:type dtype: torch.dtype, optional weight_initializer (:class:`typing.Callable`, optional):
:param weight_initializer: The intializer of weight, defaults to normal initializer he initializer of weight, defaults to normal initializer.
:type weight_initializer: typing.Callable, optional
:param args: Args used in F.embedding The ``args`` and ``kwargs`` used in :class:``torch.nn.functional.embedding`` should contain:
:param kwargs: Kwargs used in F.embedding ::
max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
renormalized to have norm max_norm. Note: this will modify weight in-place.
norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
of frequency of the words in the mini-batch. Default False.
sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
More details about ``args`` and ``kwargs`` could be found in
`Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
More details about initializer please refer to
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_
""" """
def __init__(self, def __init__(self,
num_embeddings: int, num_embeddings: int,
@ -358,20 +364,33 @@ class Embedding2D(ParallelLayer):
@LAYERS.register_module @LAYERS.register_module
class VocabParallelEmbedding2D(torch.nn.Module): class VocabParallelEmbedding2D(torch.nn.Module):
"""Embedding parallelized in the vocabulary dimension. r"""Embedding parallelized in the vocabulary dimension.
:param num_embeddings: number of embeddings Args:
:type num_embeddings: int num_embeddings (int): number of embeddings.
:param embedding_dim: dimension of embedding embedding_dim (int): dimension of embedding.
:type embedding_dim: int padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
:param padding_idx: index of padding, defaults to None therefore, the embedding vector at padding_idx is not updated during training,
:type padding_idx: int, optional i.e. it remains as a fixed pad, defaults to None.
:param dtype: The dtype of parameters, defaults to None dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
:type dtype: torch.dtype, optional weight_initializer (:class:`typing.Callable`, optional):
:param weight_initializer: The intializer of weight, defaults to normal initializer he initializer of weight, defaults to normal initializer.
:type weight_initializer: typing.Callable, optional
:param args: Args used in F.embedding The ``args`` and ``kwargs`` used in :class:``torch.nn.functional.embedding`` should contain:
:param kwargs: Kwargs used in F.embedding ::
max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
renormalized to have norm max_norm. Note: this will modify weight in-place.
norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
of frequency of the words in the mini-batch. Default False.
sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
More details about ``args`` and ``kwargs`` could be found in
`Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
More details about initializer please refer to
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
""" """
def __init__(self, def __init__(self,
num_embeddings: int, num_embeddings: int,
@ -435,23 +454,21 @@ class VocabParallelEmbedding2D(torch.nn.Module):
@LAYERS.register_module @LAYERS.register_module
class Classifier2D(ParallelLayer): class Classifier2D(ParallelLayer):
""" r"""Classifier for 2D parallelism.
Classifier for 2D parallelism
:param in_features: size of each input sample Args:
:type in_features: int in_features (int): size of each input sample.
:param num_classes: number of classes num_classes (int): number of classes.
:type num_classes: int weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
:param weight: weight of the classifier, defaults to True bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
:type weight: torch.nn.Parameter, optional dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
:param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True`` weight_initializer (:class:`typing.Callable`, optional):
:type bias: bool, optional The initializer of weight, defaults to kaiming uniform initializer.
:param dtype: The dtype of parameters, defaults to None bias_initializer (:class:`typing.Callable`, optional):
:type dtype: torch.dtype, optional The initializer of bias, defaults to xavier uniform initializer.
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
:type weight_initializer: typing.Callable, optional More details about ``initializer`` please refer to
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
:type bias_initializer: typing.Callable, optional
""" """
def __init__(self, def __init__(self,
in_features: int, in_features: int,
@ -515,23 +532,21 @@ class Classifier2D(ParallelLayer):
@LAYERS.register_module @LAYERS.register_module
class VocabParallelClassifier2D(ParallelLayer): class VocabParallelClassifier2D(ParallelLayer):
""" r"""Vocab parallel classifier layer for 2D parallelism.
Vocab parallel classifier layer for 2D parallelism
:param in_features: size of each input sample Args:
:type in_features: int in_features (int): size of each input sample.
:param num_classes: number of classes num_classes (int): number of classes.
:type num_classes: int weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
:param weight: weight of the classifier, defaults to True bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
:type weight: torch.nn.Parameter, optional dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
:param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True`` weight_initializer (:class:`typing.Callable`, optional):
:type bias: bool, optional The initializer of weight, defaults to kaiming uniform initializer.
:param dtype: The dtype of parameters, defaults to None bias_initializer (:class:`typing.Callable`, optional):
:type dtype: torch.dtype, optional The initializer of bias, defaults to xavier uniform initializer.
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
:type weight_initializer: typing.Callable, optional More details about ``initializer`` please refer to
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
:type bias_initializer: typing.Callable, optional
""" """
def __init__(self, def __init__(self,
in_features: int, in_features: int,

View File

@ -100,35 +100,26 @@ def classifier_2p5d(A: Tensor, B: Tensor, bias, tesseract_dim: int, out_shape: T
...], row_rank: int, col_rank: int, ...], row_rank: int, col_rank: int,
row_parallel_mode: ParallelMode, col_parallel_mode: ParallelMode, data_parallel_rank: int, row_parallel_mode: ParallelMode, col_parallel_mode: ParallelMode, data_parallel_rank: int,
pipeline_parallel_rank: int, pipeline_parallel_size: int, tensor_parallel_size: int) -> Tensor: pipeline_parallel_rank: int, pipeline_parallel_size: int, tensor_parallel_size: int) -> Tensor:
""" r"""Classifier.
Classifier
:param a: matrix :math:`A` Args:
:type a: torch.tensor A (:class:`torch.tensor`): matrix :math:`A`.
:param b: matrix :math:`B` B (:class:`torch.tensor`): matrix :math:`B`.
:type b: torch.tensor bias (:class:`torch.tensor`): matrix of bias.
:param bias: matrix of bias tesseract_dim (int): dimension of TESSERACT fo 2.5D parallelism.
:type bias: torch.tensor, optional out_shape (:class:`torch.size`): shape of output tensor.
:param tesseract_dim: dimension of TESSERACT fo 2.5D parallelism row_rank (int): the rank of row.
:type tesseract_dim: int col_rank (int): the rank of column.
:param out_shape: shape of output tensor row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
:type out_shape: tuple col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
:param row_rank: the rank of row data_parallel_rank (int): data parallel rank.
:type row_rank: int pipeline_parallel_rank (int): pipeline parallel rank
:param col_rank: the rank of column pipeline_parallel_size (int): pipeline parallel size.
:type col_rank: int tensor_parallel_size (int): tensor parallel size.
:param row_parallel_mode: row parallel mode
:type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode Note:
:param col_parallel_mode: column parallel mode The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
:type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
:param data_parallel_rank: data parallel rank
:type data_parallel_rank: int
:param pipeline_parallel_rank: pipeline parallel rank
:type pipeline_parallel_rank: int
:param pipeline_parallel_size: pipeline parallel size
:type pipeline_parallel_size: int
:param tensor_parallel_size: tensor parallel size
:type tensor_parallel_size: int
""" """
return _Classifier2p5D.apply(A, B, bias, tesseract_dim, out_shape, row_rank, col_rank, row_parallel_mode, return _Classifier2p5D.apply(A, B, bias, tesseract_dim, out_shape, row_rank, col_rank, row_parallel_mode,
col_parallel_mode, data_parallel_rank, pipeline_parallel_rank, pipeline_parallel_size, col_parallel_mode, data_parallel_rank, pipeline_parallel_rank, pipeline_parallel_size,
@ -136,35 +127,26 @@ def classifier_2p5d(A: Tensor, B: Tensor, bias, tesseract_dim: int, out_shape: T
class Matmul_AB_2p5D(torch.autograd.Function): class Matmul_AB_2p5D(torch.autograd.Function):
""" r"""Matrix multiplication for :math:`C = AB`.
Matrix multiplication for :math:`C = AB`
:param a: matrix :math:`A` Args:
:type a: torch.tensor A (:class:`torch.tensor`): matrix :math:`A`.
:param b: matrix :math:`B` B (:class:`torch.tensor`): matrix :math:`B`.
:type b: torch.tensor tesseract_dim (int): dimension of TESSERACT fo 2.5D parallelism.
:param tesseract_dim: dimension of TESSERACT fo 2.5D parallelism out_shape (:class:`torch.size`): shape of output tensor.
:type tesseract_dim: int row_rank (int): the rank of row.
:param out_shape: shape of output tensor col_rank (int): the rank of column.
:type out_shape: tuple dep_rank (int): the rank of depth.
:param row_rank: the rank of row row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
:type row_rank: int col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
:param col_rank: the rank of column data_parallel_rank (int): data parallel rank.
:type col_rank: int pipeline_parallel_rank (int): pipeline parallel rank
:param dep_rank: the rank of depth pipeline_parallel_size (int): pipeline parallel size.
:type dep_rank: int tensor_parallel_size (int): tensor parallel size.
:param row_parallel_mode: row parallel mode
:type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode Note:
:param col_parallel_mode: column parallel mode The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
:type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
:param data_parallel_rank: data parallel rank
:type data_parallel_rank: int
:param pipeline_parallel_rank: pipeline parallel rank
:type pipeline_parallel_rank: int
:param pipeline_parallel_size: pipeline parallel size
:type pipeline_parallel_size: int
:param tensor_parallel_size: tensor parallel size
:type tensor_parallel_size: int
""" """
@staticmethod @staticmethod
@ -270,35 +252,26 @@ class Matmul_AB_2p5D(torch.autograd.Function):
class Matmul_ABT_2p5D(torch.autograd.Function): class Matmul_ABT_2p5D(torch.autograd.Function):
""" r"""Matrix multiplication for :math:`C = AB^T`.
Matrix multiplication for :math:`C = AB^T`
:param a: matrix :math:`A` Args:
:type a: torch.tensor A (:class:`torch.tensor`): matrix :math:`A`.
:param b: matrix :math:`B` B (:class:`torch.tensor`): matrix :math:`B`.
:type b: torch.tensor tesseract_dim (int): dimension of TESSERACT fo 2.5D parallelism.
:param tesseract_dim: dimension of TESSERACT fo 2.5D parallelism out_shape (:class:`torch.size`): shape of output tensor.
:type tesseract_dim: int row_rank (int): the rank of row.
:param out_shape: shape of output tensor col_rank (int): the rank of column.
:type out_shape: tuple dep_rank (int): the rank of depth.
:param row_rank: the rank of row row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
:type row_rank: int col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
:param col_rank: the rank of column data_parallel_rank (int): data parallel rank.
:type col_rank: int pipeline_parallel_rank (int): pipeline parallel rank
:param dep_rank: the rank of depth pipeline_parallel_size (int): pipeline parallel size.
:type dep_rank: int tensor_parallel_size (int): tensor parallel size.
:param row_parallel_mode: row parallel mode
:type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode Note:
:param col_parallel_mode: column parallel mode The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
:type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
:param data_parallel_rank: data parallel rank
:type data_parallel_rank: int
:param pipeline_parallel_rank: pipeline parallel rank
:type pipeline_parallel_rank: int
:param pipeline_parallel_size: pipeline parallel size
:type pipeline_parallel_size: int
:param tensor_parallel_size: tensor parallel size
:type tensor_parallel_size: int
""" """
@staticmethod @staticmethod
@ -409,35 +382,26 @@ class Matmul_ABT_2p5D(torch.autograd.Function):
class Matmul_ATB_2p5D(torch.autograd.Function): class Matmul_ATB_2p5D(torch.autograd.Function):
""" r"""Matrix multiplication for :math:`C = A^TB`
Matrix multiplication for :math:`C = A^TB`
:param a: matrix :math:`A` Args:
:type a: torch.tensor A (:class:`torch.tensor`): matrix :math:`A`.
:param b: matrix :math:`B` B (:class:`torch.tensor`): matrix :math:`B`.
:type b: torch.tensor tesseract_dim (int): dimension of TESSERACT fo 2.5D parallelism.
:param tesseract_dim: dimension of TESSERACT fo 2.5D parallelism out_shape (:class:`torch.size`): shape of output tensor.
:type tesseract_dim: int row_rank (int): the rank of row.
:param out_shape: shape of output tensor col_rank (int): the rank of column.
:type out_shape: tuple dep_rank (int): the rank of depth.
:param row_rank: the rank of row row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
:type row_rank: int col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
:param col_rank: the rank of column data_parallel_rank (int): data parallel rank.
:type col_rank: int pipeline_parallel_rank (int): pipeline parallel rank
:param dep_rank: the rank of depth pipeline_parallel_size (int): pipeline parallel size.
:type dep_rank: int tensor_parallel_size (int): tensor parallel size.
:param row_parallel_mode: row parallel mode
:type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode Note:
:param col_parallel_mode: column parallel mode The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
:type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
:param data_parallel_rank: data parallel rank
:type data_parallel_rank: int
:param pipeline_parallel_rank: pipeline parallel rank
:type pipeline_parallel_rank: int
:param pipeline_parallel_size: pipeline parallel size
:type pipeline_parallel_size: int
:param tensor_parallel_size: tensor parallel size
:type tensor_parallel_size: int
""" """
@staticmethod @staticmethod
@ -629,36 +593,27 @@ def add_bias_2p5d(input: Tensor, bias: Tensor, output_size_per_partition: int, t
col_rank: int, dep_rank: int, col_parallel_mode: ParallelMode, skip_bias_add: bool, col_rank: int, dep_rank: int, col_parallel_mode: ParallelMode, skip_bias_add: bool,
data_parallel_rank: int, pipeline_parallel_rank: int, pipeline_parallel_size: int, data_parallel_rank: int, pipeline_parallel_rank: int, pipeline_parallel_size: int,
tensor_parallel_size: int) -> Tensor: tensor_parallel_size: int) -> Tensor:
""" r"""Matrix add bias: :math:`C = A + b`.
Matrix add bias: :math:`C = A + b`
:param input: matrix :math:`A` Args:
:type input: torch.tensor input (:class:`torch.tensor`): matrix :math:`A`.
:param bias: matrix :math:`b` bias (:class:`torch.tensor`): matrix :math:`B`.
:type bias: torch.tensor tesseract_dim (int): dimension of TESSERACT fo 2.5D parallelism.
:param output_size_per_partition: output size in each partition output_size_per_partition (int): output size in each partition.
:type output_size_per_partition: int row_rank (int): the rank of row.
:param tesseract_dim: dimension of TESSERACT fo 2.5D parallelism col_rank (int): the rank of column.
:type tesseract_dim: int dep_rank (int): the rank of depth.
:param row_rank: the rank of row col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
:type row_rank: int skip_bias_add (bool): If set to ``True``, it will skip bias add for linear layer,
:param col_rank: the rank of column which is preserved for kernel fusion.
:type col_rank: int data_parallel_rank (int): data parallel rank.
:param row_parallel_mode: row parallel mode pipeline_parallel_rank (int): pipeline parallel rank
:type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode pipeline_parallel_size (int): pipeline parallel size.
:param col_parallel_mode: column parallel mode tensor_parallel_size (int): tensor parallel size.
:type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
:param skip_bias_add: If set to ``True``, it will skip bias add for linear layer, Note:
which is preserved for kernel fusion The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
:type skip_bias_add: bool in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
:param data_parallel_rank: data parallel rank
:type data_parallel_rank: int
:param pipeline_parallel_rank: pipeline parallel rank
:type pipeline_parallel_rank: int
:param pipeline_parallel_size: pipeline parallel size
:type pipeline_parallel_size: int
:param tensor_parallel_size: tensor parallel size
:type tensor_parallel_size: int
""" """
return _Add_Bias_2p5D.apply(input, bias, output_size_per_partition, tesseract_dim, row_rank, col_rank, dep_rank, return _Add_Bias_2p5D.apply(input, bias, output_size_per_partition, tesseract_dim, row_rank, col_rank, dep_rank,
col_parallel_mode, skip_bias_add, data_parallel_rank, pipeline_parallel_rank, col_parallel_mode, skip_bias_add, data_parallel_rank, pipeline_parallel_rank,
@ -666,19 +621,18 @@ def add_bias_2p5d(input: Tensor, bias: Tensor, output_size_per_partition: int, t
class _Layernorm2p5D(torch.autograd.Function): class _Layernorm2p5D(torch.autograd.Function):
""" r"""Layernorm.
Layernorm
:param input: input maxtrix Args:
:type input: torch.tensor input (:class:`torch.tensor`): input matrix.
:param E_x: mean E_x (:class:`torch.tensor`): mean.
:type E_x: torch.tensor Var_x (:class:`torch.tensor`): variance.
:param Var_x: variance hidden_size (int): hidden size.
:type Var_x: torch.tensor row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
:param hidden_size: hidden size
:type hidden_size: int Note:
:param row_parallel_mode: row parallel mode The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
:type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
""" """
@staticmethod @staticmethod
@ -718,19 +672,18 @@ class _Layernorm2p5D(torch.autograd.Function):
def layernorm_2p5d(input: Tensor, E_x: Tensor, Var_x: Tensor, hidden_size: int, def layernorm_2p5d(input: Tensor, E_x: Tensor, Var_x: Tensor, hidden_size: int,
row_parallel_mode: ParallelMode) -> Tensor: row_parallel_mode: ParallelMode) -> Tensor:
""" r"""Layernorm.
Layernorm
:param input: input maxtrix Args:
:type input: torch.tensor input (:class:`torch.tensor`): input matrix.
:param E_x: mean E_x (:class:`torch.tensor`): mean.
:type E_x: torch.tensor Var_x (:class:`torch.tensor`): variance.
:param Var_x: variance hidden_size (int): hidden size.
:type Var_x: torch.tensor row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
:param hidden_size: hidden size
:type hidden_size: int Note:
:param row_parallel_mode: row parallel mode The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
:type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
""" """
return _Layernorm2p5D.apply(input, E_x, Var_x, hidden_size, row_parallel_mode) return _Layernorm2p5D.apply(input, E_x, Var_x, hidden_size, row_parallel_mode)
@ -753,29 +706,31 @@ class _AllGatherTensor2p5D(torch.autograd.Function):
def all_gather_tensor_2p5d(inputs: Tensor, dim: int, col_parallel_mode: ParallelMode) -> Tensor: def all_gather_tensor_2p5d(inputs: Tensor, dim: int, col_parallel_mode: ParallelMode) -> Tensor:
""" r"""all gather the weight of 2.5D parallelism.
all gather the weight of 2.5D parallelism
:param inputs: input maxtrix Args:
:type inputs: torch.tensor inputs (:class:`torch.tensor`): input tensor.
:param dim: dimension of all gather dim (int): dimension of all-gather.
:type dim: int col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
:param tesseract_dim: dimension of TESSERACT fo 2.5D parallelism
:type tesseract_dim: int Note:
:param col_parallel_mode: column parallel mode The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
:type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
""" """
return _AllGatherTensor2p5D.apply(inputs, dim, col_parallel_mode) return _AllGatherTensor2p5D.apply(inputs, dim, col_parallel_mode)
class SplitFirst(torch.autograd.Function): class SplitFirst(torch.autograd.Function):
""" r"""
:param inputs: input maxtrix
:type inputs: torch.tensor Args:
:param tesseract_dim: dimension of TESSERACT fo 2.5D parallelism inputs (:class:`torch.tensor`): input tensor.
:type tesseract_dim: int tesseract_dim (int): dimension of TESSERACT fo 2.5D parallelism
:param col_parallel_mode: column parallel mode col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
:type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
""" """
@staticmethod @staticmethod
@ -801,16 +756,14 @@ class SplitFirst(torch.autograd.Function):
def split_tensor_2p5d(input_: Tensor, dim: int = 0) -> Tensor: def split_tensor_2p5d(input_: Tensor, dim: int = 0) -> Tensor:
"""Splits 2P5D tensor in specified dimension across cols """Splits 2P5D tensor in specified dimension across cols.
:param input_: Input tensor Args:
:param dim: Specified dimension in which to split input_ (:class:`torch.tensor`): Input tensor.
dim (int): Specified dimension in which to split.
:type input_: torch.Tensor Returns:
:type dim: int, optional :class:`torch.tensor`: The tensor has been split.
:return output: Splitted tensor
:rtype output: torch.Tensor
""" """
if input_.size(dim) <= 1: if input_.size(dim) <= 1:
return input_ return input_
@ -829,11 +782,15 @@ class _ReduceTensor2p5D(torch.autograd.Function):
def reduce_tensor_2p5d(input_: Tensor, parallel_mode: ParallelMode) -> Tensor: def reduce_tensor_2p5d(input_: Tensor, parallel_mode: ParallelMode) -> Tensor:
""" r"""All-reduce the input.
All-reduce the input.
:param input_: input tensor Args:
:param parallel_mode: parallel mode input_ (:class:`torch.tensor`): Input tensor.
parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode tensor used.
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
""" """
return _ReduceTensor2p5D.apply(input_, parallel_mode) return _ReduceTensor2p5D.apply(input_, parallel_mode)
@ -851,11 +808,16 @@ class _ReduceScatterTensor2p5D(torch.autograd.Function):
def reduce_scatter_tensor_2p5d(input_: Tensor, dim: int, parallel_mode: ParallelMode) -> Tensor: def reduce_scatter_tensor_2p5d(input_: Tensor, dim: int, parallel_mode: ParallelMode) -> Tensor:
""" r"""Reduce-scatter the input.
Reduce-scatter the input.
:param input_: input tensor Args:
:param parallel_mode: parallel mode input_ (:class:`torch.tensor`): Input tensor.
dim (int): Dimension to reduce.
parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode tensor used.
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
""" """
return _ReduceScatterTensor2p5D.apply(input_, dim, parallel_mode) return _ReduceScatterTensor2p5D.apply(input_, dim, parallel_mode)
@ -890,12 +852,11 @@ class _RreduceByBatch2p5D(torch.autograd.Function):
def reduce_by_batch_2p5d(input_, reduce_mean: bool = False) -> Tensor: def reduce_by_batch_2p5d(input_, reduce_mean: bool = False) -> Tensor:
""" r"""All-reduce the input from the model parallel region.
All-reduce the input from the model parallel region.
:param input_: input maxtrix Args:
:type input_: torch.tensor input_ (:class:`torch.tensor`): input matrix.
:param reduce_mean: If set to ``True``, it will divide the output by column parallel size, default to False reduce_mean (bool, optional):
:type reduce_mean: bool, optional If set to ``True``, it will divide the output by column parallel size, default to False.
""" """
return _RreduceByBatch2p5D.apply(input_, reduce_mean) return _RreduceByBatch2p5D.apply(input_, reduce_mean)

View File

@ -23,21 +23,22 @@ from ._utils import assert_tesseract_initialization, get_tesseract_dim_dep_from_
@LAYERS.register_module @LAYERS.register_module
class Linear2p5D(ParallelLayer): class Linear2p5D(ParallelLayer):
""" r"""Linear layer for 2.5D parallelism.
Linear layer for 2.5D parallelism
:param in_features: size of each input sample Args:
:type in_features: int in_features (int): size of each input sample.
:param out_features: size of each output sample out_features (int): size of each output sample.
:type out_features: int bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
:param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
:type bias: bool, optional skip_bias_add (bool, optional): If set to ``True``, it will skip bias add for linear layer,
:param dtype: The dtype of parameters, defaults to None which is preserved for kernel fusion, defaults to False.
:type dtype: torch.dtype, optional weight_initializer (:class:`typing.Callable`, optional):
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer The initializer of weight, defaults to kaiming uniform initializer.
:type weight_initializer: typing.Callable, optional bias_initializer (:class:`typing.Callable`, optional):
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer The initializer of bias, defaults to xavier uniform initializer.
:type bias_initializer: typing.Callable, optional
More details about ``initializer`` please refer to
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
""" """
def __init__(self, def __init__(self,
in_features: int, in_features: int,
@ -131,19 +132,16 @@ class Linear2p5D(ParallelLayer):
@LAYERS.register_module @LAYERS.register_module
class LayerNorm2p5D(ParallelLayer): class LayerNorm2p5D(ParallelLayer):
r""" r"""Layer Normalization for 2.5D parallelism.
Layer Normalization for 2.5D parallelism
:param normalized_shape: input shape from an expected input of size. Args:
:math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1] normalized_shape (int): input shape from an expected input of size.
\times \ldots \times \text{normalized_shape}[-1]]` :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
If a single integer is used, it is treated as a singleton list, and this module will \times \ldots \times \text{normalized_shape}[-1]]`
normalize over the last dimension which is expected to be of that specific size. If a single integer is used, it is treated as a singleton list, and this module will
:type normalized_shape: int normalize over the last dimension which is expected to be of that specific size.
:param eps: a value added to the denominator for numerical stability, defaults to 1e-05 eps (float, optional): a value added to the denominator for numerical stability, defaults to 1e-05.
:type eps: float, optional dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
:param dtype: The dtype of parameters, defaults to None
:type dtype: torch.dtype, optional
""" """
def __init__(self, normalized_shape: int, eps: float = 1e-05, dtype=None): def __init__(self, normalized_shape: int, eps: float = 1e-05, dtype=None):
super().__init__() super().__init__()
@ -204,27 +202,24 @@ class LayerNorm2p5D(ParallelLayer):
@LAYERS.register_module @LAYERS.register_module
class PatchEmbedding2p5D(ParallelLayer): class PatchEmbedding2p5D(ParallelLayer):
""" r"""2D Image to Patch Embedding.
2D Image to Patch Embedding
:param img_size: image size Args:
:type img_size: int img_size (int): image size.
:param patch_size: patch size patch_size (int): patch size.
:type patch_size: int in_chans (int): number of channels of input image.
:param in_chans: number of channels of input image embed_size (int): size of embedding.
:type in_chans: int dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
:param embed_size: size of embedding flatten (bool, optional): whether to flatten output tensor, defaults to True.
:type embed_size: int weight_initializer (:class:`typing.Callable`, optional):
:param dtype: The dtype of parameters, defaults to None The initializer of weight, defaults to kaiming uniform initializer.
:type dtype: torch.dtype, optional bias_initializer (:class:`typing.Callable`, optional):
:param flatten: whether to flatten output tensor, defaults to True The initializer of bias, defaults to xavier uniform initializer.
:type flatten: bool, optional position_embed_initializer (:class:`typing.Callable`, optional):
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer The initializer of position embedding, defaults to zeros initializer.
:type weight_initializer: typing.Callable, optional
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer More details about ``initializer`` please refer to
:type bias_initializer: typing.Callable, optional `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
:param position_embed_initializer: The intializer of position embedding, defaults to zero
:type position_embed_initializer: typing.Callable, optional
""" """
def __init__(self, def __init__(self,
img_size: int, img_size: int,
@ -306,21 +301,33 @@ class PatchEmbedding2p5D(ParallelLayer):
@LAYERS.register_module @LAYERS.register_module
class Embedding2p5D(ParallelLayer): class Embedding2p5D(ParallelLayer):
""" r"""Embedding for 2.5D parallelism.
Embedding for 2.5D parallelism
:param num_embeddings: number of embeddings Args:
:type num_embeddings: int num_embeddings (int): number of embeddings.
:param embedding_dim: dimension of embedding embedding_dim (int): dimension of embedding.
:type embedding_dim: int padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
:param padding_idx: index of padding, defaults to None therefore, the embedding vector at padding_idx is not updated during training,
:type padding_idx: int, optional i.e. it remains as a fixed pad, defaults to None.
:param dtype: The dtype of parameters, defaults to None dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
:type dtype: torch.dtype, optional weight_initializer (:class:`typing.Callable`, optional):
:param weight_initializer: The intializer of weight, defaults to normal initializer he initializer of weight, defaults to normal initializer.
:type weight_initializer: typing.Callable, optional
:param args: Args used in F.embedding The ``args`` and ``kwargs`` used in :class:``torch.nn.functional.embedding`` should contain:
:param kwargs: Kwargs used in F.embedding ::
max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
renormalized to have norm max_norm. Note: this will modify weight in-place.
norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
of frequency of the words in the mini-batch. Default False.
sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
More details about ``args`` and ``kwargs`` could be found in
`Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
More details about initializer please refer to
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_
""" """
def __init__(self, def __init__(self,
num_embeddings: int, num_embeddings: int,
@ -376,18 +383,31 @@ class Embedding2p5D(ParallelLayer):
class VocabParallelEmbedding2p5D(torch.nn.Module): class VocabParallelEmbedding2p5D(torch.nn.Module):
"""Embedding parallelized in the vocabulary dimension. """Embedding parallelized in the vocabulary dimension.
:param num_embeddings: number of embeddings Args:
:type num_embeddings: int num_embeddings (int): number of embeddings.
:param embedding_dim: dimension of embedding embedding_dim (int): dimension of embedding.
:type embedding_dim: int padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
:param padding_idx: index of padding, defaults to None therefore, the embedding vector at padding_idx is not updated during training,
:type padding_idx: int, optional i.e. it remains as a fixed pad, defaults to None.
:param dtype: The dtype of parameters, defaults to None dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
:type dtype: torch.dtype, optional weight_initializer (:class:`typing.Callable`, optional):
:param weight_initializer: The intializer of weight, defaults to normal initializer he initializer of weight, defaults to normal initializer.
:type weight_initializer: typing.Callable, optional
:param args: Args used in F.embedding The ``args`` and ``kwargs`` used in :class:``torch.nn.functional.embedding`` should contain:
:param kwargs: Kwargs used in F.embedding ::
max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
renormalized to have norm max_norm. Note: this will modify weight in-place.
norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
of frequency of the words in the mini-batch. Default False.
sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
More details about ``args`` and ``kwargs`` could be found in
`Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
More details about initializer please refer to
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
""" """
def __init__(self, def __init__(self,
num_embeddings: int, num_embeddings: int,
@ -455,23 +475,21 @@ class VocabParallelEmbedding2p5D(torch.nn.Module):
@LAYERS.register_module @LAYERS.register_module
class Classifier2p5D(ParallelLayer): class Classifier2p5D(ParallelLayer):
""" r"""Classifier for 2.5D parallelism.
Classifier for 2.5D parallelism
:param in_features: size of each input sample Args:
:type in_features: int in_features (int): size of each input sample.
:param num_classes: number of classes num_classes (int): number of classes.
:type num_classes: int weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
:param weight: weight of the classifier, defaults to True bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
:type weight: torch.nn.Parameter, optional dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
:param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True weight_initializer (:class:`typing.Callable`, optional):
:type bias: bool, optional The initializer of weight, defaults to kaiming uniform initializer.
:param dtype: The dtype of parameters, defaults to None bias_initializer (:class:`typing.Callable`, optional):
:type dtype: torch.dtype, optional The initializer of bias, defaults to xavier uniform initializer.
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
:type weight_initializer: typing.Callable, optional More details about ``initializer`` please refer to
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
:type bias_initializer: typing.Callable, optional
""" """
def __init__(self, def __init__(self,
in_features: int, in_features: int,
@ -537,23 +555,21 @@ class Classifier2p5D(ParallelLayer):
@LAYERS.register_module @LAYERS.register_module
class VocabParallelClassifier2p5D(ParallelLayer): class VocabParallelClassifier2p5D(ParallelLayer):
""" r"""Vocab parallel classifier layer for 2.5D parallelism.
Vocab parallel classifier layer for 2.5D parallelism
:param in_features: size of each input sample Args:
:type in_features: int in_features (int): size of each input sample.
:param num_classes: number of classes num_classes (int): number of classes.
:type num_classes: int weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
:param weight: weight of the classifier, defaults to True bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
:type weight: torch.nn.Parameter, optional dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
:param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True`` weight_initializer (:class:`typing.Callable`, optional):
:type bias: bool, optional The initializer of weight, defaults to kaiming uniform initializer.
:param dtype: The dtype of parameters, defaults to None bias_initializer (:class:`typing.Callable`, optional):
:type dtype: torch.dtype, optional The initializer of bias, defaults to xavier uniform initializer.
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
:type weight_initializer: typing.Callable, optional More details about ``initializer`` please refer to
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
:type bias_initializer: typing.Callable, optional
""" """
def __init__(self, def __init__(self,
in_features: int, in_features: int,

View File

@ -88,27 +88,22 @@ def linear_3d(input_: Tensor,
input_dim: int = 0, input_dim: int = 0,
weight_dim: int = -1, weight_dim: int = -1,
output_dim: int = 0) -> Tensor: output_dim: int = 0) -> Tensor:
""" r"""Linear layer for 3D parallelism.
Linear layer for 3D parallelism
:param input_: matrix of input Args:
:type input_: torch.tensor input_ (:class:`torch.tensor`): input matrix.
:param weight: matrix of weight weight (:class:`torch.tensor`): matrix of weight.
:type weight: torch.tensor bias (:class:`torch.tensor`): matrix of bias.
:param bias: matrix of bias input_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): input parallel mode.
:type bias: torch.tensor, optional weight_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): weight parallel mode.
:param input_parallel_mode: input parallel mode output_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): output parallel mode.
:type input_parallel_mode: colossalai.context.parallel_mode.ParallelMode input_dim (int, optional): dimension of input, defaults to 0.
:param weight_parallel_mode: weight parallel mode weight_dim (int, optional): dimension of weight, defaults to -1.
:type weight_parallel_mode: colossalai.context.parallel_mode.ParallelMode output_dim (int, optional): dimension of output, defaults to 0.
:param output_parallel_mode: output parallel mode
:type output_parallel_mode: colossalai.context.parallel_mode.ParallelMode Note:
:param input_dim: dimension of input, defaults to 0 The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
:type input_dim: int, optional in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
:param weight_dim: dimension of weight, defaults to -1
:type weight_dim: int, optional
:param output_dim: dimension of output, defaults to 0
:type output_dim: int, optional
""" """
return _Linear3D.apply(input_, weight, bias, input_parallel_mode, weight_parallel_mode, output_parallel_mode, return _Linear3D.apply(input_, weight, bias, input_parallel_mode, weight_parallel_mode, output_parallel_mode,
input_dim, weight_dim, output_dim) input_dim, weight_dim, output_dim)
@ -174,21 +169,19 @@ class _Classifier3D(torch.autograd.Function):
def classifier_3d(input_: Tensor, weight: Tensor, bias: Optional[Tensor], input_parallel_mode: ParallelMode, def classifier_3d(input_: Tensor, weight: Tensor, bias: Optional[Tensor], input_parallel_mode: ParallelMode,
weight_parallel_mode: ParallelMode, output_parallel_mode: ParallelMode) -> Tensor: weight_parallel_mode: ParallelMode, output_parallel_mode: ParallelMode) -> Tensor:
""" r"""3D parallel classifier.
3D parallel classifier
:param input_: matrix of input Args:
:type input_: torch.tensor input_ (:class:`torch.tensor`): input matrix.
:param weight: matrix of weight weight (:class:`torch.tensor`): matrix of weight.
:type weight: torch.tensor bias (:class:`torch.tensor`): matrix of bias.
:param bias: matrix of bias input_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): input parallel mode.
:type bias: torch.tensor, optional weight_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): weight parallel mode.
:param input_parallel_mode: input parallel mode output_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): output parallel mode.
:type input_parallel_mode: colossalai.context.parallel_mode.ParallelMode
:param weight_parallel_mode: weight parallel mode Note:
:type weight_parallel_mode: colossalai.context.parallel_mode.ParallelMode The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
:param output_parallel_mode: output parallel mode in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
:type output_parallel_mode: colossalai.context.parallel_mode.ParallelMode
""" """
return _Classifier3D.apply(input_, weight, bias, input_parallel_mode, weight_parallel_mode, output_parallel_mode) return _Classifier3D.apply(input_, weight, bias, input_parallel_mode, weight_parallel_mode, output_parallel_mode)
@ -244,48 +237,44 @@ class _Layernorm3D(torch.autograd.Function):
def layernorm_3d(input_: Tensor, weight: Tensor, bias: Tensor, normalized_shape: int, eps: float, def layernorm_3d(input_: Tensor, weight: Tensor, bias: Tensor, normalized_shape: int, eps: float,
input_parallel_mode: ParallelMode, weight_parallel_mode: ParallelMode, input_parallel_mode: ParallelMode, weight_parallel_mode: ParallelMode,
output_parallel_mode: ParallelMode) -> Tensor: output_parallel_mode: ParallelMode) -> Tensor:
r""" r"""3D parallel Layernorm.
3D parallel Layernorm
:param input_: input maxtrix Args:
:type input_: torch.tensor input_ (:class:`torch.tensor`): input matrix.
:param weight: matrix of weight weight (:class:`torch.tensor`): matrix of weight.
:type weight: torch.tensor bias (:class:`torch.tensor`): matrix of bias.
:param bias: matrix of bias normalized_shape (int): input shape from an expected input of size.
:type bias: torch.tensor :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
:param normalized_shape: input shape from an expected input of size. \times \ldots \times \text{normalized_shape}[-1]]`
:math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1] If a single integer is used, it is treated as a singleton list, and this module will
\times \ldots \times \text{normalized_shape}[-1]]` normalize over the last dimension which is expected to be of that specific size.
If a single integer is used, it is treated as a singleton list, and this module will eps (float): a value added to the denominator for numerical stability
normalize over the last dimension which is expected to be of that specific size. input_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): input parallel mode.
:type normalized_shape: int weight_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): weight parallel mode.
:param eps: a value added to the denominator for numerical stability output_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): output parallel mode.
:type eps: float
:param input_parallel_mode: input parallel mode Note:
:type input_parallel_mode: colossalai.context.parallel_mode.ParallelMode The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
:param weight_parallel_mode: weight parallel mode in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
:type weight_parallel_mode: colossalai.context.parallel_mode.ParallelMode
:param output_parallel_mode: output parallel mode
:type output_parallel_mode: colossalai.context.parallel_mode.ParallelMode
""" """
return _Layernorm3D.apply(input_, weight, bias, normalized_shape, eps, input_parallel_mode, weight_parallel_mode, return _Layernorm3D.apply(input_, weight, bias, normalized_shape, eps, input_parallel_mode, weight_parallel_mode,
output_parallel_mode) output_parallel_mode)
def split_tensor_3d(tensor: Tensor, dim: int, parallel_mode: ParallelMode) -> Tensor: def split_tensor_3d(tensor: Tensor, dim: int, parallel_mode: ParallelMode) -> Tensor:
"""Splits 3D parallel tensor in specified dimension r"""Splits 3D parallel tensor in specified dimension.
:param tensor: Input tensor Args:
:param dim: Specified dimension in which to split tensor (:class:`torch.tensor`): Input tensor.
:param parallel_mode: Parallel mode dim (int): Specified dimension in which to split.
:param weight_parallel_mode: Weight parallel mode parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`, optional): Parallel mode.
:type tensor: torch.Tensor Returns:
:type dim: int :class:`torch.tensor`: The tensor has been split.
:type parallel_mode: colossalai.context.parallel_mode.ParallelMode
:return output: Splitted tensor Note:
:rtype output: torch.Tensor The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
""" """
if tensor.size(dim) <= 1: if tensor.size(dim) <= 1:
return tensor return tensor
@ -298,17 +287,20 @@ def split_batch_3d(input_: Tensor,
dim: int = 0, dim: int = 0,
input_parallel_mode: ParallelMode = ParallelMode.PARALLEL_3D_INPUT, input_parallel_mode: ParallelMode = ParallelMode.PARALLEL_3D_INPUT,
weight_parallel_mode: ParallelMode = ParallelMode.PARALLEL_3D_WEIGHT) -> Tensor: weight_parallel_mode: ParallelMode = ParallelMode.PARALLEL_3D_WEIGHT) -> Tensor:
"""Splits 3D tensor in batch r"""Splits 3D tensor in batch.
:param input_: Input tensor
:param dim: Specified dimension in which to split Args:
:param input_parallel_mode: Input parallel mode input_ (:class:`torch.tensor`): Input tensor.
:param weight_parallel_mode: Weight parallel mode dim (int): Specified dimension in which to split.
:type input_: torch.Tensor input_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`, optional): input parallel mode.
:type dim: int, optional weight_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`, optional): weight parallel mode.
:type input_parallel_mode: colossalai.context.parallel_mode.ParallelMode, optional
:type weight_parallel_mode: colossalai.context.parallel_mode.ParallelMode, optional Returns:
:return output: Splitted tensor :class:`torch.tensor`: The tensor has been split.
:rtype output: torch.Tensor
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
""" """
if input_.size(dim) <= 1: if input_.size(dim) <= 1:
return input_ return input_
@ -333,11 +325,15 @@ class _ReduceTensor3D(torch.autograd.Function):
def reduce_tensor_3d(tensor: Tensor, parallel_mode: ParallelMode) -> Tensor: def reduce_tensor_3d(tensor: Tensor, parallel_mode: ParallelMode) -> Tensor:
""" r"""All-reduce the input
All-reduce the input
:param tensor: Input tensor Args:
:param parallel_mode: Parallel mode tensor (:class:`torch.tensor`): Input tensor.
parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): Parallel mode.
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
""" """
return _ReduceTensor3D.apply(tensor, parallel_mode) return _ReduceTensor3D.apply(tensor, parallel_mode)
@ -358,11 +354,16 @@ class _AllGatherTensor3D(torch.autograd.Function):
def all_gather_tensor_3d(tensor: Tensor, dim: int, parallel_mode: ParallelMode) -> Tensor: def all_gather_tensor_3d(tensor: Tensor, dim: int, parallel_mode: ParallelMode) -> Tensor:
""" r"""All-reduce the gradient in backward pass.
All-reduce the gradient in backward pass.
:param tensor: Input tensor Args:
:param parallel_mode: Parallel mode tensor (:class:`torch.tensor`): Input tensor.
dim (int): Dimension to gather.
parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): Parallel mode.
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
""" """
return _AllGatherTensor3D.apply(tensor, dim, parallel_mode) return _AllGatherTensor3D.apply(tensor, dim, parallel_mode)
@ -382,12 +383,16 @@ class _ReduceScatterTensor3D(torch.autograd.Function):
def reduce_scatter_tensor_3d(tensor: Tensor, dim: int, parallel_mode: ParallelMode) -> Tensor: def reduce_scatter_tensor_3d(tensor: Tensor, dim: int, parallel_mode: ParallelMode) -> Tensor:
""" r"""Reduce-scatter the input.
Reduce-scatter the input.
:param tensor: Input tensor Args:
:param dim: Dimension to scatter tensor (:class:`torch.tensor`): Input tensor.
:param parallel_mode: Parallel mode dim (int): Dimension to scatter.
parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): Parallel mode.
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
""" """
return _ReduceScatterTensor3D.apply(tensor, dim, parallel_mode) return _ReduceScatterTensor3D.apply(tensor, dim, parallel_mode)
@ -423,34 +428,33 @@ def reduce_by_batch_3d(tensor: Tensor,
input_parallel_mode: ParallelMode, input_parallel_mode: ParallelMode,
weight_parallel_mode: ParallelMode, weight_parallel_mode: ParallelMode,
reduce_mean: bool = False) -> Tensor: reduce_mean: bool = False) -> Tensor:
""" r"""All-reduce the input from the model parallel region.
All-reduce the input from the model parallel region.
:param input_: input maxtrix Args:
:type input_: torch.tensor input_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): input parallel mode.
:param input_parallel_mode: input parallel mode weight_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): weight parallel mode.
:type input_parallel_mode: colossalai.context.parallel_mode.ParallelMode reduce_mean (bool, optional): If set to ``True``, it will divide the output by
:param weight_parallel_mode: weight parallel mode (input parallel size * weight parallel size), default to False.
:type weight_parallel_mode: colossalai.context.parallel_mode.ParallelMode
:param reduce_mean: If set to ``True``, it will divide the output by (input parallel size * weight parallel size), Note:
default to False The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
:type reduce_mean: int, optional in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
""" """
return _ReduceByBatch3D.apply(tensor, input_parallel_mode, weight_parallel_mode, reduce_mean) return _ReduceByBatch3D.apply(tensor, input_parallel_mode, weight_parallel_mode, reduce_mean)
class _BroadcastWeight3D_FromDiagonal(torch.autograd.Function): class _BroadcastWeight3D_FromDiagonal(torch.autograd.Function):
""" r"""broadcast weight from diagonal.
broadcast weight from diagonal
:param input_: input maxtrix Args:
:type input_: torch.tensor input_ (:class:`torch.tensor`): input matrix.
:param input_parallel_mode: input parallel mode input_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): input parallel mode.
:type input_parallel_mode: colossalai.context.parallel_mode.ParallelMode weight_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): weight parallel mode.
:param weight_parallel_mode: weight parallel mode output_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): output parallel mode.
:type weight_parallel_mode: colossalai.context.parallel_mode.ParallelMode
:param weight_parallel_mode: output parallel mode Note:
:type weight_parallel_mode: colossalai.context.parallel_mode.ParallelMode The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
""" """
@staticmethod @staticmethod

View File

@ -24,19 +24,16 @@ from ._utils import get_depth_from_env, get_last_group, get_parallel_mode_from_e
@LAYERS.register_module @LAYERS.register_module
class LayerNorm3D(ParallelLayer): class LayerNorm3D(ParallelLayer):
r""" r"""Layer Normalization for 3D parallelism.
Layer Normalization for 3D parallelism
:param normalized_shape: input shape from an expected input of size. Args:
:math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1] normalized_shape (int): input shape from an expected input of size.
\times \ldots \times \text{normalized_shape}[-1]]` :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
If a single integer is used, it is treated as a singleton list, and this module will \times \ldots \times \text{normalized_shape}[-1]]`
normalize over the last dimension which is expected to be of that specific size. If a single integer is used, it is treated as a singleton list, and this module will
:type normalized_shape: int normalize over the last dimension which is expected to be of that specific size.
:param eps: a value added to the denominator for numerical stability, defaults to 1e-12 eps (float, optional): a value added to the denominator for numerical stability, defaults to 1e-12.
:type eps: float, optional dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
:param dtype: The dtype of parameters, defaults to None
:type dtype: torch.dtype, optional
""" """
def __init__(self, normalized_shape: int, eps: float = 1e-12, dtype=None): def __init__(self, normalized_shape: int, eps: float = 1e-12, dtype=None):
@ -71,21 +68,20 @@ class LayerNorm3D(ParallelLayer):
@LAYERS.register_module @LAYERS.register_module
class Linear3D(ParallelLayer): class Linear3D(ParallelLayer):
""" r"""Linear layer for 3D parallelism.
Linear layer for 3D parallelism
:param in_features: size of each input sample Args:
:type in_features: int in_features (int): size of each input sample.
:param out_features: size of each output sample out_features (int): size of each output sample.
:type out_features: int bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
:param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
:type bias: bool, optional weight_initializer (:class:`typing.Callable`, optional):
:param dtype: The dtype of parameters, defaults to None The initializer of weight, defaults to kaiming uniform initializer.
:type dtype: torch.dtype, optional bias_initializer (:class:`typing.Callable`, optional):
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer The initializer of bias, defaults to xavier uniform initializer.
:type weight_initializer: typing.Callable, optional
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer More details about ``initializer`` please refer to
:type bias_initializer: typing.Callable, optional `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
""" """
def __init__(self, def __init__(self,
@ -146,23 +142,21 @@ class Linear3D(ParallelLayer):
@LAYERS.register_module @LAYERS.register_module
class Classifier3D(ParallelLayer): class Classifier3D(ParallelLayer):
""" r"""Classifier for 3D parallelism.
Classifier for 3D parallelism
:param in_features: size of each input sample Args:
:type in_features: int in_features (int): size of each input sample.
:param num_classes: number of classes num_classes (int): number of classes.
:type num_classes: int weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
:param weight: weight of the classifier, defaults to True bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
:type weight: torch.nn.Parameter, optional dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
:param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True weight_initializer (:class:`typing.Callable`, optional):
:type bias: bool, optional The initializer of weight, defaults to kaiming uniform initializer.
:param dtype: The dtype of parameters, defaults to None bias_initializer (:class:`typing.Callable`, optional):
:type dtype: torch.dtype, optional The initializer of bias, defaults to xavier uniform initializer.
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
:type weight_initializer: typing.Callable, optional More details about ``initializer`` please refer to
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
:type bias_initializer: typing.Callable, optional
""" """
def __init__(self, def __init__(self,
@ -225,23 +219,21 @@ class Classifier3D(ParallelLayer):
@LAYERS.register_module @LAYERS.register_module
class VocabParallelClassifier3D(ParallelLayer): class VocabParallelClassifier3D(ParallelLayer):
""" r"""Vocab parallel classifier layer for 3D parallelism.
Vocab parallel classifier layer for 2D parallelism
:param in_features: size of each input sample Args:
:type in_features: int in_features (int): size of each input sample.
:param num_classes: number of classes num_classes (int): number of classes.
:type num_classes: int weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
:param weight: weight of the classifier, defaults to True bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
:type weight: torch.nn.Parameter, optional dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
:param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True`` weight_initializer (:class:`typing.Callable`, optional):
:type bias: bool, optional The initializer of weight, defaults to kaiming uniform initializer.
:param dtype: The dtype of parameters, defaults to None bias_initializer (:class:`typing.Callable`, optional):
:type dtype: torch.dtype, optional The initializer of bias, defaults to xavier uniform initializer.
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
:type weight_initializer: typing.Callable, optional More details about ``initializer`` please refer to
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
:type bias_initializer: typing.Callable, optional
""" """
def __init__(self, def __init__(self,
@ -311,27 +303,24 @@ class VocabParallelClassifier3D(ParallelLayer):
@LAYERS.register_module @LAYERS.register_module
class PatchEmbedding3D(ParallelLayer): class PatchEmbedding3D(ParallelLayer):
""" r"""2D Image to Patch Embedding.
2D Image to Patch Embedding
:param img_size: image size Args:
:type img_size: int img_size (int): image size.
:param patch_size: patch size patch_size (int): patch size.
:type patch_size: int in_chans (int): number of channels of input image.
:param in_chans: number of channels of input image embed_size (int): size of embedding.
:type in_chans: int dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
:param embed_size: size of embedding flatten (bool, optional): whether to flatten output tensor, defaults to True.
:type embed_size: int weight_initializer (:class:`typing.Callable`, optional):
:param dtype: The dtype of parameters, defaults to None The initializer of weight, defaults to kaiming uniform initializer.
:type dtype: torch.dtype, optional bias_initializer (:class:`typing.Callable`, optional):
:param flatten: whether to flatten output tensor, defaults to True The initializer of bias, defaults to xavier uniform initializer.
:type flatten: bool, optional position_embed_initializer (:class:`typing.Callable`, optional):
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer The initializer of position embedding, defaults to zeros initializer.
:type weight_initializer: typing.Callable, optional
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer More details about ``initializer`` please refer to
:type bias_initializer: typing.Callable, optional `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
:param position_embed_initializer: The intializer of position embedding, defaults to zero
:type position_embed_initializer: typing.Callable, optional
""" """
def __init__(self, def __init__(self,
@ -419,21 +408,33 @@ class PatchEmbedding3D(ParallelLayer):
@LAYERS.register_module @LAYERS.register_module
class Embedding3D(ParallelLayer): class Embedding3D(ParallelLayer):
""" r"""Embedding for 3D parallelism.
Embedding for 3D parallelism
:param num_embeddings: number of embeddings Args:
:type num_embeddings: int num_embeddings (int): number of embeddings.
:param embedding_dim: dimension of embedding embedding_dim (int): dimension of embedding.
:type embedding_dim: int padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
:param padding_idx: index of padding, defaults to None therefore, the embedding vector at padding_idx is not updated during training,
:type padding_idx: int, optional i.e. it remains as a fixed pad, defaults to None.
:param dtype: The dtype of parameters, defaults to None dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
:type dtype: torch.dtype, optional weight_initializer (:class:`typing.Callable`, optional):
:param weight_initializer: The intializer of weight, defaults to normal initializer he initializer of weight, defaults to normal initializer.
:type weight_initializer: typing.Callable, optional
:param args: Args used in F.embedding The ``args`` and ``kwargs`` used in :class:``torch.nn.functional.embedding`` should contain:
:param kwargs: Kwargs used in F.embedding ::
max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
renormalized to have norm max_norm. Note: this will modify weight in-place.
norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
of frequency of the words in the mini-batch. Default False.
sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
More details about ``args`` and ``kwargs`` could be found in
`Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
More details about initializer please refer to
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_
""" """
def __init__(self, def __init__(self,
@ -491,20 +492,33 @@ class Embedding3D(ParallelLayer):
@LAYERS.register_module @LAYERS.register_module
class VocabParallelEmbedding3D(torch.nn.Module): class VocabParallelEmbedding3D(torch.nn.Module):
"""Embedding parallelized in the vocabulary dimension. r"""Embedding parallelized in the vocabulary dimension.
:param num_embeddings: number of embeddings Args:
:type num_embeddings: int num_embeddings (int): number of embeddings.
:param embedding_dim: dimension of embedding embedding_dim (int): dimension of embedding.
:type embedding_dim: int padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
:param padding_idx: index of padding, defaults to None therefore, the embedding vector at padding_idx is not updated during training,
:type padding_idx: int, optional i.e. it remains as a fixed pad, defaults to None.
:param dtype: The dtype of parameters, defaults to None dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
:type dtype: torch.dtype, optional weight_initializer (:class:`typing.Callable`, optional):
:param weight_initializer: The intializer of weight, defaults to normal initializer he initializer of weight, defaults to normal initializer.
:type weight_initializer: typing.Callable, optional
:param args: Args used in F.embedding The ``args`` and ``kwargs`` used in :class:``torch.nn.functional.embedding`` should contain:
:param kwargs: Kwargs used in F.embedding ::
max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
renormalized to have norm max_norm. Note: this will modify weight in-place.
norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
of frequency of the words in the mini-batch. Default False.
sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
More details about ``args`` and ``kwargs`` could be found in
`Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
More details about initializer please refer to
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
""" """
def __init__(self, def __init__(self,

View File

@ -24,14 +24,13 @@ class TransformerSelfAttentionRing(nn.Module):
Self-attention layer takes input with size [b, s, h] Self-attention layer takes input with size [b, s, h]
and returns output of the same size. and returns output of the same size.
:param hidden_size: hidden size Args:
:type hidden_size: int hidden_size (int): hidden size.
:param kv_channels: channels of key/value tensor num_attention_heads (int): number of attention heads.
:type kv_channels: int attention_dropout (float): dropout probability for attention layer.
:param num_attention_heads: number of attention heads attention_mask_func (:class:`typing.Callable`): Mask function to be applied.
:type num_attention_heads: int layer_number (int): number of layers.
:param attention_dropout: dropout probability for attention layer
:type attention_dropout: float
""" """
def __init__(self, def __init__(self,

View File

@ -38,11 +38,16 @@ class CheckpointModule(nn.Module):
def divide(numerator, denominator): def divide(numerator, denominator):
"""Only allow exact division """Only allow exact division.
:param numerator: Numerator of the division Args:
:param denominator: Denominator of the division numerator (int): Numerator of the division.
denominator (int): Denominator of the division.
Returns:
int: the result of exact division.
""" """
assert denominator != 0, 'denominator can not be zero'
assert numerator % denominator == 0, \ assert numerator % denominator == 0, \
'{} is not divisible by {}'.format(numerator, denominator) '{} is not divisible by {}'.format(numerator, denominator)
return numerator // denominator return numerator // denominator

View File

@ -15,11 +15,16 @@ from ..utils import to_2tuple
def drop_path(x, drop_prob: float = 0., training: bool = False): def drop_path(x, drop_prob: float = 0., training: bool = False):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
This is the same as the DropConnect impl I created for EfficientNet, etc networks, however, This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
'survival rate' as the argument. 'survival rate' as the argument.
Args:
drop_prob (float, optional): probability of dropping path, defaults 0.0.
training (bool, optional): whether in training progress, defaults False.
""" """
if drop_prob == 0. or not training: if drop_prob == 0. or not training:
return x return x
@ -35,6 +40,9 @@ class DropPath(nn.Module):
""" """
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
Adapted from https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/drop.py Adapted from https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/drop.py
Args:
drop_prob (float, optional): probability of dropping path, defaults None.
""" """
def __init__(self, drop_prob=None): def __init__(self, drop_prob=None):
@ -46,7 +54,19 @@ class DropPath(nn.Module):
class WrappedDropout(nn.Module): class WrappedDropout(nn.Module):
"""Same as torch.nn.Dropout. But it is wrapped with the context of seed manager. r"""Same as torch.nn.Dropout. But it is wrapped with the context of seed manager. During training, randomly zeroes
some elements of the input tensor with probability p using samples from a Bernoulli distribution. Each
channel will be zeroed out independently on every forward call. Furthermore, the outputs are scaled by a factor of
1/(1-p) during training. This means that during evaluation the module simply computes an identity function.
Args:
p (float, optional): probability of an element to be zeroed, defaults 0.5.
inplace (bool, optional): whether to do dropout in-place, default to be False.
mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
""" """
def __init__(self, p: float = 0.5, inplace: bool = False, mode=None): def __init__(self, p: float = 0.5, inplace: bool = False, mode=None):
@ -74,8 +94,16 @@ class WrappedDropout(nn.Module):
class WrappedDropPath(nn.Module): class WrappedDropPath(nn.Module):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). r"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
Here, it is wrapped with the context of seed manager. Here, it is wrapped with the context of seed manager.
Args:
p (float, optional): probability of dropping path, defaults 0.0.
mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
""" """
def __init__(self, p: float = 0., mode=None): def __init__(self, p: float = 0., mode=None):
@ -101,27 +129,25 @@ class WrappedDropPath(nn.Module):
@LAYERS.register_module @LAYERS.register_module
class VanillaPatchEmbedding(nn.Module): class VanillaPatchEmbedding(nn.Module):
""" r"""
2D Image to Patch Embedding 2D Image to Patch Embedding
:param img_size: image size Args:
:type img_size: int img_size (int): image size.
:param patch_size: patch size patch_size (int): patch size.
:type patch_size: int in_chans (int): number of channels of input image.
:param in_chans: number of channels of input image embed_size (int): size of embedding.
:type in_chans: int dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
:param embed_size: size of embedding flatten (bool, optional): whether to flatten output tensor, defaults to True.
:type embed_size: int weight_initializer (:class:`typing.Callable`, optional):
:param dtype: The dtype of parameters, defaults to None The initializer of weight, defaults to kaiming uniform initializer.
:type dtype: torch.dtype, optional bias_initializer (:class:`typing.Callable`, optional):
:param flatten: whether to flatten output tensor, defaults to True The initializer of bias, defaults to xavier uniform initializer.
:type flatten: bool, optional position_embed_initializer (:class:`typing.Callable`, optional):
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer The initializer of position embedding, defaults to zeros initializer.
:type weight_initializer: typing.Callable, optional
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer More details about initializer please refer to
:type bias_initializer: typing.Callable, optional `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
:param position_embed_initializer: The intializer of position embedding, defaults to zero
:type position_embed_initializer: typing.Callable, optional
""" """
def __init__(self, def __init__(self,
@ -174,23 +200,21 @@ class VanillaPatchEmbedding(nn.Module):
@LAYERS.register_module @LAYERS.register_module
class VanillaClassifier(nn.Module): class VanillaClassifier(nn.Module):
""" r"""Dense linear classifier.
Dense linear classifier
:param in_features: size of each input sample Args:
:type in_features: int in_features (int): size of each input sample.
:param num_classes: number of classes num_classes (int): number of classes.
:type num_classes: int weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
:param weight: weight of the classifier, defaults to True dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
:type weight: torch.nn.Parameter, optional flatten (bool, optional): whether to flatten output tensor, defaults to True.
:param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True weight_initializer (:class:`typing.Callable`, optional):
:type bias: bool, optional The initializer of weight, defaults to kaiming uniform initializer.
:param dtype: The dtype of parameters, defaults to None bias_initializer (:class:`typing.Callable`, optional):
:type dtype: torch.dtype, optional The initializer of bias, defaults to xavier uniform initializer.
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
:type weight_initializer: typing.Callable, optional More details about initializer please refer to
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
:type bias_initializer: typing.Callable, optional
""" """
def __init__(self, def __init__(self,

View File

@ -9,12 +9,11 @@ from colossalai.registry import LAYERS
@LAYERS.register_module @LAYERS.register_module
class LambdaWrapper(nn.Module): class LambdaWrapper(nn.Module):
"""Wrap a function to nn.Module, which takes a config of layers and can fully access them """Wrap a function to nn.Module, which takes a config of layers and can fully access them.
:param func: User customed function Args:
:type func: Callable func (``Callable``): User customed function.
:param layers_cfg: Config of layers, defaults to None layers_cfg (dict, optional): Config of layers, defaults to None.
:type layers_cfg: dict, optional
""" """
def __init__(self, func, layers_cfg: dict = None): def __init__(self, func, layers_cfg: dict = None):

View File

@ -86,12 +86,10 @@ class _VocabParallelCrossEntropy1D(torch.autograd.Function):
@LOSSES.register_module @LOSSES.register_module
class VocabParallelCrossEntropyLoss1D(_Loss): class VocabParallelCrossEntropyLoss1D(_Loss):
""" """Vocab parallel cross entropy loss for 1D parallelism.
Vocab parallel cross entropy loss for 1D parallelism
:param reduction: whether to average the loss, defaults to True Args:
reduction (bool, optional): whether to average the loss, defaults to True.
:type reduction: bool, optional
""" """
def __init__(self, reduction=True): def __init__(self, reduction=True):
@ -99,10 +97,11 @@ class VocabParallelCrossEntropyLoss1D(_Loss):
self.reduction_mean = reduction self.reduction_mean = reduction
def forward(self, logits, targets): def forward(self, logits, targets):
"""Calculate loss between logits and targets """Calculate loss between logits and targets.
:param logits: Output logits of model Args:
:param targets: True targets from data logits (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
targets (:class:`torch.tensor`): Ground truth class indices or class probabilities.
""" """
loss = _VocabParallelCrossEntropy1D.apply(logits, targets) loss = _VocabParallelCrossEntropy1D.apply(logits, targets)
if self.reduction_mean: if self.reduction_mean:

View File

@ -13,14 +13,22 @@ from torch.nn.modules.loss import _Loss
@LOSSES.register_module @LOSSES.register_module
class CrossEntropyLoss2D(_Loss): class CrossEntropyLoss2D(_Loss):
""" r"""Cross entropy loss for 2D parallelism
Cross entropy loss for 2D parallelism
:param reduction: whether to average the loss, defaults to True Args:
:param args: Args for loss function reduction (bool, optional): whether to average the loss, defaults to True.
:param kwargs: Kwargs for loss function
:type reduction: bool, optional The ``args`` and ``kwargs`` should include parameters below:
::
weight (Tensor, optional)
size_average (bool, optional)
ignore_index (int, optional)
reduce (bool, optional)
label_smoothing (float, optional)
More details about args, kwargs and torch.nn.functional.cross_entropy could be found in
`Cross_entropy <https://pytorch.org/docs/stable/generated/torch.nn.functional.cross_entropy.html#torch.nn.functional.cross_entropy>`_.
""" """
def __init__(self, reduction=True, *args, **kwargs): def __init__(self, reduction=True, *args, **kwargs):
@ -31,10 +39,14 @@ class CrossEntropyLoss2D(_Loss):
self.loss_kwargs = kwargs self.loss_kwargs = kwargs
def forward(self, logits, targets): def forward(self, logits, targets):
"""Calculate loss between logits and targets """Calculate loss between logits and targets.
:param logits: Output logits of model Args:
:param targets: True targets from data logits (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
targets (:class:`torch.tensor`): Ground truth class indices or class probabilities.
Returns:
float: the loss between logits and targets.
""" """
targets = split_tensor_2d(targets) targets = split_tensor_2d(targets)
loss = cross_entropy(logits, targets, reduction='none', *self.loss_args, **self.loss_kwargs) loss = cross_entropy(logits, targets, reduction='none', *self.loss_args, **self.loss_kwargs)
@ -116,12 +128,10 @@ class _VocabParallelCrossEntropy2D(torch.autograd.Function):
@LOSSES.register_module @LOSSES.register_module
class VocabParallelCrossEntropyLoss2D(_Loss): class VocabParallelCrossEntropyLoss2D(_Loss):
""" """Vocab parallel cross entropy loss for 2D parallelism.
Vocab parallel cross entropy loss for 2D parallelism
:param reduction: whether to average the loss, defaults to True Args:
reduction (bool, optional): whether to average the loss, defaults to True.
:type reduction: bool, optional
""" """
def __init__(self, reduction=True): def __init__(self, reduction=True):
@ -129,10 +139,11 @@ class VocabParallelCrossEntropyLoss2D(_Loss):
self.reduction_mean = reduction self.reduction_mean = reduction
def forward(self, logits, targets): def forward(self, logits, targets):
"""Calculate loss between logits and targets """Calculate loss between logits and targets.
:param logits: Output logits of model Args:
:param targets: True targets from data logits (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
targets (:class:`torch.tensor`): Ground truth class indices or class probabilities.
""" """
targets = split_tensor_2d(targets) targets = split_tensor_2d(targets)
loss = _VocabParallelCrossEntropy2D.apply( loss = _VocabParallelCrossEntropy2D.apply(

View File

@ -13,14 +13,22 @@ from torch.nn.modules.loss import _Loss
@LOSSES.register_module @LOSSES.register_module
class CrossEntropyLoss2p5D(_Loss): class CrossEntropyLoss2p5D(_Loss):
""" r"""Cross entropy loss for 2.5D parallelism
Cross entropy loss for 2.5D parallelism
:param reduction: whether to average the loss, defaults to True Args:
:param args: Args for loss function reduction (bool, optional): whether to average the loss, defaults to True.
:param kwargs: Kwargs for loss function
:type reduction: bool, optional The ``args`` and ``kwargs`` should include parameters below:
::
weight (Tensor, optional)
size_average (bool, optional)
ignore_index (int, optional)
reduce (bool, optional)
label_smoothing (float, optional)
More details about args, kwargs and torch.nn.functional.cross_entropy could be found in
`Cross_entropy <https://pytorch.org/docs/stable/generated/torch.nn.functional.cross_entropy.html#torch.nn.functional.cross_entropy>`_.
""" """
def __init__(self, reduction=True, *args, **kwargs): def __init__(self, reduction=True, *args, **kwargs):
super().__init__() super().__init__()
@ -30,10 +38,11 @@ class CrossEntropyLoss2p5D(_Loss):
self.loss_kwargs = kwargs self.loss_kwargs = kwargs
def forward(self, logits, targets): def forward(self, logits, targets):
"""Calculate loss between logits and targets """Calculate loss between logits and targets.
:param logits: Output logits of model Args:
:param targets: True targets from data logits (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
targets (:class:`torch.tensor`): Ground truth class indices or class probabilities.
""" """
targets = split_tensor_2p5d(targets) targets = split_tensor_2p5d(targets)
loss = cross_entropy(logits, targets, reduction='none', *self.loss_args, **self.loss_kwargs) loss = cross_entropy(logits, targets, reduction='none', *self.loss_args, **self.loss_kwargs)
@ -115,19 +124,19 @@ class VocabParallelCrossEntropyLoss2p5D(_Loss):
""" """
Vocab parallel cross entropy loss for 2.5D parallelism Vocab parallel cross entropy loss for 2.5D parallelism
:param reduction: whether to average the loss, defaults to True Args:
reduction (bool, optional): whether to average the loss, defaults to True.
:type reduction: bool, optional
""" """
def __init__(self, reduction=True): def __init__(self, reduction=True):
super().__init__() super().__init__()
self.reduction_mean = reduction self.reduction_mean = reduction
def forward(self, logits, targets): def forward(self, logits, targets):
"""Calculate loss between logits and targets """Calculate loss between logits and targets.
:param logits: Output logits of model Args:
:param targets: True targets from data logits (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
targets (:class:`torch.tensor`): Ground truth class indices or class probabilities.
""" """
targets = split_tensor_2p5d(targets) targets = split_tensor_2p5d(targets)
loss = _VocabParallelCrossEntropy2p5D.apply(logits, targets) loss = _VocabParallelCrossEntropy2p5D.apply(logits, targets)

View File

@ -13,14 +13,22 @@ from torch.nn.modules.loss import _Loss
@LOSSES.register_module @LOSSES.register_module
class CrossEntropyLoss3D(_Loss): class CrossEntropyLoss3D(_Loss):
""" r"""Cross entropy loss for 3D parallelism.
Cross entropy loss for 3D parallelism
:param reduction: whether to average the loss, defaults to True Args:
:param args: Args for loss function reduction (bool, optional): whether to average the loss, defaults to True.
:param kwargs: Kwargs for loss function
:type reduction: bool, optional The ``args`` and ``kwargs`` should include parameters below:
::
weight (Tensor, optional)
size_average (bool, optional)
ignore_index (int, optional)
reduce (bool, optional)
label_smoothing (float, optional)
More details about args, kwargs and torch.nn.functional.cross_entropy could be found in
`Cross_entropy <https://pytorch.org/docs/stable/generated/torch.nn.functional.cross_entropy.html#torch.nn.functional.cross_entropy>`_.
""" """
def __init__(self, reduction=True, *args, **kwargs): def __init__(self, reduction=True, *args, **kwargs):
@ -32,10 +40,11 @@ class CrossEntropyLoss3D(_Loss):
self.loss_kwargs = kwargs self.loss_kwargs = kwargs
def forward(self, logits, targets): def forward(self, logits, targets):
"""Calculate loss between logits and targets """Calculate loss between logits and targets.
:param logits: Output logits of model Args:
:param targets: True targets from data logits (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
targets (:class:`torch.tensor`): Ground truth class indices or class probabilities.
""" """
targets = split_tensor_3d(targets, 0, self.weight_parallel_mode) targets = split_tensor_3d(targets, 0, self.weight_parallel_mode)
targets = split_tensor_3d(targets, 0, self.input_parallel_mode) targets = split_tensor_3d(targets, 0, self.input_parallel_mode)
@ -109,12 +118,10 @@ class _VocabParallelCrossEntropy3D(torch.autograd.Function):
@LOSSES.register_module @LOSSES.register_module
class VocabParallelCrossEntropyLoss3D(_Loss): class VocabParallelCrossEntropyLoss3D(_Loss):
""" """Vocab parallel cross entropy loss for 2D parallelism.
Vocab parallel cross entropy loss for 2D parallelism
:param reduction: whether to average the loss, defaults to True Args:
reduction (bool, optional): whether to average the loss, defaults to True.
:type reduction: bool, optional
""" """
def __init__(self, reduction=True): def __init__(self, reduction=True):
@ -125,10 +132,11 @@ class VocabParallelCrossEntropyLoss3D(_Loss):
self.reduction_mean = reduction self.reduction_mean = reduction
def forward(self, logits, targets): def forward(self, logits, targets):
"""Calculate loss between logits and targets """Calculate loss between logits and targets.
:param logits: Output logits of model Args:
:param targets: True targets from data logits (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
targets (:class:`torch.tensor`): Ground truth class indices or class probabilities.
""" """
targets = split_tensor_3d(targets, 0, self.weight_parallel_mode) targets = split_tensor_3d(targets, 0, self.weight_parallel_mode)
targets = split_tensor_3d(targets, 0, self.input_parallel_mode) targets = split_tensor_3d(targets, 0, self.input_parallel_mode)

View File

@ -6,13 +6,25 @@ from colossalai.context.moe_context import MOE_CONTEXT
@LOSSES.register_module @LOSSES.register_module
class MoeCrossEntropyLoss(_Loss): class MoeCrossEntropyLoss(_Loss):
"""torch.nn.CrossEntropyLoss added with auxiliary loss. r"""torch.nn.CrossEntropyLoss added with auxiliary loss.
:param aux_weight: Weight of auxiliary loss in total loss Args:
:param args: Args in CrossEntropyLoss input (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
:param kwargs: Kwargs in CrossEntropyLoss target (:class:`torch.tensor`): Ground truth class indices or class probabilities.
aux_weight (float, optional): Weight of auxiliary loss in total loss.Defaults 0.01.
:type aux_weight: float, optional The ``args`` and ``kwargs`` should include parameters below:
::
weight (Tensor, optional)
size_average (bool, optional)
ignore_index (int, optional)
reduce (bool, optional)
reduction (str, optional)
label_smoothing (float, optional)
More details about args, kwargs and torch.nn.functional.cross_entropy could be found in
`Cross_entropy <https://pytorch.org/docs/stable/generated/torch.nn.functional.cross_entropy.html#torch.nn.functional.cross_entropy>`_.
""" """
def __init__(self, aux_weight: float = 0.01, *args, **kwargs): def __init__(self, aux_weight: float = 0.01, *args, **kwargs):
@ -21,6 +33,16 @@ class MoeCrossEntropyLoss(_Loss):
self.aux_weight = aux_weight self.aux_weight = aux_weight
def forward(self, *args): def forward(self, *args):
"""
The ``args`` should at least include parameters below:
::
input (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
target (:class:`torch.tensor`): Ground truth class indices or class probabilities.
More details about args, kwargs and torch.nn.functional.cross_entropy could be found in
`Cross_entropy <https://pytorch.org/docs/stable/generated/torch.nn.functional.cross_entropy.html#torch.nn.functional.cross_entropy>`_.
"""
main_loss = self.loss(*args) main_loss = self.loss(*args)
aux_loss = MOE_CONTEXT.get_loss() aux_loss = MOE_CONTEXT.get_loss()
return main_loss + self.aux_weight * aux_loss return main_loss + self.aux_weight * aux_loss
@ -30,13 +52,11 @@ class MoeCrossEntropyLoss(_Loss):
class MoeLoss(_Loss): class MoeLoss(_Loss):
"""A wrapper class for any loss module to add with auxiliary loss. """A wrapper class for any loss module to add with auxiliary loss.
:param aux_weight: Weight of auxiliary loss in total loss Args:
:param loss_fn: Loss function aux_weight (float): Weight of auxiliary loss in total loss.
:param args: Args in loss function loss_fn (``Callable``): Loss function.
:param kwargs: Kwargs in loss function args (list): Args in loss function.
kwargs (dict): Kwargs in loss function
:type aux_weight: float
:type loss_fn: Callable
""" """
def __init__(self, aux_weight: float, loss_fn, *args, **kwargs): def __init__(self, aux_weight: float, loss_fn, *args, **kwargs):
@ -45,6 +65,16 @@ class MoeLoss(_Loss):
self.aux_weight = aux_weight self.aux_weight = aux_weight
def forward(self, *args, **kwargs): def forward(self, *args, **kwargs):
"""
The ``args`` and ``kwargs`` should at least include parameters below:
::
input (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
target (:class:`torch.tensor`): Ground truth class indices or class probabilities.
Note:
The ``args`` and ``kwargs`` may include different parameters varying with different loss function.
"""
main_loss = self.loss_fn(*args, **kwargs) main_loss = self.loss_fn(*args, **kwargs)
aux_loss = MOE_CONTEXT.get_loss() aux_loss = MOE_CONTEXT.get_loss()
return main_loss + self.aux_weight * aux_loss return main_loss + self.aux_weight * aux_loss

View File

@ -36,14 +36,12 @@ class CosineAnnealingLR(_CosineAnnealingLR):
.. _SGDR\: Stochastic Gradient Descent with Warm Restarts: .. _SGDR\: Stochastic Gradient Descent with Warm Restarts:
https://arxiv.org/abs/1608.03983 https://arxiv.org/abs/1608.03983
:param optimizer: Wrapped optimizer Args:
:type optimizer: torch.optim.Optimizer optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
:param total_steps: Number of total training steps total_steps (int): Number of total training steps.
:type total_steps: int eta_min (int, optional): Minimum learning rate, defaults to 0.
:param eta_min: Minimum learning rate, defaults to 0 last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
:type eta_min: int, optional the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
:param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional
""" """
def __init__(self, optimizer, total_steps: int, eta_min: int = 0, last_epoch: int = -1, **kwargs): def __init__(self, optimizer, total_steps: int, eta_min: int = 0, last_epoch: int = -1, **kwargs):
@ -54,16 +52,13 @@ class CosineAnnealingLR(_CosineAnnealingLR):
class CosineAnnealingWarmupLR(WarmupScheduler): class CosineAnnealingWarmupLR(WarmupScheduler):
"""Cosine annealing learning rate scheduler with learning rate warmup. A linear warmup schedule will be applied. """Cosine annealing learning rate scheduler with learning rate warmup. A linear warmup schedule will be applied.
:param optimizer: Wrapped optimizer Args:
:type optimizer: torch.optim.Optimizer optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
:param total_steps: Number of total training steps total_steps (int): Number of total training steps.
:type total_steps: int warmup_steps (int, optional): Number of warmup steps, defaults to 0.
:param warmup_steps: Number of warmup steps, defaults to 0 eta_min (int, optional): Minimum learning rate, defaults to 0.
:type warmup_steps: int, optional last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
:param eta_min: Minimum learning rate, defaults to 0 the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
:type eta_min: int, optional
:param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional
""" """
def __init__(self, optimizer, total_steps: int, warmup_steps: int = 0, eta_min: float = 0., last_epoch: int = -1): def __init__(self, optimizer, total_steps: int, warmup_steps: int = 0, eta_min: float = 0., last_epoch: int = -1):
@ -76,14 +71,12 @@ class CosineAnnealingWarmupLR(WarmupScheduler):
class FlatAnnealingLR(DelayerScheduler): class FlatAnnealingLR(DelayerScheduler):
"""Flat and cosine annealing learning rate scheduler. The learning rate will be a fixed value before starting decay. """Flat and cosine annealing learning rate scheduler. The learning rate will be a fixed value before starting decay.
:param optimizer: Wrapped optimizer Args:
:type optimizer: torch.optim.Optimizer optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
:param total_steps: Number of total training steps total_steps (int): Number of total training steps.
:type total_steps: int pct_start (float, optional): Percent of steps before starting learning rate decay, defaults to -0.72.
:param pct_start: Percent of steps before starting learning rate decay last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
:type pct_start: float the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
:param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional
""" """
def __init__(self, optimizer, total_steps: int, pct_start: float = 0.72, last_epoch: int = -1, **kwargs): def __init__(self, optimizer, total_steps: int, pct_start: float = 0.72, last_epoch: int = -1, **kwargs):
@ -102,18 +95,14 @@ class FlatAnnealingWarmupLR(WarmupDelayerScheduler):
"""Flat and cosine annealing learning rate scheduler with learning rate warmup. A linear warmup schedule will be """Flat and cosine annealing learning rate scheduler with learning rate warmup. A linear warmup schedule will be
applied, and then the learning rate will be a fixed value before starting decay. applied, and then the learning rate will be a fixed value before starting decay.
:param optimizer: Wrapped optimizer Args:
:type optimizer: torch.optim.Optimizer optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
:param total_steps: Number of total training steps total_steps (int): Number of total training steps.
:type total_steps: int warmup_steps (int, optional): Number of warmup steps, defaults to 0.
:param warmup_steps: Number of warmup steps, defaults to 0 pct_start (float, optional): Percent of steps before starting learning rate decay, defaults to -0.72.
:type warmup_steps: int, optional eta_min (int, optional): Minimum learning rate, defaults to 0.
:param pct_start: Percent of steps before starting learning rate decay last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
:type pct_start: float the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
:param eta_min: Minimum learning rate, defaults to 0
:type eta_min: int, optional
:param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional
""" """
def __init__(self, optimizer, total_steps: int, warmup_steps: int = 0, pct_start: float = 0.72, eta_min: int = 0, def __init__(self, optimizer, total_steps: int, warmup_steps: int = 0, pct_start: float = 0.72, eta_min: int = 0,

View File

@ -14,16 +14,15 @@ class _enable_get_lr_call:
class DelayerScheduler(_LRScheduler): class DelayerScheduler(_LRScheduler):
""" Starts with a flat lr schedule until it reaches N epochs the applies a scheduler """Starts with a flat lr schedule until it reaches N epochs then applies
the specific scheduler (For example: ReduceLROnPlateau)
:param optimizer: Wrapped optimizer. Args:
:type optimizer: torch.optim.Optimizer optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
:param delay_epochs: Number of epochs to keep the initial lr until starting aplying the scheduler delay_epochs (int): Number of epochs to keep the initial lr until starting applying the scheduler.
:type delay_epochs: int after_scheduler (:class:`torch.optim.lr_scheduler`): After target_epoch, use this scheduler.
:param after_scheduler: After target_epoch, use this scheduler(eg. ReduceLROnPlateau) last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
:type after_scheduler: torch.optim.lr_scheduler the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
:param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional
""" """
def __init__(self, optimizer, delay_epochs, after_scheduler, last_epoch=-1): def __init__(self, optimizer, delay_epochs, after_scheduler, last_epoch=-1):
@ -57,16 +56,15 @@ class DelayerScheduler(_LRScheduler):
class WarmupScheduler(_LRScheduler): class WarmupScheduler(_LRScheduler):
""" Starts with a linear warmup lr schedule until it reaches N epochs the applies a scheduler """Starts with a linear warmup lr schedule until it reaches N epochs then applies
the specific scheduler (For example: ReduceLROnPlateau).
:param optimizer: Wrapped optimizer. Args:
:type optimizer: torch.optim.Optimizer optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
:param warmup_epochs: Number of epochs to linearly warmup lr until starting aplying the scheduler warmup_epochs (int): Number of epochs to linearly warmup lr until starting applying the scheduler.
:type warmup_epochs: int after_scheduler (:class:`torch.optim.lr_scheduler`): After target_epoch, use this scheduler.
:param after_scheduler: After target_epoch, use this scheduler(eg. ReduceLROnPlateau) last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
:type after_scheduler: torch.optim.lr_scheduler the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
:param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional
""" """
def __init__(self, optimizer, warmup_epochs, after_scheduler, last_epoch=-1): def __init__(self, optimizer, warmup_epochs, after_scheduler, last_epoch=-1):
@ -97,18 +95,16 @@ class WarmupScheduler(_LRScheduler):
class WarmupDelayerScheduler(_LRScheduler): class WarmupDelayerScheduler(_LRScheduler):
""" Starts with a linear warmup lr schedule until it reaches N epochs and a flat lr schedule until it reaches M epochs the applies a scheduler """Starts with a linear warmup lr schedule until it reaches N epochs and a flat lr schedule
until it reaches M epochs then applies the specific scheduler (For example: ReduceLROnPlateau).
:param optimizer: Wrapped optimizer. Args:
:type optimizer: torch.optim.Optimizer optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
:param warmup_epochs: Number of epochs to linearly warmup lr until starting aplying the scheduler warmup_epochs (int): Number of epochs to linearly warmup lr until starting applying the scheduler.
:type warmup_epochs: int delay_epochs (int): Number of epochs to keep the initial lr until starting applying the scheduler.
:param delay_epochs: Number of epochs to keep the initial lr until starting aplying the scheduler after_scheduler (:class:`torch.optim.lr_scheduler`): After target_epoch, use this scheduler.
:type delay_epochs: int last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
:param after_scheduler: After target_epoch, use this scheduler(eg. ReduceLROnPlateau) the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
:type after_scheduler: torch.optim.lr_scheduler
:param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional
""" """
def __init__(self, optimizer, warmup_epochs, delay_epochs, after_scheduler, last_epoch=-1): def __init__(self, optimizer, warmup_epochs, delay_epochs, after_scheduler, last_epoch=-1):

View File

@ -5,16 +5,14 @@ from colossalai.registry import LR_SCHEDULERS
@LR_SCHEDULERS.register_module @LR_SCHEDULERS.register_module
class LinearWarmupLR(_LRScheduler): class LinearWarmupLR(_LRScheduler):
"""Linearly warmup learning rate and then linearly decay """Linearly warmup learning rate and then linearly decay.
:param optimizer: Wrapped optimizer Args:
:type optimizer: torch.optim.Optimizer optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
:param total_steps: Number of total training steps total_steps (int): Number of total training steps.
:type total_steps: int warmup_steps (int, optional): Number of warmup steps, defaults to 0
:param warmup_steps: Number of warmup steps, defaults to 0 last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
:type warmup_steps: int, optional the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
:param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional
""" """
def __init__(self, optimizer, total_steps: int, warmup_steps: int = 0, last_epoch: int = -1, **kwargs): def __init__(self, optimizer, total_steps: int, warmup_steps: int = 0, last_epoch: int = -1, **kwargs):

View File

@ -13,18 +13,13 @@ class MultiStepLR(_MultiStepLR):
happen simultaneously with other changes to the learning rate from outside happen simultaneously with other changes to the learning rate from outside
this scheduler. When last_epoch=-1, sets initial lr as lr. this scheduler. When last_epoch=-1, sets initial lr as lr.
:param optimizer: Wrapped optimizer Args:
:type optimizer: torch.optim.Optimizer optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
:param total_steps: Number of total training steps total_steps (int): Number of total training steps.
:type total_steps: int milestones (List[int], optional): List of epoch indices. Must be increasing, defaults to None.
:param milestones: List of epoch indices. Must be increasing, defaults to None gamma (float, optional): Multiplicative factor of learning rate decay, defaults to 0.1.
:type milestones: List[int], optional last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
:param gamma: Multiplicative factor of learning rate decay, defaults to 0.1 the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
:type gamma: float, optional
:param num_steps_per_epoch: Number of steps per epoch, defaults to -1
:type num_steps_per_epoch: int, optional
:param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional
""" """
def __init__(self, optimizer, total_steps: int, milestones: List[int] = None, gamma: float = 0.1, last_epoch: int = -1, **kwargs): def __init__(self, optimizer, total_steps: int, milestones: List[int] = None, gamma: float = 0.1, last_epoch: int = -1, **kwargs):
@ -33,22 +28,17 @@ class MultiStepLR(_MultiStepLR):
@LR_SCHEDULERS.register_module @LR_SCHEDULERS.register_module
class MultiStepWarmupLR(WarmupScheduler): class MultiStepWarmupLR(WarmupScheduler):
"""Multi-step laerning rate scheduler with warmup. """Multistep learning rate scheduler with warmup.
:param optimizer: Wrapped optimizer Args:
:type optimizer: torch.optim.Optimizer optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
:param total_steps: Number of total training steps total_steps (int): Number of total training steps.
:type total_steps: int warmup_steps (int, optional): Number of warmup steps, defaults to 0.
:param warmup_steps: Number of warmup steps, defaults to 0 milestones (List[int], optional): List of epoch indices. Must be increasing, defaults to None.
:type warmup_steps: int, optional gamma (float, optional): Multiplicative factor of learning rate decay, defaults to 0.1.
:param milestones: List of epoch indices. Must be increasing, defaults to None num_steps_per_epoch (int, optional): Number of steps per epoch, defaults to -1.
:type milestones: List[int], optional last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
:param gamma: Multiplicative factor of learning rate decay, defaults to 0.1 the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
:type gamma: float, optional
:param num_steps_per_epoch: Number of steps per epoch, defaults to -1
:type num_steps_per_epoch: int, optional
:param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional
""" """
def __init__(self, optimizer, total_steps: int, warmup_steps: int = 0, milestones: List[int] = None, def __init__(self, optimizer, total_steps: int, warmup_steps: int = 0, milestones: List[int] = None,

View File

@ -28,43 +28,41 @@ class OneCycleLR(_OneCycleLR):
claims that "unpublished work has shown even better results by using only two phases". To claims that "unpublished work has shown even better results by using only two phases". To
mimic the behaviour of the original paper instead, set ``three_phase=True``. mimic the behaviour of the original paper instead, set ``three_phase=True``.
:param optimizer: Wrapped optimizer Args:
:type optimizer: torch.optim.Optimizer optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
:param total_steps: Number of total training steps total_steps (int): Number of total training steps.
:type total_steps: int pct_start (float, optional):
:param pct_start: The percentage of the cycle (in number of steps) spent increasing the learning rate, defaults to 0.3 The percentage of the cycle (in number of steps) spent increasing the learning rate, defaults to 0.3.
:type pct_start: float, optional anneal_strategy (str, optional): {'cos', 'linear'}, Specifies the annealing strategy:
:param anneal_strategy: {'cos', 'linear'} "cos" for cosine annealing, "linear" for linear annealing, defaults to 'cos'.
Specifies the annealing strategy: "cos" for cosine annealing, "linear" for cycle_momentum (bool, optional): If ``True``, momentum is cycled inversely
linear annealing, defaults to 'cos' to learning rate between 'base_momentum' and 'max_momentum', defaults to True.
:type anneal_strategy: str, optional base_momentum (float, optional): Lower momentum boundaries in the cycle for each parameter group.
:param cycle_momentum: If ``True``, momentum is cycled inversely Note that momentum is cycled inversely to learning rate; at the peak of a cycle, momentum is
to learning rate between 'base_momentum' and 'max_momentum', defaults to True 'base_momentum' and learning rate is 'max_lr', defaults to 0.85.
:type cycle_momentum: bool, optional max_momentum (float, optional): Upper momentum boundaries in the cycle for each parameter group.
:param base_momentum: Lower momentum boundaries in the cycle Functionally, it defines the cycle amplitude (max_momentum - base_momentum).
for each parameter group. Note that momentum is cycled inversely Note that momentum is cycled inversely to learning rate; at the start of a cycle, momentum is 'max_momentum'
to learning rate; at the peak of a cycle, momentum is and learning rate is 'base_lr', defaults to 0.95.
'base_momentum' and learning rate is 'max_lr', defaults to 0.85 div_factor (float, optional): Determines the initial learning rate via
:type base_momentum: float, optional initial_lr = max_lr/div_factor, defaults to 25.0.
:param max_momentum: Upper momentum boundaries in the cycle final_div_factor (float, optional): Determines the minimum learning rate via
for each parameter group. Functionally, min_lr = initial_lr/final_div_factor, defaults to 10000.0.
it defines the cycle amplitude (max_momentum - base_momentum). last_epoch (int, optional): The index of the last batch. This parameter is used when resuming a training job.
Note that momentum is cycled inversely Since `step()` should be invoked after each batch instead of after each epoch, this number represents
to learning rate; at the start of a cycle, momentum is 'max_momentum' the total number of *batches* computed, not the total number of epochs computed.
and learning rate is 'base_lr', defaults to 0.95 When last_epoch=-1, the schedule is started from the beginning, defaults to -1
:type max_momentum: float, optional
:param div_factor: Determines the initial learning rate via The ``kwargs`` for initializing torch.optim.lr_scheduler.OneCycleLR should include parameters below:
initial_lr = max_lr/div_factor, defaults to 25.0 ::
:type div_factor: float, optional
:param final_div_factor: Determines the minimum learning rate via epochs (int, optional, default=None)
min_lr = initial_lr/final_div_factor, defaults to 10000.0 steps_per_epoch (int, optional, default=None)
:type final_div_factor: float, optional three_phase (bool, optional, default=False)
:param last_epoch: The index of the last batch. This parameter is used when verbose (bool, optional, default=False)
resuming a training job. Since `step()` should be invoked after each
batch instead of after each epoch, this number represents the total More details about kwargs could be found in
number of *batches* computed, not the total number of epochs computed. `OneCycleLR <https://pytorch.org/docs/stable/generated/torch.optim.lr_scheduler.OneCycleLR.html#torch.optim.lr_scheduler.OneCycleLR>`_.
When last_epoch=-1, the schedule is started from the beginning, defaults to -1
:type last_epoch: int, optional
.. _Super-Convergence\: Very Fast Training of Neural Networks Using Large Learning Rates: .. _Super-Convergence\: Very Fast Training of Neural Networks Using Large Learning Rates:
https://arxiv.org/abs/1708.07120 https://arxiv.org/abs/1708.07120

View File

@ -8,16 +8,13 @@ from .delayed import WarmupScheduler
class PolynomialLR(_LRScheduler): class PolynomialLR(_LRScheduler):
"""Polynomial learning rate scheduler. """Polynomial learning rate scheduler.
:param optimizer: Wrapped optimizer Args:
:type optimizer: torch.optim.Optimizer optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
:param total_steps: Number of total training steps total_steps (int): Number of total training steps.
:type total_steps: int end_lr (float, optional): Minimum learning rate, defaults to 0.0001.
:param end_lr: Minimum learning rate, defaults to 0.0001 power (float, optional): The power of polynomial, defaults to 1.0.
:type end_lr: float, optional last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
:param power: The power of polynomial, defaults to 1.0 the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
:type power: float, optional
:param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional
""" """
def __init__(self, optimizer, total_steps: int, end_lr: float = 0.0001, power: float = 1.0, last_epoch: int = -1, def __init__(self, optimizer, total_steps: int, end_lr: float = 0.0001, power: float = 1.0, last_epoch: int = -1,
@ -44,18 +41,14 @@ class PolynomialLR(_LRScheduler):
class PolynomialWarmupLR(WarmupScheduler): class PolynomialWarmupLR(WarmupScheduler):
"""Polynomial learning rate scheduler with warmup. """Polynomial learning rate scheduler with warmup.
:param optimizer: Wrapped optimizer Args:
:type optimizer: torch.optim.Optimizer optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
:param total_steps: Number of total training steps total_steps (int): Number of total training steps.
:type total_steps: int warmup_steps (int, optional): Number of warmup steps, defaults to 0.
:param warmup_steps: Number of warmup steps, defaults to 0 end_lr (float, optional): Minimum learning rate, defaults to 0.0001.
:type warmup_steps: int, optional power (float, optional): The power of polynomial, defaults to 1.0.
:param end_lr: Minimum learning rate, defaults to 0.0001 last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
:type end_lr: float, optional the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
:param power: The power of polynomial, defaults to 1.0
:type power: float, optional
:param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional
""" """
def __init__(self, optimizer, total_steps: int, warmup_steps: int = 0, end_lr: float = 0.0001, power: float = 1.0, def __init__(self, optimizer, total_steps: int, warmup_steps: int = 0, end_lr: float = 0.0001, power: float = 1.0,

View File

@ -11,16 +11,13 @@ class LambdaLR(_LambdaLR):
"""Sets the learning rate of each parameter group to the initial lr """Sets the learning rate of each parameter group to the initial lr
times a given function. When last_epoch=-1, sets initial lr as lr. times a given function. When last_epoch=-1, sets initial lr as lr.
:param optimizer: Wrapped optimizer Args:
:type optimizer: torch.optim.Optimizer optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
:param total_steps: Number of total training steps total_steps (int): Number of total training steps.
:type total_steps: int lr_lambda (Union[``function``, ``list[function]``]): A function which computes a multiplicative
:param lr_lambda: A function which computes a multiplicative factor given an integer parameter epoch, or a list of such functions,
factor given an integer parameter epoch, or a list of such one for each group in optimizer.param_groups, defaults to None.
functions, one for each group in optimizer.param_groups, defaults to None last_epoch (int, optional): The index of last epoch, defaults to -1.
:type lr_lambda: function or list, optional
:param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional
""" """
def __init__(self, optimizer, total_steps, lr_lambda=None, last_epoch: int = -1) -> None: def __init__(self, optimizer, total_steps, lr_lambda=None, last_epoch: int = -1) -> None:
@ -30,18 +27,15 @@ class LambdaLR(_LambdaLR):
@LR_SCHEDULERS.register_module @LR_SCHEDULERS.register_module
class MultiplicativeLR(_MultiplicativeLR): class MultiplicativeLR(_MultiplicativeLR):
"""Multiply the learning rate of each parameter group by the factor given """Multiply the learning rate of each parameter group by the factor given
in the specified function. When last_epoch=-1, sets initial lr as lr in the specified function. When last_epoch=-1, sets initial lr as lr.
:param optimizer: Wrapped optimizer Args:
:type optimizer: torch.optim.Optimizer optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
:param total_steps: Number of total training steps total_steps (int): Number of total training steps.
:type total_steps: int lr_lambda (Union[``function``, ``list[function]``]): A function which computes a multiplicative
:param lr_lambda: A function which computes a multiplicative factor given an integer parameter epoch, or a list of such functions,
factor given an integer parameter epoch, or a list of such one for each group in optimizer.param_groups, defaults to None.
functions, one for each group in optimizer.param_groups, defaults to None last_epoch (int, optional): The index of last epoch, defaults to -1.
:type lr_lambda: function or list, optional
:param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional
""" """
def __init__(self, optimizer, total_steps, lr_lambda=None, last_epoch: int = -1) -> None: def __init__(self, optimizer, total_steps, lr_lambda=None, last_epoch: int = -1) -> None:
@ -53,18 +47,14 @@ class StepLR(_StepLR):
"""Decays the learning rate of each parameter group by gamma every """Decays the learning rate of each parameter group by gamma every
step_size epochs. Notice that such decay can happen simultaneously with step_size epochs. Notice that such decay can happen simultaneously with
other changes to the learning rate from outside this scheduler. When other changes to the learning rate from outside this scheduler. When
last_epoch=-1, sets initial lr as lr last_epoch=-1, sets initial lr as lr.
:param optimizer: Wrapped optimizer Args:
:type optimizer: torch.optim.Optimizer optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
:param total_steps: Number of total training steps total_steps (int): Number of total training steps.
:type total_steps: int step_size (int, optional): Period of learning rate decay, defaults to 1.
:param step_size: Period of learning rate decay, defaults to 1 gamma (float, optional): Multiplicative factor of learning rate decay, defaults to 0.1.
:type step_size: int, optional last_epoch (int, optional): The index of last epoch, defaults to -1.
:param gamma: Multiplicative factor of learning rate decay, defaults to 0.1
:type gamma: float, optional
:param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional
""" """
def __init__(self, optimizer, total_steps, step_size: int = 1, gamma: float = 0.1, last_epoch: int = -1) -> None: def __init__(self, optimizer, total_steps, step_size: int = 1, gamma: float = 0.1, last_epoch: int = -1) -> None:
@ -77,14 +67,11 @@ class ExponentialLR(_ExponentialLR):
"""Decays the learning rate of each parameter group by gamma every epoch. """Decays the learning rate of each parameter group by gamma every epoch.
When last_epoch=-1, sets initial lr as lr When last_epoch=-1, sets initial lr as lr
:param optimizer: Wrapped optimizer Args:
:type optimizer: torch.optim.Optimizer optimizer (Union[:class:`torch.optim.Optimizer`, :class:`colossalai.nn.optimizer`]): Wrapped optimizer.
:param total_steps: Number of total training steps total_steps (int): Number of total training steps.
:type total_steps: int gamma (float, optional): Multiplicative factor of learning rate decay, defaults to 1.0.
:param gamma: Multiplicative factor of learning rate decay, defaults to 1.0 last_epoch (int, optional): The index of last epoch, defaults to -1.
:type gamma: float, optional
:param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional
""" """
def __init__(self, optimizer, total_steps, gamma: float = 1.0, def __init__(self, optimizer, total_steps, gamma: float = 1.0,

View File

@ -14,8 +14,12 @@ class Accuracy2D(nn.Module):
def forward(self, logits, targets): def forward(self, logits, targets):
"""Calculate the accuracy of predicted labels. """Calculate the accuracy of predicted labels.
:param logits: Predicted labels Args:
:param targets: True labels from data logits (:class:`torch.tensor`): Predicted labels.
targets (:class:`torch.tensor`): True labels from data.
Returns:
float: the accuracy of prediction.
""" """
with torch.no_grad(): with torch.no_grad():
targets = split_tensor_2d(targets) targets = split_tensor_2d(targets)

View File

@ -14,8 +14,12 @@ class Accuracy2p5D(nn.Module):
def forward(self, logits, targets): def forward(self, logits, targets):
"""Calculate the accuracy of predicted labels. """Calculate the accuracy of predicted labels.
:param logits: Predicted labels Args:
:param targets: True labels from data logits (:class:`torch.tensor`): Predicted labels.
targets (:class:`torch.tensor`): True labels from data.
Returns:
float: the accuracy of prediction.
""" """
with torch.no_grad(): with torch.no_grad():
targets = split_tensor_2p5d(targets) targets = split_tensor_2p5d(targets)

View File

@ -18,8 +18,12 @@ class Accuracy3D(nn.Module):
def forward(self, logits, targets): def forward(self, logits, targets):
"""Calculate the accuracy of predicted labels. """Calculate the accuracy of predicted labels.
:param logits: Predicted labels Args:
:param targets: True labels from data logits (:class:`torch.tensor`): Predicted labels.
targets (:class:`torch.tensor`): True labels from data.
Returns:
float: the accuracy of prediction.
""" """
with torch.no_grad(): with torch.no_grad():
targets = split_tensor_3d(targets, 0, self.weight_parallel_mode) targets = split_tensor_3d(targets, 0, self.weight_parallel_mode)

View File

@ -9,11 +9,10 @@ class Registry:
"""This is a registry class used to register classes and modules so that a universal """This is a registry class used to register classes and modules so that a universal
object builder can be enabled. object builder can be enabled.
:param name: The name of the registry Args:
:type name: str name (str): The name of the registry .
:param third_party_library: List of third party libraries which are used in the third_party_library (list, optional):
initialization of the register module List of third party libraries which are used in the initialization of the register module.
:type third_party_library: list, optional
""" """
def __init__(self, name: str, third_party_library: List[ModuleType] = None): def __init__(self, name: str, third_party_library: List[ModuleType] = None):
@ -28,12 +27,12 @@ class Registry:
def register_module(self, module_class): def register_module(self, module_class):
"""Registers a module represented in `module_class`. """Registers a module represented in `module_class`.
:param module_class: The module to be registered Args:
:type module_class: class module_class (class): The module to be registered.
:raises AssertionError: Raises an AssertionError if the module has already been Returns:
registered before class: The module to be registered, so as to use it normally if via importing.
:return: The module to be registered, so as to use it normally if via importing Raises:
:rtype: class AssertionError: Raises an AssertionError if the module has already been registered before.
""" """
module_name = module_class.__name__ module_name = module_class.__name__
assert module_name not in self._registry assert module_name not in self._registry
@ -46,12 +45,13 @@ class Registry:
"""Retrieves a module with name `module_name` and returns the module if it has """Retrieves a module with name `module_name` and returns the module if it has
already been registered before. already been registered before.
:param module_name: The name of the module to be retrieved Args:
:type module_name: str module_name (str): The name of the module to be retrieved.
:raises NameError: Raises a NameError if the module to be retrieved has neither been Returns:
registered directly nor as third party modules before :class:`object`: The retrieved module or None.
:return: The retrieved module or None Raises:
:rtype: :class:`object` NameError: Raises a NameError if the module to be retrieved has neither been
registered directly nor as third party modules before.
""" """
if module_name in self._registry: if module_name in self._registry:
return self._registry[module_name] return self._registry[module_name]
@ -65,11 +65,11 @@ class Registry:
"""Searches for a module with name `module_name` and returns a boolean value indicating """Searches for a module with name `module_name` and returns a boolean value indicating
whether the module has been registered directly or as third party modules before. whether the module has been registered directly or as third party modules before.
:param module_name: The name of the module to be searched for Args:
:type module_name: str module_name (str): The name of the module to be searched for.
:return: A boolean value indicating whether the module has been registered directly or Returns:
as third party modules before bool: A boolean value indicating whether the module has been registered directly or
:rtype: bool as third party modules before.
""" """
found_flag = module_name in self._registry found_flag = module_name in self._registry

View File

@ -17,18 +17,46 @@ from colossalai.trainer.hooks import BaseHook
class Trainer: class Trainer:
"""This a class tending for easy deployments of users' training and evaluation instead of r"""This is a class tending for easy deployments of users' training and evaluation instead of
writing their own scripts. It is similar with ``ignite.engine`` and ``keras.engine``, but is writing their own scripts. It is similar with ``ignite.engine`` and ``keras.engine``, but is
called `Trainer`. called `Trainer`.
:param engine: Engine responsible for the process function Args:
:type engine: :class:`Engine` engine (:class:`Engine`): Engine responsible for the process function.
:param schedule: Schedule responsible for forward and backward steps schedule (:class:`BaseSchedule`, optional): Schedule responsible for forward and backward steps.
:type schedule: :class:`BaseSchedule`, optional timer (:class:`MultiTimer`, optional): Timer used to monitor the whole training.
:param timer: Timer used to monitor the whole training logger (:class:`colossalai.logging.DistributedLogger`, optional): Logger used to record the whole training log.
:type timer: :class:`MultiTimer`, optional
:param logger: Logger used to record the whole training Note:
:type logger: :class:`colossalai.logging.DistributedLogger`, optional when `schedule` is None, the ``NonPipelineSchedule`` would be used. If you would like to use pipeline,
you should choose ``PipelineSchedule`` or ``InterleavedPipelineSchedule`` for the `schedule`
Examples:
>>> # define model, criterion, optimizer, lr_scheduler, train_dataloader for your training
>>> model = ...
>>> criterion = ...
>>> optimizer = ...
>>> train_dataloader = ...
>>> # Initialize your engine, train_dataloader, test_dataloader, lr_scheduler
>>> engine, train_dataloader, _, _ = colossalai.initialize(model, optimizer, criterion)
>>> # Beginning training progress
>>> timier = ...
>>> logger = ...
>>> trainer = Trainer(engine=engine, logger=logger, schedule=schedule, timer=timier)
>>> # add hooks you would like to use here.
>>> hook_list = []
>>> trainer.fit(
>>> train_dataloader=train_dataloader,
>>> epochs=gpc.config.NUM_EPOCHS,
>>> test_interval=1,
>>> hooks=hook_list,
>>> display_progress=True,
>>> return_output_label=False
>>> )
More examples and details could be found in
`Training with engine and trainer <https://www.colossalai.org/docs/basics/engine_trainer>`_
and `ColossalAI-Examples <https://github.com/hpcaitech/ColossalAI-Examples/tree/main>`_.
""" """
def __init__( def __init__(
self, self,
@ -108,20 +136,19 @@ class Trainer:
def _set_current_step(self, epoch: int): def _set_current_step(self, epoch: int):
"""Sets current step number. """Sets current step number.
:param epoch: Step number to be set Args:
:type epoch: int epoch (int): Step number to be set.
""" """
self._cur_step = epoch * self._steps_per_epoch self._cur_step = epoch * self._steps_per_epoch
def _call_timer(self, action: str, item: str, *args, **kwargs) -> None: def _call_timer(self, action: str, item: str, *args, **kwargs) -> None:
"""Call timer funciton with a given timer name. """Call timer funciton with a given timer name.
:param action: Function to be called on timer Args:
:type action: str action (str): Function to be called on timer.
:param item: Name of the timer item (str): Name of the timer.
:type item: str args (list): args used for action function.
:param args: args used for action function kwargs (dict): kwargs used for action function.
:param kwargs: kwargs used for action function
""" """
if self._timer is not None: if self._timer is not None:
@ -134,10 +161,9 @@ class Trainer:
def _call_hooks(self, func, output=None): def _call_hooks(self, func, output=None):
"""Calls specific hooks in the current time point. """Calls specific hooks in the current time point.
:param func: A string represents the time point Args:
:param output: Output of the model after running a iteration or None in any other time points func (str): A string represents the time point.
:type func: str output (Any, optional): Output of the model after running an iteration or None in any other time points.
:type output: optional
""" """
# Only after iter hook will receive output # Only after iter hook will receive output
for hook in self.hooks: for hook in self.hooks:
@ -273,25 +299,17 @@ class Trainer:
display_progress: bool = False, display_progress: bool = False,
return_output_label: bool = True, return_output_label: bool = True,
): ):
"""Trains the model to fit training data. r"""Trains the model to fit training data.
:param train_dataloader: DataLoader in training Args:
:param epochs: Maximum number of epoches train_dataloader (:class:`torch.utils.data.DataLoader`): DataLoader for training.
:param max_steps: Maximum number of running iterations epochs (int): Maximum number of epochs.
:param test_dataloader: DataLoader in testing max_steps (int, optional): Maximum number of running iterations.
:param test_interval: Interval of testing test_dataloader (:class:`torch.utils.data.DataLoader`, optional): DataLoader for validation.
:param hooks: A list of hooks used in training test_interval (int, optional): Interval of validation
:param display_progress: If True, the training progress will be printed hooks (list[`BaseHook <https://github.com/hpcaitech/ColossalAI/tree/main/colossalai/trainer/hooks>`_],
:param return_output_label: If True, the output of model and the label will be returned optional): A list of hooks used in training.
display_progress (bool, optional): If True, a progress bar will be displayed.
:type train_dataloader: DataLoader
:type epochs: int
:type max_steps: int, optional
:type test_dataloader: DataLoader, optional
:type test_interval: int, optional
:type hooks: list, optional
:type display_progress: bool, optional
:type return_output_label: bool, optional
""" """
# set epochs and steps, consider gradient accumulation # set epochs and steps, consider gradient accumulation
@ -374,15 +392,12 @@ class Trainer:
): ):
"""Evaluates the model with testing data. """Evaluates the model with testing data.
:param test_dataloader: DataLoader in testing Args:
:param hooks: A list of hooks used in evaluation test_dataloader (:class:`torch.utils.data.DataLoader`, optional): Dataloader for testing.
:param display_progress: If True, the evaluation progress will be printed hooks (list, optional): A list of hooks used in evaluation. Defaults to None.
:param return_output_label: If True, the output of model and the label will be returned display_progress (bool, optional): If True, the evaluation progress will be printed. Defaults to False.
return_output_label (bool, optional): If True, the output of model and the label
:type test_dataloader: DataLoader will be returned. Defaults to True.
:type hooks: list, optional
:type display_progress: bool, optional
:type return_output_label: bool
""" """
# set display # set display
display_progress = self._should_display_progress(display_progress) display_progress = self._should_display_progress(display_progress)
@ -418,10 +433,11 @@ class Trainer:
def predict(self, data: Union[Tensor, List[Tensor]]): def predict(self, data: Union[Tensor, List[Tensor]]):
"""Uses trained model to make a prediction for a tensor or a tensor list. """Uses trained model to make a prediction for a tensor or a tensor list.
:param data: Data as the input Args:
:type data: Union[Tensor, List[Tensor] data (Union[:class:`torch.tensor`, List[:class:`torch.tensor`]]): Data as the input.
:return: The output of model as the prediction
:rtype: Tensor Returns:
:class:`torch.tensor`: The output of model as the prediction
""" """
# predict without labels # predict without labels
if isinstance(data, (list, tuple)): if isinstance(data, (list, tuple)):

View File

@ -40,14 +40,11 @@ class BaseHook(ABC):
def after_train_iter(self, trainer, output: Tensor, label: Tensor, loss: Tensor): def after_train_iter(self, trainer, output: Tensor, label: Tensor, loss: Tensor):
"""Actions after running a training iteration. """Actions after running a training iteration.
:param trainer: Trainer which is using this hook Args:
:type trainer: :class:`Trainer` trainer (:class:`Trainer`): Trainer which is using this hook.
:param output: Output of the model output (:class:`torch.Tensor`): Output of the model.
:type output: torch.Tensor label (:class:`torch.Tensor`): Labels of the input data.
:param label: Labels of the input data loss (:class:`torch.Tensor`): Loss between the output and input data.
:type label: torch.Tensor
:param loss: Loss between the output and input data
:type loss: torch.Tensor
""" """
pass pass
@ -89,24 +86,21 @@ class BaseHook(ABC):
def after_test_iter(self, trainer, output: Tensor, label: Tensor, loss: Tensor): def after_test_iter(self, trainer, output: Tensor, label: Tensor, loss: Tensor):
"""Actions after running a testing iteration. """Actions after running a testing iteration.
:param trainer: Trainer which is using this hook Args:
:type trainer: :class:`Trainer` trainer (:class:`Trainer`): Trainer which is using this hook
:param output: Output of the model output (:class:`torch.Tensor`): Output of the model
:type output: Tensor label (:class:`torch.Tensor`): Labels of the input data
:param label: Labels of the input data loss (:class:`torch.Tensor`): Loss between the output and input data
:type label: Tensor
:param loss: Loss between the output and input data
:type loss: Tensor
""" """
pass pass
def init_runner_states(self, trainer, key, val): def init_runner_states(self, trainer, key, val):
"""Initializes trainer's state. """Initializes trainer's state.
:param trainer: Trainer which is using this hook Args:
:type trainer: :class:`Trainer` trainer (:class:`Trainer`): Trainer which is using this hook
:param key: Key of reseting state key: Key of state to be reset
:param val: Value of reseting state val: Value of state to be reset
""" """
if key not in trainer.states: if key not in trainer.states:
trainer.states[key] = val trainer.states[key] = val

View File

@ -16,14 +16,13 @@ from ._lr_scheduler_hook import LRSchedulerHook
class SaveCheckpointHook(BaseHook): class SaveCheckpointHook(BaseHook):
"""Saves the model by interval in training process. """Saves the model by interval in training process.
:param interval: Saving interval, defaults to 1 Args:
:type interval: int, optional interval (int, optional): Saving interval, defaults to 1.
:param checkpoint_dir: Directory of saving checkpoint, defaults to None checkpoint_dir (str, optional): Directory of saving checkpoint, defaults to None.
:type checkpoint_dir: str, optional suffix (str, optional): Saving suffix of the file, defaults to ''.
:param suffix: Saving suffix of the file, defaults to '' priority (int, optional): Priority in the printing, hooks with small priority will be printed in front
:type suffix: str, optional defaults to 10. If different hooks share same priority, the order of printing would
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10 depend on the hooks order in the hook list.
:type priority: int, optional
""" """
def __init__(self, def __init__(self,
@ -71,18 +70,17 @@ class SaveCheckpointHook(BaseHook):
class LoadCheckpointHook(BaseHook): class LoadCheckpointHook(BaseHook):
"""Loads the model before training process. """Loads the model before training process.
:param checkpoint_dir: Directory of saving checkpoint, defaults to None Args:
:type checkpoint_dir: str, optional checkpoint_dir (str, optional): Directory of saving checkpoint, defaults to None.
:param epoch: Epoch number to be set, defaults to -1 epoch (str, optional): Loading checkpoint of setting epoch numbers, defaults to -1.
:type epoch: str, optional Epoch equals to -1 means choosing the latest checkpoint.
:param finetune: Whether allows to load a part of the model, defaults to False finetune (bool, optional): Whether allows to load a part of the model, defaults to False.
:type finetune: bool, optional strict (bool, optional): Whether to strictly enforce that the keys in :attr:`state_dict` of the checkpoint
:param strict: Whether loads a model that has the same shape of parameters, defaults to False match the names of parameters and buffers in model, defaults to False.
:type strict: bool, optional suffix (str, optional): Suffix of checkpoint file path, defaults to ''.
:param suffix: Suffic, defaults to '' priority (int, optional): Priority in the printing, hooks with small priority will be printed in front,
:type suffix: str, optional defaults to 0. If different hooks share same priority, the order of printing would
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 0 depend on the hooks order in the hook list.
:type priority: int, optional
""" """
def __init__(self, def __init__(self,

View File

@ -25,13 +25,14 @@ def _format_number(val, prec=5):
class LogByEpochHook(BaseHook): class LogByEpochHook(BaseHook):
"""Hook to log by epoch """Hook to log by epoch.
:param logger: Logger for the log Args:
:param interval: Recording interval, defaults to 1 logger (:class:`colossalai.logging.DistributedLogger`): Logger for recording the log information.
:type interval: int, optional interval (int, optional): Interval of printing log information, defaults to 1.
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 1 priority (int, optional): Priority in the printing, hooks with small priority will be printed in front,
:type priority: int, optional defaults to 1. If different hooks share same priority, the order of printing would
depend on the hooks order in the hook list.
""" """
def __init__(self, def __init__(self,
@ -48,10 +49,12 @@ class LogByEpochHook(BaseHook):
@HOOKS.register_module @HOOKS.register_module
class LogMetricByStepHook(BaseHook): class LogMetricByStepHook(BaseHook):
"""Hook to log metric by step """Hook to log metric by step.
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10 Args:
:type priority: int, optional priority (int, optional): Priority in the printing, hooks with small priority will be printed in front,
defaults to 10. If different hooks share same priority, the order of printing would
depend on the hooks order in the hook list.
""" """
def __init__(self, priority: int = 10): def __init__(self, priority: int = 10):
@ -74,11 +77,12 @@ class LogMetricByStepHook(BaseHook):
class LogMetricByEpochHook(LogByEpochHook): class LogMetricByEpochHook(LogByEpochHook):
"""Specialized hook to record the metric to log. """Specialized hook to record the metric to log.
:param logger: Logger for the log Args:
:param interval: Recording interval, defaults to 1 logger (:class:`colossalai.logging.DistributedLogger`): Logger for recording the log information.
:type interval: int, optional interval (int, optional): Interval of printing log information, defaults to 1.
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10 priority (int, optional): Priority in the printing, hooks with small priority will be printed in front,
:type priority: int, optional defaults to 10. If different hooks share same priority, the order of printing would
depend on the hooks order in the hook list.
""" """
def __init__(self, def __init__(self,
@ -116,14 +120,14 @@ class LogMetricByEpochHook(LogByEpochHook):
class TensorboardHook(BaseHook): class TensorboardHook(BaseHook):
"""Specialized hook to record the metric to Tensorboard. """Specialized hook to record the metric to Tensorboard.
:param log_dir: Directory of log Args:
:type log_dir: str log_dir (str): Directory of log.
:param ranks: Ranks of processors ranks (list): Ranks of processors.
:type ranks: typing.List parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`, optional): Parallel mode used in trainer,
:param parallel_mode: Parallel mode, defaults to colossalai.context.parallel_mode.ParallelMode.GLOBAL defaults to colossalai.context.parallel_mode.ParallelMode.GLOBAL.
:type parallel_mode: :class:`colossalai.context.parallel_mode.ParallelMode`, optional priority (int, optional): Priority in the printing, hooks with small priority will be printed in front,
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10 defaults to 10. If different hooks share same priority, the order of printing would
:type priority: int, optional depend on the hooks order in the hook list.
""" """
def __init__(self, def __init__(self,
@ -200,18 +204,15 @@ class TensorboardHook(BaseHook):
class LogTimingByEpochHook(LogByEpochHook): class LogTimingByEpochHook(LogByEpochHook):
"""Specialized hook to write timing record to log. """Specialized hook to write timing record to log.
:param timer: Timer for the hook Args:
:type timer: :class:`colossalai.utils.MultiTimer` timer (:class:`colossalai.utils.MultiTimer`): Timer for the hook.
:param logger: Logger for the log logger (:class:`colossalai.logging.DistributedLogger`): Logger for recording the log information.
:type logger: :class:`colossalai.logging.DistributedLogger` interval (int, optional): Interval of printing log information, defaults to 1.
:param interval: Recording interval, defaults to 1 priority (int, optional): Priority in the printing, hooks with small priority will be printed in front
:type interval: int, optional defaults to 10. If different hooks share same priority, the order of printing would
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10 depend on the hooks order in the hook list.
:type priority: int, optional log_eval (bool, optional): Whether writes in evaluation, defaults to True.
:param log_eval: Whether writes in evaluation, defaults to True ignore_num_train_steps (int, optional): Number of training steps to ignore, defaults to 0.
:type log_eval: bool, optional
:param ignore_num_train_steps: Number of training steps to ignore, defaults to 0
:type ignore_num_train_steps: int, optional
""" """
def __init__(self, def __init__(self,
@ -270,14 +271,13 @@ class LogTimingByEpochHook(LogByEpochHook):
class LogMemoryByEpochHook(LogByEpochHook): class LogMemoryByEpochHook(LogByEpochHook):
"""Specialized Hook to write memory usage record to log. """Specialized Hook to write memory usage record to log.
:param logger: Logger for the log Args:
:type logger: colossalai.logging.DistributedLogger logger (:class:`colossalai.logging.DistributedLogger`): Logger for recording the log information.
:param interval: Recording interval, defaults to 1 interval (int, optional): Interval of printing log information, defaults to 1.
:type interval: int, optional priority (int, optional): Priority in the printing, hooks with small priority will be printed in front
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10 defaults to 1. If different hooks share same priority, the order of printing would
:type priority: int, optional depend on the hooks order in the hook list.
:param log_eval: Whether writes in evaluation, defaults to True log_eval (bool, optional): Whether writes in evaluation, defaults to True.
:type log_eval: bool, optional
""" """
def __init__(self, def __init__(self,

View File

@ -6,15 +6,17 @@ from ._metric_hook import LearningRateMetric, MetricHook
@HOOKS.register_module @HOOKS.register_module
class LRSchedulerHook(MetricHook): class LRSchedulerHook(MetricHook):
"""Build LR scheduler r"""Build LR scheduler for trainer.
:param lr_scheduler: LR scheduler Args:
:param by_epoch: If `True`, the LR will be scheduled every epoch. Else, the LR will be scheduled every batch lr_scheduler (:class:`colossalai.nn.lr_scheduler`): The specific LR scheduler
:type by_epoch: bool in range of ``colossalai.nn.lr_scheduler``, more details about ``lr_scheduler`` could be found in
:param store_lr_in_state: If `True`, store the learning rate in each state, defaults to `True` `lr_scheduler <https://github.com/hpcaitech/ColossalAI/tree/main/colossalai/nn/lr_scheduler>`_.
:type store_lr_in_state: bool, optional by_epoch (bool): If `True`, the LR will be scheduled every epoch. Else, the LR will be scheduled every batch.
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 1 store_lr_in_state (bool, optional): If `True`, store the learning rate in each state, defaults to `True`.
:type priority: int, optional priority (int, optional): Priority in the printing, hooks with small priority will be printed in front
defaults to 1. If different hooks share same priority, the order of printing would
depend on the hooks order in the hook list.
""" """
def __init__( def __init__(
self, self,

View File

@ -17,13 +17,13 @@ from ._base_hook import BaseHook
class Metric(ABC): class Metric(ABC):
"""A basic class of metric collectors. It collects a specific """A basic class of metric collectors. It collects a specific
metric during training or evaluation and it's always used with metric during training or evaluation and would always be used with
:class:`MetricHook` to help it update its states and show the :class:`MetricHook` to help it update its states and show the
metric. So please use corresponding hook class to make the metric metric. So please use corresponding hook class to make the metric
collector works. collector works.
:param epoch_only: Whether the metric only read for the full epoch Args:
:type epoch_only: bool epoch_only (bool): Whether the metric only read for the full epoch.
""" """
def __init__(self, epoch_only: bool): def __init__(self, epoch_only: bool):
@ -80,8 +80,8 @@ class Metric(ABC):
class LossMetric(Metric): class LossMetric(Metric):
"""A metric collector for loss. """A metric collector for loss.
:param epoch_only: Whether the metric only read for the full epoch Args:
:type epoch_only: bool epoch_only (bool): Whether the metric only read for the full epoch.
""" """
def __init__(self, epoch_only): def __init__(self, epoch_only):
@ -101,7 +101,8 @@ class LossMetric(Metric):
"""Updates :attr:`last_step_loss` and :attr:`accum_loss` with current loss. """Updates :attr:`last_step_loss` and :attr:`accum_loss` with current loss.
It expects the output has loss. It expects the output has loss.
:param loss: Current loss of the output Args:
loss (:class:`torch.tensor`): Current loss of the output.
""" """
# expect output to be logits, label and loss # expect output to be logits, label and loss
loss_ = loss.detach() loss_ = loss.detach()
@ -132,10 +133,9 @@ class LossMetric(Metric):
class LearningRateMetric(Metric): class LearningRateMetric(Metric):
"""A metric collector for learning rate. """A metric collector for learning rate.
:param epoch_only: Whether the metric only read for the full epoch Args:
:type epoch_only: bool epoch_only (bool): Whether the metric only read for the full epoch.
:param initial_lr: Initial learning rate, defaults to 0.0 initial_lr (float, optional): Initial learning rate, defaults to 0.0.
:type initial_lr: float, optional
""" """
def __init__(self, epoch_only: bool, initial_lr: float = 0.): def __init__(self, epoch_only: bool, initial_lr: float = 0.):
@ -163,10 +163,9 @@ class AccuracyMetric(Metric):
"""A metric collector for accuracy. It only works for classification """A metric collector for accuracy. It only works for classification
tasks. tasks.
:param epoch_only: Whether the metric only read for the full epoch Args:
:type epoch_only: bool epoch_only (bool): Whether the metric only read for the full epoch.
:param accuracy_func: Accuracy function for the classification task accuracy_func (:class:`typing.Callable`): Accuracy function for the classification task.
:type accuracy_func: :class:`typing.Callable`
""" """
def __init__(self, epoch_only: bool, accuracy_func: Callable): def __init__(self, epoch_only: bool, accuracy_func: Callable):
@ -187,9 +186,10 @@ class AccuracyMetric(Metric):
"""Updates last step accuracy and accumulated accuracy with current logits """Updates last step accuracy and accumulated accuracy with current logits
and labels. It expects the output has logits and labels. and labels. It expects the output has logits and labels.
:param logits: The logits output of the model Args:
:param targets: Real labels of the dataset logits (:class:`torch.tensor`): The logits output of the model.
:param batch_size: Batch size of the task targets (:class:`torch.tensor`): Real labels of the dataset.
batch_size (int): Batch size of the task.
""" """
if isinstance(logits, (list, tuple)): if isinstance(logits, (list, tuple)):
logits = logits[0] logits = logits[0]
@ -224,8 +224,10 @@ class MetricHook(BaseHook):
update their states. Others are used to display and update their states. Others are used to display and
record the metric. record the metric.
:param priority: Priority in the printing, hooks with small priority will be printed in front Args:
:type priority: int priority (int): Priority in the printing, hooks with small priority will be printed in front
defaults to 1. If different hooks share same priority, the order of printing would
depend on the hooks order in the hook list.
""" """
def __init__( def __init__(
@ -244,8 +246,10 @@ class MetricHook(BaseHook):
class LossHook(MetricHook): class LossHook(MetricHook):
"""Specialized hook class for :class:`Loss`. """Specialized hook class for :class:`Loss`.
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 0 Args:
:type priority: int, optional priority (int, optional): Priority in the printing, hooks with small priority will be printed in front
defaults to 0. If different hooks share same priority, the order of printing would
depend on the hooks order in the hook list.
""" """
def __init__(self, priority: int = 0): def __init__(self, priority: int = 0):
@ -283,10 +287,11 @@ class LossHook(MetricHook):
class AccuracyHook(MetricHook): class AccuracyHook(MetricHook):
"""Specialized hook class for :class:`Accuracy`. """Specialized hook class for :class:`Accuracy`.
:param accuracy_func: Priority in the printing, hooks with small priority will be printed in front Args:
:type accuracy_func: typing.Callable accuracy_func (:class:`typing.Callable`): Accuracy function for the classification task.
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 0 priority (int, optional): Priority in the printing, hooks with small priority will be printed in front
:type priority: int, optional defaults to 0. If different hooks share same priority, the order of printing would
depend on the hooks order in the hook list.
""" """
def __init__(self, accuracy_func: Callable, priority: int = 0): def __init__(self, accuracy_func: Callable, priority: int = 0):
@ -314,8 +319,8 @@ class AccuracyHook(MetricHook):
class ThroughputMetric(Metric): class ThroughputMetric(Metric):
"""Metric for :class:`Throughput`. """Metric for :class:`Throughput`.
:param epoch_only: epoch only Args:
:type epoch_only: bool epoch_only (bool): Whether the metric only read for the full epoch.
""" """
def __init__(self, epoch_only: bool, ignored_steps: int = 0): def __init__(self, epoch_only: bool, ignored_steps: int = 0):
super().__init__(epoch_only=epoch_only) super().__init__(epoch_only=epoch_only)
@ -360,10 +365,13 @@ class ThroughputMetric(Metric):
@HOOKS.register_module @HOOKS.register_module
class ThroughputHook(MetricHook): class ThroughputHook(MetricHook):
"""Specialized hook class for :class:`Throughput`. """Specialized hook class for :class:`Throughput`. Hook to measure execution throughput (samples/sec).
:param priority: priority of throughput hook, defaults to 10 Args:
:type priority: int, optional ignored_steps (int, optional): the number of initial training steps to ignore.
priority (int, optional): Priority in the printing, hooks with small priority will be printed in front
defaults to 10. If different hooks share same priority, the order of printing would
depend on the hooks order in the hook list.
""" """
def __init__(self, ignored_steps: int = 0, priority: int = 10): def __init__(self, ignored_steps: int = 0, priority: int = 10):
super().__init__(priority) super().__init__(priority)

View File

@ -114,10 +114,13 @@ class CheckpointFunction(torch.autograd.Function):
def checkpoint(function, activation_offload ,*args): def checkpoint(function, activation_offload ,*args):
"""Checkpoint the computation while preserve the rng states, modified from Pytorch torch.utils.checkpoint """Checkpoint the computation while preserve the rng states, modified from Pytorch torch.utils.checkpoint.
:param function: Describe the forward pass function. It should know how to handle the input tuples. Args:
:param args: Tuple containing the parameters of the function function: Describe the forward pass function. It should know how to handle the input tuples.
:return: Output of running function with provided args args (list): Tuple containing the parameters of the function
Returns:
Output of running function with provided args.
""" """
return CheckpointFunction.apply(function, activation_offload, *args) return CheckpointFunction.apply(function, activation_offload, *args)

View File

@ -50,17 +50,17 @@ def _get_standard_checkpoint_filename(epoch: int, suffix: str = ''):
def get_checkpoint_path(checkpoint_dir: str, epoch: int, suffix: str = ''): def get_checkpoint_path(checkpoint_dir: str, epoch: int, suffix: str = ''):
"""This is a function to generate the checkpoint path from the (checkpoint_dir, epoch, suffix, gpu_parallel_rank) tuple. """This is a function to generate the checkpoint path from the tuple
(checkpoint_dir, epoch, suffix, gpu_parallel_rank).
This is useful during generation and recuperation of the checkpoint. This is useful during generation and recuperation of the checkpoint.
:param checkpoint_dir: Set up a directory for saving checkpoints Args:
:type checkpoint_dir: str checkpoint_dir (str): Set up a directory for saving checkpoints.
:param epoch: Epoch number (indicate how many epochs have you trained this model) epoch (int): Epoch number (indicate how many epochs have you trained this model).
:type epoch: int suffix (str, optional): Additional notation to specify the model or checkpoint, defaults to ''
:param suffix: Additional notation to specify the model or checkpoint, defaults to ''
:type suffix: str, optional Returns:
:return: Checkpoint path to be generated str: The checkpoint path to be generated.
:rtype: path
""" """
ckpt_filename = _get_standard_checkpoint_filename(epoch, suffix) ckpt_filename = _get_standard_checkpoint_filename(epoch, suffix)
return os.path.join(checkpoint_dir, ckpt_filename) return os.path.join(checkpoint_dir, ckpt_filename)
@ -74,12 +74,13 @@ def _ensure_directory_exists(filename: str):
def get_latest_checkpoint_pattern(suffix: str = ''): def get_latest_checkpoint_pattern(suffix: str = ''):
"""Generate Regular expression of latest checkpoint's pattern """Generate Regular expression of the latest checkpoint's pattern.
:param suffix: Additional notation to specify the model or checkpoint, defaults to '' Args:
:type suffix: str, optional suffix (str, optional): Additional notation to specify the model or checkpoint, defaults to ''.
:return: Checkpoint pattern
:rtype: regular expression Returns:
str: The regular expression of checkpoint pattern.
""" """
ranks_name = _get_ranks_name() ranks_name = _get_ranks_name()
pattern = r'epoch(\d+)-{}{}\.pt'.format(ranks_name, suffix) pattern = r'epoch(\d+)-{}{}\.pt'.format(ranks_name, suffix)
@ -88,16 +89,19 @@ def get_latest_checkpoint_pattern(suffix: str = ''):
def get_latest_checkpoint_path(checkpoint_dir: str, suffix: str = ''): def get_latest_checkpoint_path(checkpoint_dir: str, suffix: str = ''):
"""This is a function to retrieve the latest checkpoint path from the (checkpoint_dir, suffix, gpu_parallel_rank) tuple. """This is a function to retrieve the latest checkpoint path from the tuple
(checkpoint_dir, suffix, gpu_parallel_rank).
This is useful during recuperation of the checkpoint, especially when you do not know the epoch number. This is useful during recuperation of the checkpoint, especially when you do not know the epoch number.
:param checkpoint_dir: Directory for saving checkpoints Args:
:type checkpoint_dir: str checkpoint_dir (str): Directory for saving checkpoints
:param suffix: Additional notation to specify the model or checkpoint, defaults to '' suffix (str, optional): Additional notation to specify the model or checkpoint, defaults to ''
:type suffix: str, optional
:raises FileNotFoundError: Raise error when we cannot find the latest checkpoint file with inputs given Returns:
:return: The latest checkpoint path to be retrieved str: The latest retrieved checkpoint path.
:rtype: path
Raises:
FileNotFoundError: Raise error when we cannot find the latest checkpoint file with inputs given.
""" """
CKPT_NAME_PAT = get_latest_checkpoint_pattern(suffix=suffix) CKPT_NAME_PAT = get_latest_checkpoint_pattern(suffix=suffix)
@ -126,22 +130,19 @@ def save_checkpoint(checkpoint_path: str,
optimizer: torch.optim.Optimizer, optimizer: torch.optim.Optimizer,
lr_scheduler: torch.optim.lr_scheduler._LRScheduler = None, lr_scheduler: torch.optim.lr_scheduler._LRScheduler = None,
**kwargs): **kwargs):
"""Given a directory to store the checkpoints, saves all the training components' parameters or buffers, such as model, """Given a directory to store the checkpoints, saves all the training components' parameters or buffers, such as
optimizer, lr_scheduler and etc. into a checkpoint dictionary. model, optimizer, lr_scheduler etc. into a checkpoint dictionary.
This method can be used for both colosalai nn.BaseModel and normal pytorch nn.Module. This method can be used for both :class:`colossalai.nn.BaseModel` and normal :class:`torch.nn.Module`.
Args:
:param checkpoint_path: Set up a directory for saving checkpoints checkpoint_path (str): Set up a directory for saving checkpoints.
:type checkpoint_path: str epoch (int): Epoch number (indicate how many epochs have you trained this model).
:param epoch: Epoch number (indicate how many epochs have you trained this model) model (:class:`torch.nn.Module`): Model to be registered.
:type epoch: int optimizer (Union[:class:`torch.optim.Optimizer`, :class:`colossalai.nn.optimizer`]): Optimizer to be registered.
:param model: Model to be registered lr_scheduler (Union[:class:`torch.optim.lr_scheduler`,
:type model: torch.nn.Module :class:`colossalai.nn.lr_scheduler`], optional): lr_scheduler to be registered, defaults to None.
:param optimizer: Optimizer to be registered kwargs (dict): additional parameters to be saved.
:type optimizer: torch.optim.Optimizer
:param lr_scheduler: lr_scheduler to be registered, defaults to None
:type lr_scheduler: torch.optim.lr_scheduler._LRScheduler, optional
""" """
# for compatibility with normal pytorch nn.Module # for compatibility with normal pytorch nn.Module
if hasattr(model, 'state_dict_for_save_checkpoint'): if hasattr(model, 'state_dict_for_save_checkpoint'):
@ -165,31 +166,31 @@ def load_checkpoint(checkpoint_path: str,
finetune: bool = False, finetune: bool = False,
strict: bool = True) -> Tuple: strict: bool = True) -> Tuple:
"""Loads the checkpoint file. """Loads the checkpoint file.
If finetune is False, then we intend to continue/resume the training process from the checkpoint given. If finetune is False, then we intend to continue/resume the training process from the checkpoint given.
So we copy parameters and buffers from state_dict into these modules(model, optimizer,lr_scheduler) So we copy parameters and buffers from state_dict into these modules(model, optimizer,lr_scheduler)
and its descendants. and its descendants.
If finetune is True, then only the weights and buffers of model should be reload.
If strict is True, then the keys of state_dict must exactly match the keys returned by this modules
state_dict() function.
:param checkpoint_path: The exact and matched checkpoint_path directory to retrieve appropriate state_dict If finetune is True, then only the weights and buffers of model should be reloaded.
:type checkpoint_path: str If strict is True, then the keys of state_dict must exactly match the keys returned
:param model: Model to reload parameters and buffers by this modules state_dict() function.
:type model: torch.nn.Module
:param optimizer: Optimizer to recuperate
:type optimizer: torch.optim.Optimizer
:param lr_scheduler: lr_scheduler to recuperate, defaults to None
:type lr_scheduler: torch.optim.lr_scheduler._LRScheduler, optional
:param finetune: Whether to finetune the model with new dataset or continue the pre-training, defaults to False
:type finetune: bool, optional
:param strict: Whether to strictly enforce that the keys in
:attr:`state_dict` of the checkpoint match the names of
parameters and buffers in model., defaults to True
:type strict: bool, optional
:raises ValueError: Raise error if the model/optimizer cannot successfully be recuperated
:return: (the epoch number of the checkpoint retrieved, the checkpoint retrieved)
:rtype: Tuple
Args:
checkpoint_path (str): The exact and matched checkpoint_path directory to retrieve appropriate state_dict.
model (:class:`torch.nn.Module`): Model to reload parameters and buffers.
optimizer (Union[:class:`torch.optim.Optimizer`, :class:`colossalai.nn.optimizer`]): Optimizer to recuperate.
lr_scheduler (:class:`torch.optim.lr_scheduler._LRScheduler`, optional):
lr_scheduler to recuperate, defaults to None.
finetune (bool, optional): Whether to finetune the model with new dataset or
continue the pre-training, defaults to False.
strict (bool, optional): Whether to strictly enforce that the keys in :attr:`state_dict`
of the checkpoint match the names of parameters and buffers in model, defaults to True.
Returns:
Tuple(int, ``checkpoint``): The tuple (the epoch number of the checkpoint retrieved, the checkpoint retrieved).
Raises:
ValueError: Raise error if the model/optimizer cannot successfully be recuperated
""" """
# Load the checkpoint. # Load the checkpoint.
checkpoint = torch.load(checkpoint_path, map_location='cpu') checkpoint = torch.load(checkpoint_path, map_location='cpu')

View File

@ -27,10 +27,10 @@ from .multi_tensor_apply import multi_tensor_applier
def print_rank_0(msg: str, logger=None): def print_rank_0(msg: str, logger=None):
"""Print messages and save logs(optional). This is executed only if you are the rank-0 gpu. """Print messages and save logs(optional). This is executed only if you are the rank-0 gpu.
:param msg: A string message to output Args:
:type msg: str msg (str): A string message to output.
:param logger: Python logger object, defaults to None logger (:class:`colossalai.logging.DistributedLogger`, optional):
:type logger: optional The logger to record the message, defaults to None.
""" """
if gpc.get_global_rank() == 0: if gpc.get_global_rank() == 0:
if logger is None: if logger is None:
@ -53,12 +53,15 @@ def free_port():
def sync_model_param(model, parallel_mode): def sync_model_param(model, parallel_mode):
"""Make sure data parameters are consistent during Data Parallel Mode r"""Make sure data parameters are consistent during Data Parallel Mode.
:param model: A pyTorch nn.model on whose parameters you check the consistency Args:
:param parallel_mode: Parallel mode to be checked model (:class:`torch.nn.Module`): A pyTorch model on whose parameters you check the consistency.
:type model: torch.nn.Module parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel mode to be checked.
:type parallel_mode: colossalai.context.ParallelMode
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
""" """
if gpc.is_initialized(parallel_mode) and gpc.get_world_size(parallel_mode) > 1: if gpc.is_initialized(parallel_mode) and gpc.get_world_size(parallel_mode) > 1:
for param in model.parameters(): for param in model.parameters():
@ -146,18 +149,19 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
"""Clips gradient norm of an iterable of parameters whose gradients are in fp32. """Clips gradient norm of an iterable of parameters whose gradients are in fp32.
This is adapted from :func:`torch.nn.utils.clip_grad.clip_grad_norm_` and This is adapted from :func:`torch.nn.utils.clip_grad.clip_grad_norm_` and
added functionality to handle model parallel parameters. Note that added functionality to handle model parallel parameters.
the gradients are modified in place.
:param parameters: An iterable of Tensors or a single Tensor that will have gradients normalized Note:
:type parameters: (Iterable[Tensor] or Tensor) the gradients are modified in place.
:param max_norm: Max norm of the gradients
:type max_norm: float or int
:param norm_type: Type of the used p-norm. Can be ``'inf'`` for infinity norm.
:type norm_type: float or int
:return: Total norm of the parameters (viewed as a single vector). Args:
:rtype: float parameters (Iterable[:class:`torch.tensor`] or :class:`torch.tensor`):
An iterable of Tensors or a single Tensor that will have gradients normalized.
max_norm (Union[float, int]): Max norm of the gradients.
norm_type (Union[float, int, 'inf']): Type of the used p-norm. Can be ``'inf'`` for infinity norm.
Returns:
float: Total norm of the parameters.
""" """
if isinstance(parameters, torch.Tensor): if isinstance(parameters, torch.Tensor):

View File

@ -19,18 +19,15 @@ T_co = TypeVar('T_co', covariant=True)
@DATA_SAMPLERS.register_module @DATA_SAMPLERS.register_module
class DataParallelSampler(Sampler): class DataParallelSampler(Sampler):
"""A data sampler for distributed data parallelism """A data sampler for distributed data parallelism.
:param dataset: A Dataset instance Args:
:type dataset: torch.utils.data.Dataset dataset (:class:`torch.utils.data.Dataset`): The Dataset for sampling.
:param shuffle: Whether to shuffle data, defaults to False shuffle (bool, optional): Whether to shuffle data, defaults to False.
:type shuffle: bool, optional seed (int, optional): The random seed used for sampling, defaults to 0.
:param seed: The random seed, defaults to 0 drop_last (bool, optional): Set to True to drop the last incomplete batch, if the dataset size
:type seed: int, optional is not divisible by the batch size. If False and the size of dataset is not divisible by
:param drop_last: Set to True to drop the last incomplete batch, if the dataset size is not divisible by the batch the batch size, then the last batch will be smaller, defaults to False.
size. If False and the size of dataset is not divisible by the batch size, then the last batch will be smaller,
defaults to False
:type drop_last: bool, optional
""" """
def __init__(self, def __init__(self,
@ -104,8 +101,8 @@ class DataParallelSampler(Sampler):
use a different random ordering for each epoch. Otherwise, the next iteration of this use a different random ordering for each epoch. Otherwise, the next iteration of this
sampler will yield the same ordering. sampler will yield the same ordering.
:param epoch: Epoch number. Args:
:type epoch: int epoch (int): Epoch number.
""" """
self.epoch = epoch self.epoch = epoch
@ -118,29 +115,27 @@ def get_dataloader(dataset,
pin_memory=False, pin_memory=False,
num_workers=0, num_workers=0,
**kwargs): **kwargs):
"""Set up a deterministic dataloader (also configure seed workers, samplers and whether shuffle or not) r"""Set up a deterministic dataloader (also configure seed workers, samplers and whether shuffle or not)
.. note:: When pipeline parallel is enabled, shuffle cannot be True as it will result in mismatch between input data Note:
on the 1st stage and label on the last stage When pipeline parallel is enabled, shuffle cannot be True as it will result in mismatch between input data
on the 1st stage and label on the last stage.
:param dataset: A :class:`torch.utils.data.Dataset` object Args:
:param shuffle: Whether to shuffle the dataset dataset (:class:`torch.utils.data.Dataset`): The dataset to be loaded.
:param seed: Random worker seed, defaults to 1024 shuffle (bool, optional): Whether to shuffle the dataset. Defaults to False.
:param add_sampler: Add DistributedDataParallelSampelr to the dataset seed (int, optional): Random worker seed for sampling, defaults to 1024.
:param drop_last: Drop the last incomplete batch of data add_sampler: Whether to add ``DistributedDataParallelSampler`` to the dataset. Defaults to True.
:param pin_memory: Whether to pin memory address in CPU memory drop_last (bool, optional): Set to True to drop the last incomplete batch, if the dataset size
:param num_workers: Number of worker threads for this dataloader is not divisible by the batch size. If False and the size of dataset is not divisible by
the batch size, then the last batch will be smaller, defaults to False.
pin_memory (bool, optional): Whether to pin memory address in CPU memory. Defaults to False.
num_workers (int, optional): Number of worker threads for this dataloader. Defaults to 0.
kwargs (dict): optional parameters for ``torch.utils.data.DataLoader``, more details could be found in
`DataLoader <https://pytorch.org/docs/stable/_modules/torch/utils/data/dataloader.html#DataLoader>`_.
:type dataset: :class:`torch.utils.data.Dataset` Returns:
:type shuffle: bool, optional. Default is False :class:`torch.utils.data.DataLoader`: A DataLoader used for training or testing.
:type seed: int, optional. Default is 1024
:type add_sampler: bool, optional. Default is True
:type drop_last: bool, optional. Default is False
:type pin_memory: bool, optional. Default is False
:type num_workers: int, optional. Default is 0
:return: A object of :class:`torch.utils.data.DataLoader`
:rtype: :class:`torch.utils.data.DataLoader`
""" """
_kwargs = kwargs.copy() _kwargs = kwargs.copy()

View File

@ -13,19 +13,25 @@ def accumulate_gradient(model: nn.Module,
accumulate_size: int, accumulate_size: int,
gradient_handlers: List[BaseGradientHandler] = None, gradient_handlers: List[BaseGradientHandler] = None,
lr_scheduler: _LRScheduler = None): lr_scheduler: _LRScheduler = None):
""" r"""Turning model, optimizer, dataloader into corresponding object for gradient accumulation.
:param model: your model object
:type model: :class:`torch.nn.Module` Args:
:param optimizer: your optimizer object model (:class:`torch.nn.Module`): your model object for gradient accumulation.
:type optimizer: :class:`torch.optim.Optimizer` optimizer (:class:`torch.optim.Optimizer`): your optimizer object for gradient accumulation.
:param dataloader: your dataloader object dataloader (:class:`torch.utils.data.DataLoader` or iterable objects):
:type dataloader: Iterable your dataloader object, would be called like iter(dataloader)
:param accumulate_size: the number of steps to accumulate gradients accumulate_size (int): the number of steps to accumulate gradients
:type accumulate_size: int gradient_handlers (List[:class:`colossalai.engine.BaseGradientHandler`]):
:param gradient_handlers: list of gradient handler objects. Default is None list of gradient handler objects. Default is None.
:type gradient_handlers: List[:class:`colossalai.engine.BaseGradientHandler`] lr_scheduler (`torch.optim.lr_scheduler` or `colossalai.nn.lr_scheduler`):
:param lr_scheduler: your lr scheduler object. Default is None your ``lr_scheduler`` object for gradient accumulation. Defaults to None.
:type lr_scheduler: `torch.optim.lr_scheduler._LRScheduler`
More details about `gradient_handlers` could be found in
`Gradient_handler <https://github.com/hpcaitech/ColossalAI/tree/main/colossalai/engine/gradient_handler>`_.
More details about `lr_scheduler` could be found
`lr_scheduler <https://github.com/hpcaitech/ColossalAI/tree/main/colossalai/nn/lr_scheduler>`_. and
`how to adjust learning rate <https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate>`_.
""" """
optimizer = GradAccumOptimizer(optimizer, accumulate_size=accumulate_size, model=model) optimizer = GradAccumOptimizer(optimizer, accumulate_size=accumulate_size, model=model)
dataloader = GradAccumDataloader(dataloader, accumulate_size=accumulate_size) dataloader = GradAccumDataloader(dataloader, accumulate_size=accumulate_size)

View File

@ -15,15 +15,13 @@ from colossalai.engine import BaseGradientHandler
class GradAccumOptimizer(ColossalaiOptimizer): class GradAccumOptimizer(ColossalaiOptimizer):
"""A wrapper for the optimizer to enable gradient accumulation by skipping the steps """A wrapper for the optimizer to enable gradient accumulation by skipping the steps
before accumulation size is reached before accumulation size is reached.
:param optim: Your optimizer object
:type optim: :class:`torch.optim.Optimizer`
:param accumulate_size: The number of steps to accumulate gradients
:type accumulate_size: int
:param model: Your model object to check if it is DDP for special handling of no_sync() context
:type model: :class:`torch.nn.Module`
Args:
optim (:class:`torch.optim.Optimizer`): Your optimizer object for gradient accumulation.
accumulate_size (int): The number of steps to accumulate gradients.
model (:class:`torch.nn.Module`):
Your model object to check if it is DistributedDataParallel for special handling of no_sync() context.
""" """
def __init__(self, optim: Optimizer, accumulate_size: int, model: nn.Module = None): def __init__(self, optim: Optimizer, accumulate_size: int, model: nn.Module = None):
@ -76,18 +74,18 @@ class GradAccumOptimizer(ColossalaiOptimizer):
class GradAccumDataloader: class GradAccumDataloader:
"""A wrapper for dataloder to enable gradient accumulation by dropping the last incomplete steps. """A wrapper for dataloader to enable gradient accumulation by dropping the last incomplete steps.
For example, if a dataloader has 10 batches of data and accumulate size is 4. The model paramters will Note:
be update only twice at step 4 and step 8. The last two batches of data do not form a complete 4-step cycle. The dataloader would drop the last incomplete steps for gradient accumulation.
Thus, they will be automatically skipped by this class. If the dataloader is not standard PyTorch dataloader, For example, if a dataloader has 10 batches of data and accumulate size is 4. The model parameters will
(e.g. Dali dataloader), this class will automatically consume (load data for nothing) the remaining 2 batches. be updated only twice at step 4 and step 8. The last two batches of data do not form a complete 4-step cycle.
Thus, they will be automatically skipped by this class. If the dataloader is not standard PyTorch dataloader,
:param dataloader: Your dataloader object (e.g. Dali dataloader), this class will automatically consume (load data for nothing) the remaining 2 batches.
:type dataloader: Iterable
:param accumulate_size: The number of steps to accumulate gradients
:type accumulate_size: int
Args:
optim (``Iterable``): Your dataloader object for gradient accumulation.
accumulate_size (int): The number of steps to accumulate gradients.
""" """
def __init__(self, dataloader: Iterable, accumulate_size: int) -> None: def __init__(self, dataloader: Iterable, accumulate_size: int) -> None:
@ -125,13 +123,12 @@ class GradAccumDataloader:
class GradAccumLrSchedulerByStep(_LRScheduler): class GradAccumLrSchedulerByStep(_LRScheduler):
"""A wrapper for the LR scheduler to enable gradient accumulation by skipping the steps """A wrapper for the LR scheduler to enable gradient accumulation by skipping the steps
before accumulation size is reached before accumulation size is reached.
:param lr_scheduler: Your lr scheduler object
:type lr_scheduler: :class:`torch.optim.lr_scheduler._LRScheduler`
:param accumulate_size: The number of steps to accumulate gradients
:type accumulate_size: int
Args:
lr_scheduler (:class:`torch.optim.lr_scheduler._LRScheduler`):
Your ``lr_scheduler`` object for gradient accumulation.
accumulate_size (int): The number of steps to accumulate gradients.
""" """
def __init__(self, lr_scheduler: _LRScheduler, accumulate_size: int) -> None: def __init__(self, lr_scheduler: _LRScheduler, accumulate_size: int) -> None:
@ -171,13 +168,16 @@ class GradAccumLrSchedulerByStep(_LRScheduler):
class GradAccumGradientHandler: class GradAccumGradientHandler:
"""A wrapper for the gradient handler to enable gradient accumulation by skipping the steps r"""A wrapper for the gradient handler to enable gradient accumulation by skipping the steps
before accumulation size is reached before accumulation size is reached.
:param grad_handler: Your gradient handler object Args:
:type grad_handler: :class:`colossalai.engine.BaseGradientHandler` grad_handler (:class:`colossalai.engine.BaseGradientHandler`):
:param accumulate_size: The number of steps to accumulate gradients Your ``gradient_handler`` object for gradient accumulation, would be called when achieving `accumulate_size`.
:type accumulate_size: int accumulate_size (int): The number of steps to accumulate gradients.
More details about ``gradient_handlers`` could be found in
`Gradient_handler <https://github.com/hpcaitech/ColossalAI/tree/main/colossalai/engine/gradient_handler>`_.
""" """

View File

@ -14,12 +14,13 @@ from typing import Optional
def colo_cuda_memory_used(device: Optional[torch.device] = None) -> int: def colo_cuda_memory_used(device: Optional[torch.device] = None) -> int:
""" """Get the free memory info of device.
Get the free memory info of device.
:param device: a torch device instance or None Args:
:type device: Optional[torch.device] device (Optional[``torch.device``]): a torch device instance or None. Defaults None.
:return: current memory usage, sized by Byte
:rtype: int Returns:
int: current memory usage, sized by Byte.
""" """
if device: if device:
assert device.type == 'cuda' assert device.type == 'cuda'
@ -34,7 +35,7 @@ def colo_cuda_memory_used(device: Optional[torch.device] = None) -> int:
def bytes_to_GB(val, decimal=2): def bytes_to_GB(val, decimal=2):
"""A byte-to-Gigabyte converter, defaultly using binary notation. """A byte-to-Gigabyte converter, default using binary notation.
:param val: X bytes to convert :param val: X bytes to convert
:return: X' GB :return: X' GB
@ -43,7 +44,7 @@ def bytes_to_GB(val, decimal=2):
def bytes_to_MB(val, decimal=2): def bytes_to_MB(val, decimal=2):
"""A byte-to-Megabyte converter, defaultly using binary notation. """A byte-to-Megabyte converter, default using binary notation.
:param val: X bytes to convert :param val: X bytes to convert
:return: X' MB :return: X' MB
@ -54,13 +55,13 @@ def bytes_to_MB(val, decimal=2):
def report_memory_usage(message, logger=None, report_cpu=False): def report_memory_usage(message, logger=None, report_cpu=False):
"""Calculate and print RAM usage (in GB) """Calculate and print RAM usage (in GB)
:param message: A prefix message to add in the log Args:
:type message: str message (str): A prefix message to add in the log.
:param logger: An instance of :class:`colossalai.logging.DistributedLogger` logger (:class:`colossalai.logging.DistributedLogger`): The logger used to record memory information.
:type logger: :class:`colossalai.logging.DistributedLogger`, optional report_cpu (bool, optional): Whether to report CPU memory.
:param report_cpu: Whether to report CPU memory
:type report_cpu: bool, optional Raises:
:raises EnvironmentError: Raise error if no distributed environment has been initialized EnvironmentError: Raise error if no distributed environment has been initialized.
""" """
if not gpc.is_initialized(ParallelMode.GLOBAL): if not gpc.is_initialized(ParallelMode.GLOBAL):
raise EnvironmentError("No distributed environment is initialized") raise EnvironmentError("No distributed environment is initialized")

View File

@ -12,8 +12,8 @@ def get_moe_epsize_param_dict(model: nn.Module) -> Dict[int, List[nn.Parameter]]
size of every parameter. Since the parameters in data parallelism is replicated size of every parameter. Since the parameters in data parallelism is replicated
in each GPU, we set their ep_size to 1. in each GPU, we set their ep_size to 1.
:param model: A pyTorch nn.model from which we get dict Args:
:type model: torch.nn.Module model (:class:`torch.nn.Module`): A pyTorch `nn.Module` from which we get dict.
""" """
epsize_param_dict = dict() epsize_param_dict = dict()
for param in model.parameters(): for param in model.parameters():
@ -29,10 +29,10 @@ def get_moe_epsize_param_dict(model: nn.Module) -> Dict[int, List[nn.Parameter]]
def sync_moe_model_param(model: nn.Module): def sync_moe_model_param(model: nn.Module):
"""Make sure model parameters are consistent in MoE parallel context """Make sure model parameters are consistent in MoE parallel context.
:param model: A pyTorch nn.model on whose parameters you check the consistency Args:
:type model: torch.nn.Module model (:class:`torch.nn.Module`): A pyTorch model on whose parameters you check the consistency.
""" """
if is_using_ddp(): if is_using_ddp():

View File

@ -3,10 +3,10 @@
class MultiTensorApply(object): class MultiTensorApply(object):
""" """
Apply an operation to a list of tensors efficiently Apply an operation to a list of tensors efficiently.
:param chunk_size: Size of a chunk Args:
:type chunk_size: int chunk_size (int): Size of a chunk.
""" """
available = False available = False

View File

@ -9,6 +9,7 @@ from collections import defaultdict
LINE_WIDTH = 108 LINE_WIDTH = 108
LINE = '-' * LINE_WIDTH + '\n' LINE = '-' * LINE_WIDTH + '\n'
class TensorDetector(): class TensorDetector():
def __init__(self, def __init__(self,
show_info: bool = True, show_info: bool = True,
@ -16,17 +17,14 @@ class TensorDetector():
include_cpu: bool = False, include_cpu: bool = False,
module: Optional[nn.Module] = None module: Optional[nn.Module] = None
): ):
"""This class is an detector to detect tensor on different devices. """This class is a detector to detect tensor on different devices.
:param show_info: whether to print the info on screen, default True
:type show_info: bool
:param log: the file name to save the log
:type log: str
:param include_cpu: whether to detect tensor on cpu, default False
:type include_cpu: bool
:param module: when sending an `nn.Module` it, the detector can name the tensors detected better
:type module: Optional[nn.Module]
Args:
show_info (bool, optional): whether to print the info on screen, default True.
log (str, optional): the file name to save the log. Defaults to None.
include_cpu (bool, optional): whether to detect tensor on cpu, default False.
module (Optional[:class:`nn.Module`]): when sending an ``nn.Module`` object,
the detector can name the tensors detected better.
""" """
self.show_info = show_info self.show_info = show_info
self.log = log self.log = log
@ -49,7 +47,6 @@ class TensorDetector():
self.tensor_info[id(param)].append(param.dtype) self.tensor_info[id(param)].append(param.dtype)
self.tensor_info[id(param)].append(self.get_tensor_mem(param)) self.tensor_info[id(param)].append(self.get_tensor_mem(param))
def get_tensor_mem(self, tensor): def get_tensor_mem(self, tensor):
# calculate the memory occupied by a tensor # calculate the memory occupied by a tensor
memory_size = tensor.element_size() * tensor.storage().size() memory_size = tensor.element_size() * tensor.storage().size()
@ -58,7 +55,6 @@ class TensorDetector():
memory_size += grad_memory_size memory_size += grad_memory_size
return self.mem_format(memory_size) return self.mem_format(memory_size)
def mem_format(self, real_memory_size): def mem_format(self, real_memory_size):
# format the tensor memory into a reasonal magnitude # format the tensor memory into a reasonal magnitude
if real_memory_size >= 2 ** 30: if real_memory_size >= 2 ** 30:
@ -69,7 +65,6 @@ class TensorDetector():
return str(real_memory_size / (2 ** 10)) + ' KB' return str(real_memory_size / (2 ** 10)) + ' KB'
return str(real_memory_size) + ' B' return str(real_memory_size) + ' B'
def collect_tensors_state(self): def collect_tensors_state(self):
for obj in gc.get_objects(): for obj in gc.get_objects():
if torch.is_tensor(obj): if torch.is_tensor(obj):
@ -116,7 +111,6 @@ class TensorDetector():
if obj.device not in self.devices: if obj.device not in self.devices:
self.devices.append(obj.device) self.devices.append(obj.device)
def print_tensors_state(self): def print_tensors_state(self):
template_format = '{:3s}{:<30s}{:>10s}{:>20s}{:>10s}{:>20s}{:>15s}' template_format = '{:3s}{:<30s}{:>10s}{:>20s}{:>10s}{:>20s}{:>15s}'
self.info += LINE self.info += LINE
@ -174,7 +168,6 @@ class TensorDetector():
with open(self.log + '.log', 'a') as f: with open(self.log + '.log', 'a') as f:
f.write(self.info) f.write(self.info)
def detect(self, include_cpu = False): def detect(self, include_cpu = False):
self.include_cpu = include_cpu self.include_cpu = include_cpu
self.collect_tensors_state() self.collect_tensors_state()

View File

@ -25,7 +25,7 @@ class Timer:
return time.time() return time.time()
def start(self): def start(self):
"""Fisrtly synchronize cuda, reset the clock and then start the timer. """Firstly synchronize cuda, reset the clock and then start the timer.
""" """
self._elapsed = 0 self._elapsed = 0
synchronize() synchronize()
@ -40,10 +40,11 @@ class Timer:
def stop(self, keep_in_history: bool = False): def stop(self, keep_in_history: bool = False):
"""Stop the timer and record the start-stop time interval. """Stop the timer and record the start-stop time interval.
:param keep_in_history: Whether does it record into history each start-stop interval, defaults to False Args:
:type keep_in_history: bool, optional keep_in_history (bool, optional): Whether does it record into history
:return: Start-stop interval each start-stop interval, defaults to False.
:rtype: int Returns:
int: Start-stop interval.
""" """
synchronize() synchronize()
end_time = time.time() end_time = time.time()
@ -57,26 +58,27 @@ class Timer:
def get_history_mean(self): def get_history_mean(self):
"""Mean of all history start-stop time intervals. """Mean of all history start-stop time intervals.
:return: Mean of time intervals Returns:
:rtype: int int: Mean of time intervals
""" """
return sum(self._history) / len(self._history) return sum(self._history) / len(self._history)
def get_history_sum(self): def get_history_sum(self):
"""Add up all the start-stop time intervals. """Add up all the start-stop time intervals.
:return: Sum of time intervals Returns:
:rtype: int int: Sum of time intervals.
""" """
return sum(self._history) return sum(self._history)
def get_elapsed_time(self): def get_elapsed_time(self):
"""Return the last start-stop time interval. """Return the last start-stop time interval.
.. note:: Use it only when timer is not in progress Returns:
int: The last time interval.
:return: The last time interval Note:
:rtype: int Use it only when timer is not in progress
""" """
assert not self._started, 'Timer is still in progress' assert not self._started, 'Timer is still in progress'
return self._elapsed return self._elapsed
@ -90,10 +92,10 @@ class Timer:
class MultiTimer: class MultiTimer:
"""An object contains multiple timers """An object contains multiple timers.
:param on: Whether the timer is enabled. Default is True Args:
:type on: bool, optional on (bool, optional): Whether the timer is enabled. Default is True.
""" """
def __init__(self, on: bool = True): def __init__(self, on: bool = True):
@ -101,10 +103,10 @@ class MultiTimer:
self._timers = dict() self._timers = dict()
def start(self, name: str): def start(self, name: str):
"""Start namely one of the timers """Start namely one of the timers.
:param name: Timer's key Args:
:type name: str name (str): Timer's key.
""" """
if self._on: if self._on:
if name not in self._timers: if name not in self._timers:
@ -114,10 +116,9 @@ class MultiTimer:
def stop(self, name: str, keep_in_history: bool): def stop(self, name: str, keep_in_history: bool):
"""Stop namely one of the timers. """Stop namely one of the timers.
:param name: Timer's key Args:
:type name: str name (str): Timer's key.
:param keep_in_history: Whether does it record into history each start-stop interval keep_in_history (bool): Whether does it record into history each start-stop interval.
:type keep_in_history: bool
""" """
if self._on: if self._on:
return self._timers[name].stop(keep_in_history) return self._timers[name].stop(keep_in_history)
@ -127,17 +128,19 @@ class MultiTimer:
def get_timer(self, name): def get_timer(self, name):
"""Get timer by its name (from multitimer) """Get timer by its name (from multitimer)
:param name: Timer's key Args:
:return: Timer with the name you give correctly name (str): Timer's key.
:rtype: Timer Returns:
:class:`colossalai.utils.Timer`: Timer with the name you give correctly.
""" """
return self._timers[name] return self._timers[name]
def reset(self, name=None): def reset(self, name=None):
"""Reset timers. """Reset timers.
:param name: If name is designated, the named timer will be reset and others will not, defaults to None Args:
:type name: optional name (str, optional): If name is designated, the named timer will be reset
and others will not, defaults to None.
""" """
if self._on: if self._on:
if name is not None: if name is not None: