mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-05-03 05:58:09 +00:00
Refactored docstring to google style
This commit is contained in:
parent
53b1b6e340
commit
ec5086c49c
colossalai
amp
builder
communication
context
config.pyparallel_context.py
process_group_initializer
initializer_1d.pyinitializer_2d.pyinitializer_2p5d.pyinitializer_3d.pyinitializer_data.pyinitializer_model.pyinitializer_pipeline.pyinitializer_sequence.pyinitializer_tensor.pyprocess_group_initializer.py
random
engine
initialize.pylogging
nn
init.py
layer
colossalai_layer
moe
parallel_1d
parallel_2d
parallel_2p5d
parallel_3d
parallel_sequence
utils
vanilla
wrapper
loss
lr_scheduler
metric
registry
trainer
utils
activation_checkpoint.pycheckpointing.pycommon.py
data_sampler
gradient_accumulation
memory_utils
moe.pymulti_tensor_apply
tensor_detector
timer.pytests/test_moe
@ -12,21 +12,27 @@ from .naive_amp import convert_to_naive_amp
|
||||
|
||||
|
||||
def convert_to_amp(model: nn.Module, optimizer: Optimizer, criterion: _Loss, mode: AMP_TYPE, amp_config: Config = None):
|
||||
"""A helper function to wrap training components with Torch AMP modules
|
||||
"""A helper function to wrap training components with Torch AMP modules.
|
||||
|
||||
:param model: your model object
|
||||
:type model: :class:`torch.nn.Module`
|
||||
:param optimizer: your optimizer object
|
||||
:type optimizer: :class:`torch.optim.Optimizer`
|
||||
:param criterion: your loss function object
|
||||
:type criterion: :class:`torch.nn.modules.loss._Loss`
|
||||
:param mode: amp mode
|
||||
:type mode: :class:`colossalai.amp.AMP_TYPE`
|
||||
:param amp_config: configuration for different amp modes
|
||||
:type amp_config: :class:`colossalai.context.Config` or dict
|
||||
Args:
|
||||
param model (:class:`torch.nn.Module`): your model object.
|
||||
optimizer (:class:`torch.optim.Optimizer`): your optimizer object.
|
||||
criterion (:class:`torch.nn.modules.loss._Loss`): your loss function object.
|
||||
mode (:class:`colossalai.amp.AMP_TYPE`): amp mode.
|
||||
amp_config (:class:`colossalai.context.Config` or dict): configuration for different amp modes
|
||||
|
||||
:return: (model, optimizer, criterion)
|
||||
:rtype: Tuple
|
||||
Returns:
|
||||
A tuple (model, optimizer, criterion).
|
||||
|
||||
Note:
|
||||
``amp_config`` may vary from different mode you choose. You should check the corresponding amp mode
|
||||
for more details about ``amp_config``.
|
||||
For ``apex_amp``, please check
|
||||
`apex_amp config <https://nvidia.github.io/apex/amp.html?highlight=apex%20amp>`_.
|
||||
For ``naive_amp``, please check
|
||||
`naive_amp config <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/amp/naive_amp/_fp16_optimizer.py#L42>`_.
|
||||
For ``torch_amp``, please check
|
||||
`torch_amp config <https://github.com/pytorch/pytorch/blob/master/torch/cuda/amp/grad_scaler.py#L97>`_.
|
||||
"""
|
||||
assert isinstance(mode, AMP_TYPE), \
|
||||
f'expected the argument mode be AMP_TYPE, but got {type(mode)}'
|
||||
|
@ -4,17 +4,33 @@ from torch.optim import Optimizer
|
||||
|
||||
|
||||
def convert_to_apex_amp(model: nn.Module, optimizer: Optimizer, amp_config):
|
||||
"""A helper function to wrap training components with Apex AMP modules
|
||||
r"""A helper function to wrap training components with Apex AMP modules
|
||||
|
||||
:param model: your model object
|
||||
:type model: :class:`torch.nn.Module`
|
||||
:param optimizer: your optimizer object
|
||||
:type optimizer: :class:`torch.optim.Optimizer`
|
||||
:param amp_config: configuration for nvidia apex
|
||||
:type amp_config: :class:`colossalai.context.Config` or dict
|
||||
Args:
|
||||
model (:class:`torch.nn.Module`): your model object.
|
||||
optimizer (:class:`torch.optim.Optimizer`): your optimizer object.
|
||||
amp_config (:class: colossalai.context.Config or dict): configuration for initializing apex_amp.
|
||||
|
||||
:return: (model, optimizer)
|
||||
:rtype: Tuple
|
||||
The ``amp_config`` should include parameters below:
|
||||
::
|
||||
|
||||
enabled (bool, optional, default=True)
|
||||
opt_level (str, optional, default="O1")
|
||||
cast_model_type (``torch.dtype``, optional, default=None)
|
||||
patch_torch_functions (bool, optional, default=None)
|
||||
keep_batchnorm_fp32 (bool or str, optional, default=None
|
||||
master_weights (bool, optional, default=None)
|
||||
loss_scale (float or str, optional, default=None)
|
||||
cast_model_outputs (torch.dtype, optional, default=None)
|
||||
num_losses (int, optional, default=1)
|
||||
verbosity (int, default=1)
|
||||
min_loss_scale (float, default=None)
|
||||
max_loss_scale (float, default=2.**24)
|
||||
|
||||
Returns:
|
||||
Tuples: A tuple (model, optimizer).
|
||||
|
||||
More details about ``amp_config`` refer to `amp_config <https://nvidia.github.io/apex/amp.html?highlight=apex%20amp>`_.
|
||||
"""
|
||||
import apex.amp as apex_amp
|
||||
model, optimizer = apex_amp.initialize(model, optimizer, **amp_config)
|
||||
|
@ -21,8 +21,8 @@ class ApexAMPOptimizer(ColossalaiOptimizer):
|
||||
def backward(self, loss: Tensor):
|
||||
"""Backward pass to get all gradients
|
||||
|
||||
:param loss: Loss computed by a loss function
|
||||
:type loss: torch.Tensor
|
||||
Args:
|
||||
loss (torch.Tensor): Loss computed by a loss function
|
||||
"""
|
||||
with apex_amp.scale_loss(loss, self.optim) as scaled_loss:
|
||||
scaled_loss.backward()
|
||||
@ -30,10 +30,9 @@ class ApexAMPOptimizer(ColossalaiOptimizer):
|
||||
def clip_grad_norm(self, model: nn.Module, max_norm: float):
|
||||
"""Clip gradients' norm
|
||||
|
||||
:param model: Your model object
|
||||
:type model: torch.nn.Module
|
||||
:param max_norm: The max norm value for gradient clipping
|
||||
:type max_norm: float
|
||||
Args:
|
||||
model (torch.nn.Module): Your model object
|
||||
max_norm (float): The max norm value for gradient clipping
|
||||
"""
|
||||
if max_norm > 0:
|
||||
clip_grad_norm_fp32(apex_amp.master_params(self.optim), max_norm)
|
||||
|
@ -4,20 +4,30 @@ from torch.optim import Optimizer
|
||||
from colossalai.utils import is_no_pp_or_last_stage
|
||||
from .naive_amp import NaiveAMPOptimizer, NaiveAMPModel
|
||||
from .grad_scaler import DynamicGradScaler, ConstantGradScaler
|
||||
from ._fp16_optimizer import FP16Optimizer
|
||||
|
||||
|
||||
def convert_to_naive_amp(model: nn.Module, optimizer: Optimizer, amp_config):
|
||||
"""A helper function to wrap training components with naive AMP modules
|
||||
"""A helper function to wrap training components with naive AMP modules. In this mode,
|
||||
we forcibly cast the model weights and inputs to FP16, and cast the model outputs to FP32 to calculate loss,
|
||||
which is equivalent to Apex O3.
|
||||
|
||||
:param model: your model object
|
||||
:type model: :class:`torch.nn.Module`
|
||||
:param optimizer: your optimizer object
|
||||
:type optimizer: :class:`torch.optim.Optimizer`
|
||||
:param amp_config: configuration for naive mode amp
|
||||
:type amp_config: :class:`colossalai.context.Config` or dict
|
||||
Args:
|
||||
model (:class:`torch.nn.Module`): your model object
|
||||
optimizer (:class:`torch.optim.Optimizer`): your optimizer object
|
||||
amp_config (:class:`colossalai.context.Config` or dict): configuration for naive mode amp.
|
||||
|
||||
:return: (model, optimizer)
|
||||
:rtype: Tuple
|
||||
|
||||
The ``amp_config`` should contain parameters below:
|
||||
:
|
||||
|
||||
verbose (bool, optional): if set to `True`, will print debug info (Default: False).
|
||||
clip_grad_norm (float, optional): clip gradients with this global L2 norm (Default 0).
|
||||
Note that clipping is ignored if clip_grad == 0.
|
||||
dynamic_grad_scale (bool): whether to use dynamic grad scaler.
|
||||
|
||||
Returns:
|
||||
Tuples: A tuple (model, optimizer)
|
||||
"""
|
||||
if isinstance(model, nn.ModuleList):
|
||||
# interleaved pipeline
|
||||
@ -46,4 +56,4 @@ def convert_to_naive_amp(model: nn.Module, optimizer: Optimizer, amp_config):
|
||||
return model, optimizer
|
||||
|
||||
|
||||
__all__ = ['convert_to_naive_amp', 'NaiveAMPOptimizer']
|
||||
__all__ = ['convert_to_naive_amp', 'NaiveAMPOptimizer', 'FP16Optimizer']
|
||||
|
@ -41,25 +41,14 @@ def _multi_tensor_copy_this_to_that(this, that, overflow_buf=None):
|
||||
|
||||
class FP16Optimizer(Optimizer):
|
||||
"""Float16 optimizer for fp16 and bf16 data types.
|
||||
|
||||
:param optimizer: base optimizer such as Adam or SGD
|
||||
:type optimizer: torch.optim.Optimizer
|
||||
:param clip_grad: clip gradeints with this global L2 norm. Note that clipping is ignored if clip_grad == 0
|
||||
:type param clip_grad: float
|
||||
:param log_num_zeros_in_grad: return number of zeros in the gradients.
|
||||
:type log_num_zeros_in_grad: bool
|
||||
:param initial_scale: initial scale of gradient scaler
|
||||
:type initial_scale: int
|
||||
:param growth_factor: the growth rate of loss scale
|
||||
:type growth_factor: int
|
||||
:param backoff_factor: the decrease rate of loss scale
|
||||
:type backoff_factor: float
|
||||
:param hysterisis: delay shift in dynamic loss scaling
|
||||
:type hysterisis: int
|
||||
:param max_scale: maximum loss scale allowed
|
||||
:type max_scale: int
|
||||
:param verbose: if set to `True`, will print debug info
|
||||
:type verbose: bool
|
||||
|
||||
Args:
|
||||
optimizer (torch.optim.Optimizer): base optimizer such as Adam or SGD
|
||||
grad_scaler (BaseGradScaler): grad scaler for gradient chose in
|
||||
``constant_grad_scaler`` or ``dynamic_grad_scaler``.
|
||||
clip_grad_norm (float, optional): clip gradients with this global L2 norm. Default 0.
|
||||
Note that clipping is ignored if clip_grad == 0
|
||||
verbose (bool, optional): if set to `True`, will print debug info. Default False.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
|
@ -18,11 +18,15 @@ from ._fp16_optimizer import FP16Optimizer
|
||||
class NaiveAMPOptimizer(ColossalaiOptimizer):
|
||||
"""A wrapper class for optimizer to cast all parameters to fp16
|
||||
|
||||
:param optim: A normal optimizer like Adam or SGD
|
||||
:param args: Args used to initialize FP16 optimizer
|
||||
:param kwargs: Kwargs used to initialize FP16 optimizer
|
||||
Args:
|
||||
optim (torch.optim.Optimizer): A normal optimizer like Adam or SGD.
|
||||
grad_scaler (BaseGradScaler): grad scaler for gradient chose in
|
||||
``constant_grad_scaler`` or ``dynamic_grad_scaler``.
|
||||
clip_grad_norm (float, optional): clip gradients with this global L2 norm. Default 0.
|
||||
verbose (bool, optional): if set to `True`, will print debug info. Default False.
|
||||
|
||||
:type optim: torch.optim.Optimizer
|
||||
Note:
|
||||
clipping is ignored if ``clip_grad_norm`` equals 0.
|
||||
"""
|
||||
|
||||
def __init__(self, optim: Optimizer, *args, **kwargs):
|
||||
@ -40,8 +44,19 @@ class NaiveAMPOptimizer(ColossalaiOptimizer):
|
||||
|
||||
|
||||
class NaiveAMPModel(nn.Module):
|
||||
"""A wrapper class for model to cast the model into fp16 and
|
||||
r"""A wrapper class for model to cast the model into fp16 and
|
||||
automatically cast the input and output
|
||||
|
||||
Args:
|
||||
model (torch.nn.Module): torch.nn.Module to be wrapped.
|
||||
output_to_fp32 (bool, optional): Whether cast output of this module into fp32. (Default: True)
|
||||
parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel group mode used in this module.
|
||||
(Default: ``ParallelMode.DATA``)
|
||||
sync_buffer (bool, optional): whether to synchronize buffer. (Default: True)
|
||||
|
||||
Note:
|
||||
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
|
||||
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
|
@ -10,18 +10,25 @@ def convert_to_torch_amp(model: nn.Module,
|
||||
optimizer: Optimizer,
|
||||
criterion: Optional[_Loss] = None,
|
||||
amp_config: Optional[Config] = None):
|
||||
"""A helper function to wrap training components with Torch AMP modules
|
||||
"""A helper function to wrap training components with Pytorch AMP modules
|
||||
|
||||
:param model: your model object
|
||||
:type model: :class:`torch.nn.Module`
|
||||
:param optimizer: your optimizer object
|
||||
:type optimizer: :class:`torch.optim.Optimizer`
|
||||
:param criterion: your loss function object
|
||||
:type criterion: :class:`torch.nn.modules.loss._Loss`, optional
|
||||
:param amp_config: configuration for different amp modes
|
||||
:type amp_config: :class:`colossalai.context.Config` or dict, optional
|
||||
:return: (model, optimizer, criterion)
|
||||
:rtype: Tuple
|
||||
Args:
|
||||
model (:class:`torch.nn.Module`): your model object.
|
||||
optimizer (:class:`torch.optim.Optimizer`): your optimizer object
|
||||
criterion (:class:`torch.nn.modules.loss._Loss`, optional): your loss function object
|
||||
amp_config (:class:`colossalai.context.Config` or dict, optional): configuration for Pytorch AMP.
|
||||
|
||||
The ``amp_config`` should include parameters below:
|
||||
::
|
||||
|
||||
init_scale (float, optional, default=2.**16)
|
||||
growth_factor (float, optional, default=2.0)
|
||||
backoff_factor (float, optional, default=0.5)
|
||||
growth_interval (int, optional, default=2000)
|
||||
enabled (bool, optional, default=True)
|
||||
|
||||
Returns:
|
||||
A tuple (model, optimizer, criterion)
|
||||
"""
|
||||
model = TorchAMPModel(model)
|
||||
if amp_config is None:
|
||||
|
@ -14,13 +14,19 @@ from colossalai.utils import clip_grad_norm_fp32
|
||||
|
||||
|
||||
class TorchAMPOptimizer(ColossalaiOptimizer):
|
||||
"""A wrapper class which integrate pytorch amp with an optimizer
|
||||
"""A wrapper class which integrate Pytorch AMP with an optimizer
|
||||
|
||||
:param optim: A normal optimizer like Adam or SGD
|
||||
:param args: Args used to initialize gradient scaler
|
||||
:param kwargs: Kwargs used to initialize gradient scaler
|
||||
|
||||
:type optim: torch.optim.Optimizer
|
||||
Args:
|
||||
optim (torch.optim.Optimizer): A normal optimizer like Adam or SGD.
|
||||
init_scale (float, optional, default=2.**16): Initial scale factor.
|
||||
growth_factor (float, optional, default=2.0): Factor by which the scale is multiplied during
|
||||
:meth:`update` if no inf/NaN gradients occur for ``growth_interval`` consecutive iterations.
|
||||
backoff_factor (float, optional, default=0.5): Factor by which the scale is multiplied during
|
||||
:meth:`update` if inf/NaN gradients occur in an iteration.
|
||||
growth_interval (int, optional, default=2000): Number of consecutive iterations without inf/NaN gradients
|
||||
that must occur for the scale to be multiplied by ``growth_factor``.
|
||||
enabled (bool, optional, default=True): If ``False``, disables gradient scaling. :meth:`step` simply
|
||||
invokes the underlying ``optimizer.step()``, and other methods become no-ops.
|
||||
"""
|
||||
|
||||
def __init__(self, optim: Optimizer, *args, **kwargs):
|
||||
@ -30,8 +36,8 @@ class TorchAMPOptimizer(ColossalaiOptimizer):
|
||||
def backward(self, loss: Tensor):
|
||||
"""Backward with torch amp gradient scaler
|
||||
|
||||
:param loss: Loss computed by a loss function
|
||||
:type loss: torch.Tensor
|
||||
Args:
|
||||
loss (torch.Tensor): Loss computed by a loss function
|
||||
"""
|
||||
self.scaler.scale(loss).backward()
|
||||
|
||||
@ -44,10 +50,9 @@ class TorchAMPOptimizer(ColossalaiOptimizer):
|
||||
def clip_grad_norm(self, model: nn.Module, max_norm: float):
|
||||
"""Apply gradient clipping to the model parameters
|
||||
|
||||
:param model: Your model object
|
||||
:type model: torch.nn.Module
|
||||
:param max_norm: Max norm value for gradient clipping
|
||||
:type max_norm: float
|
||||
Args:
|
||||
model (torch.nn.Module): Your model object
|
||||
max_norm (float): Max norm value for gradient clipping
|
||||
"""
|
||||
if max_norm > 0.0:
|
||||
self.scaler.unscale_(self.optim)
|
||||
@ -71,8 +76,8 @@ class TorchAMPModel(nn.Module):
|
||||
class TorchAMPLoss(nn.Module):
|
||||
"""A wrapper class for a criterion object which computes the loss in mixed-precision context
|
||||
|
||||
:param loss: A loss function object
|
||||
:type loss: torch.nn.modules.loss._Loss
|
||||
Args:
|
||||
loss (torch.nn.modules.loss._Loss): A loss function object
|
||||
"""
|
||||
|
||||
def __init__(self, loss: _Loss):
|
||||
|
@ -10,34 +10,40 @@ from colossalai.registry import *
|
||||
def build_from_config(module, config: dict):
|
||||
"""Returns an object of :class:`module` constructed from `config`.
|
||||
|
||||
:param module: A python or user-defined class
|
||||
:type module: class
|
||||
:param config: A python dict containing information used in the construction
|
||||
of the return object
|
||||
:type config: dict
|
||||
:raises AssertionError: Raises an AssertionError if `module` is not a class
|
||||
:return: An object of interest
|
||||
:rtype: Object
|
||||
Args:
|
||||
module: A python or user-defined class
|
||||
config: A python dict containing information used in the construction of the return object
|
||||
|
||||
Returns: An ``object`` of interest
|
||||
|
||||
Raises:
|
||||
AssertionError: Raises an AssertionError if `module` is not a class
|
||||
|
||||
"""
|
||||
assert inspect.isclass(module), 'module must be a class'
|
||||
return module(**config)
|
||||
|
||||
|
||||
def build_from_registry(config, registry: Registry):
|
||||
"""Returns an object constructed from `config`, the type of the object
|
||||
r"""Returns an object constructed from `config`, the type of the object
|
||||
is specified by `registry`.
|
||||
|
||||
:param config: A python dict or a :class:`colossalai.context.Config` object
|
||||
containing information used in the construction of the return object
|
||||
:type config: dict or :class:`colossalai.context.colossalai.context.Config`
|
||||
:param registry: A registry specifying the type of the return object
|
||||
:type registry: :class:`Registry`
|
||||
:raises AssertionError: Raises an AssertionError if `registry` is not an object
|
||||
of :class:`Registry` or `mod_type` in `config` is not found in `registry`
|
||||
:raises Exception: Raises an Exception if an error occurred when building
|
||||
from registry
|
||||
:return: An object specified by `registry`
|
||||
:rtype: Python object specified by `registry`
|
||||
Note:
|
||||
the `config` is used to construct the return object such as `LAYERS`,
|
||||
`OPTIMIZERS` and other support types in `registry`. The `config` should contain
|
||||
all required parameters of corresponding object. The details of support
|
||||
types in `registry` and the `mod_type` in `config` could be found in
|
||||
`registry <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/registry/__init__.py>`_.
|
||||
|
||||
Args:
|
||||
config (dict or :class:`colossalai.context.colossalai.context.Config`): information
|
||||
used in the construction of the return object.
|
||||
registry (:class:`Registry`): A registry specifying the type of the return object
|
||||
|
||||
Returns: A Python object specified by `registry`
|
||||
|
||||
Raises:
|
||||
Exception: Raises an Exception if an error occurred when building from registry
|
||||
"""
|
||||
config_ = config.copy() # keep the original config untouched
|
||||
assert isinstance(
|
||||
@ -60,11 +66,13 @@ def build_from_registry(config, registry: Registry):
|
||||
def build_layer(config):
|
||||
"""Returns a layer object of :class:`nn.Module` constructed from `config`.
|
||||
|
||||
:param config: A python dict or a :class:`colossalai.context.Config` object
|
||||
containing information used in the construction of the return object
|
||||
:type config: dict or :class:`colossalai.context.Config`
|
||||
:return: An object of :class:`torch.nn.Module`
|
||||
:rtype: :class:`torch.nn.Module`
|
||||
Args:
|
||||
config (dict or :class:`colossalai.context.Config`): A python dict or
|
||||
a :class:`colossalai.context.Config` object containing information
|
||||
used in the construction of the ``LAYERS``.
|
||||
|
||||
Returns:
|
||||
An object of :class:`torch.nn.Module`
|
||||
"""
|
||||
return build_from_registry(config, LAYERS)
|
||||
|
||||
@ -73,11 +81,13 @@ def build_loss(config):
|
||||
"""Returns a loss function object of :class:`torch.autograd.Function` constructed
|
||||
from `config`.
|
||||
|
||||
:param config: A python dict or a :class:`colossalai.context.Config` object
|
||||
containing information used in the construction of the return object
|
||||
:type config: dict or :class:`colossalai.context.Config`
|
||||
:return: An object of :class:`torch.nn.modules.loss._Loss`
|
||||
:rtype: :class:`torch.nn.modules.loss._Loss`
|
||||
Args:
|
||||
config (dict or :class:`colossalai.context.Config`): A python dict or
|
||||
a :class:`colossalai.context.Config` object containing information
|
||||
used in the construction of the ``LOSSES``.
|
||||
|
||||
Returns:
|
||||
An object of :class:`torch.nn.modules.loss._Loss`
|
||||
"""
|
||||
return build_from_registry(config, LOSSES)
|
||||
|
||||
@ -85,11 +95,13 @@ def build_loss(config):
|
||||
def build_model(config):
|
||||
"""Returns a model object of :class:`nn.Module` constructed from `config`.
|
||||
|
||||
:param config: A python dict or a :class:`colossalai.context.Config` object
|
||||
containing information used in the construction of the return object
|
||||
:type config: dict or :class:`colossalai.context.Config`
|
||||
:return: An object of :class:`torch.nn.Module`
|
||||
:rtype: :class:`torch.nn.Module`
|
||||
Args:
|
||||
config (dict or :class:`colossalai.context.Config`): A python dict or
|
||||
a :class:`colossalai.context.Config` object containing information
|
||||
used in the construction of the ``MODELS``.
|
||||
|
||||
Returns:
|
||||
An object of :class:`torch.nn.Module`
|
||||
"""
|
||||
return build_from_registry(config, MODELS)
|
||||
|
||||
@ -98,11 +110,13 @@ def build_dataset(config):
|
||||
"""Returns a dataset object of :class:`torch.utils.data.Dataset` constructed
|
||||
from `config`.
|
||||
|
||||
:param config: A python dict or a :class:`colossalai.context.Config` object
|
||||
containing information used in the construction of the return object
|
||||
:type config: dict or :class:`colossalai.context.Config`
|
||||
:return: An object of :class:`torch.utils.data.Dataset`
|
||||
:rtype: :class:`torch.utils.data.Dataset`
|
||||
Args:
|
||||
config (dict or :class:`colossalai.context.Config`): A python dict or
|
||||
a :class:`colossalai.context.Config` object containing information
|
||||
used in the construction of the ``DATASETS``.
|
||||
|
||||
Returns:
|
||||
An object of :class:`torch.utils.data.Dataset`
|
||||
"""
|
||||
return build_from_registry(config, DATASETS)
|
||||
|
||||
@ -111,13 +125,14 @@ def build_optimizer(config, model):
|
||||
"""Returns an optimizer object of :class:`torch.optim.Optimizer` constructed from `config`,
|
||||
'model' and 'params'.
|
||||
|
||||
:param config: A python dict or a :class:`colossalai.context.Config` object
|
||||
containing information used in the construction of the return object
|
||||
:type config: dict or :class:`colossalai.context.Config`
|
||||
:param model: A model containing parameters for the optimizer
|
||||
:type model: :class:`nn.Module`
|
||||
:return: An object of :class:`torch.optim.Optimizer`
|
||||
:rtype: :class:`torch.optim.Optimizer`
|
||||
Args:
|
||||
config (dict or :class:`colossalai.context.Config`): A python dict or
|
||||
a :class:`colossalai.context.Config` object containing information
|
||||
used in the construction of the ``OPTIMIZERS``.
|
||||
model (:class:`nn.Module`): A model containing parameters for the optimizer
|
||||
|
||||
Returns:
|
||||
An object of :class:`torch.optim.Optimizer`
|
||||
"""
|
||||
config_ = config.copy()
|
||||
config_['params'] = model.parameters()
|
||||
@ -128,15 +143,15 @@ def build_gradient_handler(config, model, optimizer):
|
||||
"""Returns a gradient handler object of :class:`BaseGradientHandler` constructed from `config`,
|
||||
`model` and `optimizer`.
|
||||
|
||||
:param config: A python dict or a :class:`colossalai.context.Config` object
|
||||
containing information used in the construction of the return object
|
||||
:type config: dict or :class:`colossalai.context.Config`
|
||||
:param model: A model containing parameters for the gradient handler
|
||||
:type model: :class:`nn.Module`
|
||||
:param optimizer: An optimizer object containing parameters for the gradient handler
|
||||
:type optimizer: :class:`torch.optim.Optimizer`
|
||||
:return: An object of :class:`colossalai.engine.BaseGradientHandler`
|
||||
:rtype: :class:`colossalai.engine.BaseGradientHandler`
|
||||
Args:
|
||||
config (dict or :class:`colossalai.context.Config`): A python dict or
|
||||
a :class:`colossalai.context.Config` object containing information
|
||||
used in the construction of the ``GRADIENT_HANDLER``.
|
||||
model (:class:`nn.Module`): A model containing parameters for the gradient handler
|
||||
optimizer (:class:`torch.optim.Optimizer`): An optimizer object containing parameters for the gradient handler
|
||||
|
||||
Returns:
|
||||
An object of :class:`colossalai.engine.BaseGradientHandler`
|
||||
"""
|
||||
config_ = config.copy()
|
||||
config_['model'] = model
|
||||
@ -147,13 +162,13 @@ def build_gradient_handler(config, model, optimizer):
|
||||
def build_hooks(config, trainer):
|
||||
"""Returns a hook object of :class:`BaseHook` constructed from `config` and `trainer`.
|
||||
|
||||
:param config: A python dict or a :class:`colossalai.context.Config` object
|
||||
containing information used in the construction of the return object
|
||||
:type config: dict or :class:`colossalai.context.Config`
|
||||
:param trainer: A :class:`Trainer` object containing parameters for the hook
|
||||
:type trainer: :class:`Trainer`
|
||||
:return: An object of :class:`colossalai.trainer.hooks.BaseHook`
|
||||
:rtype: :class:`colossalai.trainer.hooks.BaseHook`
|
||||
Args:
|
||||
config (dict or :class:`colossalai.context.Config`): A python dict or
|
||||
a :class:`colossalai.context.Config` object containing information
|
||||
used in the construction of the ``HOOKS``.
|
||||
|
||||
Returns:
|
||||
An object of :class:`colossalai.trainer.hooks.BaseHook`
|
||||
"""
|
||||
config_ = config.copy()
|
||||
config_['trainer'] = trainer
|
||||
@ -163,11 +178,13 @@ def build_hooks(config, trainer):
|
||||
def build_ophooks(config):
|
||||
"""Returns a hook object of :class:`BaseOpHook` constructed from `config`.
|
||||
|
||||
:param config: A python dict or a :class:`colossalai.context.Config` object
|
||||
containing information used in the construction of the return object
|
||||
:type config: dict or :class:`colossalai.context.Config`
|
||||
:return: An object of :class:`colossalai.trainer.hooks.BaseOpHook`
|
||||
:rtype: :class:`colossalai.trainer.hooks.BaseOpHook`
|
||||
Args:
|
||||
config (dict or :class:`colossalai.context.Config`): A python dict or
|
||||
a :class:`colossalai.context.Config` object containing information
|
||||
used in the construction of the ``OPHOOKS``.
|
||||
|
||||
Returns:
|
||||
An object of :class:`colossalai.trainer.hooks.BaseOpHook`
|
||||
"""
|
||||
config_ = config.copy()
|
||||
return build_from_registry(config_, OPHOOKS)
|
||||
@ -177,11 +194,13 @@ def build_transform(config):
|
||||
"""Returns a transformation object of :class:`torchvision.transforms` constructed
|
||||
from `config`.
|
||||
|
||||
:param config: A python dict or a :class:`colossalai.context.Config` object
|
||||
containing information used in the construction of the return object
|
||||
:type config: dict or :class:`colossalai.context.Config`
|
||||
:return: An object of :class:`torchvision.transforms`
|
||||
:rtype: :class:`torchvision.transforms`
|
||||
Args:
|
||||
config (dict or :class:`colossalai.context.Config`): A python dict or
|
||||
a :class:`colossalai.context.Config` object containing information
|
||||
used in the construction of the ``TRANSFORMS``.
|
||||
|
||||
Returns:
|
||||
An object of :class:`torchvision.transforms`
|
||||
"""
|
||||
return build_from_registry(config, TRANSFORMS)
|
||||
|
||||
@ -190,14 +209,15 @@ def build_data_sampler(config, dataset):
|
||||
"""Returns a data sampler object of :class:`colossalai.nn.data.sampler.BaseSampler`
|
||||
constructed from `config`.
|
||||
|
||||
:param config: A python dict or a :class:`colossalai.context.Config` object
|
||||
containing information used in the construction of the return object
|
||||
:type config: dict or :class:`colossalai.context.Config`
|
||||
:param dataset: An object of :class:`torch.utils.data.Dataset` containing information
|
||||
used in the construction of the return object
|
||||
:type dataset: :class:`torch.utils.data.Dataset`
|
||||
:return: An object of :class:`colossalai.utils.data_sampler.BaseSampler`
|
||||
:rtype: :class:`colossalai.utils.data_sampler.BaseSampler`
|
||||
Args:
|
||||
config (dict or :class:`colossalai.context.Config`): A python dict or
|
||||
a :class:`colossalai.context.Config` object containing information
|
||||
used in the construction of the ``DATA_SAMPLERS``.
|
||||
dataset (:class:`torch.utils.data.Dataset`): An object of
|
||||
:class:`torch.utils.data.Dataset` containing information
|
||||
used in the construction of the return object
|
||||
Returns:
|
||||
An object of :class:`colossalai.utils.data_sampler.BaseSampler`
|
||||
"""
|
||||
config_ = config.copy()
|
||||
config_['dataset'] = dataset
|
||||
@ -208,14 +228,15 @@ def build_lr_scheduler(config, optimizer):
|
||||
"""Returns a learning rate scheduler object of :class:`torch.optim.lr_scheduler`
|
||||
constructed from `config`, `optimizer`, `total_steps` and `num_steps_per_epoch`.
|
||||
|
||||
:param config: A python dict or a :class:`colossalai.context.Config` object
|
||||
containing information used in the construction of the return object
|
||||
:type config: dict or :class:`colossalai.context.Config`
|
||||
:param optimizer: An optimizer object containing parameters for the learning rate
|
||||
scheduler
|
||||
:type optimizer: :class:`torch.optim.Optimizer`
|
||||
:return: An object of :class:`torch.optim.lr_scheduler`
|
||||
:rtype: :class:`torch.optim.lr_scheduler`
|
||||
Args:
|
||||
config (dict or :class:`colossalai.context.Config`): A python dict or
|
||||
a :class:`colossalai.context.Config` object containing information
|
||||
used in the construction of the ``lr_schedule``.
|
||||
optimizer (:class:`torch.optim.Optimizer`): An optimizer object containing
|
||||
parameters for the learning rate scheduler.
|
||||
|
||||
Returns:
|
||||
An object of :class:`torch.optim.lr_scheduler`
|
||||
"""
|
||||
config_ = config.copy()
|
||||
config_['optimizer'] = optimizer
|
||||
@ -225,10 +246,12 @@ def build_lr_scheduler(config, optimizer):
|
||||
def build_schedule(config):
|
||||
"""Returns a schedule of :class:`colossalai.engine.schedule.BaseSchedule`.
|
||||
|
||||
:param config: A python dict or a :class:`colossalai.context.Config` object
|
||||
containing information used in the construction of the return object
|
||||
:type config: dict or :class:`colossalai.context.Config`
|
||||
:return: An object of :class:`colossalai.engine.schedule.BaseSchedule`
|
||||
:rtype: :class:`colossalai.engine.schedule.BaseSchedule`
|
||||
Args:
|
||||
config (dict or :class:`colossalai.context.Config`): A python dict or
|
||||
a :class:`colossalai.context.Config` object containing information
|
||||
used in the construction of the ``Schedule``.
|
||||
|
||||
Returns:
|
||||
An object of :class:`colossalai.engine.schedule.BaseSchedule`
|
||||
"""
|
||||
return build_from_registry(config, SCHEDULE)
|
||||
|
@ -13,14 +13,13 @@ def _binary_partition(weights, st, ed):
|
||||
"""Returns the binary partition position of `weights`, given the start
|
||||
position `st` and the end position `ed`.
|
||||
|
||||
:param weights: A python list to be binary partitioned
|
||||
:type weights: list
|
||||
:param st: the start position of the binary partition
|
||||
:type st: int
|
||||
:param ed: the end postition of the binary partition
|
||||
:type ed: int
|
||||
:return: the binary partition position of `weights`
|
||||
:rtype: int
|
||||
Args:
|
||||
weights (list): A python list to be binary partitioned
|
||||
st (int): the start position of the binary partition
|
||||
ed (int): the end position of the binary partition
|
||||
|
||||
Returns:
|
||||
int: the binary partition position of `weights`
|
||||
"""
|
||||
w_sum = weights[ed - 1]
|
||||
prefix = 0
|
||||
@ -176,16 +175,13 @@ def build_pipeline_model_from_cfg(config, num_chunks: int = 1, partition_method:
|
||||
...
|
||||
)
|
||||
|
||||
:param config: Configuration of the model
|
||||
:type config: dict
|
||||
:param num_chunks: The number of chunks you want to have on the current stage. This value should be 1
|
||||
in most cases unless you are using virutal pipeline parallelism.
|
||||
:type num_chunks: int, optional
|
||||
:param partition_method: This parameter determines how you want to split your model layers into stages,
|
||||
you can set it as 'layer' or 'parameter'
|
||||
:type partition_method: str, optional
|
||||
:param verbose: Whether to print the logs
|
||||
:type verbose: bool, optional
|
||||
Args:
|
||||
config (dict): Configuration of the model.
|
||||
num_chunks (int, optional): The number of chunks you want to have on the current stage.
|
||||
This value should be 1 in most cases unless you are using virtual pipeline parallelism.
|
||||
partition_method (str, optional): This parameter determines how you want to split your model
|
||||
layers into stages, you can set it as 'layer' or 'parameter'.
|
||||
verbose (bool, optional): Whether to print the logs.
|
||||
"""
|
||||
ori_model = build_model(config)
|
||||
layers = ori_model.layers_cfg
|
||||
@ -240,13 +236,11 @@ def build_pipeline_model(layers: nn.Sequential, num_chunks: int = 1, verbose: bo
|
||||
"""An intializer to split the model into different stages for pipeline parallelism.
|
||||
Note that `layer` must be `torch.nn.Sequential`.
|
||||
|
||||
:param layers: Layers of model
|
||||
:type layers: `torch.nn.Sequential`
|
||||
:param num_chunks: The number of chunks you want to have on the current stage. This value should be 1
|
||||
in most cases unless you are using virutal pipeline parallelism.
|
||||
:type num_chunks: int, optional
|
||||
:param verbose: Whether to print the logs
|
||||
:type verbose: bool, optional
|
||||
Args:
|
||||
layers (`torch.nn.Sequential`): Layers of model
|
||||
num_chunks: The number of chunks you want to have on the current stage. This value should be 1
|
||||
in most cases unless you are using virtual pipeline parallelism.
|
||||
verbose (bool, optional): Whether to print the logs.
|
||||
"""
|
||||
pipeline_parallel_size = gpc.get_world_size(ParallelMode.PIPELINE)
|
||||
pipeline_rank = gpc.get_local_rank(ParallelMode.PIPELINE)
|
||||
|
@ -12,21 +12,22 @@ from colossalai.utils import get_current_device
|
||||
|
||||
|
||||
def all_gather(tensor: Tensor, dim: int, parallel_mode: ParallelMode, async_op: bool = False) -> Tensor:
|
||||
"""Gathers all tensors from the parallel group and concatenates them in a
|
||||
r"""Gathers all tensors from the parallel group and concatenates them in a
|
||||
specific dimension.
|
||||
|
||||
:param tensor: Tensor to be gathered
|
||||
:param dim: The dimension concatenating in
|
||||
:param parallel_mode: Parallel group mode used in this communication
|
||||
:param async_op: Whether operations are asynchronous
|
||||
|
||||
:type tensor: :class:`torch.Tensor`
|
||||
:type dim: int
|
||||
:type parallel_mode: :class:`colossalai.context.ParallelMode`
|
||||
:type async_op: bool, optional
|
||||
Note:
|
||||
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
|
||||
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
|
||||
|
||||
:return: The tensor generated by all-gather
|
||||
:rtype: :class:`torch.Tensor`
|
||||
Args:
|
||||
tensor (:class:`torch.Tensor`): Tensor to be gathered.
|
||||
dim (int): The dimension concatenating in.
|
||||
parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel group mode used in this communication.
|
||||
async_op (bool, optional): Whether operations are asynchronous.
|
||||
|
||||
Returns:
|
||||
Union[tuple(:class:`torch.Tensor`, work handle), :class:`torch.Tensor`]: The result of all-together only,
|
||||
if async_op is set to False. A tuple of output of all-gather and Async work handle, if async_op is set to True.
|
||||
"""
|
||||
depth = gpc.get_world_size(parallel_mode)
|
||||
if depth == 1:
|
||||
@ -54,23 +55,26 @@ def reduce_scatter(tensor: Tensor,
|
||||
parallel_mode: ParallelMode,
|
||||
op: ReduceOp = ReduceOp.SUM,
|
||||
async_op: bool = False) -> Tensor:
|
||||
"""Reduces all tensors then scatters it in a specific dimension to all
|
||||
r"""Reduces all tensors then scatters it in a specific dimension to all
|
||||
members in the parallel group.
|
||||
|
||||
:param tensor: Tensor to be reduced and scattered
|
||||
:param dim: The dimension scattering in
|
||||
:param parallel_mode: Parallel group mode used in this communication
|
||||
:param op: The type of reduce operation
|
||||
:param async_op: Whether operations are asynchronous
|
||||
|
||||
:type tensor: :class:`torch.Tensor`
|
||||
:type dim: int
|
||||
:type parallel_mode: :class:`colossalai.context.ParallelMode`
|
||||
:type op: ReduceOp, optional
|
||||
:type async_op: bool, optional
|
||||
Note:
|
||||
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
|
||||
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
|
||||
|
||||
:return: The tensor generated by reduce-scatter
|
||||
:rtype: :class:`Tensor`
|
||||
Args:
|
||||
tensor (:class:`torch.Tensor`): Tensor to be reduce_scattered.
|
||||
dim (int): The dimension concatenating in.
|
||||
parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel group mode used in this communication.
|
||||
op (torch.distributed.ReduceOp, optional): The type of reduce operation,
|
||||
should be included in [SUM, AVG, PRODUCT, MIN, MAX, BAND, BOR, BXOR].
|
||||
More details about ReduceOp please refer to
|
||||
`ReduceOp <https://pytorch.org/docs/stable/distributed.html#torch.distributed.ReduceOp>`_.
|
||||
async_op (bool, optional): Whether operations are asynchronous.
|
||||
|
||||
Returns:
|
||||
Union[tuple(:class:`torch.Tensor`, work handle), :class:`torch.Tensor`]: The result of reduce_scatter only,
|
||||
if async_op is set to False. A tuple of output of all-gather and Async work handle, if async_op is set to True.
|
||||
"""
|
||||
depth = gpc.get_world_size(parallel_mode)
|
||||
if depth == 1:
|
||||
@ -94,6 +98,25 @@ def all_reduce(tensor: Tensor,
|
||||
parallel_mode: ParallelMode,
|
||||
op: ReduceOp = ReduceOp.SUM,
|
||||
async_op: bool = False) -> Tensor:
|
||||
r"""Reduces the tensor data across whole parallel group in such a way that all get the final result.
|
||||
|
||||
Note:
|
||||
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
|
||||
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
|
||||
|
||||
Args:
|
||||
tensor (:class:`torch.Tensor`): Tensor to be all-reduced.
|
||||
parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel group mode used in this communication.
|
||||
op (torch.distributed.ReduceOp, optional): The type of reduce operation,
|
||||
should be included in [SUM, AVG, PRODUCT, MIN, MAX, BAND, BOR, BXOR].
|
||||
More details about ReduceOp please refer to
|
||||
`ReduceOp <https://pytorch.org/docs/stable/distributed.html#torch.distributed.ReduceOp>`_.
|
||||
async_op (bool, optional): Whether operations are asynchronous.
|
||||
|
||||
Returns:
|
||||
Union[tuple(:class:`torch.Tensor`, work handle), :class:`torch.Tensor`]: The result of all-gather only,
|
||||
if async_op is set to False. A tuple of output of all-gather and Async work handle, if async_op is set to True.
|
||||
"""
|
||||
depth = gpc.get_world_size(parallel_mode)
|
||||
if depth == 1:
|
||||
out = tensor
|
||||
@ -108,6 +131,23 @@ def all_reduce(tensor: Tensor,
|
||||
|
||||
|
||||
def broadcast(tensor: Tensor, src: int, parallel_mode: ParallelMode, async_op: bool = False):
|
||||
r"""Broadcast tensors to whole parallel group. Tensor must have the same
|
||||
number of elements in all processes participating in the collective.
|
||||
|
||||
Note:
|
||||
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
|
||||
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
|
||||
|
||||
Args:
|
||||
tensor (:class:`torch.Tensor`): Tensor to be broadcast.
|
||||
src (int): Source rank.
|
||||
parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel group mode used in this communication.
|
||||
async_op (bool, optional): Whether operations are asynchronous.
|
||||
|
||||
Returns:
|
||||
Union[tuple(:class:`torch.Tensor`, work handle), :class:`torch.Tensor`]: The tensor need to be broadcast only,
|
||||
if async_op is set to False. A tuple of output of all-gather and Async work handle, if async_op is set to True.
|
||||
"""
|
||||
depth = gpc.get_world_size(parallel_mode)
|
||||
if depth == 1:
|
||||
out = tensor
|
||||
@ -122,6 +162,23 @@ def broadcast(tensor: Tensor, src: int, parallel_mode: ParallelMode, async_op: b
|
||||
|
||||
|
||||
def reduce(tensor: Tensor, dst: int, parallel_mode: ParallelMode, op: ReduceOp = ReduceOp.SUM, async_op: bool = False):
|
||||
r"""Reduce tensors across whole parallel group. Only the process with
|
||||
rank ``dst`` is going to receive the final result.
|
||||
|
||||
Note:
|
||||
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
|
||||
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
|
||||
|
||||
Args:
|
||||
tensor (:class:`torch.Tensor`): Tensor to be reduced.
|
||||
dst (int): Destination rank.
|
||||
parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel group mode used in this communication.
|
||||
async_op (bool, optional): Whether operations are asynchronous.
|
||||
|
||||
Returns:
|
||||
Union[tuple(:class:`torch.Tensor`, work handle), :class:`torch.Tensor`]: The result of reduce only,
|
||||
if async_op is set to False. A tuple of output of all-gather and Async work handle, if async_op is set to True.
|
||||
"""
|
||||
depth = gpc.get_world_size(parallel_mode)
|
||||
if depth == 1:
|
||||
out = tensor
|
||||
|
@ -19,12 +19,12 @@ TensorShape = Union[torch.Size, List[int], Tuple[int]]
|
||||
def _get_tensor_shape(tensor_shape: TensorShape, chunk_tensor: bool = False) -> Tuple[TensorShape, bool]:
|
||||
"""get the exact tensor shape when communicating and return whether the tensor is a chunk
|
||||
|
||||
:param tensor_shape: shape of tensor
|
||||
:type tensor_shape: TensorShape
|
||||
:param chunk_tensor: whether to chunk tensor, defaults to False
|
||||
:type chunk_tensor: bool, optional
|
||||
:return: exact tensor shape, whether to chunk tensor
|
||||
:rtype: Tuple[Union[torch.Size, List[int], Tuple[int]], bool]
|
||||
Args:
|
||||
tensor_shape (:class:`torch.Size`): shape of tensor
|
||||
chunk_tensor (bool, optional): whether to chunk tensor, defaults to False
|
||||
|
||||
Returns:
|
||||
Tuple[Union[torch.Size, List[int], Tuple[int]], bool]: exact tensor shape, whether to chunk tensor
|
||||
"""
|
||||
if chunk_tensor:
|
||||
tensor_chunk_shape = reduce(operator.mul, tensor_shape, 1)
|
||||
@ -134,14 +134,14 @@ def _communicate(tensor_send_next=None,
|
||||
|
||||
|
||||
def recv_forward(input_tensor_shape, prev_rank=None, dtype=torch.float, scatter_gather_tensors=False):
|
||||
"""Receives the input tensor from the previous member in pipeline.
|
||||
"""Copy the forward output from the previous stage in pipeline as the input tensor of this stage.
|
||||
|
||||
:param input_tensor_shape: The shape of the tensor to be recieved
|
||||
:param prev_rank: The rank of the source of the tensor
|
||||
:type input_tensor_shape: torch.Size
|
||||
:type prev_rank: int, optional
|
||||
:return: The input tensor in forward step
|
||||
:rtype: :class:`torch.Tensor`
|
||||
Args:
|
||||
input_tensor_shape (:class:`torch.Size`): The shape of the tensor to be received.
|
||||
prev_rank (int, optional): The rank of the source of the tensor.
|
||||
|
||||
Returns:
|
||||
:class:`torch.Tensor`: The input tensor.
|
||||
"""
|
||||
if gpc.is_pipeline_first_stage():
|
||||
input_tensor = None
|
||||
@ -155,14 +155,14 @@ def recv_forward(input_tensor_shape, prev_rank=None, dtype=torch.float, scatter_
|
||||
|
||||
|
||||
def recv_backward(output_grad_shape, next_rank=None, dtype=torch.float, scatter_gather_tensors=False):
|
||||
"""Receives the grad tensor from the next member in pipeline.
|
||||
"""Copy the gradient tensor from the next stage in pipeline as the input gradient of this stage.
|
||||
|
||||
:param output_grad_shape: The shape of the tensor to be recieved
|
||||
:param next_rank: The rank of the source of the tensor
|
||||
:type output_grad_shape: torch.Size
|
||||
:type next_rank: int, optional
|
||||
:return: The grad of output tensor in forward step
|
||||
:rtype: :class:`torch.Tensor`
|
||||
Args:
|
||||
output_grad_shape (:class:`torch.Size`): The shape of the tensor to be received.
|
||||
next_rank (int, optional): The rank of the source of the tensor.
|
||||
|
||||
Returns:
|
||||
:class:`torch.Tensor`: The input gradient tensor.
|
||||
"""
|
||||
if gpc.is_pipeline_last_stage():
|
||||
output_tensor_grad = None
|
||||
@ -176,12 +176,11 @@ def recv_backward(output_grad_shape, next_rank=None, dtype=torch.float, scatter_
|
||||
|
||||
|
||||
def send_forward(output_tensor, next_rank=None, scatter_gather_tensors=False):
|
||||
"""Sends the input tensor to the next member in pipeline.
|
||||
"""Sends the input tensor to the next stage in pipeline.
|
||||
|
||||
:param output_tensor: Tensor to be sent
|
||||
:param next_rank: The rank of the recipient of the tensor
|
||||
:type output_tensor: :class:`torch.Tensor`
|
||||
:type next_rank: int, optional
|
||||
Args:
|
||||
output_tensor (:class:`torch.Tensor`): Tensor to be sent.
|
||||
next_rank (int, optional): The rank of the recipient of the tensor.
|
||||
"""
|
||||
if not gpc.is_pipeline_last_stage():
|
||||
_communicate(tensor_send_next=output_tensor,
|
||||
@ -190,12 +189,11 @@ def send_forward(output_tensor, next_rank=None, scatter_gather_tensors=False):
|
||||
|
||||
|
||||
def send_backward(input_tensor_grad, prev_rank=None, scatter_gather_tensors=False):
|
||||
"""Sends the grad tensor to the previous member in pipeline.
|
||||
"""Sends the gradient tensor to the previous stage in pipeline.
|
||||
|
||||
:param input_tensor_grad: Tensor to be sent
|
||||
:param prev_rank: The rank of the recipient of the tensor
|
||||
:type input_tensor_grad: :class:`torch.Tensor`
|
||||
:type prev_rank: int, optional
|
||||
Args:
|
||||
input_tensor_grad (:class:`torch.Tensor`): Tensor to be sent
|
||||
prev_rank (int, optional): The rank of the recipient of the tensor
|
||||
"""
|
||||
if not gpc.is_pipeline_first_stage():
|
||||
_communicate(tensor_send_prev=input_tensor_grad,
|
||||
@ -210,15 +208,15 @@ def send_forward_recv_backward(output_tensor,
|
||||
dtype=torch.float,
|
||||
scatter_gather_tensors=False):
|
||||
"""Batched communication operation. Sends the input tensor to the
|
||||
next member in pipeline, while recieves the grad tensor from the
|
||||
next member in pipeline.
|
||||
next stage in pipeline, while receives the gradient tensor from the
|
||||
next stage in pipeline as the input gradient tensor of this stage.
|
||||
|
||||
:param output_tensor: Tensor to be sent
|
||||
:param output_grad_shape: The shape of the tensor to be recieved
|
||||
:type output_tensor: :class:`torch.Tensor`
|
||||
:type output_grad_shape: :class:`torch.Size`
|
||||
:return: The grad of output tensor in forward step
|
||||
:rtype: :class:`torch.Tensor`
|
||||
Args:
|
||||
output_tensor (:class:`torch.Tensor`): Tensor to be sent.
|
||||
output_grad_shape (:class:`torch.Size`): The shape of the tensor to be received.
|
||||
|
||||
Returns:
|
||||
:class:`torch.Tensor`: The input gradient tensor.
|
||||
"""
|
||||
if gpc.is_pipeline_last_stage():
|
||||
output_tensor_grad = None
|
||||
@ -238,16 +236,16 @@ def send_backward_recv_forward(input_tensor_grad,
|
||||
prev_rank=None,
|
||||
dtype=torch.float,
|
||||
scatter_gather_tensors=False):
|
||||
"""Batched communication operation. Sends the grad tensor to the
|
||||
previous member in pipeline, while recieves the input tensor from the
|
||||
previous member in pipeline.
|
||||
"""Batched communication operation. Sends the gradient tensor to the
|
||||
previous stage in pipeline, while receives the output tensor from the
|
||||
previous stage in pipeline as the input of this stage.
|
||||
|
||||
:param input_tensor_grad: Tensor to be sent
|
||||
:param input_tensor_shape: The shape of the tensor to be recieved
|
||||
:type input_tensor_grad: :class:`torch.Tensor`
|
||||
:type input_tensor_shape: :class:`torch.Size`
|
||||
:return: The input tensor in forward step
|
||||
:rtype: :class:`torch.Tensor`
|
||||
Args:
|
||||
input_tensor_grad (:class:`torch.Tensor`): Tensor to be sent.
|
||||
input_tensor_shape (:class:`torch.Size`): The shape of the tensor to be received.
|
||||
|
||||
Returns:
|
||||
:class:`torch.Tensor`: The input tensor.
|
||||
"""
|
||||
if gpc.is_pipeline_first_stage():
|
||||
input_tensor = None
|
||||
@ -269,15 +267,15 @@ def send_forward_recv_forward(output_tensor,
|
||||
dtype=torch.float,
|
||||
scatter_gather_tensors=False):
|
||||
"""Batched communication operation. Sends the input tensor to the
|
||||
next member in pipeline, while recieves the input tensor from the
|
||||
previous member in pipeline.
|
||||
next stage in pipeline, while receives the output tensor from the
|
||||
previous stage in pipeline as the input of this stage.
|
||||
|
||||
:param output_tensor: Tensor to be sent
|
||||
:param input_tensor_shape: The shape of the tensor to be recieved
|
||||
:type output_tensor: :class:`torch.Tensor`
|
||||
:type input_tensor_shape: :class:`torch.Size`
|
||||
:return: The input tensor in forward step
|
||||
:rtype: :class:`torch.Tensor`
|
||||
Args:
|
||||
output_tensor (:class:`torch.Tensor`): Tensor to be sent.
|
||||
input_tensor_shape (:class:`torch.Size`): The shape of the tensor to be received.
|
||||
|
||||
Returns:
|
||||
:class:`torch.Tensor`: The input tensor.
|
||||
"""
|
||||
input_tensor, _ = _communicate(tensor_send_next=output_tensor,
|
||||
recv_prev=recv_prev,
|
||||
@ -296,16 +294,16 @@ def send_backward_recv_backward(input_tensor_grad,
|
||||
next_rank=None,
|
||||
dtype=torch.float,
|
||||
scatter_gather_tensors=False):
|
||||
"""Batched communication operation. Sends the grad tensor to the
|
||||
previous member in pipeline, while recieves the grad tensor from the
|
||||
next member in pipeline.
|
||||
"""Batched communication operation. Sends the gradient tensor to the
|
||||
previous stage in pipeline, while receives the gradient tensor from the
|
||||
next member in pipeline as the input of this stage.
|
||||
|
||||
:param input_tensor_grad: Tensor to be sent
|
||||
:param output_grad_shape: The shape of the tensor to be recieved
|
||||
:type input_tensor_grad: :class:`torch.Tensor`
|
||||
:type output_grad_shape: :class:`torch.Size`
|
||||
:return: The grad of output tensor in forward step
|
||||
:rtype: :class:`torch.Tensor`
|
||||
Args:
|
||||
input_tensor_grad (:class:`torch.Tensor`): Tensor to be sent.
|
||||
output_grad_shape (:class:`torch.Size`): The shape of the tensor to be received.
|
||||
|
||||
Returns:
|
||||
:class:`torch.Tensor`: The input gradient tensor.
|
||||
"""
|
||||
_, output_tensor_grad = _communicate(tensor_send_prev=input_tensor_grad,
|
||||
recv_next=recv_next,
|
||||
@ -327,20 +325,18 @@ def send_forward_backward_recv_forward_backward(output_tensor,
|
||||
next_rank=None,
|
||||
dtype=torch.float,
|
||||
scatter_gather_tensors=False):
|
||||
"""Batched communication operation. Sends the input tensor to the next and
|
||||
the grad tensor to the previous, while recieves the grad tensor from the
|
||||
next and the input tensor from the previous.
|
||||
"""Batched communication operation. Sends the input tensor to the next stage in pipeline and
|
||||
the gradient tensor to the previous stage, while receives the input gradient tensor from the
|
||||
next stage and the input tensor from the previous stage.
|
||||
|
||||
:param output_tensor: Tensor sent to the next
|
||||
:param input_tensor_grad: Tensor sent to the previous
|
||||
:param input_tensor_shape: The shape of the tensor recieved from the previous
|
||||
:param output_grad_shape: The shape of the tensor recieved from the next
|
||||
:type output_tensor: :class:`torch.Tensor`
|
||||
:type input_tensor_grad: :class:`torch.Tensor`
|
||||
:type input_tensor_shape: :class:`torch.Size`
|
||||
:type output_grad_shape: :class:`torch.Size`
|
||||
:return: (the input tensor in forward step, the grad of output tensor in forward step)
|
||||
:rtype: (Tensor, Tensor)
|
||||
Args:
|
||||
output_tensor (:class:`torch.Tensor`): Tensor sent to the next.
|
||||
input_tensor_grad (:class:`torch.Tensor`): Tensor sent to the previous.
|
||||
input_tensor_shape (:class:`torch.Size`): The shape of the tensor received from the previous.
|
||||
output_grad_shape (:class:`torch.Size`): The shape of the tensor received from the next.
|
||||
|
||||
Returns:
|
||||
Tuple(Tensor, Tensor): (the input tensor, the input gradient tensor)
|
||||
"""
|
||||
input_tensor, output_tensor_grad = _communicate(
|
||||
tensor_send_next=output_tensor,
|
||||
|
@ -9,15 +9,19 @@ from colossalai.utils import get_current_device, synchronize
|
||||
|
||||
|
||||
def ring_forward(tensor_send_next: torch.Tensor, parallel_mode: ParallelMode):
|
||||
"""Sends a tensor to the next member and recieves a tensor from the previous member.
|
||||
This function returns the recieved tensor from the previous member.
|
||||
"""Sends a tensor to the next member and receives a tensor from the previous member.
|
||||
This function returns the received tensor from the previous member.
|
||||
|
||||
:param tensor_send_next: Tensor sent to next member
|
||||
:param parallel_mode: Parallel group mode used in this communication
|
||||
:type tensor_send_next: :class:`torch.Tensor`
|
||||
:type parallel_mode: :class:`colossalai.context.ParallelMode`
|
||||
:return: The tensor recieved from the previous
|
||||
:rtype: :class:`torch.Tensor`
|
||||
Args:
|
||||
tensor_send_next: Tensor sent to next member
|
||||
parallel_mode: Parallel group mode used in this communication
|
||||
|
||||
Returns:
|
||||
:class:`torch.Tensor`: The tensor received from the previous.
|
||||
|
||||
Note:
|
||||
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
|
||||
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
|
||||
"""
|
||||
buffer_shape = tensor_send_next.size()
|
||||
|
||||
|
@ -12,14 +12,13 @@ def send_tensor_meta(tensor, need_meta=True, next_rank=None):
|
||||
meta information of the tensor should be sent before communications. This function
|
||||
synchronizes with :func:`recv_tensor_meta`.
|
||||
|
||||
:param tensor: Tensor to be sent
|
||||
:param need_meta: If False, meta information won't be sent
|
||||
:param next_rank: The rank of the next member in pipeline parallel group
|
||||
:type tensor: Tensor
|
||||
:type need_meta: bool, optional
|
||||
:type next_rank: int
|
||||
:return: False
|
||||
:rtype: bool
|
||||
Args:
|
||||
tensor (torch.Tensor): Tensor to be sent.
|
||||
need_meta (bool, optional): If False, meta information won't be sent.
|
||||
next_rank (int): The rank of the next member in pipeline parallel group.
|
||||
|
||||
Returns:
|
||||
bool: False
|
||||
"""
|
||||
if need_meta:
|
||||
if next_rank is None:
|
||||
@ -36,17 +35,17 @@ def send_tensor_meta(tensor, need_meta=True, next_rank=None):
|
||||
|
||||
|
||||
def recv_tensor_meta(tensor_shape, prev_rank=None):
|
||||
"""Recieves tensor meta information before recieving a specific tensor.
|
||||
"""Receives tensor meta information before receiving a specific tensor.
|
||||
Since the recipient must know the shape of the tensor in p2p communications,
|
||||
meta information of the tensor should be recieved before communications. This function
|
||||
meta information of the tensor should be received before communications. This function
|
||||
synchronizes with :func:`send_tensor_meta`.
|
||||
|
||||
:param tensor_shape: The shape of the tensor to be recieved
|
||||
:param prev_rank: The rank of the source of the tensor
|
||||
:type tensor_shape: torch.Size
|
||||
:type prev_rank: int, optional
|
||||
:return: The shape of the tensor to be recieved
|
||||
:rtype: torch.Size
|
||||
Args:
|
||||
tensor_shape (torch.Size): The shape of the tensor to be received.
|
||||
prev_rank (int): The rank of the source of the tensor.
|
||||
|
||||
Returns:
|
||||
torch.Size: The shape of the tensor to be received.
|
||||
"""
|
||||
if tensor_shape is None:
|
||||
if prev_rank is None:
|
||||
@ -67,14 +66,12 @@ def recv_tensor_meta(tensor_shape, prev_rank=None):
|
||||
def split_tensor_into_1d_equal_chunks(tensor, new_buffer=False):
|
||||
"""Break a tensor into equal 1D chunks.
|
||||
|
||||
:param tensor: Tensor to be splitted before communication
|
||||
:param new_buffer: Whether uses a new buffer to store sliced tensor
|
||||
Args:
|
||||
tensor (torch.Tensor): Tensor to be split before communication.
|
||||
new_buffer (bool, optional): Whether to use a new buffer to store sliced tensor.
|
||||
|
||||
:type tensor: torch.Tensor
|
||||
:type new_buffer: bool, optional
|
||||
|
||||
:return splitted_tensor: The splitted tensor
|
||||
:rtype splitted_tensor: torch.Tensor
|
||||
Returns:
|
||||
torch.Tensor: The split tensor
|
||||
"""
|
||||
partition_size = torch.numel(tensor) // gpc.get_world_size(ParallelMode.PARALLEL_1D)
|
||||
start_index = partition_size * gpc.get_local_rank(ParallelMode.PARALLEL_1D)
|
||||
@ -92,11 +89,10 @@ def split_tensor_into_1d_equal_chunks(tensor, new_buffer=False):
|
||||
def gather_split_1d_tensor(tensor):
|
||||
"""Opposite of above function, gather values from model parallel ranks.
|
||||
|
||||
:param tensor: Tensor to be gathered after communication
|
||||
:type tensor: torch.Tensor
|
||||
|
||||
:return gathered: The gathered tensor
|
||||
:rtype gathered: torch.Tensor
|
||||
Args:
|
||||
tensor (torch.Tensor): Tensor to be gathered after communication.
|
||||
Returns:
|
||||
gathered (torch.Tensor): The gathered tensor
|
||||
"""
|
||||
world_size = gpc.get_world_size(ParallelMode.PARALLEL_1D)
|
||||
numel = torch.numel(tensor)
|
||||
|
@ -12,8 +12,8 @@ class Config(dict):
|
||||
"""This is a wrapper class for dict objects so that values of which can be
|
||||
accessed as attributes.
|
||||
|
||||
:param config: The dict object to be wrapped
|
||||
:type config: dict
|
||||
Args:
|
||||
config (dict): The dict object to be wrapped.
|
||||
"""
|
||||
|
||||
def __init__(self, config: dict = None):
|
||||
@ -50,12 +50,14 @@ class Config(dict):
|
||||
def from_file(filename: str):
|
||||
"""Reads a python file and constructs a corresponding :class:`Config` object.
|
||||
|
||||
:param filename: Name of the file to construct the return object
|
||||
:type filename: str
|
||||
:raises AssertionError: Raises an AssertionError if the file does not exist, or the file
|
||||
is not .py file
|
||||
:return: A :class:`Config` object constructed with information in the file
|
||||
:rtype: :class:`Config`
|
||||
Args:
|
||||
filename (str): Name of the file to construct the return object.
|
||||
|
||||
Returns:
|
||||
:class:`Config`: A :class:`Config` object constructed with information in the file.
|
||||
|
||||
Raises:
|
||||
AssertionError: Raises an AssertionError if the file does not exist, or the file is not .py file
|
||||
"""
|
||||
|
||||
# check config path
|
||||
|
@ -22,6 +22,10 @@ class ParallelContext(metaclass=SingletonMeta):
|
||||
"""This class provides interface functions for users to get the parallel context,
|
||||
such as the global rank, the local rank, the world size, etc. of each device.
|
||||
|
||||
Note:
|
||||
The parallel_mode used in this class should be concluded in ``ParallelMode``.
|
||||
More details about ``ParallelMode`` could be found in
|
||||
`parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
@ -62,10 +66,12 @@ class ParallelContext(metaclass=SingletonMeta):
|
||||
def load_config(self, config: Union[dict, str]):
|
||||
"""Loads the configuration from either a dict or a file.
|
||||
|
||||
:param config: Either a dict containing the configuration information or the filename
|
||||
of a file containing the configuration information
|
||||
:type config: dict or str
|
||||
:raises TypeError: Raises a TypeError if `config` is neither a dict or a str
|
||||
Args:
|
||||
config (dict or str): Either a dict containing the configuration information or the filename
|
||||
of a file containing the configuration information.
|
||||
|
||||
Raises:
|
||||
TypeError: Raises a TypeError if `config` is neither a dict nor a str.
|
||||
"""
|
||||
if isinstance(config, str):
|
||||
self._config = Config.from_file(config)
|
||||
@ -81,20 +87,21 @@ class ParallelContext(metaclass=SingletonMeta):
|
||||
def get_global_rank(self):
|
||||
"""Returns the global rank of the current device.
|
||||
|
||||
:return: The global rank of the current device
|
||||
:rtype: int
|
||||
Returns:
|
||||
int: The global rank of the current device
|
||||
"""
|
||||
return self._global_ranks[ParallelMode.GLOBAL]
|
||||
|
||||
def add_global_rank(self, parallel_mode: ParallelMode, rank: int):
|
||||
"""Adds the global rank of the current device for `parallel_mode` to the context.
|
||||
|
||||
:param parallel_mode: The parallel mode for the rank
|
||||
:type parallel_mode: :class:`colossalai.context.ParallelMode`
|
||||
:param rank: The rank to be added
|
||||
:type rank: int
|
||||
:raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
|
||||
of :class:`colossalai.context.ParallelMode`
|
||||
Args:
|
||||
parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode for the rank.
|
||||
rank (int): The rank to be added
|
||||
|
||||
Raises:
|
||||
AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
|
||||
of :class:`colossalai.context.ParallelMode`.
|
||||
"""
|
||||
self._check_parallel_mode(parallel_mode)
|
||||
self._global_ranks[parallel_mode] = rank
|
||||
@ -102,12 +109,15 @@ class ParallelContext(metaclass=SingletonMeta):
|
||||
def get_local_rank(self, parallel_mode: ParallelMode):
|
||||
"""Returns the local rank of the current device.
|
||||
|
||||
:param parallel_mode: The chosen parallel mode
|
||||
:type parallel_mode: :class:`colossalai.context.ParallelMode`
|
||||
:raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
|
||||
of :class:`colossalai.context.ParallelMode`
|
||||
:return: The local rank of the current device for `parallel_mode`
|
||||
:rtype: int
|
||||
Args:
|
||||
parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
|
||||
|
||||
Raises:
|
||||
AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
|
||||
of :class:`colossalai.context.ParallelMode`.
|
||||
|
||||
Returns:
|
||||
int: The local rank of the current device for `parallel_mode`.
|
||||
"""
|
||||
self._check_parallel_mode(parallel_mode)
|
||||
return self._local_ranks[parallel_mode]
|
||||
@ -115,12 +125,13 @@ class ParallelContext(metaclass=SingletonMeta):
|
||||
def add_local_rank(self, parallel_mode: ParallelMode, rank: int):
|
||||
"""Adds the local rank of the current device for `parallel_mode` to the context.
|
||||
|
||||
:param parallel_mode: The parallel mode for the rank
|
||||
:type parallel_mode: :class:`colossalai.context.ParallelMode`
|
||||
:param rank: The rank to be added
|
||||
:type rank: int
|
||||
:raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
|
||||
of :class:`colossalai.context.ParallelMode`
|
||||
Args:
|
||||
parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode for the rank.
|
||||
rank (int): The rank to be added.
|
||||
|
||||
Raises:
|
||||
AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
|
||||
of :class:`colossalai.context.ParallelMode`.
|
||||
"""
|
||||
self._check_parallel_mode(parallel_mode)
|
||||
self._local_ranks[parallel_mode] = rank
|
||||
@ -128,12 +139,15 @@ class ParallelContext(metaclass=SingletonMeta):
|
||||
def get_next_global_rank(self, parallel_mode: ParallelMode):
|
||||
"""Returns the global rank of the next device.
|
||||
|
||||
:param parallel_mode: The chosen parallel mode
|
||||
:type parallel_mode: :class:`colossalai.context.ParallelMode`
|
||||
:raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
|
||||
of :class:`colossalai.context.ParallelMode`
|
||||
:return: The global rank of the next device for `parallel_mode`
|
||||
:rtype: int
|
||||
Args:
|
||||
parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
|
||||
|
||||
Raises:
|
||||
AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
|
||||
of :class:`colossalai.context.ParallelMode`.
|
||||
|
||||
Returns:
|
||||
int: The global rank of the next device for `parallel_mode`.
|
||||
"""
|
||||
self._check_parallel_mode(parallel_mode)
|
||||
|
||||
@ -147,12 +161,15 @@ class ParallelContext(metaclass=SingletonMeta):
|
||||
def get_prev_global_rank(self, parallel_mode: ParallelMode):
|
||||
"""Returns the global rank of the previous device.
|
||||
|
||||
:param parallel_mode: The chosen parallel mode
|
||||
:type parallel_mode: :class:`colossalai.context.ParallelMode`
|
||||
:raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
|
||||
of :class:`colossalai.context.ParallelMode`
|
||||
:return: The global rank of the previous device for `parallel_mode`
|
||||
:rtype: int
|
||||
Args:
|
||||
parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
|
||||
|
||||
Raises:
|
||||
AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
|
||||
of :class:`colossalai.context.ParallelMode`.
|
||||
|
||||
Returns:
|
||||
int: The global rank of the previous device for `parallel_mode`.
|
||||
"""
|
||||
self._check_parallel_mode(parallel_mode)
|
||||
|
||||
@ -167,13 +184,16 @@ class ParallelContext(metaclass=SingletonMeta):
|
||||
"""Returns a boolean value indicating whether the current device is the first one
|
||||
among its group for `parallel_mode`.
|
||||
|
||||
:param parallel_mode: The chosen parallel mode
|
||||
:type parallel_mode: :class:`colossalai.context.ParallelMode`
|
||||
:raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
|
||||
of :class:`colossalai.context.ParallelMode`
|
||||
:return: a boolean value indicating whether the current device is the first one
|
||||
among its group for `parallel_mode`
|
||||
:rtype: bool
|
||||
Args:
|
||||
parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
|
||||
|
||||
Raises:
|
||||
AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
|
||||
of :class:`colossalai.context.ParallelMode`.
|
||||
|
||||
Returns:
|
||||
bool: a boolean value indicating whether the current device is the first one
|
||||
among its group for `parallel_mode`.
|
||||
"""
|
||||
rank = self.get_local_rank(parallel_mode)
|
||||
return rank == 0
|
||||
@ -182,13 +202,16 @@ class ParallelContext(metaclass=SingletonMeta):
|
||||
"""Returns a boolean value indicating whether the current device is the last one
|
||||
among its group for `parallel_mode`.
|
||||
|
||||
:param parallel_mode: The chosen parallel mode
|
||||
:type parallel_mode: :class:`colossalai.context.ParallelMode`
|
||||
:raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
|
||||
of :class:`colossalai.context.ParallelMode`
|
||||
:return: a boolean value indicating whether the current device is the last one
|
||||
among its group for `parallel_mode`
|
||||
:rtype: bool
|
||||
Args:
|
||||
parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
|
||||
|
||||
Raises:
|
||||
AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
|
||||
of :class:`colossalai.context.ParallelMode`.
|
||||
|
||||
Returns:
|
||||
bool: a boolean value indicating whether the current device is the first one
|
||||
among its group for `parallel_mode`.
|
||||
"""
|
||||
rank = self.get_local_rank(parallel_mode)
|
||||
world_size = self.get_world_size(parallel_mode)
|
||||
@ -210,12 +233,15 @@ class ParallelContext(metaclass=SingletonMeta):
|
||||
def get_world_size(self, parallel_mode: ParallelMode):
|
||||
"""Returns the world size for `parallel_mode`.
|
||||
|
||||
:param parallel_mode: The chosen parallel mode
|
||||
:type parallel_mode: :class:`colossalai.context.ParallelMode`
|
||||
:raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
|
||||
of :class:`colossalai.context.ParallelMode`
|
||||
:return: The world size for `parallel_mode`
|
||||
:rtype: int
|
||||
Args:
|
||||
parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
|
||||
|
||||
Raises:
|
||||
AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
|
||||
of :class:`colossalai.context.ParallelMode`.
|
||||
|
||||
Returns:
|
||||
int: The world size for `parallel_mode`.
|
||||
"""
|
||||
self._check_parallel_mode(parallel_mode)
|
||||
return self._world_sizes[parallel_mode]
|
||||
@ -223,12 +249,13 @@ class ParallelContext(metaclass=SingletonMeta):
|
||||
def add_world_size(self, parallel_mode: ParallelMode, world_size: int):
|
||||
"""Adds world size for `parallel_mode`.
|
||||
|
||||
:param parallel_mode: The chosen parallel mode
|
||||
:type parallel_mode: :class:`colossalai.context.ParallelMode`
|
||||
:param world_size: The world size to be added
|
||||
:type world_size: int
|
||||
:raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
|
||||
of :class:`colossalai.context.ParallelMode`
|
||||
Args:
|
||||
parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
|
||||
world_size (int): The world size to be added
|
||||
|
||||
Raises:
|
||||
AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
|
||||
of :class:`colossalai.context.ParallelMode`.
|
||||
"""
|
||||
self._check_parallel_mode(parallel_mode)
|
||||
self._world_sizes[parallel_mode] = world_size
|
||||
@ -236,12 +263,15 @@ class ParallelContext(metaclass=SingletonMeta):
|
||||
def get_group(self, parallel_mode: ParallelMode):
|
||||
"""Returns the group of the current device for `parallel_mode`.
|
||||
|
||||
:param parallel_mode: The chosen parallel mode
|
||||
:type parallel_mode: :class:`colossalai.context.ParallelMode`
|
||||
:raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
|
||||
of :class:`colossalai.context.ParallelMode`
|
||||
:return: The group of the current device for `parallel_mode`
|
||||
:rtype: torch.distributed.ProcessGroup
|
||||
Args:
|
||||
parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
|
||||
|
||||
Raises:
|
||||
AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
|
||||
of :class:`colossalai.context.ParallelMode`.
|
||||
|
||||
Returns:
|
||||
torch.distributed.ProcessGroup: The group of the current device for `parallel_mode`.
|
||||
"""
|
||||
self._check_parallel_mode(parallel_mode)
|
||||
return self._groups[parallel_mode]
|
||||
@ -249,12 +279,13 @@ class ParallelContext(metaclass=SingletonMeta):
|
||||
def add_group(self, parallel_mode: ParallelMode, group: dist.ProcessGroup):
|
||||
"""Adds the group of the current device for `parallel_mode`.
|
||||
|
||||
:param parallel_mode: The chosen parallel mode
|
||||
:type parallel_mode: :class:`colossalai.context.ParallelMode`
|
||||
:param group: The group to be added
|
||||
:type group: torch.distributed.ProcessGroup
|
||||
:raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
|
||||
of :class:`colossalai.context.ParallelMode`
|
||||
Args:
|
||||
parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
|
||||
group (torch.distributed.ProcessGroup): The group to be added
|
||||
|
||||
Raises:
|
||||
AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
|
||||
of :class:`colossalai.context.ParallelMode`.
|
||||
"""
|
||||
self._check_parallel_mode(parallel_mode)
|
||||
self._groups[parallel_mode] = group
|
||||
@ -262,12 +293,15 @@ class ParallelContext(metaclass=SingletonMeta):
|
||||
def get_ranks_in_group(self, parallel_mode: ParallelMode):
|
||||
"""Returns the rank of the current device for `parallel_mode` in the group.
|
||||
|
||||
:param parallel_mode: The chosen parallel mode
|
||||
:type parallel_mode: :class:`colossalai.context.ParallelMode`
|
||||
:raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
|
||||
of :class:`colossalai.context.ParallelMode`
|
||||
:return: the rank of the current device for `parallel_mode` in the group
|
||||
:rtype: int
|
||||
Args:
|
||||
parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
|
||||
|
||||
Raises:
|
||||
AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
|
||||
of :class:`colossalai.context.ParallelMode`.
|
||||
|
||||
Returns:
|
||||
int: The rank of the current device for `parallel_mode` in the group.
|
||||
"""
|
||||
self._check_parallel_mode(parallel_mode)
|
||||
return self._ranks_in_group[parallel_mode]
|
||||
@ -275,28 +309,26 @@ class ParallelContext(metaclass=SingletonMeta):
|
||||
def add_ranks_in_group(self, parallel_mode: ParallelMode, ranks: list):
|
||||
"""Adds the ranks of the current device for `parallel_mode` in the group.
|
||||
|
||||
:param parallel_mode: The chosen parallel mode
|
||||
:type parallel_mode: :class:`colossalai.context.ParallelMode`
|
||||
:param ranks: List of ranks to be added
|
||||
:type ranks: list
|
||||
:raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
|
||||
of :class:`colossalai.context.ParallelMode`
|
||||
Args:
|
||||
parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
|
||||
ranks (list): List of ranks to be added
|
||||
|
||||
Raises:
|
||||
AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
|
||||
of :class:`colossalai.context.ParallelMode`.
|
||||
"""
|
||||
self._check_parallel_mode(parallel_mode)
|
||||
self._ranks_in_group[parallel_mode] = ranks
|
||||
|
||||
def init_global_dist(self, rank: int, world_size: int, backend: str, host: str, port: int):
|
||||
"""Initializes the global distributed environment
|
||||
:param rank: rank for the default process group
|
||||
:type rank: int
|
||||
:param world_size: world size of the default process group
|
||||
:type world_size: int
|
||||
:param host: the master address for distributed training
|
||||
:type host: str
|
||||
:param port: the master port for distributed training
|
||||
:type port: str
|
||||
:param backend: backend for torch.distributed
|
||||
:type backend: str
|
||||
|
||||
Args:
|
||||
rank (int): rank for the default process group.
|
||||
world_size (int): world size of the default process group.
|
||||
backend (str): backend for ``torch.distributed``
|
||||
host (str): the master address for distributed training.
|
||||
port (str): the master port for distributed training
|
||||
"""
|
||||
# initialize the default process group
|
||||
init_method = f'tcp://{host}:{port}'
|
||||
@ -315,8 +347,9 @@ class ParallelContext(metaclass=SingletonMeta):
|
||||
def check_sanity(self):
|
||||
"""Checks sanity of the parallel context.
|
||||
|
||||
:raises AssertionError: Raises an AssertionError if the world size does not equal to the product
|
||||
of data paralle size, pipeline parallel size and tensor parallel size
|
||||
Raises:
|
||||
AssertionError: Raises an AssertionError if the world size does not equal to the product
|
||||
of data parallel size, pipeline parallel size and tensor parallel size.
|
||||
"""
|
||||
dps = self.data_parallel_size
|
||||
pps = self.pipeline_parallel_size
|
||||
@ -341,7 +374,8 @@ class ParallelContext(metaclass=SingletonMeta):
|
||||
def init_parallel_groups(self):
|
||||
"""Initializes the parallel groups.
|
||||
|
||||
:raises AssertionError: Raises an AssertionError if the field paralle is not present in the config file
|
||||
Raises:
|
||||
AssertionError: Raises an AssertionError if the field parallel is not present in the config file.
|
||||
"""
|
||||
|
||||
# get rank and world size
|
||||
@ -411,11 +445,11 @@ class ParallelContext(metaclass=SingletonMeta):
|
||||
"""Returns a boolean value indicating whether `parallel_mode` is initialized
|
||||
in the current system.
|
||||
|
||||
:param parallel_mode: The chosen parallel mode
|
||||
:type parallel_mode: :class:`colossalai.context.ParallelMode`
|
||||
:return: a boolean value indicating whether `parallel_mode` is initialized
|
||||
in the current system
|
||||
:rtype: bool
|
||||
Args:
|
||||
parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
|
||||
|
||||
Returns:
|
||||
bool: a boolean value indicating whether `parallel_mode` is initialized in the current system.
|
||||
"""
|
||||
return parallel_mode in self._groups
|
||||
|
||||
@ -432,8 +466,8 @@ class ParallelContext(metaclass=SingletonMeta):
|
||||
def set_device(self, device_ordinal: int = None):
|
||||
"""Sets distributed processes to be bound to devices.
|
||||
|
||||
:param device_ordinal: the device id to be bound to
|
||||
:type device_ordinal: int, optional
|
||||
Args:
|
||||
device_ordinal (int, optional): the device id to be bound to
|
||||
"""
|
||||
global_rank = self.get_global_rank()
|
||||
if device_ordinal is None:
|
||||
@ -447,8 +481,8 @@ class ParallelContext(metaclass=SingletonMeta):
|
||||
def set_seed(self, seed: int):
|
||||
"""Sets seeds for all random libraries.
|
||||
|
||||
:param seed: seed for random states
|
||||
:type seed: int
|
||||
Args:
|
||||
seed (int): seed for random states
|
||||
"""
|
||||
random.seed(seed)
|
||||
np.random.seed(seed)
|
||||
|
@ -11,8 +11,16 @@ from .process_group_initializer import ProcessGroupInitializer
|
||||
|
||||
@DIST_GROUP_INITIALIZER.register_module
|
||||
class Initializer_1D(ProcessGroupInitializer):
|
||||
'''A ProcessGroupInitializer for 1d tensor parallelism.
|
||||
'''
|
||||
"""A ProcessGroupInitializer for 1d tensor parallelism.
|
||||
|
||||
Args:
|
||||
rank (int): The rank of current process.
|
||||
world_size (int): Size of whole communication world.
|
||||
config (Config): Running configuration.
|
||||
data_parallel_size (int): Size of data parallel.
|
||||
pipeline_parallel_size (int): Size of pipeline parallel.
|
||||
tensor_parallel_size (int): Size of tensor parallel.
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
@ -20,8 +28,10 @@ class Initializer_1D(ProcessGroupInitializer):
|
||||
|
||||
def init_dist_group(self):
|
||||
"""Initialize 1D tensor parallel groups, and assign local_ranks and groups to each gpu.
|
||||
:return: (local_rank, group_world_size, process_group, ranks_in_group, mode)
|
||||
:rtype: Tuple
|
||||
|
||||
Returns:
|
||||
Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
|
||||
1D tensor parallelism's information in a tuple.
|
||||
"""
|
||||
local_rank = None
|
||||
ranks_in_group = None
|
||||
|
@ -22,12 +22,16 @@ def _check_summa_env_var(summa_dim):
|
||||
|
||||
class Initializer_2D_Row(ProcessGroupInitializer):
|
||||
"""2d tensor parallel initialization among rows.
|
||||
:param num_group: The number of all tensor groups
|
||||
:param summa_dim: The dimension of SUMMA
|
||||
:param args: Args used to initialize base class
|
||||
:param kwargs: Kwargs used to initialize base class
|
||||
:type num_group: int
|
||||
:type summa_dim: int
|
||||
|
||||
Args:
|
||||
num_group (int): The number of all tensor groups.
|
||||
summa_dim (int): The dimension of SUMMA.
|
||||
rank (int): The rank of current process.
|
||||
world_size (int): Size of whole communication world.
|
||||
config (Config): Running configuration.
|
||||
data_parallel_size (int): Size of data parallel.
|
||||
pipeline_parallel_size (int): Size of pipeline parallel.
|
||||
tensor_parallel_size (int): Size of tensor parallel.
|
||||
"""
|
||||
|
||||
def __init__(self, num_group, summa_dim, *args, **kwargs):
|
||||
@ -37,9 +41,9 @@ class Initializer_2D_Row(ProcessGroupInitializer):
|
||||
|
||||
def init_dist_group(self):
|
||||
"""Initialize 2D tensor row parallel groups, and assign local_ranks and groups to each gpu.
|
||||
|
||||
:return: 2D tensor row parallelism's information
|
||||
:rtype: Tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
|
||||
Returns:
|
||||
Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
|
||||
2D tensor row parallelism's information in a tuple.
|
||||
"""
|
||||
local_rank = None
|
||||
ranks_in_group = None
|
||||
@ -64,13 +68,15 @@ class Initializer_2D_Row(ProcessGroupInitializer):
|
||||
class Initializer_2D_Col(ProcessGroupInitializer):
|
||||
"""2d tensor parallel initialization among cols.
|
||||
|
||||
:param num_group: The number of all tensor groups
|
||||
:param summa_dim: The dimension of SUMMA
|
||||
:param args: Args used to initialize base class
|
||||
:param kwargs: Kwargs used to initialize base class
|
||||
|
||||
:type num_group: int
|
||||
:type summa_dim: int
|
||||
Args:
|
||||
num_group (int): The number of all tensor groups.
|
||||
summa_dim (int): The dimension of SUMMA.
|
||||
rank (int): The rank of current process.
|
||||
world_size (int): Size of whole communication world.
|
||||
config (Config): Running configuration.
|
||||
data_parallel_size (int): Size of data parallel.
|
||||
pipeline_parallel_size (int): Size of pipeline parallel.
|
||||
tensor_parallel_size (int): Size of tensor parallel.
|
||||
"""
|
||||
|
||||
def __init__(self, num_group, summa_dim, *args, **kwargs):
|
||||
@ -81,8 +87,9 @@ class Initializer_2D_Col(ProcessGroupInitializer):
|
||||
def init_dist_group(self):
|
||||
"""Initialize 2D tensor row parallel groups, and assign local_ranks and groups to each gpu.
|
||||
|
||||
:return: 2D tensor col parallelism's information
|
||||
:rtype: Tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
|
||||
Returns:
|
||||
Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
|
||||
2D tensor col parallelism's information in a tuple.
|
||||
"""
|
||||
local_rank = None
|
||||
ranks_in_group = None
|
||||
@ -109,8 +116,13 @@ class Initializer_2D(ProcessGroupInitializer):
|
||||
"""
|
||||
Serve as the single entry point to 2D parallel initialization.
|
||||
|
||||
:param args: Args used to initialize ProcessGroupInitializer
|
||||
:param kwargs: Kwargs used to initialize ProcessGroupInitializer
|
||||
Args:
|
||||
rank (int): The rank of current process.
|
||||
world_size (int): Size of whole communication world.
|
||||
config (Config): Running configuration.
|
||||
data_parallel_size (int): Size of data parallel.
|
||||
pipeline_parallel_size (int): Size of pipeline parallel.
|
||||
tensor_parallel_size (int): Size of tensor parallel.
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
@ -127,8 +139,10 @@ class Initializer_2D(ProcessGroupInitializer):
|
||||
|
||||
def init_dist_group(self):
|
||||
"""Initialize 2D tensor row and col parallel groups, and assign local_ranks and groups to each gpu.
|
||||
:return: 2D tensor parallelism's information
|
||||
:rtype: list of Tuples (local_rank, group_world_size, process_group, ranks_in_group, mode)
|
||||
|
||||
Returns:
|
||||
List[Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode)]:
|
||||
2D tensor parallelism's information in a list of tuples.
|
||||
"""
|
||||
parallel_setting = [self.row_initializer.init_dist_group(), self.col_initializer.init_dist_group()]
|
||||
return parallel_setting
|
||||
|
@ -31,14 +31,17 @@ def _check_tesseract_env_var(tesseract_dim: int, tesseract_dep: int):
|
||||
|
||||
# i row j col k dep
|
||||
class Initializer_2p5D_ROW(ProcessGroupInitializer):
|
||||
"""2p5d tensor parallel initialization among rows.
|
||||
"""2.5d tensor parallel initialization among rows.
|
||||
|
||||
:param tesseract_dim: The dimension of tesseract
|
||||
:param tesseract_dep: The dimension of depth
|
||||
:param args: Args used to initialize base class
|
||||
|
||||
:type tesseract_dim: int
|
||||
:type tesseract_dep: int
|
||||
Args:
|
||||
tesseract_dim (int): The dimension of tesseract.
|
||||
tesseract_dep (int): The dimension of depth.
|
||||
rank (int): The rank of current process.
|
||||
world_size (int): Size of whole communication world.
|
||||
config (Config): Running configuration.
|
||||
data_parallel_size (int): Size of data parallel.
|
||||
pipeline_parallel_size (int): Size of pipeline parallel.
|
||||
tensor_parallel_size (int): Size of tensor parallel.
|
||||
"""
|
||||
|
||||
def __init__(self, tesseract_dim: int, tesseract_dep: int, *args):
|
||||
@ -50,10 +53,11 @@ class Initializer_2p5D_ROW(ProcessGroupInitializer):
|
||||
"Tensor parallel size should be depth * dim ** 2 in 2.5D parallel"
|
||||
|
||||
def init_dist_group(self):
|
||||
"""Initialize 2p5D tensor row parallel groups, and assign local_ranks and groups to each gpu.
|
||||
"""Initialize 2.5D tensor row parallel groups, and assign local_ranks and groups to each gpu.
|
||||
|
||||
:return: 2p5D tensor row parallelism's information
|
||||
:rtype: Tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
|
||||
Returns:
|
||||
Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
|
||||
2.5D tensor row parallelism's information in a tuple.
|
||||
"""
|
||||
local_rank = None
|
||||
ranks_in_group = None
|
||||
@ -80,14 +84,17 @@ class Initializer_2p5D_ROW(ProcessGroupInitializer):
|
||||
|
||||
|
||||
class Initializer_2p5D_Col(ProcessGroupInitializer):
|
||||
"""2p5d tensor parallel initialization among cols.
|
||||
"""2.5d tensor parallel initialization among cols.
|
||||
|
||||
:param tesseract_dim: The dimension of tesseract
|
||||
:param tesseract_dep: The dimension of depth
|
||||
:param args: Args used to initialize base class
|
||||
|
||||
:type tesseract_dim: int
|
||||
:type tesseract_dep: int
|
||||
Args:
|
||||
tesseract_dim (int): The dimension of tesseract.
|
||||
tesseract_dep (int): The dimension of depth.
|
||||
rank (int): The rank of current process.
|
||||
world_size (int): Size of whole communication world.
|
||||
config (Config): Running configuration.
|
||||
data_parallel_size (int): Size of data parallel.
|
||||
pipeline_parallel_size (int): Size of pipeline parallel.
|
||||
tensor_parallel_size (int): Size of tensor parallel.
|
||||
"""
|
||||
|
||||
def __init__(self, tesseract_dim: int, tesseract_dep: int, *args):
|
||||
@ -99,10 +106,11 @@ class Initializer_2p5D_Col(ProcessGroupInitializer):
|
||||
"Tensor parallel size should be depth * dim ** 2 in 2.5D parallel"
|
||||
|
||||
def init_dist_group(self):
|
||||
"""Initialize 2p5D tensor col parallel groups, and assign local_ranks and groups to each gpu.
|
||||
"""Initialize 2.5D tensor col parallel groups, and assign local_ranks and groups to each gpu.
|
||||
|
||||
:return: 2p5D tensor col parallelism's information
|
||||
:rtype: Tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
|
||||
Returns:
|
||||
Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
|
||||
2.5D tensor col parallelism's information in a tuple.
|
||||
"""
|
||||
local_rank = None
|
||||
ranks_in_group = None
|
||||
@ -129,14 +137,17 @@ class Initializer_2p5D_Col(ProcessGroupInitializer):
|
||||
|
||||
|
||||
class Initializer_2p5D_Dep(ProcessGroupInitializer):
|
||||
"""2p5D tensor parallel initialization among depths.
|
||||
"""2.5D tensor parallel initialization among depths.
|
||||
|
||||
:param tesseract_dim: The dimension of tesseract
|
||||
:param tesseract_dep: The dimension of depth
|
||||
:param args: Args used to initialize base class
|
||||
|
||||
:type tesseract_dim: int
|
||||
:type tesseract_dep: int
|
||||
Args:
|
||||
tesseract_dim (int): The dimension of tesseract.
|
||||
tesseract_dep (int): The dimension of depth.
|
||||
rank (int): The rank of current process.
|
||||
world_size (int): Size of whole communication world.
|
||||
config (Config): Running configuration.
|
||||
data_parallel_size (int): Size of data parallel.
|
||||
pipeline_parallel_size (int): Size of pipeline parallel.
|
||||
tensor_parallel_size (int): Size of tensor parallel.
|
||||
"""
|
||||
|
||||
def __init__(self, tesseract_dim: int, tesseract_dep: int, *args):
|
||||
@ -148,10 +159,11 @@ class Initializer_2p5D_Dep(ProcessGroupInitializer):
|
||||
"Tensor parallel size should be depth * dim ** 2 in 2.5D parallel"
|
||||
|
||||
def init_dist_group(self):
|
||||
"""Initialize 2p5D tensor depth parallel groups, and assign local_ranks and groups to each gpu.
|
||||
"""Initialize 2.5D tensor depth parallel groups, and assign local_ranks and groups to each gpu.
|
||||
|
||||
:return: 2p5D tensor depth parallelism's information
|
||||
:rtype: Tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
|
||||
Returns:
|
||||
Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
|
||||
2.5D tensor depth parallelism's information in a tuple.
|
||||
"""
|
||||
local_rank = None
|
||||
ranks_in_group = None
|
||||
@ -179,14 +191,17 @@ class Initializer_2p5D_Dep(ProcessGroupInitializer):
|
||||
|
||||
# i row j col k dep
|
||||
class Initializer_2p5D_XZ(ProcessGroupInitializer):
|
||||
"""2p5d tensor parallel initialization among cols times dep.
|
||||
"""2.5d tensor parallel initialization among cols times dep.
|
||||
|
||||
:param tesseract_dim: The dimension of tesseract
|
||||
:param tesseract_dep: The dimension of depth
|
||||
:param args: Args used to initialize base class
|
||||
|
||||
:type tesseract_dim: int
|
||||
:type tesseract_dep: int
|
||||
Args:
|
||||
tesseract_dim (int): The dimension of tesseract.
|
||||
tesseract_dep (int): The dimension of depth.
|
||||
rank (int): The rank of current process.
|
||||
world_size (int): Size of whole communication world.
|
||||
config (Config): Running configuration.
|
||||
data_parallel_size (int): Size of data parallel.
|
||||
pipeline_parallel_size (int): Size of pipeline parallel.
|
||||
tensor_parallel_size (int): Size of tensor parallel.
|
||||
"""
|
||||
|
||||
def __init__(self, tesseract_dim: int, tesseract_dep: int, *args):
|
||||
@ -198,10 +213,11 @@ class Initializer_2p5D_XZ(ProcessGroupInitializer):
|
||||
"Tensor parallel size should be depth * dim ** 2 in 2.5D parallel"
|
||||
|
||||
def init_dist_group(self):
|
||||
"""Initialize 2p5D tensor colXdepth parallel groups, and assign local_ranks and groups to each gpu.
|
||||
"""Initialize 2.5D tensor colXdepth parallel groups, and assign local_ranks and groups to each gpu.
|
||||
|
||||
:return: 2p5D tensor colXdepth parallelism's information
|
||||
:rtype: Tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
|
||||
Returns:
|
||||
Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
|
||||
2.5D tensor colXdepth parallelism's information in a tuple.
|
||||
"""
|
||||
local_rank = None
|
||||
ranks_in_group = None
|
||||
@ -232,20 +248,14 @@ class Initializer_2p5D(ProcessGroupInitializer):
|
||||
"""
|
||||
Serve as the single entry point to Tesseract parallel initialization.
|
||||
|
||||
:param rank: The rank of current process
|
||||
:param world_size: Size of whole communication world
|
||||
:param config: Running configuration
|
||||
:param data_parallel_size: Size of data parallel
|
||||
:param pipeline_parallel_size: Size of pipeline parallel
|
||||
:param tensor_parallel_size: Size of tensor parallel
|
||||
:param depth: The depth of 2p5d parallel
|
||||
:type rank: int
|
||||
:type world_size: int
|
||||
:type config: Config
|
||||
:type data_parallel_size: int
|
||||
:type pipeline_parallel_size: int
|
||||
:type tensor_parallel_size: int
|
||||
:type depth: int
|
||||
Args:
|
||||
rank (int): The rank of current process.
|
||||
world_size (int): Size of whole communication world.
|
||||
config (Config): Running configuration.
|
||||
data_parallel_size (int): Size of data parallel.
|
||||
pipeline_parallel_size (int): Size of pipeline parallel.
|
||||
tensor_parallel_size (int): Size of tensor parallel.
|
||||
depth (int): The depth of 2.5d parallel.
|
||||
"""
|
||||
|
||||
def __init__(self, rank: int, world_size: int, config: Config, data_parallel_size: int, pipeline_parallel_size: int,
|
||||
@ -266,9 +276,11 @@ class Initializer_2p5D(ProcessGroupInitializer):
|
||||
self.xz_initializer = Initializer_2p5D_XZ(self.tesseract_dim, self.tesseract_dep, *args)
|
||||
|
||||
def init_dist_group(self):
|
||||
"""Initialize 2p5D tensor row, col, depth, and colXdepth parallel groups, and assign local_ranks and groups to each gpu.
|
||||
:return: Whole 2p5D tensor parallelism's information
|
||||
:rtype: list of Tuples (local_rank, group_world_size, process_group, ranks_in_group, mode)
|
||||
"""Initialize 2.5D tensor row, col, depth, and colXdepth parallel groups, and assign local_ranks and groups to each gpu.
|
||||
|
||||
Returns:
|
||||
List[Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode)]:
|
||||
Whole 2.5D tensor parallelism's information in a list of tuples.
|
||||
"""
|
||||
parallel_setting = [
|
||||
self.col_initializer.init_dist_group(),
|
||||
|
@ -26,12 +26,15 @@ def _check_depth_env_var(depth):
|
||||
class Initializer_3D_Input(ProcessGroupInitializer):
|
||||
"""3D tensor parallel initialization among input.
|
||||
|
||||
:param num_group: The number of all tensor groups
|
||||
:param depth: Depth of 3D parallelism
|
||||
:param args: Args used in base class
|
||||
|
||||
:type num_group: int
|
||||
:type depth: int
|
||||
Args:
|
||||
num_group (int): The number of all tensor groups.
|
||||
depth (int): Depth of 3D parallelism.
|
||||
rank (int): The rank of current process.
|
||||
world_size (int): Size of whole communication world.
|
||||
config (Config): Running configuration.
|
||||
data_parallel_size (int): Size of data parallel.
|
||||
pipeline_parallel_size (int): Size of pipeline parallel.
|
||||
tensor_parallel_size (int): Size of tensor parallel.
|
||||
"""
|
||||
|
||||
def __init__(self, num_group: int, depth: int, *args):
|
||||
@ -42,8 +45,9 @@ class Initializer_3D_Input(ProcessGroupInitializer):
|
||||
def init_dist_group(self):
|
||||
"""Initialize 3D tensor parallel groups among input, and assign local_ranks and groups to each gpu.
|
||||
|
||||
:return: 3D tensor parallelism's information among input
|
||||
:rtype: Tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
|
||||
Returns:
|
||||
Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
|
||||
3D tensor parallelism's information among input in a tuple.
|
||||
"""
|
||||
local_rank = None
|
||||
ranks_in_group = None
|
||||
@ -70,12 +74,15 @@ class Initializer_3D_Input(ProcessGroupInitializer):
|
||||
class Initializer_3D_Weight(ProcessGroupInitializer):
|
||||
"""3D tensor parallel initialization among weight.
|
||||
|
||||
:param num_group: The number of all tensor groups
|
||||
:param depth: Depth of 3D parallelism
|
||||
:param args: Args used in base class
|
||||
|
||||
:type num_group: int
|
||||
:type depth: int
|
||||
Args:
|
||||
num_group (int): The number of all tensor groups.
|
||||
depth (int): Depth of 3D parallelism.
|
||||
rank (int): The rank of current process.
|
||||
world_size (int): Size of whole communication world.
|
||||
config (Config): Running configuration.
|
||||
data_parallel_size (int): Size of data parallel.
|
||||
pipeline_parallel_size (int): Size of pipeline parallel.
|
||||
tensor_parallel_size (int): Size of tensor parallel.
|
||||
"""
|
||||
|
||||
def __init__(self, num_group: int, depth: int, *args):
|
||||
@ -86,8 +93,9 @@ class Initializer_3D_Weight(ProcessGroupInitializer):
|
||||
def init_dist_group(self):
|
||||
"""Initialize 3D tensor parallel groups among weight, and assign local_ranks and groups to each gpu.
|
||||
|
||||
:return: 3D tensor parallelism's information among weight
|
||||
:rtype: Tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
|
||||
Returns:
|
||||
Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
|
||||
3D tensor parallelism's information among weight in a tuple.
|
||||
"""
|
||||
local_rank = None
|
||||
ranks_in_group = None
|
||||
@ -114,12 +122,15 @@ class Initializer_3D_Weight(ProcessGroupInitializer):
|
||||
class Initializer_3D_Output(ProcessGroupInitializer):
|
||||
"""3D tensor parallel initialization among output.
|
||||
|
||||
:param num_group: The number of all tensor groups
|
||||
:param depth: Depth of 3D parallelism
|
||||
:param args: Args used in base class
|
||||
|
||||
:type num_group: int
|
||||
:type depth: int
|
||||
Args:
|
||||
num_group (int): The number of all tensor groups.
|
||||
depth (int): Depth of 3D parallelism.
|
||||
rank (int): The rank of current process.
|
||||
world_size (int): Size of whole communication world.
|
||||
config (Config): Running configuration.
|
||||
data_parallel_size (int): Size of data parallel.
|
||||
pipeline_parallel_size (int): Size of pipeline parallel.
|
||||
tensor_parallel_size (int): Size of tensor parallel.
|
||||
"""
|
||||
|
||||
def __init__(self, num_group: int, depth: int, *args):
|
||||
@ -130,8 +141,9 @@ class Initializer_3D_Output(ProcessGroupInitializer):
|
||||
def init_dist_group(self):
|
||||
"""Initialize 3D tensor parallel groups among output, and assign local_ranks and groups to each gpu.
|
||||
|
||||
:return: 3D tensor parallelism's information among output
|
||||
:rtype: Tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
|
||||
Returns:
|
||||
Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
|
||||
3D tensor parallelism's information among output in a tuple.
|
||||
"""
|
||||
local_rank = None
|
||||
ranks_in_group = None
|
||||
@ -158,7 +170,14 @@ class Initializer_3D_Output(ProcessGroupInitializer):
|
||||
@DIST_GROUP_INITIALIZER.register_module
|
||||
class Initializer_3D(ProcessGroupInitializer):
|
||||
"""Serve as the single entry point to 3D parallel initialization.
|
||||
:param args: Args used to initialize ProcessGroupInitializer
|
||||
|
||||
Args:
|
||||
rank (int): The rank of current process.
|
||||
world_size (int): Size of whole communication world.
|
||||
config (Config): Running configuration.
|
||||
data_parallel_size (int): Size of data parallel.
|
||||
pipeline_parallel_size (int): Size of pipeline parallel.
|
||||
tensor_parallel_size (int): Size of tensor parallel.
|
||||
"""
|
||||
|
||||
def __init__(self, *args):
|
||||
@ -175,8 +194,10 @@ class Initializer_3D(ProcessGroupInitializer):
|
||||
|
||||
def init_dist_group(self):
|
||||
"""Initialize 3D tensor parallel groups, and assign local_ranks and groups to each gpu.
|
||||
:return: 3D tensor parallelism's information
|
||||
:rtype: list of Tuples (local_rank, group_world_size, process_group, ranks_in_group, mode)
|
||||
|
||||
Returns:
|
||||
List[Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode)]:
|
||||
Whole 3D tensor parallelism's information in a list of tuples.
|
||||
"""
|
||||
parallel_setting = [
|
||||
self.input_initializer.init_dist_group(),
|
||||
|
@ -12,8 +12,13 @@ from ..parallel_mode import ParallelMode
|
||||
class Initializer_Data(ProcessGroupInitializer):
|
||||
"""A ProcessGroupInitializer for data parallelism.
|
||||
|
||||
:param args: Args used to initialize ProcessGroupInitializer
|
||||
:param kwargs: Kwargs used to initialize ProcessGroupInitializer
|
||||
Args:
|
||||
rank (int): The rank of current process.
|
||||
world_size (int): Size of whole communication world.
|
||||
config (Config): Running configuration.
|
||||
data_parallel_size (int): Size of data parallel.
|
||||
pipeline_parallel_size (int): Size of pipeline parallel.
|
||||
tensor_parallel_size (int): Size of tensor parallel.
|
||||
"""
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
@ -22,8 +27,9 @@ class Initializer_Data(ProcessGroupInitializer):
|
||||
def init_dist_group(self):
|
||||
"""Initialize data parallel groups, and assign local_ranks and groups to each gpu.
|
||||
|
||||
:return: Data parallelism's information
|
||||
:rtype: Tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
|
||||
Returns:
|
||||
Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
|
||||
A Data parallelism's information tuple.
|
||||
"""
|
||||
local_rank = None
|
||||
ranks_in_group = None
|
||||
|
@ -12,8 +12,13 @@ class Initializer_Model(ProcessGroupInitializer):
|
||||
"""A ProcessGroupInitializer for model parallelism (model parallel group contains pipeline and tensor parallel
|
||||
groups).
|
||||
|
||||
:param args: Args used to initialize ProcessGroupInitializer
|
||||
:param kwargs: Kwargs used to initialize ProcessGroupInitializer
|
||||
Args:
|
||||
rank (int): The rank of current process.
|
||||
world_size (int): Size of whole communication world.
|
||||
config (Config): Running configuration.
|
||||
data_parallel_size (int): Size of data parallel.
|
||||
pipeline_parallel_size (int): Size of pipeline parallel.
|
||||
tensor_parallel_size (int): Size of tensor parallel.
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
@ -24,8 +29,9 @@ class Initializer_Model(ProcessGroupInitializer):
|
||||
def init_dist_group(self):
|
||||
"""Initialize model parallel groups, and assign local_ranks and groups to each gpu.
|
||||
|
||||
:return: (local_rank, group_world_size, process_group, ranks_in_group, mode)
|
||||
:rtype: Tuple
|
||||
Returns:
|
||||
Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
|
||||
A Model parallelism's information tuple.
|
||||
"""
|
||||
local_rank = None
|
||||
ranks_in_group = None
|
||||
|
@ -12,8 +12,13 @@ from ..parallel_mode import ParallelMode
|
||||
class Initializer_Pipeline(ProcessGroupInitializer):
|
||||
"""A ProcessGroupInitializer for pipeline parallelism.
|
||||
|
||||
:param args: Args used to initialize ProcessGroupInitializer
|
||||
:param kwargs: Kwargs used to initialize ProcessGroupInitializer
|
||||
Args:
|
||||
rank (int): The rank of current process
|
||||
world_size (int): Size of whole communication world
|
||||
config (Config): Running configuration
|
||||
data_parallel_size (int): Size of data parallel
|
||||
pipeline_parallel_size (int): Size of pipeline parallel
|
||||
tensor_parallel_size (int): Size of tensor parallel
|
||||
"""
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
@ -23,8 +28,9 @@ class Initializer_Pipeline(ProcessGroupInitializer):
|
||||
def init_dist_group(self):
|
||||
"""Initialize pipeline parallel groups, and assign local_ranks and groups to each gpu.
|
||||
|
||||
:return: Pipeline parallelism's information
|
||||
:rtype: list of Tuples (local_rank, group_world_size, process_group, ranks_in_group, mode)
|
||||
Returns:
|
||||
List[Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode)]:
|
||||
A Pipeline parallelism's information in list of tuples.
|
||||
"""
|
||||
dist_settings = list()
|
||||
for i in range(self.data_parallel_size):
|
||||
|
@ -15,8 +15,13 @@ class Initializer_Sequence_DP(ProcessGroupInitializer):
|
||||
In Sequence Parallelism, each GPU holds the full copy of model weights,
|
||||
thus, gradient all-reduce occurs across all processes in the same pipeline stage
|
||||
|
||||
:param args: Args used to initialize ProcessGroupInitializer
|
||||
:param kwargs: Kwargs used to initialize ProcessGroupInitializer
|
||||
Args:
|
||||
rank (int): The rank of current process
|
||||
world_size (int): Size of whole communication world
|
||||
config (Config): Running configuration
|
||||
data_parallel_size (int): Size of data parallel
|
||||
pipeline_parallel_size (int): Size of pipeline parallel
|
||||
tensor_parallel_size (int): Size of tensor parallel
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
@ -27,8 +32,8 @@ class Initializer_Sequence_DP(ProcessGroupInitializer):
|
||||
def init_dist_group(self):
|
||||
"""Initialize Sequence Parallel process groups used for gradient all-reduce.
|
||||
|
||||
:return: (local_rank, group_world_size, process_group, ranks_in_group, mode)
|
||||
:rtype: Tuple
|
||||
Returns:
|
||||
Tuple: A tuple (local_rank, group_world_size, process_group, ranks_in_group, mode).
|
||||
"""
|
||||
local_rank = None
|
||||
ranks_in_group = None
|
||||
@ -52,8 +57,13 @@ class Initializer_Sequence_DP(ProcessGroupInitializer):
|
||||
class Initializer_Sequence(ProcessGroupInitializer):
|
||||
"""A ProcessGroupInitializer for sequence parallelism.
|
||||
|
||||
:param args: Args used to initialize ProcessGroupInitializer
|
||||
:param kwargs: Kwargs used to initialize ProcessGroupInitializer
|
||||
Args:
|
||||
rank (int): The rank of current process.
|
||||
world_size (int): Size of whole communication world.
|
||||
config (Config): Running configuration.
|
||||
data_parallel_size (int): Size of data parallel.
|
||||
pipeline_parallel_size (int): Size of pipeline parallel.
|
||||
tensor_parallel_size (int): Size of tensor parallel.
|
||||
"""
|
||||
def __init__(self,
|
||||
*args, **kwargs):
|
||||
@ -66,11 +76,12 @@ class Initializer_Sequence(ProcessGroupInitializer):
|
||||
"""Initialize Sequence parallel process groups and assign local_ranks and groups to each gpu.
|
||||
|
||||
Sequence parallelism requires 2 process groups. The first is for model forward where several processes
|
||||
exchange paritial query, key and value embedding to compute self attention values. The second is for
|
||||
exchange partial query, key and value embedding to compute self attention values. The second is for
|
||||
all-reduce to synchronize the model parameters.
|
||||
|
||||
:return: Sequence parallelism's information
|
||||
:rtype: list of Tuples (local_rank, group_world_size, process_group, ranks_in_group, mode)
|
||||
Returns:
|
||||
List[Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode)]:
|
||||
A Sequence parallelism's information in list of tuples.
|
||||
"""
|
||||
|
||||
parallel_setting = []
|
||||
|
@ -12,8 +12,13 @@ from ..parallel_mode import ParallelMode
|
||||
class Initializer_Tensor(ProcessGroupInitializer):
|
||||
"""A ProcessGroupInitializer for tensor parallelism.
|
||||
|
||||
:param args: Args used to initialize ProcessGroupInitializer
|
||||
:param kwargs: Kwargs used to initialize ProcessGroupInitializer
|
||||
Args:
|
||||
rank (int): The rank of current process.
|
||||
world_size (int): Size of whole communication world.
|
||||
config (Config): Running configuration.
|
||||
data_parallel_size (int): Size of data parallel.
|
||||
pipeline_parallel_size (int): Size of pipeline parallel.
|
||||
tensor_parallel_size (int): Size of tensor parallel.
|
||||
"""
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
@ -22,8 +27,9 @@ class Initializer_Tensor(ProcessGroupInitializer):
|
||||
def init_dist_group(self):
|
||||
"""Initialize tensor parallel groups, and assign local_ranks and groups to each gpu.
|
||||
|
||||
:return: Tensor parallelism's information
|
||||
:rtype: Tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
|
||||
Returns:
|
||||
Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
|
||||
A Tensor parallelism's information tuple.
|
||||
"""
|
||||
local_rank = None
|
||||
ranks_in_group = None
|
||||
|
@ -9,19 +9,13 @@ from colossalai.context import Config
|
||||
class ProcessGroupInitializer(ABC):
|
||||
"""An object, knowing the parallelism configuration, that initializes parallel groups.
|
||||
|
||||
:param rank: The rank of current process
|
||||
:param world_size: Size of whole communication world
|
||||
:param config: Running configuration
|
||||
:param data_parallel_size: Size of data parallel
|
||||
:param pipeline_parallel_size: Size of pipeline parallel
|
||||
:param tensor_parallel_size: Size of tensor parallel
|
||||
|
||||
:type rank: int
|
||||
:type world_size: int
|
||||
:type config: Config
|
||||
:type data_parallel_size: int
|
||||
:type pipeline_parallel_size: int
|
||||
:type tensor_parallel_size: int
|
||||
Args:
|
||||
rank (int): The rank of current process.
|
||||
world_size (int): Size of whole communication world.
|
||||
config (Config): Running configuration.
|
||||
data_parallel_size (int): Size of data parallel.
|
||||
pipeline_parallel_size (int): Size of pipeline parallel.
|
||||
tensor_parallel_size (int): Size of tensor parallel.
|
||||
"""
|
||||
def __init__(self,
|
||||
rank: int,
|
||||
|
@ -16,8 +16,8 @@ _SEED_MANAGER = SeedManager()
|
||||
def get_seeds():
|
||||
"""Returns the seeds of the seed manager.
|
||||
|
||||
:return: The seeds of the seed manager
|
||||
:rtype: dict
|
||||
Returns:
|
||||
dict: The seeds of the seed manager.
|
||||
"""
|
||||
return _SEED_MANAGER.seeds
|
||||
|
||||
@ -25,8 +25,8 @@ def get_seeds():
|
||||
def get_states(copy=False):
|
||||
"""Returns the seed states of the seed manager.
|
||||
|
||||
:return: The seed states of the seed manager
|
||||
:rtype: dict
|
||||
Returns:
|
||||
dict: The seed states of the seed manager.
|
||||
"""
|
||||
states = _SEED_MANAGER.seed_states
|
||||
|
||||
@ -43,8 +43,8 @@ def get_states(copy=False):
|
||||
def get_current_mode():
|
||||
"""Returns the current mode of the seed manager.
|
||||
|
||||
:return: The current mode of the seed manager.
|
||||
:rtype: :class:`torch.ByteTensor`
|
||||
Returns:
|
||||
:class:`torch.ByteTensor`: The current mode of the seed manager.
|
||||
"""
|
||||
return _SEED_MANAGER.current_mode
|
||||
|
||||
@ -52,12 +52,16 @@ def get_current_mode():
|
||||
def add_seed(parallel_mode: ParallelMode, seed: int, overwrite: bool = False):
|
||||
"""Adds a seed to the seed manager for `parallel_mode`.
|
||||
|
||||
:param parallel_mode: The chosen parallel mode
|
||||
:type parallel_mode: :class:`colossalai.context.ParallelMode`
|
||||
:param seed: The seed to be added
|
||||
:type seed: int
|
||||
:raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance of
|
||||
:class:`colossalai.context.ParallelMode` or the seed for `parallel_mode` has been added
|
||||
Args:
|
||||
parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
|
||||
seed (int): The seed to be added
|
||||
Raises:
|
||||
AssertionError: Raises an AssertionError if `parallel_mode` is not an instance of
|
||||
:class:`colossalai.context.ParallelMode` or the seed for `parallel_mode` has been added.
|
||||
|
||||
Note:
|
||||
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
|
||||
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
|
||||
"""
|
||||
_SEED_MANAGER.add_seed(parallel_mode, seed, overwrite)
|
||||
|
||||
@ -65,8 +69,12 @@ def add_seed(parallel_mode: ParallelMode, seed: int, overwrite: bool = False):
|
||||
def set_mode(parallel_mode: ParallelMode):
|
||||
"""Sets the current mode of the seed manager.
|
||||
|
||||
:param parallel_mode: The chosen parallel mode
|
||||
:type parallel_mode: :class:`colossalai.context.ParallelMode`
|
||||
Args:
|
||||
parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
|
||||
|
||||
Note:
|
||||
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
|
||||
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
|
||||
"""
|
||||
_SEED_MANAGER.set_mode(parallel_mode)
|
||||
|
||||
@ -74,11 +82,12 @@ def set_mode(parallel_mode: ParallelMode):
|
||||
def set_seed_states(parallel_mode: ParallelMode, state: Tensor):
|
||||
"""Sets the state of the seed manager for `parallel_mode`.
|
||||
|
||||
:param parallel_mode: The chosen parallel mode
|
||||
:type parallel_mode: :class:`colossalai.context.ParallelMode`
|
||||
:param state: the state to be set
|
||||
:type state: :class:`torch.Tensor`
|
||||
:raises AssertionError: Raises an AssertionError if `parallel_mode` is not found in the seed manager
|
||||
Args:
|
||||
parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
|
||||
state (:class:`torch.Tensor`): the state to be set.
|
||||
|
||||
Raises:
|
||||
AssertionError: Raises an AssertionError if `parallel_mode` is not found in the seed manager.
|
||||
"""
|
||||
_SEED_MANAGER.set_state(parallel_mode, state)
|
||||
|
||||
@ -98,6 +107,9 @@ def seed(parallel_mode: ParallelMode):
|
||||
with seed(ParallelMode.DATA):
|
||||
output = F.dropout(input)
|
||||
|
||||
Note:
|
||||
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
|
||||
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
|
||||
"""
|
||||
try:
|
||||
# set to new mode
|
||||
@ -125,6 +137,9 @@ def with_seed(func, parallel_mode: ParallelMode):
|
||||
wrapper_forward = with_seed(forward, ParallelMode.DATA)
|
||||
out = wrapped_forward(input)
|
||||
|
||||
Note:
|
||||
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
|
||||
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
|
||||
"""
|
||||
|
||||
@functools.wraps(func)
|
||||
|
@ -9,6 +9,10 @@ from colossalai.context.parallel_mode import ParallelMode
|
||||
|
||||
class SeedManager:
|
||||
"""This class is a manager of all random seeds involved in the system.
|
||||
|
||||
Note:
|
||||
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
|
||||
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
@ -30,12 +34,12 @@ class SeedManager:
|
||||
|
||||
def set_state(self, parallel_mode: ParallelMode, state: Tensor):
|
||||
"""Sets the state of the seed manager for `parallel_mode`.
|
||||
Args:
|
||||
parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
|
||||
state (:class:`torch.Tensor`): the state to be set.
|
||||
|
||||
:param parallel_mode: The chosen parallel mode
|
||||
:type parallel_mode: :class:`colossalai.context.ParallelMode`
|
||||
:param state: the state to be set
|
||||
:type state: :class:`torch.Tensor`
|
||||
:raises AssertionError: Raises an AssertionError if `parallel_mode` is not found in the seed manager
|
||||
Raises:
|
||||
AssertionError: Raises an AssertionError if `parallel_mode` is not found in the seed manager.
|
||||
"""
|
||||
assert parallel_mode in self._seed_states, f'Parallel mode {parallel_mode} is not found in the seed manager'
|
||||
self._seed_states[parallel_mode] = state
|
||||
@ -43,8 +47,8 @@ class SeedManager:
|
||||
def set_mode(self, parallel_mode: ParallelMode):
|
||||
"""Sets the current mode of the seed manager.
|
||||
|
||||
:param parallel_mode: The chosen parallel mode
|
||||
:type parallel_mode: :class:`colossalai.context.ParallelMode`
|
||||
Args:
|
||||
parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
|
||||
"""
|
||||
if self.current_mode:
|
||||
# save the current state for current mode
|
||||
@ -57,14 +61,14 @@ class SeedManager:
|
||||
def add_seed(self, parallel_mode: ParallelMode, seed: int, overwrtie: bool = False):
|
||||
"""Adds a seed to the seed manager for `parallel_mode`.
|
||||
|
||||
:param parallel_mode: The chosen parallel mode
|
||||
:type parallel_mode: :class:`colossalai.context.ParallelMode`
|
||||
:param seed: The seed to be added
|
||||
:type seed: int
|
||||
:param overwrtie: Whether allows to overwrite the seed that has been set already
|
||||
:type overwrtie: bool, optional
|
||||
:raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance of
|
||||
:class:`colossalai.context.ParallelMode` or the seed for `parallel_mode` has been added
|
||||
Args:
|
||||
parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
|
||||
seed (int): The seed to be added.
|
||||
overwrtie (bool, optional): Whether allows to overwrite the seed that has been set already
|
||||
|
||||
Raises
|
||||
AssertionError: Raises an AssertionError if `parallel_mode` is not an instance of
|
||||
:class:`colossalai.context.ParallelMode` or the seed for `parallel_mode` has been added.
|
||||
"""
|
||||
assert isinstance(parallel_mode, ParallelMode), 'A valid ParallelMode must be provided'
|
||||
if overwrtie is False:
|
||||
|
@ -19,20 +19,37 @@ class Engine:
|
||||
:meth:`step` which is based on the given :attr:`schedule` over each batch of a dataset.
|
||||
It controls a iteration in training.
|
||||
|
||||
:param model: The neural network model
|
||||
:type model: ``torch.nn.Module``
|
||||
:param optimizer: Optimizer for updating the parameters
|
||||
:type optimizer: ``torch.optim.Optimizer``
|
||||
:param criterion: Loss function for calculating loss
|
||||
:type criterion: ``torch.nn.modules.loss._Loss``, optional
|
||||
:param gradient_handlers: A list of gradient handler used in backward
|
||||
:type gradient_handlers: a list of ``BaseGradientHandler``, optional
|
||||
:param clip_grad_norm: The norm of gradient clipping
|
||||
:type clip_grad_norm: float, optional
|
||||
:param ophook_list: List of ophook
|
||||
:type ophook_list: list
|
||||
:param verbose: whether to display log info
|
||||
:type verbose: bool
|
||||
Args:
|
||||
model (``torch.nn.Module``): The neural network model.
|
||||
optimizer (``torch.optim.Optimizer``): Optimizer for updating the parameters.
|
||||
criterion (``torch.nn.modules.loss._Loss``, optional): Loss function for calculating loss.
|
||||
gradient_handlers (List[``BaseGradientHandler``], optional): A list of gradient handler used in backward.
|
||||
clip_grad_norm (float, optional): The norm of gradient clipping.
|
||||
ophook_list (list): List of ophook.
|
||||
verbose (bool): whether to display log info.
|
||||
|
||||
Examples:
|
||||
>>> # define model, criterion, optimizer, lr_scheduler, train_dataloader for your training
|
||||
>>> model = ...
|
||||
>>> criterion = ...
|
||||
>>> optimizer = ...
|
||||
>>> train_dataloader = ...
|
||||
>>> engine, _, _, _ = colossalai.initialize(model, optimizer, criterion)
|
||||
>>> engine.train()
|
||||
>>> for inputs, labels in train_dataloader
|
||||
>>> # set gradients to zero
|
||||
>>> engine.zero_grad()
|
||||
>>> # run forward pass
|
||||
>>> outputs = engine(inputs)
|
||||
>>> # compute loss value and run backward pass
|
||||
>>> loss = engine.criterion(outputs, labels)
|
||||
>>> engine.backward(loss)
|
||||
>>> # update parameters
|
||||
>>> engine.step()
|
||||
|
||||
The example of using Engine in training could be find in
|
||||
`Training with engine and trainer <https://www.colossalai.org/docs/basics/engine_trainer>`_. and
|
||||
`Run resnet cifar10 with engine <https://github.com/hpcaitech/ColossalAI-Examples/blob/main/image/resnet/run_resnet_cifar10_with_engine.py>`_.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
@ -113,10 +130,10 @@ class Engine:
|
||||
return self.optimizer.step()
|
||||
|
||||
def backward(self, loss: Tensor):
|
||||
"""Start backward propagation given the loss value computed by a loss function
|
||||
"""Start backward propagation given the loss value computed by a loss function.
|
||||
|
||||
:param loss: Loss value computed by a loss function
|
||||
:type loss: :class:`torch.Tensor`
|
||||
Args:
|
||||
loss (:class:`torch.Tensor`): Loss value computed by a loss function.
|
||||
"""
|
||||
ret = self.optimizer.backward(loss)
|
||||
for ophook in self._ophook_list:
|
||||
@ -124,34 +141,22 @@ class Engine:
|
||||
return ret
|
||||
|
||||
def backward_by_grad(self, tensor, grad):
|
||||
"""Start backward propagation given the gradient of the output tensor
|
||||
"""Start backward propagation given the gradient of the output tensor.
|
||||
|
||||
:param tensor: Output tensor
|
||||
:type tensor: :class:`torch.Tensor`
|
||||
:param grad: Gradient passed back to the output
|
||||
:type grad: :class:`torch.Tensor`
|
||||
Args:
|
||||
tensor (:class:`torch.Tensor`): Output tensor.
|
||||
grad (:class:`torch.Tensor`): Gradient passed back to the output.
|
||||
"""
|
||||
ret = self.optimizer.backward_by_grad(tensor, grad)
|
||||
for ophook in self._ophook_list:
|
||||
ophook.post_iter()
|
||||
return ret
|
||||
|
||||
def calc_loss(self, *args, **kwargs):
|
||||
"""Compute the loss value
|
||||
|
||||
:param args: Args used in criterion function
|
||||
:param kwargs: Kwargs used in criterion function
|
||||
|
||||
:return: The loss value
|
||||
:rtype: :class:`torch.Tensor`
|
||||
"""
|
||||
return self.criterion(*args, **kwargs)
|
||||
|
||||
def __call__(self, *args, **kwargs):
|
||||
"""Run the forward step for the model
|
||||
"""Run the forward step for the model.
|
||||
|
||||
:return: Output the model
|
||||
:rtype: Tuple[:class:`torch.Tensor`] or :class:`torch.Tensor`
|
||||
Returns:
|
||||
Tuple[:class:`torch.Tensor`] or :class:`torch.Tensor`: Output of the model.
|
||||
"""
|
||||
return self.model(*args, **kwargs)
|
||||
|
||||
|
@ -8,10 +8,9 @@ class BaseGradientHandler(ABC):
|
||||
"""A basic helper class to handle all-reduce operations of gradients across different parallel groups
|
||||
before optimization.
|
||||
|
||||
:param model: Model where the gradients accumulate
|
||||
:param optimizer: Optimizer for updating the parameters
|
||||
:type model: Module
|
||||
:type optimizer: Optimizer
|
||||
Args:
|
||||
model (Module): Model where the gradients accumulate.
|
||||
optimizer (Optimizer): Optimizer for updating the parameters.
|
||||
"""
|
||||
def __init__(self, model, optimizer):
|
||||
self._model = model
|
||||
|
@ -17,12 +17,11 @@ import math
|
||||
class MemTracerOpHook(BaseOpHook):
|
||||
"""
|
||||
Collect GPU memory usage information
|
||||
:param warmup: This parameter indicates how many iterations to truncate before profiling, defaults to 50
|
||||
:type warmup: int
|
||||
:param refreshrate: This parameter decides the frequency of write file, defaults to 10
|
||||
:type refreshrate: int
|
||||
:param data_prefix: The prefix of the stats data file, defaults to "memstats"
|
||||
:type data_prefix: string
|
||||
|
||||
Args:
|
||||
warmup (int): This parameter indicates how many iterations to truncate before profiling, defaults to 50.
|
||||
refreshrate (int): This parameter decides the frequency of write file, defaults to 10.
|
||||
data_prefix (string): The prefix of the stats data file, defaults to "memstats".
|
||||
"""
|
||||
|
||||
def __init__(self, warmup: int = 50, refreshrate: int = 10, data_prefix: str = "memstats"):
|
||||
|
@ -15,8 +15,12 @@ class BaseSchedule(ABC):
|
||||
"""A basic helper class to control the process of training or evaluation.
|
||||
It mainly composes of forward_backward_step for gradient backward and
|
||||
optimizer_step for parameters update.
|
||||
For the convenience to enable FP16, we aggreate all codes that contain the
|
||||
For the convenience to enable FP16, we aggregate all codes that contain the
|
||||
control of FP16 in class schedule.
|
||||
|
||||
Args:
|
||||
batch_data_process_func (Callable, optional): The preprocessing function which receives a batch of data,
|
||||
and it will be executed in load_batch.
|
||||
"""
|
||||
|
||||
def __init__(self, batch_data_process_func: Callable = None):
|
||||
@ -46,13 +50,12 @@ class BaseSchedule(ABC):
|
||||
"""Loads a batch from data iterator. It returns the data and labels which are
|
||||
already in the same GPU as where the model's.
|
||||
|
||||
:param data_iter: Data iterator from which get a batch of data
|
||||
:type data_iter: DataIter
|
||||
:param to_gpu: Whether the data should be moved to GPU
|
||||
:type to_gpu: bool, optional
|
||||
Args:
|
||||
data_iter (Iterable): Data iterator from which get a batch of data, obtained by calling iter(dataloader).
|
||||
to_gpu (bool, optional): Whether the data should be moved to GPU
|
||||
|
||||
:return: (data, label)
|
||||
:rtype: (:class:`Tensor`, :class:`torch.Tensor`)
|
||||
Returns:
|
||||
Tuple (:class:`Tensor`, :class:`torch.Tensor`): A tuple of (data, label).
|
||||
"""
|
||||
if data_iter is None:
|
||||
raise RuntimeError('Dataloader is not defined.')
|
||||
@ -87,16 +90,12 @@ class BaseSchedule(ABC):
|
||||
):
|
||||
"""The process function over a batch of dataset for training or evaluation.
|
||||
|
||||
:param engine: Colossalai training engine
|
||||
:type engine: colossalai.engine.Engine
|
||||
:param data_iter: Data iterator from which get a batch of data
|
||||
:type data_iter: DataIter
|
||||
:param forward_only: If True, the process won't include backward
|
||||
:type forward_only: bool
|
||||
:param return_loss: If False, the loss won't be returned
|
||||
:type return_loss: bool, optional
|
||||
:param return_output_label: If False, the output and label won't be returned
|
||||
:type return_output_label: bool, optional
|
||||
Args:
|
||||
engine (colossalai.engine.Engine): Colossalai engine for training and inference.
|
||||
data_iter (Iterable): Data iterator from which get a batch of data, obtained by calling iter(dataloader).
|
||||
forward_only (bool): If True, the process won't include backward.
|
||||
return_loss (bool, optional): If False, the loss won't be returned.
|
||||
return_output_label (bool, optional): If False, the output and label won't be returned.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
@ -15,6 +15,10 @@ class NonPipelineSchedule(BaseSchedule):
|
||||
During one process, it loads a batch of dataset and feeds it to the model.
|
||||
After getting the output and calculating the loss, it will use :meth:`step`
|
||||
to update the parameters if it is in training mode.
|
||||
|
||||
Args:
|
||||
batch_data_process_func (Callable, optional): The preprocessing function which receives a batch of data,
|
||||
and it will be executed in load_batch.
|
||||
"""
|
||||
|
||||
def forward_backward_step(self,
|
||||
@ -23,22 +27,19 @@ class NonPipelineSchedule(BaseSchedule):
|
||||
forward_only: bool = False,
|
||||
return_loss: bool = True,
|
||||
return_output_label: bool = True):
|
||||
"""The process function that loads loads a batch of dataset and feeds it to the model.
|
||||
"""The process function that loads a batch of dataset and feeds it to the model.
|
||||
The returned labels and loss will None if :attr:`return_loss` is False.
|
||||
|
||||
:param engine: Model for training and inference
|
||||
:param data_iter: Data iterator of the dataloader, e.g. iter(dataloader)
|
||||
:param forward_only: If True, the model is run for the forward pass, else back propagation will be executed
|
||||
:param return_loss: Loss will be returned if True
|
||||
:param return_output_label: Output and label will be returned if True
|
||||
:type engine: Iterator
|
||||
:type data_iter: Iterator
|
||||
:type forward_only: bool, optional
|
||||
:type return_loss: bool, optional
|
||||
:type return_output_label: bool, optional
|
||||
Args:
|
||||
engine (colossalai.engine.Engine): Colossalai engine for training and inference.
|
||||
data_iter (Iterable): Dataloader as the form of an iterator, obtained by calling iter(dataloader).
|
||||
forward_only (bool, optional):
|
||||
If True, the model is run for the forward pass, else back propagation will be executed.
|
||||
return_loss (bool, optional): Loss will be returned if True.
|
||||
return_output_label (bool, optional): Output and label will be returned if True.
|
||||
|
||||
:return: (output, label, loss)
|
||||
:rtype: Tuple[:class:`torch.Tensor`]
|
||||
Returns:
|
||||
Tuple[:class:`torch.Tensor`]: A tuple of (output, label, loss), loss and label could be None.
|
||||
"""
|
||||
assert forward_only or return_loss, \
|
||||
"The argument 'return_loss' has to be True when 'forward_only' is False, but got False."
|
||||
|
@ -41,14 +41,13 @@ class PipelineSchedule(BaseSchedule):
|
||||
It uses non-interleaved 1F1B strategy. Other properties are similar as
|
||||
:class:`NonPipelineSchedule`.
|
||||
|
||||
:param num_microbatches: The number of microbatches
|
||||
:type num_microbatches: int
|
||||
:param batch_data_process_func: The preprocessing function which receives a batch of data, and it will be executed in `load_batch`
|
||||
:type batch_data_process_func: Callable, optional
|
||||
:param tensor_shape: Specified shape in pipeline communication
|
||||
:type tensor_shape: torch.Size, optional
|
||||
:param scatter_gather_tensors: If set to `True`, communication will be reduced over pipeline when using 1D tensor parallelization
|
||||
:type scatter_gather_tensors: bool, optional
|
||||
Args:
|
||||
num_microbatches (int): The number of microbatches.
|
||||
batch_data_process_func (Callable, optional):
|
||||
The preprocessing function which receives a batch of data, and it will be executed in `load_batch`.
|
||||
tensor_shape (torch.Size, optional): Specified shape in pipeline communication.
|
||||
scatter_gather_tensors (bool, optional):
|
||||
If set to `True`, communication will be reduced over pipeline when using 1D tensor parallelization.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
@ -131,19 +130,14 @@ class PipelineSchedule(BaseSchedule):
|
||||
is obtained from data_iterator, otherwise the passed-in input_tensor is used.
|
||||
Returns output tensor. This is a helper function and can be ignored by users.
|
||||
|
||||
:param engine: Your engine object
|
||||
:type engine: colossalai.engine.Engine
|
||||
:param input_tensor: Input tensor for this pipeline stage
|
||||
:type input_tensor: :class:`torch.Tensor`
|
||||
:param return_tensors: A list of tensors to return
|
||||
:type return_tensors: List[:class:`torch.Tensor`]
|
||||
:param return_output_label: Whether returns output labels
|
||||
:type return_output_label: bool, optional
|
||||
:param accum_loss: Where accumulated loss stores
|
||||
:type accum_loss: optional
|
||||
|
||||
:return: output or the loss value of the current pipeline stage
|
||||
:rtype: :class:`torch.Tensor`
|
||||
Args:
|
||||
engine (colossalai.engine.Engine): Colossalai engine for training and inference.
|
||||
input_tensor (:class:`torch.Tensor`): Input tensor for this pipeline stage.
|
||||
return_tensors (List[:class:`torch.Tensor`]): A list of tensors to return.
|
||||
return_output_label (bool, optional): Whether returns output labels.
|
||||
accum_loss (optional): Where accumulated loss stores.
|
||||
Returns:
|
||||
:class:`torch.Tensor`: output or the loss value of the current pipeline stage.
|
||||
"""
|
||||
data, label = self.load_micro_batch()
|
||||
output_tensor = self._call_engine(engine.model, input_tensor, data)
|
||||
@ -173,17 +167,14 @@ class PipelineSchedule(BaseSchedule):
|
||||
Returns the gradients with respect to the input tensor (None if first stage).
|
||||
This is a helper function and can be ignored by users.
|
||||
|
||||
:param engine: your engine object
|
||||
:type engine: colossalai.engine.Engine
|
||||
:param input_tensor: input tensor for this pipeline stage
|
||||
:type input_tensor: :class:`torch.Tensor`
|
||||
:param output_tensor: output tensor for this pipeline stage
|
||||
:type output_tensor: :class:`torch.Tensor`
|
||||
:param output_tensor_grad: gradient of output tensor for this pipeline stage
|
||||
:type output_tensor_grad: :class:`torch.Tensor`
|
||||
Args:
|
||||
engine (colossalai.engine.Engine): Colossalai engine for training and inference.
|
||||
input_tensor (:class:`torch.Tensor`): input tensor for this pipeline stage.
|
||||
output_tensor (:class:`torch.Tensor`): output tensor for this pipeline stage.
|
||||
output_tensor_grad (:class:`torch.Tensor`): gradient of output tensor for this pipeline stage.
|
||||
|
||||
:return: gradient of input tensor
|
||||
:rtype: :class:`torch.Tensor`
|
||||
Returns:
|
||||
:class:`torch.Tensor`: gradient of input tensor.
|
||||
"""
|
||||
|
||||
# Retain the grad on the input_tensor.
|
||||
@ -207,19 +198,16 @@ class PipelineSchedule(BaseSchedule):
|
||||
"""Runs non-interleaved 1F1B schedule, with communication between pipeline stages.
|
||||
Returns a tuple with losses if the last stage, an empty tuple otherwise.
|
||||
|
||||
:param engine: Your engine object
|
||||
:type engine: colossalai.engine.Engine
|
||||
:param data_iter: Dataloader as the form of an iterator, obtained by calling iter(dataloader)
|
||||
:type data_iter: Iterable
|
||||
:param forward_only: Whether run forward step only. Default is false. If true, no backward will be run.
|
||||
:type forward_only: bool
|
||||
:param return_loss: Whether returns the loss value. Default is true.
|
||||
:type return_loss: bool
|
||||
:param return_output_label: If False, the output and label won't be returned
|
||||
:type return_output_label: bool
|
||||
Args:
|
||||
engine (colossalai.engine.Engine): Colossalai engine for training and inference.
|
||||
data_iter (Iterable): Dataloader as the form of an iterator, obtained by calling iter(dataloader).
|
||||
forward_only (bool, optional):
|
||||
Whether run forward step only. Default is false. If true, no backward will be run.
|
||||
return_loss (bool, optional): Whether returns the loss value. Default is true.
|
||||
return_output_label (bool, optional): If False, the output and label won't be returned.
|
||||
|
||||
:return: (output, label, loss)
|
||||
:rtype: Tuple[:class:`torch.Tensor`]
|
||||
Returns:
|
||||
Tuple[:class:`torch.Tensor`]: A tuple of (output, label, loss), loss and label could be None.
|
||||
"""
|
||||
|
||||
assert forward_only or return_loss, \
|
||||
@ -354,16 +342,14 @@ class InterleavedPipelineSchedule(PipelineSchedule):
|
||||
It uses interleaved 1F1B strategy. Other properties are similar as
|
||||
:class:`NonPipelineSchedule`.
|
||||
|
||||
:param num_microbatches: The number of microbatches
|
||||
:type num_microbatches: int
|
||||
:param num_model_chunks: The number of model chunks
|
||||
:type num_model_chunks: int
|
||||
:param batch_data_process_func: The preprocessing function which receives a batch of data, and it will be executed in `load_batch`
|
||||
:type batch_data_process_func: Callable, optional
|
||||
:param tensor_shape: Specified shape in pipeline communication
|
||||
:type tensor_shape: torch.Size, optional
|
||||
:param scatter_gather_tensors: If set to `True`, communication will be reduced over pipeline when using 1D tensor parallelization
|
||||
:type scatter_gather_tensors: bool, optional
|
||||
Args:
|
||||
num_microbatches (int): The number of microbatches.
|
||||
num_model_chunks (int): The number of model chunks.
|
||||
batch_data_process_func (Callable, optional):
|
||||
The preprocessing function which receives a batch of data, and it will be executed in `load_batch`.
|
||||
tensor_shape (torch.Size, optional): Specified shape in pipeline communication.
|
||||
scatter_gather_tensors (bool, optional):
|
||||
If set to `True`, communication will be reduced over pipeline when using 1D tensor parallelization.
|
||||
"""
|
||||
assert num_microbatches % gpc.get_world_size(ParallelMode.PIPELINE) == 0, \
|
||||
'num_microbatches must be an integer multiple of pipeline parallel world size'
|
||||
@ -408,6 +394,16 @@ class InterleavedPipelineSchedule(PipelineSchedule):
|
||||
"""Forward step for passed-in model. If it is the first stage, the input tensor
|
||||
is obtained from data_iterator, otherwise the passed-in input_tensor is used.
|
||||
Returns output tensor. This is a helper function and can be ignored by users.
|
||||
|
||||
Args:
|
||||
engine (colossalai.engine.Engine): Colossalai engine for training and inference.
|
||||
model_chunk_id (int): The id of model chunks.
|
||||
input_tensor (:class:`torch.Tensor`): Input tensor for this pipeline stage.
|
||||
return_tensors (List[:class:`torch.Tensor`]): A list of tensors to return.
|
||||
return_output_label (bool, optional): Whether returns output labels.
|
||||
accum_loss (optional): Where accumulated loss stores.
|
||||
Returns:
|
||||
:class:`torch.Tensor`: output or the loss value of the current pipeline stage.
|
||||
"""
|
||||
data, label = self.load_micro_batch(model_chunk_id)
|
||||
output_tensor = self._call_engine(engine.model[model_chunk_id], input_tensor, data)
|
||||
@ -435,18 +431,17 @@ class InterleavedPipelineSchedule(PipelineSchedule):
|
||||
"""Run interleaved 1F1B schedule (model split into model chunks), with
|
||||
communication between pipeline stages as needed.
|
||||
|
||||
Returns dictionary with losses if the last stage, empty dict otherwise.
|
||||
Args:
|
||||
engine (colossalai.engine.Engine): Colossalai engine for training and inference.
|
||||
data_iter (Iterable): Dataloader as the form of an iterator, obtained by calling iter(dataloader).
|
||||
forward_only (bool, optional):
|
||||
Whether run forward step only. Default is false. If true, no backward will be run.
|
||||
return_loss (bool, optional): Whether returns the loss value. Default is true.
|
||||
return_output_label (bool, optional): If False, the output and label won't be returned.
|
||||
|
||||
:param engine: Your engine object
|
||||
:type engine: colossalai.engine.Engine
|
||||
:param data_iter: Dataloader as the form of an iterator, obtained by calling iter(dataloader)
|
||||
:type data_iter: Iterable
|
||||
:param forward_only: Whether run forward step only. Default is false. If true, no backward will be run.
|
||||
:type forward_only: bool
|
||||
:param return_loss: Whether returns the loss value. Default is true.
|
||||
:type return_loss: bool
|
||||
:param return_output_label: If False, the output and label won't be returned
|
||||
:type return_output_label: bool
|
||||
Returns:
|
||||
Tuple[:class:`torch.Tensor`]: A tuple of (output, label, loss), loss and label could be None.
|
||||
The loss would be returned only in the last stage.
|
||||
"""
|
||||
assert forward_only or return_loss, \
|
||||
'The argument \'return_loss\' has to be True when \'forward_only\' is False, but got False.'
|
||||
|
@ -37,8 +37,8 @@ def get_default_parser():
|
||||
"""Reads user command line and uses an argument parser to parse the input arguments.
|
||||
Input arguments include configuration, host, port, world size, local rank, backend for torch.distributed.
|
||||
|
||||
:return: Returns the parser with the default arguments, the user may add customized arguments into this parser
|
||||
:rtype: Namespace
|
||||
Returns:
|
||||
Namespace: Returns the parser with the default arguments, the user may add customized arguments into this parser.
|
||||
"""
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--config', type=str, help='path to the config file')
|
||||
@ -63,26 +63,21 @@ def launch(config: Union[str, Path, Config, Dict],
|
||||
"""This function first parses the configuration arguments, using :func:`parse_args()` in case one of the input
|
||||
arguments are not given. Then initialize and set distributed environment by calling global_context's functions.
|
||||
|
||||
:param config: Config file or config file path are both acceptable
|
||||
:type config: Union[str, dict, Config]
|
||||
:param rank: Rank for the default process group
|
||||
:type rank: int
|
||||
:param world_size: World size of the default process group
|
||||
:type world_size: int
|
||||
:param host: The master address for distributed training
|
||||
:type host: str
|
||||
:param port: The master port for distributed training
|
||||
:type port: str
|
||||
:param backend: Backend for torch.distributed
|
||||
:type backend: str, optional
|
||||
:param local_rank: Rank for the process on the node and is used to set the default CUDA device, defaults to None.
|
||||
If local_rank = None, the default device ordinal will be calculated automatically
|
||||
:type local_rank: int, optional
|
||||
:param seed: Specified random seed for every processes
|
||||
:type seed: int, optional
|
||||
:param verbose: Whether to print logs
|
||||
:type verbose: bool, optional
|
||||
:raises Exception: Raise exception when config type is wrong
|
||||
Args:
|
||||
config (Union[str, dict, Config]): Config file or config file path are both acceptable
|
||||
rank (int): Rank for the default process group
|
||||
world_size (int): World size of the default process group
|
||||
host (str): The master address for distributed training
|
||||
port (str): The master port for distributed training
|
||||
backend (str, optional): Backend for ``torch.distributed``, defaults to ``nccl``
|
||||
local_rank (int, optional):
|
||||
Rank for the process on the node and is used to set the default CUDA device,
|
||||
defaults to None. If local_rank = None, the default device ordinal will be calculated automatically.
|
||||
seed (int, optional): Specified random seed for every process. Defaults to 1024.
|
||||
verbose (bool, optional): Whether to print logs. Defaults to True.
|
||||
|
||||
Raises:
|
||||
Exception: Raise exception when config type is wrong
|
||||
"""
|
||||
gpc.verbose = verbose
|
||||
|
||||
@ -126,18 +121,13 @@ def launch_from_slurm(config: Union[str, Path, Config, Dict],
|
||||
"""A wrapper for colossalai.launch for SLURM launcher by reading rank and world size from the environment variables
|
||||
set by SLURM
|
||||
|
||||
:param config: Config file or config file path are both acceptable
|
||||
:type config: Union[str, dict, Config]
|
||||
:param host: The master address for distributed training
|
||||
:type host: str
|
||||
:param port: The master port for distributed training
|
||||
:type port: str
|
||||
:param backend: Backend for torch.distributed
|
||||
:type backend: str, optional
|
||||
:param seed: Specified random seed for every processes
|
||||
:type seed: int, optional
|
||||
:param verbose: Whether to print logs
|
||||
:type verbose: bool, optional
|
||||
Args:
|
||||
config (Union[str, dict, Config]): Config file or config file path are both acceptable
|
||||
host (str): The master address for distributed training
|
||||
port (str): The master port for distributed training
|
||||
backend (str, optional): Backend for ``torch.distributed``, defaults to ``nccl``
|
||||
seed (int, optional): Specified random seed for every process. Defaults to 1024.
|
||||
verbose (bool, optional): Whether to print logs. Defaults to True.
|
||||
"""
|
||||
rank = int(os.environ['SLURM_PROCID'])
|
||||
world_size = int(os.environ['SLURM_NPROCS'])
|
||||
@ -160,18 +150,13 @@ def launch_from_openmpi(config: Union[str, Path, Config, Dict],
|
||||
"""A wrapper for colossalai.launch for OpenMPI launcher by reading rank and world size from the environment variables
|
||||
set by OpenMPI
|
||||
|
||||
:param config: Config file or config file path are both acceptable
|
||||
:type config: Union[str, dict, Config]
|
||||
:param host: The master address for distributed training
|
||||
:type host: str
|
||||
:param port: The master port for distributed training
|
||||
:type port: str
|
||||
:param backend: Backend for torch.distributed
|
||||
:type backend: str, optional
|
||||
:param seed: Specified random seed for every processes
|
||||
:type seed: int, optional
|
||||
:param verbose: Whether to print logs
|
||||
:type verbose: bool, optional
|
||||
Args:
|
||||
config (Union[str, dict, Config]): Config file or config file path are both acceptable
|
||||
host (str): The master address for distributed training
|
||||
port (str): The master port for distributed training
|
||||
backend (str, optional): Backend for ``torch.distributed``, defaults to ``nccl``
|
||||
seed (int, optional): Specified random seed for every process. Defaults to 1024.
|
||||
verbose (bool, optional): Whether to print logs. Defaults to True.
|
||||
"""
|
||||
rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
|
||||
local_rank = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])
|
||||
@ -194,14 +179,11 @@ def launch_from_torch(config: Union[str, Path, Config, Dict],
|
||||
"""A wrapper for colossalai.launch for torchrun or torch.distributed.launch by reading rank and world size
|
||||
from the environment variables set by PyTorch
|
||||
|
||||
:param config: Config file or config file path are both acceptable
|
||||
:type config: Union[str, dict, Config]
|
||||
:param backend: Backend for torch.distributed
|
||||
:type backend: str, optional
|
||||
:param seed: Specified random seed for every processes
|
||||
:type seed: int, optional
|
||||
:param verbose: Whether to print logs
|
||||
:type verbose: bool, optional
|
||||
Args:
|
||||
config (Union[str, dict, Config]): Config file or config file path are both acceptable
|
||||
backend (str, optional): Backend for ``torch.distributed``, defaults to ``nccl``
|
||||
seed (int, optional): Specified random seed for every process. Defaults to 1024.
|
||||
verbose (bool, optional): Whether to print logs. Defaults to True.
|
||||
"""
|
||||
rank = int(os.environ['RANK'])
|
||||
local_rank = int(os.environ['LOCAL_RANK'])
|
||||
@ -230,22 +212,20 @@ def initialize(model: nn.Module,
|
||||
"""Core function to wrap the essential training components with our functionality based on the config which is
|
||||
loaded into gpc.config.
|
||||
|
||||
:param model: Your model instance or a function to build the model
|
||||
:type model: :class:`torch.nn.Module` or Callbale
|
||||
:param optimizer: Your optimizer instance
|
||||
:type optimizer: :class:`torch.optim.optimizer.Optimizer` or :class:`Type[torch.optim.optimizer]`
|
||||
:param criterion: Your criterion instance
|
||||
:type criterion: :class:`torch.nn.modules.loss._Loss`, optional
|
||||
:param train_dataloader: Dataloader for training
|
||||
:type train_dataloader: :class:`torch.utils.data.DataLoader`, optional
|
||||
:param test_dataloader: Dataloader for testing
|
||||
:type test_dataloader: :class:`torch.utils.data.DataLoader`, optional
|
||||
:param lr_scheduler: Your lr scheduler instance, optional
|
||||
:type lr_scheduler: :class:`torch.nn.lr_scheduler._LRScheduler`, optional
|
||||
:param verbose: Whether to print logs
|
||||
:type verbose: bool, optional
|
||||
:return: (engine, train_dataloader, test_dataloader, lr_scheduler)
|
||||
:rtype: Tuple
|
||||
Args:
|
||||
model (:class:`torch.nn.Module` or Callbale): Your model instance or a function to build the model.
|
||||
optimizer (:class:`torch.optim.optimizer.Optimizer` or :class:`Type[torch.optim.optimizer]`):
|
||||
Your optimizer instance.
|
||||
criterion (:class:`torch.nn.modules.loss._Loss`, optional): Your criterion instance.
|
||||
train_dataloader (:class:`torch.utils.data.DataLoader`, optional): Dataloader for training.
|
||||
test_dataloader (:class:`torch.utils.data.DataLoader`, optional): Dataloader for testing.
|
||||
lr_scheduler (:class:`torch.nn.lr_scheduler._LRScheduler`, optional): Your lr scheduler instance, optional.
|
||||
verbose (bool, optional): Whether to print logs.
|
||||
|
||||
Returns:
|
||||
Tuple (engine, train_dataloader, test_dataloader, lr_scheduler):
|
||||
A tuple of ``(engine, train_dataloader, test_dataloader, lr_scheduler)``
|
||||
where only ``engine`` could not be None.
|
||||
"""
|
||||
# get logger
|
||||
logger = get_dist_logger()
|
||||
|
@ -10,6 +10,8 @@ def get_dist_logger(name='colossalai'):
|
||||
"""Get logger instance based on name. The DistributedLogger will create singleton instances,
|
||||
which means that only one logger instance is created per name.
|
||||
|
||||
Args:
|
||||
|
||||
:param name: name of the logger, name must be unique
|
||||
:type name: str
|
||||
|
||||
|
@ -23,8 +23,13 @@ except ImportError:
|
||||
class DistributedLogger:
|
||||
"""This is a distributed event logger class essentially based on :class:`logging`.
|
||||
|
||||
:param name: The name of the logger
|
||||
:type name: str
|
||||
Args:
|
||||
name (str): The name of the logger.
|
||||
|
||||
Note:
|
||||
The parallel_mode used in ``info``, ``warning``, ``debug`` and ``error``
|
||||
should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
|
||||
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
|
||||
"""
|
||||
|
||||
__instances = dict()
|
||||
@ -33,10 +38,10 @@ class DistributedLogger:
|
||||
def get_instance(name: str):
|
||||
"""Get the unique single logger instance based on name.
|
||||
|
||||
:param name: The name of the logger
|
||||
:type name: str
|
||||
:return: A DistributedLogger object
|
||||
:rtype: DistributedLogger
|
||||
Args:
|
||||
name (str): The name of the logger.
|
||||
Returns:
|
||||
DistributedLogger: A DistributedLogger object
|
||||
"""
|
||||
if name in DistributedLogger.__instances:
|
||||
return DistributedLogger.__instances[name]
|
||||
@ -73,8 +78,8 @@ class DistributedLogger:
|
||||
def set_level(self, level: str):
|
||||
"""Set the logging level
|
||||
|
||||
:param level: Can only be INFO, DEBUG, WARNING and ERROR
|
||||
:type level: str
|
||||
Args:
|
||||
level (str): Can only be INFO, DEBUG, WARNING and ERROR.
|
||||
"""
|
||||
self._check_valid_logging_level(level)
|
||||
self._logger.setLevel(getattr(logging, level))
|
||||
@ -82,14 +87,11 @@ class DistributedLogger:
|
||||
def log_to_file(self, path: Union[str, Path], mode: str = 'a', level: str = 'INFO', suffix: str = None):
|
||||
"""Save the logs to file
|
||||
|
||||
:param path: The file to save the log
|
||||
:type path: A string or pathlib.Path object
|
||||
:param mode: The mode to write log into the file
|
||||
:type mode: str
|
||||
:param level: Can only be INFO, DEBUG, WARNING and ERROR
|
||||
:type level: str
|
||||
:param suffix: The suffix string of log's name
|
||||
:type suffix: str
|
||||
Args:
|
||||
path (A string or pathlib.Path object): The file to save the log.
|
||||
mode (str): The mode to write log into the file.
|
||||
level (str): Can only be INFO, DEBUG, WARNING and ERROR.
|
||||
suffix (str): The suffix string of log's name.
|
||||
"""
|
||||
assert isinstance(path, (str, Path)), \
|
||||
f'expected argument path to be type str or Path, but got {type(path)}'
|
||||
@ -131,12 +133,11 @@ class DistributedLogger:
|
||||
def info(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: list = None):
|
||||
"""Log an info message.
|
||||
|
||||
:param message: The message to be logged
|
||||
:type message: str
|
||||
:param parallel_mode: The parallel mode used for logging. Defaults to ParallelMode.GLOBAL
|
||||
:type parallel_mode: :class:`colossalai.context.parallel_mode.ParallelMode`
|
||||
:param ranks: List of parallel ranks
|
||||
:type ranks: list
|
||||
Args:
|
||||
message (str): The message to be logged.
|
||||
parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`):
|
||||
The parallel mode used for logging. Defaults to ParallelMode.GLOBAL.
|
||||
ranks (List): List of parallel ranks.
|
||||
"""
|
||||
message_prefix = "{}:{} {}".format(*self.__get_call_info())
|
||||
self._log('info', message_prefix, parallel_mode, ranks)
|
||||
@ -145,12 +146,11 @@ class DistributedLogger:
|
||||
def warning(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: list = None):
|
||||
"""Log a warning message.
|
||||
|
||||
:param message: The message to be logged
|
||||
:type message: str
|
||||
:param parallel_mode: The parallel mode used for logging. Defaults to ParallelMode.GLOBAL
|
||||
:type parallel_mode: :class:`colossalai.context.parallel_mode.ParallelMode`
|
||||
:param ranks: List of parallel ranks
|
||||
:type ranks: list
|
||||
Args:
|
||||
message (str): The message to be logged.
|
||||
parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`):
|
||||
The parallel mode used for logging. Defaults to ParallelMode.GLOBAL.
|
||||
ranks (List): List of parallel ranks.
|
||||
"""
|
||||
message_prefix = "{}:{} {}".format(*self.__get_call_info())
|
||||
self._log('warning', message_prefix, parallel_mode, ranks)
|
||||
@ -159,12 +159,11 @@ class DistributedLogger:
|
||||
def debug(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: list = None):
|
||||
"""Log a debug message.
|
||||
|
||||
:param message: The message to be logged
|
||||
:type message: str
|
||||
:param parallel_mode: The parallel mode used for logging. Defaults to ParallelMode.GLOBAL
|
||||
:type parallel_mode: :class:`colossalai.context.parallel_mode.ParallelMode`
|
||||
:param ranks: List of parallel ranks
|
||||
:type ranks: list
|
||||
Args:
|
||||
message (str): The message to be logged.
|
||||
parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`):
|
||||
The parallel mode used for logging. Defaults to ParallelMode.GLOBAL.
|
||||
ranks (List): List of parallel ranks.
|
||||
"""
|
||||
message_prefix = "{}:{} {}".format(*self.__get_call_info())
|
||||
self._log('debug', message_prefix, parallel_mode, ranks)
|
||||
@ -173,12 +172,11 @@ class DistributedLogger:
|
||||
def error(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: list = None):
|
||||
"""Log an error message.
|
||||
|
||||
:param message: The message to be logged
|
||||
:type message: str
|
||||
:param parallel_mode: The parallel mode used for logging. Defaults to ParallelMode.GLOBAL
|
||||
:type parallel_mode: :class:`colossalai.context.parallel_mode.ParallelMode`
|
||||
:param ranks: List of parallel ranks
|
||||
:type ranks: list
|
||||
Args:
|
||||
message (str): The message to be logged.
|
||||
parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`):
|
||||
The parallel mode used for logging. Defaults to ParallelMode.GLOBAL.
|
||||
ranks (List): List of parallel ranks.
|
||||
"""
|
||||
message_prefix = "{}:{} {}".format(*self.__get_call_info())
|
||||
self._log('error', message_prefix, parallel_mode, ranks)
|
||||
|
@ -6,6 +6,7 @@ import torch.nn as nn
|
||||
|
||||
|
||||
def zeros_():
|
||||
"""Return the initializer filling the input Tensor with the scalar zeros"""
|
||||
def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None):
|
||||
return nn.init.zeros_(tensor)
|
||||
|
||||
@ -13,6 +14,7 @@ def zeros_():
|
||||
|
||||
|
||||
def ones_():
|
||||
"""Return the initializer filling the input Tensor with the scalar ones"""
|
||||
def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None):
|
||||
return nn.init.ones_(tensor)
|
||||
|
||||
@ -20,6 +22,14 @@ def ones_():
|
||||
|
||||
|
||||
def uniform_(a: float = 0., b: float = 1.):
|
||||
r"""Return the initializer filling the input Tensor with values drawn from the uniform
|
||||
distribution :math:`\mathcal{U}(a, b)`.
|
||||
|
||||
Args:
|
||||
a (float): the lower bound of the uniform distribution. Defaults 0.0.
|
||||
b (float): the upper bound of the uniform distribution. Defaults 1.0.
|
||||
"""
|
||||
|
||||
def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None):
|
||||
return nn.init.uniform_(tensor, a, b)
|
||||
|
||||
@ -27,6 +37,15 @@ def uniform_(a: float = 0., b: float = 1.):
|
||||
|
||||
|
||||
def normal_(mean: float = 0., std: float = 1.):
|
||||
r"""Return the initializer filling the input Tensor with values drawn from the normal distribution
|
||||
|
||||
.. math::
|
||||
\mathcal{N}(\text{mean}, \text{std}^2)
|
||||
|
||||
Args:
|
||||
mean (float): the mean of the normal distribution. Defaults 0.0.
|
||||
std (float): the standard deviation of the normal distribution. Defaults 1.0.
|
||||
"""
|
||||
def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None):
|
||||
return nn.init.normal_(tensor, mean, std)
|
||||
|
||||
@ -34,6 +53,19 @@ def normal_(mean: float = 0., std: float = 1.):
|
||||
|
||||
|
||||
def trunc_normal_(mean: float = 0., std: float = 1., a: float = -2., b: float = 2.):
|
||||
r"""Return the initializer filling the input Tensor with values drawn from a truncated
|
||||
normal distribution. The values are effectively drawn from the
|
||||
normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
|
||||
with values outside :math:`[a, b]` redrawn until they are within
|
||||
the bounds. The method used for generating the random values works
|
||||
best when :math:`a \leq \text{mean} \leq b`.
|
||||
|
||||
Args:
|
||||
mean (float): the mean of the normal distribution. Defaults 0.0.
|
||||
std (float): the standard deviation of the normal distribution. Defaults 1.0.
|
||||
a (float): the minimum cutoff value. Defaults -2.0.
|
||||
b (float): the maximum cutoff value. Defaults 2.0.
|
||||
"""
|
||||
def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None):
|
||||
return nn.init.trunc_normal_(tensor, mean, std, a, b)
|
||||
|
||||
@ -41,6 +73,26 @@ def trunc_normal_(mean: float = 0., std: float = 1., a: float = -2., b: float =
|
||||
|
||||
|
||||
def kaiming_uniform_(a=0, mode='fan_in', nonlinearity='leaky_relu'):
|
||||
r"""Return the initializer filling the input `Tensor` with values according to the method
|
||||
described in `Delving deep into rectifiers: Surpassing human-level
|
||||
performance on ImageNet classification` - He, K. et al. (2015), using a
|
||||
uniform distribution. The resulting tensor will have values sampled from
|
||||
:math:`\mathcal{U}(-\text{bound}, \text{bound})` where
|
||||
|
||||
.. math::
|
||||
\text{bound} = \text{gain} \times \sqrt{\frac{3}{\text{fan_mode}}}
|
||||
|
||||
Also known as 'He initialization'.
|
||||
|
||||
Args:
|
||||
a (int): the negative slope of the rectifier used after this layer (only used with ``'leaky_relu'``).
|
||||
mode (str, optional): either ``'fan_in'`` (default) or ``'fan_out'``. Choosing ``'fan_in'``
|
||||
preserves the magnitude of the variance of the weights in the
|
||||
forward pass. Choosing ``'fan_out'`` preserves the magnitudes in the
|
||||
backwards pass.
|
||||
nonlinearity (str, optional): the non-linear function (`nn.functional` name),
|
||||
recommended to use only with ``'relu'`` or ``'leaky_relu'`` (default).
|
||||
"""
|
||||
# adapted from torch.nn.init
|
||||
def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None):
|
||||
if 0 in tensor.shape:
|
||||
@ -64,6 +116,26 @@ def kaiming_uniform_(a=0, mode='fan_in', nonlinearity='leaky_relu'):
|
||||
|
||||
|
||||
def kaiming_normal_(a=0, mode='fan_in', nonlinearity='leaky_relu'):
|
||||
r"""Return the initializer filling the input `Tensor` with values according to the method
|
||||
described in `Delving deep into rectifiers: Surpassing human-level
|
||||
performance on ImageNet classification` - He, K. et al. (2015), using a
|
||||
normal distribution. The resulting tensor will have values sampled from
|
||||
:math:`\mathcal{N}(0, \text{std}^2)` where
|
||||
|
||||
.. math::
|
||||
\text{std} = \frac{\text{gain}}{\sqrt{\text{fan_mode}}}
|
||||
|
||||
Also known as 'He initialization'.
|
||||
|
||||
Args:
|
||||
a (int): the negative slope of the rectifier used after this layer (only used with ``'leaky_relu'``).
|
||||
mode (str, optional): either ``'fan_in'`` (default) or ``'fan_out'``. Choosing ``'fan_in'``
|
||||
preserves the magnitude of the variance of the weights in the
|
||||
forward pass. Choosing ``'fan_out'`` preserves the magnitudes in the
|
||||
backwards pass.
|
||||
nonlinearity (str, optional): the non-linear function (`nn.functional` name),
|
||||
recommended to use only with ``'relu'`` or ``'leaky_relu'`` (default).
|
||||
"""
|
||||
# adapted from torch.nn.init
|
||||
def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None):
|
||||
if 0 in tensor.shape:
|
||||
@ -86,6 +158,23 @@ def kaiming_normal_(a=0, mode='fan_in', nonlinearity='leaky_relu'):
|
||||
|
||||
|
||||
def xavier_uniform_(a: float = math.sqrt(3.), scale: float = 2., gain: float = 1.):
|
||||
r"""Return the initializer filling the input `Tensor` with values according to the method
|
||||
described in `Understanding the difficulty of training deep feedforward
|
||||
neural networks` - Glorot, X. & Bengio, Y. (2010), using a uniform
|
||||
distribution. The resulting tensor will have values sampled from
|
||||
:math:`\mathcal{U}(-a, a)` where
|
||||
|
||||
.. math::
|
||||
a = \text{gain} \times \sqrt{\frac{6}{\text{fan_in} + \text{fan_out}}}
|
||||
|
||||
Also known as 'Glorot initialization'.
|
||||
|
||||
Args:
|
||||
a (float, optional): an optional scaling factor used to calculate uniform
|
||||
bounds from standard deviation. Defaults ``math.sqrt(3.)``.
|
||||
scale (float, optional): an optional scaling factor used to calculate standard deviation. Defaults 2.0.
|
||||
gain (float, optional): an optional scaling factor. Defaults 1.0.
|
||||
"""
|
||||
# adapted from torch.nn.init
|
||||
def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None):
|
||||
assert fan_in is not None, 'Fan_in is not provided.'
|
||||
@ -102,6 +191,21 @@ def xavier_uniform_(a: float = math.sqrt(3.), scale: float = 2., gain: float = 1
|
||||
|
||||
|
||||
def xavier_normal_(scale: float = 2., gain: float = 1.):
|
||||
r"""Return the initializer filling the input `Tensor` with values according to the method
|
||||
described in `Understanding the difficulty of training deep feedforward
|
||||
neural networks` - Glorot, X. & Bengio, Y. (2010), using a normal
|
||||
distribution. The resulting tensor will have values sampled from
|
||||
:math:`\mathcal{N}(0, \text{std}^2)` where
|
||||
|
||||
.. math::
|
||||
\text{std} = \text{gain} \times \sqrt{\frac{2}{\text{fan_in} + \text{fan_out}}}
|
||||
|
||||
Also known as 'Glorot initialization'.
|
||||
|
||||
Args:
|
||||
scale (float, optional): an optional scaling factor used to calculate standard deviation. Defaults 2.0.
|
||||
gain (float, optional): an optional scaling factor. Defaults 1.0.
|
||||
"""
|
||||
# adapted from torch.nn.init
|
||||
def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None):
|
||||
assert fan_in is not None, 'Fan_in is not provided.'
|
||||
@ -137,4 +241,4 @@ def lecun_normal_():
|
||||
std = math.sqrt(1.0 / fan_in)
|
||||
return nn.init.trunc_normal_(tensor, std=std / .87962566103423978)
|
||||
|
||||
return initializer
|
||||
return initializer
|
@ -6,13 +6,11 @@ from ..utils import get_tensor_parallel_mode
|
||||
|
||||
|
||||
class Dropout(nn.Module):
|
||||
"""
|
||||
Dropout layer of colossalai
|
||||
"""Dropout layer of colossalai.
|
||||
|
||||
:param p: dropout rate, defaults to 0.5
|
||||
:type p: float, optional
|
||||
:param inplace: If set to ``True``, will do this operation in-place, defaults tp ``False``
|
||||
:type inplace: bool, optional
|
||||
Args:
|
||||
p (float, optional): probability of an element to be zeroed, defaults 0.5.
|
||||
inplace (bool, optional): whether to do dropout in-place, default to be False.
|
||||
"""
|
||||
def __init__(self, p: float = 0.5, inplace: bool = False) -> None:
|
||||
super().__init__()
|
||||
|
@ -35,21 +35,33 @@ _parallel_patchembedding = {
|
||||
|
||||
|
||||
class Embedding(nn.Module):
|
||||
"""
|
||||
Embedding for colossalai
|
||||
r"""Embedding for colossalai.
|
||||
|
||||
:param num_embeddings: number of embeddings
|
||||
:type num_embeddings: int
|
||||
:param embedding_dim: dimension of embedding
|
||||
:type embedding_dim: int
|
||||
:param padding_idx: index of padding, defaults to None
|
||||
:type padding_idx: int, optional
|
||||
:param dtype: The dtype of parameters, defaults to None
|
||||
:type dtype: torch.dtype, optional
|
||||
:param weight_initializer: The intializer of weight, defaults to normal initializer
|
||||
:type weight_initializer: typing.Callable, optional
|
||||
:param args: Args used in F.embedding
|
||||
:param kwargs: Kwargs used in F.embedding
|
||||
Args:
|
||||
num_embeddings (int): number of embeddings.
|
||||
embedding_dim (int): dimension of embedding.
|
||||
padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
|
||||
therefore, the embedding vector at padding_idx is not updated during training,
|
||||
i.e. it remains as a fixed “pad”, defaults to None.
|
||||
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
|
||||
weight_initializer (:class:`typing.Callable`, optional):
|
||||
he initializer of weight, defaults to normal initializer.
|
||||
|
||||
The ``args`` and ``kwargs`` used in :class:`torch.nn.functional.embedding` should contain:
|
||||
::
|
||||
|
||||
max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
|
||||
renormalized to have norm max_norm. Note: this will modify weight in-place.
|
||||
norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
|
||||
scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
|
||||
of frequency of the words in the mini-batch. Default False.
|
||||
sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
|
||||
|
||||
More details about ``args`` and ``kwargs`` could be found in
|
||||
`Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
|
||||
|
||||
More details about ``initializer`` please refer to
|
||||
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
@ -97,27 +109,24 @@ class Embedding(nn.Module):
|
||||
|
||||
|
||||
class PatchEmbedding(nn.Module):
|
||||
"""
|
||||
2D Image to Patch Embedding
|
||||
"""2D Image to Patch Embedding.
|
||||
|
||||
:param img_size: image size
|
||||
:type img_size: int
|
||||
:param patch_size: patch size
|
||||
:type patch_size: int
|
||||
:param in_chans: number of channels of input image
|
||||
:type in_chans: int
|
||||
:param embed_size: size of embedding
|
||||
:type embed_size: int
|
||||
:param dtype: The dtype of parameters, defaults to None
|
||||
:type dtype: torch.dtype, optional
|
||||
:param flatten: whether to flatten output tensor, defaults to True
|
||||
:type flatten: bool, optional
|
||||
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
|
||||
:type weight_initializer: typing.Callable, optional
|
||||
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
|
||||
:type bias_initializer: typing.Callable, optional
|
||||
:param position_embed_initializer: The intializer of position embedding, defaults to zero
|
||||
:type position_embed_initializer: typing.Callable, optional
|
||||
Args:
|
||||
img_size (int): image size.
|
||||
patch_size (int): patch size.
|
||||
in_chans (int): number of channels of input image.
|
||||
embed_size (int): size of embedding.
|
||||
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
|
||||
flatten (bool, optional): whether to flatten output tensor, defaults to True.
|
||||
weight_initializer (:class:`typing.Callable`, optional):
|
||||
The initializer of weight, defaults to kaiming uniform initializer.
|
||||
bias_initializer (:class:`typing.Callable`, optional):
|
||||
The initializer of bias, defaults to xavier uniform initializer.
|
||||
position_embed_initializer (:class:`typing.Callable`, optional):
|
||||
The initializer of position embedding, defaults to zeros initializer.
|
||||
|
||||
More details about ``initializer`` please refer to
|
||||
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
|
@ -31,22 +31,35 @@ _vocab_parallel_classifier = {
|
||||
|
||||
|
||||
class Linear(nn.Module):
|
||||
"""
|
||||
Linear layer of colossalai
|
||||
"""Linear layer of colossalai.
|
||||
|
||||
:param in_features: size of each input sample
|
||||
:type in_features: int
|
||||
:param out_features: size of each output sample
|
||||
:type out_features: int
|
||||
:param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True
|
||||
:type bias: bool, optional
|
||||
:param dtype: The dtype of parameters, defaults to None
|
||||
:type dtype: torch.dtype, optional
|
||||
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
|
||||
:type weight_initializer: typing.Callable, optional
|
||||
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
|
||||
:type bias_initializer: typing.Callable, optional
|
||||
:param kwargs: Kwargs used for particular parallelisms
|
||||
Args:
|
||||
in_features (int): size of each input sample.
|
||||
out_features (int): size of each output sample.
|
||||
bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
|
||||
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
|
||||
weight_initializer (:class:`typing.Callable`, optional):
|
||||
The initializer of weight, defaults to kaiming uniform initializer.
|
||||
bias_initializer (:class:`typing.Callable`, optional):
|
||||
The initializer of bias, defaults to xavier uniform initializer.
|
||||
|
||||
Note: ``kwargs`` would contain different parameters when you use different parallelisms.
|
||||
|
||||
The ``kwargs`` should contain parameters below:
|
||||
::
|
||||
|
||||
Linear1D:
|
||||
gather_output: bool (optional, default to be false)
|
||||
skip_bias_add: bool (optional, default to be false)
|
||||
Linear2D:
|
||||
skip_bias_add: bool (optional, default to be false)
|
||||
Linear2p5D:
|
||||
skip_bias_add: bool (optional, default to be false)
|
||||
Linear3D:
|
||||
None
|
||||
|
||||
More details about ``initializer`` please refer to
|
||||
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
@ -88,21 +101,21 @@ class Linear(nn.Module):
|
||||
|
||||
|
||||
class Classifier(nn.Module):
|
||||
"""
|
||||
Classifier layer of colossalai
|
||||
"""Classifier layer of colossalai.
|
||||
|
||||
:param in_features: size of each input sample
|
||||
:type in_features: int
|
||||
:param num_classes: number of total classes for the dataset
|
||||
:type num_classes: int
|
||||
:param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True
|
||||
:type bias: bool, optional
|
||||
:param dtype: The dtype of parameters, defaults to None
|
||||
:type dtype: torch.dtype, optional
|
||||
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
|
||||
:type weight_initializer: typing.Callable, optional
|
||||
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
|
||||
:type bias_initializer: typing.Callable, optional
|
||||
Args:
|
||||
in_features (int): size of each input sample.
|
||||
num_classes (int): number of classes.
|
||||
weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
|
||||
bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
|
||||
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
|
||||
weight_initializer (:class:`typing.Callable`, optional):
|
||||
The initializer of weight, defaults to kaiming uniform initializer.
|
||||
bias_initializer (:class:`typing.Callable`, optional):
|
||||
The initializer of bias, defaults to xavier uniform initializer.
|
||||
|
||||
More details about ``initializer`` please refer to
|
||||
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
|
@ -19,18 +19,15 @@ _parallel_layernorm = {
|
||||
|
||||
|
||||
class LayerNorm(nn.Module):
|
||||
r"""
|
||||
Layer Normalization for colossalai
|
||||
r"""Layer Normalization for colossalai.
|
||||
|
||||
:param normalized_shape: input shape from an expected input
|
||||
of size. :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1] \times \ldots \times \text{normalized_shape}[-1]]`
|
||||
If a single integer is used, it is treated as a singleton list, and this module will
|
||||
normalize over the last dimension which is expected to be of that specific size.
|
||||
:type normalized_shape: int
|
||||
:param eps: a value added to the denominator for numerical stability, defaults to 1e-05
|
||||
:type eps: float, optional
|
||||
:param dtype: The dtype of parameters, defaults to None
|
||||
:type dtype: torch.dtype, optional
|
||||
Args:
|
||||
normalized_shape (int): input shape from an expected input of size.
|
||||
:math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1] \times \ldots \times \text{normalized_shape}[-1]]`
|
||||
If a single integer is used, it is treated as a singleton list, and this module will
|
||||
normalize over the last dimension which is expected to be of that specific size.
|
||||
eps (float, optional): a value added to the denominator for numerical stability, defaults to 1e-05
|
||||
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
|
||||
"""
|
||||
|
||||
def __init__(self, normalized_shape: int, eps=1e-05, dtype=None) -> None:
|
||||
|
@ -28,11 +28,10 @@ class Experts(MoeExperts):
|
||||
moe model parallel group, where E is the number of experts. Every expert
|
||||
is a instence of the class, 'expert' in initialization parameters.
|
||||
|
||||
:param expert: The class of all experts
|
||||
:param num_experts: The number of experts
|
||||
:param expert_args: Args used to initialize experts
|
||||
|
||||
:type num_experts: int
|
||||
Args:
|
||||
expert_cls (:class:`torch.nn.Module`): The class of all experts
|
||||
num_experts (int): The number of experts
|
||||
expert_args: Args used to initialize experts, the args could be found in corresponding expert class
|
||||
"""
|
||||
|
||||
def __init__(self, expert_cls: Type[nn.Module], num_experts: int, **expert_args):
|
||||
|
@ -18,19 +18,13 @@ class Top1Router(nn.Module):
|
||||
for routing usage. More deailted function can be found in the paper about Switch Transformer
|
||||
of Google.
|
||||
|
||||
:param capacity_factor_train: Capacity factor in routing during training
|
||||
:param capacity_factor_eval: Capacity factor in routing during evaluation
|
||||
:param min_capacity: The minimum number of the capacity of each expert
|
||||
:param select_policy: The policy about tokens selection
|
||||
:param noisy_func: Noisy function used in logits
|
||||
:param drop_tks: Whether drops tokens in evaluation
|
||||
|
||||
:type capacity_factor_train: float, optional
|
||||
:type capacity_factor_eval: float, optional
|
||||
:type min_capacity: int, optional
|
||||
:type select_policy: str, optional
|
||||
:type noisy_func: Callable, optional
|
||||
:type drop_tks: bool, optional
|
||||
Args:
|
||||
capacity_factor_train (float, optional): Capacity factor in routing of training.
|
||||
capacity_factor_eval (float, optional): Capacity factor in routing of evaluation.
|
||||
min_capacity (int, optional): The minimum number of the capacity of each expert.
|
||||
select_policy (str, optional): The policy about tokens selection.
|
||||
noisy_func (:class:`typing.Callable`, optional): Noisy function used in logits.
|
||||
drop_tks (bool, optional): Whether drops tokens in evaluation
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
@ -119,17 +113,12 @@ class Top2Router(nn.Module):
|
||||
"""Top2 router that returns the dispatch mask [s, e, c] and combine weight [s, e, c]
|
||||
for routing usage. More deailted function can be found in the paper about ViT-MoE.
|
||||
|
||||
:param capacity_factor_train: Capacity factor in routing during training
|
||||
:param capacity_factor_eval: Capacity factor in routing during evaluation
|
||||
:param min_capacity: The minimum number of the capacity of each expert
|
||||
:param noisy_func: Noisy function used in logits
|
||||
:param drop_tks: Whether drops tokens in evaluation
|
||||
|
||||
:type capacity_factor_train: float, optional
|
||||
:type capacity_factor_eval: float, optional
|
||||
:type min_capacity: int, optional
|
||||
:type noisy_func: Callable, optional
|
||||
:type drop_tks: bool, optional
|
||||
Args:
|
||||
capacity_factor_train (float, optional): Capacity factor in routing of training.
|
||||
capacity_factor_eval (float, optional): Capacity factor in routing of evaluation.
|
||||
min_capacity (int, optional): The minimum number of the capacity of each expert
|
||||
noisy_func (:class:`typing.Callable`, optional): Noisy function used in logits.
|
||||
drop_tks (bool, optional): Whether drops tokens in evaluation.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
@ -239,15 +228,11 @@ class MoeLayer(nn.Module):
|
||||
the moe tensor group by all to all comunication. Then it will get the output of all
|
||||
experts and exchange the output. At last returns the output of the moe system.
|
||||
|
||||
:param dim_model: Dimension of model
|
||||
:param num_experts: The number of experts
|
||||
:param router: Instance of router used in routing
|
||||
:param experts: Instance of experts generated by Expert
|
||||
|
||||
:type dim_model: int
|
||||
:type num_experts: int
|
||||
:type router: nn.Module
|
||||
:type experts: nn.Module
|
||||
Args:
|
||||
dim_model (int): Dimension of model.
|
||||
num_experts (int): The number of experts.
|
||||
router (:class:`torch.nn.Module`): Instance of router used in routing.
|
||||
experts (:class:`torch.nn.Module`): Instance of experts generated by Expert.
|
||||
"""
|
||||
|
||||
def __init__(self, dim_model: int, num_experts: int, router: nn.Module, experts: MoeExperts):
|
||||
|
@ -16,8 +16,8 @@ class NormalNoiseGenerator:
|
||||
All noise is generated from a normal distribution (0, 1 / E^2), where
|
||||
E = the number of experts.
|
||||
|
||||
:param num_experts: The number of experts
|
||||
:type num_experts: int
|
||||
Args:
|
||||
num_experts (int): The number of experts.
|
||||
"""
|
||||
|
||||
def __init__(self, num_experts: int):
|
||||
@ -37,8 +37,8 @@ class UniformNoiseGenerator:
|
||||
Makes models more resilient to rounding errors introduced by bfloat16.
|
||||
This seems particularly important for logits.
|
||||
|
||||
:param eps: Epsilon in generator
|
||||
:type eps: float
|
||||
Args:
|
||||
eps (float, optional): Epsilon in generator, defaults 1e-2.
|
||||
"""
|
||||
|
||||
def __init__(self, eps: float = 1e-2):
|
||||
|
@ -7,17 +7,17 @@ except:
|
||||
|
||||
|
||||
class FusedLayerNormAffineFunction1D(torch.autograd.Function):
|
||||
r"""
|
||||
Layernorm
|
||||
r"""Layernorm
|
||||
|
||||
:param input: input maxtrix
|
||||
:param weight: weight matrix
|
||||
:param bias: bias matrix
|
||||
:param normalized_shape: input shape from an expected input
|
||||
of size. :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1] \times \ldots \times \text{normalized_shape}[-1]]`
|
||||
If a single integer is used, it is treated as a singleton list, and this module will
|
||||
normalize over the last dimension which is expected to be of that specific size.
|
||||
:param eps: a value added to the denominator for numerical stability
|
||||
Args:
|
||||
input: input matrix.
|
||||
weight: weight matrix.
|
||||
bias: bias matrix.
|
||||
normalized_shape: input shape from an expected input of size.
|
||||
:math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1] \times \ldots \times \text{normalized_shape}[-1]]`
|
||||
If a single integer is used, it is treated as a singleton list, and this module will
|
||||
normalize over the last dimension which is expected to be of that specific size.
|
||||
eps: a value added to the denominator for numerical stability
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
|
@ -78,8 +78,9 @@ class _ReduceGrad(torch.autograd.Function):
|
||||
"""
|
||||
Pass the input to the model parallel region.
|
||||
|
||||
:param input_: input matrix
|
||||
:param parallel_mode: parallel mode
|
||||
Args:
|
||||
input_: input matrix.
|
||||
parallel_mode: parallel mode.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
@ -99,9 +100,10 @@ class _ReduceGrad(torch.autograd.Function):
|
||||
class _ReduceInput(torch.autograd.Function):
|
||||
"""
|
||||
All-reduce the input from the model parallel region.
|
||||
|
||||
:param input_: input matrix
|
||||
:param parallel_mode: parallel mode
|
||||
|
||||
Args:
|
||||
input_: input matrix.
|
||||
parallel_mode: parallel mode.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
@ -121,9 +123,10 @@ class _SplitForwardGatherBackward(torch.autograd.Function):
|
||||
"""
|
||||
Split the input and keep only the corresponding chuck to the rank.
|
||||
|
||||
:param input_: input matrix
|
||||
:param parallel_mode: parallel mode
|
||||
:param dim: dimension
|
||||
Args:
|
||||
input_: input matrix.
|
||||
parallel_mode: parallel mode.
|
||||
dim: dimension
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
@ -142,12 +145,12 @@ class _SplitForwardGatherBackward(torch.autograd.Function):
|
||||
|
||||
|
||||
class _GatherForwardSplitBackward(torch.autograd.Function):
|
||||
"""
|
||||
Gather the input from model parallel region and concatinate.
|
||||
|
||||
:param input_: input matrix
|
||||
:param parallel_mode: parallel mode
|
||||
:param dim: dimension
|
||||
"""Gather the input from model parallel region and concatenate.
|
||||
|
||||
Args:
|
||||
input_: input matrix.
|
||||
parallel_mode: parallel mode.
|
||||
dim: dimension
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
|
@ -24,24 +24,23 @@ from ._utils import (gather_forward_split_backward, get_parallel_input, reduce_g
|
||||
|
||||
@LAYERS.register_module
|
||||
class Linear1D(torch.nn.Module):
|
||||
"""
|
||||
Linear layer for 1D parallelism
|
||||
r"""Linear layer for 1D parallelism.
|
||||
|
||||
:param in_features: size of each input sample
|
||||
:type in_features: int
|
||||
:param out_features: size of each output sample
|
||||
:type out_features: int
|
||||
:param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True
|
||||
:type bias: bool, optional
|
||||
:param dtype: The dtype of parameters, defaults to None
|
||||
:type dtype: torch.dtype, optional
|
||||
:param skip_bias_add: If set to ``True``, it will skip bias add for linear layer,
|
||||
which is preserved for kernel fusion, defaults to False
|
||||
:type skip_bias_add: bool, optional
|
||||
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
|
||||
:type weight_initializer: typing.Callable, optional
|
||||
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
|
||||
:type bias_initializer: typing.Callable, optional
|
||||
Args:
|
||||
in_features (int): size of each input sample.
|
||||
out_features (int): size of each output sample.
|
||||
bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
|
||||
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
|
||||
gather_output (bool, optional): Whether to call all-gather on output, defaults to False.
|
||||
skip_bias_add (bool, optional): If set to ``True``, it will skip bias add for linear layer,
|
||||
which is preserved for kernel fusion, defaults to False
|
||||
weight_initializer (:class:`typing.Callable`, optional):
|
||||
The initializer of weight, defaults to kaiming uniform initializer.
|
||||
bias_initializer (:class:`typing.Callable`, optional):
|
||||
The initializer of bias, defaults to xavier uniform initializer.
|
||||
|
||||
More details about ``initializer`` please refer to
|
||||
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
@ -88,23 +87,21 @@ class Linear1D(torch.nn.Module):
|
||||
|
||||
@LAYERS.register_module
|
||||
class Classifier1D(ParallelLayer):
|
||||
"""RowLinear with given weight
|
||||
Classifier of 1D parallelism
|
||||
|
||||
:param in_features: size of input features
|
||||
:type in_features: int
|
||||
:param num_classes: number of classes in the dataset
|
||||
:type num_classes: int
|
||||
:param weight: weight of the classifier, defaults to True
|
||||
:type weight: torch.nn.Parameter, optional
|
||||
:param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True``
|
||||
:type bias: bool, optional
|
||||
:param dtype: The dtype of parameters, defaults to None
|
||||
:type dtype: torch.dtype, optional
|
||||
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
|
||||
:type weight_initializer: typing.Callable, optional
|
||||
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
|
||||
:type bias_initializer: typing.Callable, optional
|
||||
r"""RowLinear with given weight. Classifier of 1D parallelism.
|
||||
|
||||
Args:
|
||||
in_features (int): size of each input sample.
|
||||
num_classes (int): number of classes.
|
||||
weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
|
||||
bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
|
||||
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
|
||||
weight_initializer (:class:`typing.Callable`, optional):
|
||||
The initializer of weight, defaults to kaiming uniform initializer.
|
||||
bias_initializer (:class:`typing.Callable`, optional):
|
||||
The initializer of bias, defaults to xavier uniform initializer.
|
||||
|
||||
More details about ``initializer`` please refer to
|
||||
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
@ -171,23 +168,21 @@ class Classifier1D(ParallelLayer):
|
||||
|
||||
@LAYERS.register_module
|
||||
class VocabParallelClassifier1D(ParallelLayer):
|
||||
"""ColLinear with given weight
|
||||
Classifier of 1D parallelism
|
||||
|
||||
:param in_features: size of input features
|
||||
:type in_features: int
|
||||
:param num_classes: number of classes in the dataset
|
||||
:type num_classes: int
|
||||
:param weight: weight of the classifier, defaults to True
|
||||
:type weight: torch.nn.Parameter, optional
|
||||
:param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True``
|
||||
:type bias: bool, optional
|
||||
:param dtype: The dtype of parameters, defaults to None
|
||||
:type dtype: torch.dtype, optional
|
||||
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
|
||||
:type weight_initializer: typing.Callable, optional
|
||||
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
|
||||
:type bias_initializer: typing.Callable, optional
|
||||
r"""ColLinear with given weight. Classifier of 1D parallelism.
|
||||
|
||||
Args:
|
||||
in_features (int): size of each input sample.
|
||||
num_classes (int): number of classes.
|
||||
weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
|
||||
bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
|
||||
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
|
||||
weight_initializer (:class:`typing.Callable`, optional):
|
||||
The initializer of weight, defaults to kaiming uniform initializer.
|
||||
bias_initializer (:class:`typing.Callable`, optional):
|
||||
The initializer of bias, defaults to xavier uniform initializer.
|
||||
|
||||
More details about ``initializer`` please refer to
|
||||
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
@ -249,30 +244,28 @@ class VocabParallelClassifier1D(ParallelLayer):
|
||||
|
||||
@LAYERS.register_module
|
||||
class Linear1D_Col(ParallelLayer):
|
||||
"""Linear layer with column parallelism.
|
||||
r"""Linear layer with column parallelism.
|
||||
|
||||
The linear layer is defined as :math:`Y = XA + b`. A is parallelized along
|
||||
its second dimension as :math:`A = [A_1, ..., A_p]`.
|
||||
|
||||
:param in_features: first dimension of matrix A.
|
||||
:type in_features: int
|
||||
:param output_size: second dimension of matrix A.
|
||||
:type output_size: int
|
||||
:param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True``
|
||||
:type bias: bool, optional
|
||||
:param dtype: The dtype of parameters, defaults to None
|
||||
:type dtype: torch.dtype, optional
|
||||
:param gather_output: If true, call all-gether on output and make Y avaiable
|
||||
Args:
|
||||
in_features (int): size of each input sample.
|
||||
out_features (int): size of each output sample.
|
||||
bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
|
||||
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
|
||||
gather_output (bool, optional): If true, call all-gather on output and make Y available
|
||||
to all GPUs, otherwise, every GPU will have its output
|
||||
which is :math:`Y_i = XA_i`, defaults to False
|
||||
:type gather_output: bool, optional
|
||||
:param skip_bias_add: If set to ``True``, it will skip bias add for linear layer,
|
||||
which is preserved for kernel fusion, defaults to False
|
||||
:type skip_bias_add: bool, optional
|
||||
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
|
||||
:type weight_initializer: typing.Callable, optional
|
||||
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
|
||||
:type bias_initializer: typing.Callable, optional
|
||||
skip_bias_add (bool, optional): If set to ``True``, it will skip bias add for linear layer,
|
||||
which is preserved for kernel fusion, defaults to Fals
|
||||
weight_initializer (:class:`typing.Callable`, optional):
|
||||
The initializer of weight, defaults to kaiming uniform initializer.
|
||||
bias_initializer (:class:`typing.Callable`, optional):
|
||||
The initializer of bias, defaults to xavier uniform initializer.
|
||||
|
||||
More details about ``initializer`` please refer to
|
||||
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
@ -343,25 +336,23 @@ class Linear1D_Col(ParallelLayer):
|
||||
|
||||
@LAYERS.register_module
|
||||
class Linear1D_Row(ParallelLayer):
|
||||
""" Linear layer with row parallelism
|
||||
r""" Linear layer with row parallelism
|
||||
|
||||
:param in_features: size of each input sample
|
||||
:type in_features: int
|
||||
:param out_features: size of each output sample
|
||||
:type out_features: int
|
||||
:param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True``
|
||||
:type bias: bool, optional
|
||||
:param dtype: The dtype of parameters, defaults to None
|
||||
:type dtype: torch.dtype, optional
|
||||
:param parallel_input: If set to ``True``, it's assumed that the input is splitted, defaults to False
|
||||
:type parallel_input: bool, optional
|
||||
:param skip_bias_add: If set to ``True``, it will skip bias add for linear layer,
|
||||
which is preserved for kernel fusion, defaults to False
|
||||
:type skip_bias_add: bool, optional
|
||||
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
|
||||
:type weight_initializer: typing.Callable, optional
|
||||
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
|
||||
:type bias_initializer: typing.Callable, optional
|
||||
Args:
|
||||
in_features (int): size of each input sample.
|
||||
out_features (int): size of each output sample.
|
||||
bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
|
||||
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
|
||||
parallel_input (bool, optional): If set to ``True``, it's assumed that the input is split, defaults to False.
|
||||
skip_bias_add (bool, optional): If set to ``True``, it will skip bias add for linear layer,
|
||||
which is preserved for kernel fusion, defaults to Fals
|
||||
weight_initializer (:class:`typing.Callable`, optional):
|
||||
The initializer of weight, defaults to kaiming uniform initializer.
|
||||
bias_initializer (:class:`typing.Callable`, optional):
|
||||
The initializer of bias, defaults to xavier uniform initializer.
|
||||
|
||||
More details about ``initializer`` please refer to
|
||||
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
@ -432,21 +423,33 @@ class Linear1D_Row(ParallelLayer):
|
||||
|
||||
@LAYERS.register_module
|
||||
class Embedding1D(ParallelLayer):
|
||||
"""
|
||||
Embedding for 1D parallelism
|
||||
r"""Embedding for 1D parallelism.
|
||||
|
||||
:param num_embeddings: number of embeddings
|
||||
:type num_embeddings: int
|
||||
:param embedding_dim: dimension of embedding
|
||||
:type embedding_dim: int
|
||||
:param padding_idx: index of padding, defaults to None
|
||||
:type padding_idx: int, optional
|
||||
:param dtype: The dtype of parameters, defaults to None
|
||||
:type dtype: torch.dtype, optional
|
||||
:param weight_initializer: The intializer of weight, defaults to normal initializer
|
||||
:type weight_initializer: typing.Callable, optional
|
||||
:param args: Args used in F.embedding
|
||||
:param kwargs: Kwargs used in F.embedding
|
||||
Args:
|
||||
num_embeddings (int): number of embeddings.
|
||||
embedding_dim (int): dimension of embedding.
|
||||
padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
|
||||
therefore, the embedding vector at padding_idx is not updated during training,
|
||||
i.e. it remains as a fixed “pad”, defaults to None.
|
||||
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
|
||||
weight_initializer (:class:`typing.Callable`, optional):
|
||||
he initializer of weight, defaults to normal initializer.
|
||||
|
||||
The ``args`` and ``kwargs`` used in :class:`torch.nn.functional.embedding` should contain:
|
||||
::
|
||||
|
||||
max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
|
||||
renormalized to have norm max_norm. Note: this will modify weight in-place.
|
||||
norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
|
||||
scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
|
||||
of frequency of the words in the mini-batch. Default False.
|
||||
sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
|
||||
|
||||
More details about ``args`` and ``kwargs`` could be found in
|
||||
`Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
|
||||
|
||||
More details about ``initializer`` please refer to
|
||||
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
@ -499,20 +502,33 @@ class Embedding1D(ParallelLayer):
|
||||
|
||||
@LAYERS.register_module
|
||||
class VocabParallelEmbedding1D(torch.nn.Module):
|
||||
"""Embedding parallelized in the vocabulary dimension.
|
||||
r"""Embedding parallelized in the vocabulary dimension.
|
||||
|
||||
:param num_embeddings: number of embeddings
|
||||
:type num_embeddings: int
|
||||
:param embedding_dim: dimension of embedding
|
||||
:type embedding_dim: int
|
||||
:param padding_idx: index of padding, defaults to None
|
||||
:type padding_idx: int, optional
|
||||
:param dtype: The dtype of parameters, defaults to None
|
||||
:type dtype: torch.dtype, optional
|
||||
:param weight_initializer: The intializer of weight, defaults to normal initializer
|
||||
:type weight_initializer: typing.Callable, optional
|
||||
:param args: Args used in F.embedding
|
||||
:param kwargs: Kwargs used in F.embedding
|
||||
Args:
|
||||
num_embeddings (int): number of embeddings.
|
||||
embedding_dim (int): dimension of embedding.
|
||||
padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
|
||||
therefore, the embedding vector at padding_idx is not updated during training,
|
||||
i.e. it remains as a fixed “pad”, defaults to None.
|
||||
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
|
||||
weight_initializer (:class:`typing.Callable`, optional):
|
||||
he initializer of weight, defaults to normal initializer.
|
||||
|
||||
The ``args`` and ``kwargs`` used in :class:``torch.nn.functional.embedding`` should contain:
|
||||
::
|
||||
|
||||
max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
|
||||
renormalized to have norm max_norm. Note: this will modify weight in-place.
|
||||
norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
|
||||
scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
|
||||
of frequency of the words in the mini-batch. Default False.
|
||||
sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
|
||||
|
||||
More details about ``args`` and ``kwargs`` could be found in
|
||||
`Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
|
||||
|
||||
More details about initializer please refer to
|
||||
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
@ -578,13 +594,11 @@ class VocabParallelEmbedding1D(torch.nn.Module):
|
||||
|
||||
@LAYERS.register_module
|
||||
class Dropout1D(ParallelLayer):
|
||||
"""
|
||||
Dropout layer of 1D parallelism
|
||||
"""Dropout layer of 1D parallelism.
|
||||
|
||||
:param p: dropout rate, defaults to 0.5
|
||||
:type p: float, optional
|
||||
:param inplace: If set to ``True``, will do this operation in-place, defaults tp ``False``
|
||||
:type inplace: bool, optional
|
||||
Args:
|
||||
p (float, optional): probability of an element to be zeroed, defaults 0.5.
|
||||
inplace (bool, optional): whether to do dropout in-place, default to be False.
|
||||
"""
|
||||
|
||||
def __init__(self, p: float = 0.5, inplace: bool = False):
|
||||
|
@ -21,27 +21,26 @@ def matmul_2d(
|
||||
row_parallel_mode=ParallelMode.PARALLEL_2D_ROW,
|
||||
col_parallel_mode=ParallelMode.PARALLEL_2D_COL,
|
||||
):
|
||||
"""
|
||||
Matrix multiplication for 2D parallelism
|
||||
r"""Matrix multiplication for 2D parallelism.
|
||||
|
||||
:param a: matrix :math:`A`
|
||||
:type a: torch.tensor
|
||||
:param b: matrix :math:`B`
|
||||
:type b: torch.tensor
|
||||
:param summa_dim: dimension of SUMMA fo 2D parallelism
|
||||
:type summa_dim: int
|
||||
:param out_shape: shape of output tensor
|
||||
:type out_shape: tuple
|
||||
:param row_rank: the rank of row, defaults to None
|
||||
:type row_rank: int, optional
|
||||
:param col_rank: the rank of column, defaults to None
|
||||
:type col_rank: int, optional
|
||||
:param row_parallel_mode: row parallel mode, defaults to ParallelMode.PARALLEL_2D_ROW
|
||||
:type row_parallel_mode: str, optional
|
||||
:param col_parallel_mode: column parallel mode, defaults to ParallelMode.PARALLEL_2D_COL
|
||||
:type col_parallel_mode: str, optional
|
||||
:return: :math:`C = AB`
|
||||
:rtype: torch.tensor
|
||||
Args:
|
||||
a (:class:`torch.tensor`): matrix :math:`A`.
|
||||
b (:class:`torch.tensor`): matrix :math:`B`.
|
||||
summa_dim (int): dimension of SUMMA fo 2D parallelism.
|
||||
out_shape (:class:`torch.size`): shape of output tensor.
|
||||
row_rank (int, optional): the rank of row, defaults to None.
|
||||
col_rank (int, optional): the rank of column, defaults to None.
|
||||
row_parallel_mode (:class:`colossalai.context.ParallelMode`, optional):
|
||||
row parallel mode, defaults to ParallelMode.PARALLEL_2D_ROW.
|
||||
col_parallel_mode (:class:`colossalai.context.ParallelMode`, optional):
|
||||
column parallel mode, defaults to ParallelMode.PARALLEL_2D_COL.
|
||||
|
||||
Returns:
|
||||
:class:`torch.tensor`: :math:`C = AB`.
|
||||
|
||||
Note:
|
||||
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
|
||||
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
|
||||
"""
|
||||
if row_rank is None:
|
||||
row_rank = gpc.get_local_rank(col_parallel_mode)
|
||||
@ -135,35 +134,26 @@ def classifier_2d(A: Tensor, B: Tensor, bias: Optional[Tensor], summa_dim: int,
|
||||
row_rank: int, col_rank: int, row_parallel_mode: ParallelMode, col_parallel_mode: ParallelMode,
|
||||
data_parallel_rank: int, pipeline_parallel_rank: int, pipeline_parallel_size: int,
|
||||
tensor_parallel_size: int) -> Tensor:
|
||||
"""
|
||||
2D parallel classifier
|
||||
r"""2D parallel classifier.
|
||||
|
||||
:param a: matrix :math:`A`
|
||||
:type a: torch.tensor
|
||||
:param b: matrix :math:`B`
|
||||
:type b: torch.tensor
|
||||
:param bias: matrix of bias
|
||||
:type bias: torch.tensor, optional
|
||||
:param summa_dim: dimension of SUMMA fo 2D parallelism
|
||||
:type summa_dim: int
|
||||
:param out_shape: shape of output tensor
|
||||
:type out_shape: tuple
|
||||
:param row_rank: the rank of row
|
||||
:type row_rank: int
|
||||
:param col_rank: the rank of column
|
||||
:type col_rank: int
|
||||
:param row_parallel_mode: row parallel mode
|
||||
:type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
|
||||
:param col_parallel_mode: column parallel mode
|
||||
:type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
|
||||
:param data_parallel_rank: data parallel rank
|
||||
:type data_parallel_rank: int
|
||||
:param pipeline_parallel_rank: pipeline parallel rank
|
||||
:type pipeline_parallel_rank: int
|
||||
:param pipeline_parallel_size: pipeline parallel size
|
||||
:type pipeline_parallel_size: int
|
||||
:param tensor_parallel_size: tensor parallel size
|
||||
:type tensor_parallel_size: int
|
||||
Args:
|
||||
A (:class:`torch.tensor`): matrix :math:`A`.
|
||||
B (:class:`torch.tensor`): matrix :math:`B`.
|
||||
bias (:class:`torch.tensor`, optional): matrix of bias.
|
||||
summa_dim (int): dimension of SUMMA fo 2D parallelism.
|
||||
out_shape (:class:`torch.size`): shape of output tensor.
|
||||
row_rank (int, optional): the rank of row, defaults to None.
|
||||
col_rank (int, optional): the rank of column, defaults to None.
|
||||
row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
|
||||
col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
|
||||
data_parallel_rank (int): data parallel rank.
|
||||
pipeline_parallel_rank (int): pipeline parallel rank
|
||||
pipeline_parallel_size (int): pipeline parallel size.
|
||||
tensor_parallel_size (int): tensor parallel size.
|
||||
|
||||
Note:
|
||||
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
|
||||
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
|
||||
"""
|
||||
return _Classifier2D.apply(A, B, bias, summa_dim, out_shape, row_rank, col_rank, row_parallel_mode,
|
||||
col_parallel_mode, data_parallel_rank, pipeline_parallel_rank, pipeline_parallel_size,
|
||||
@ -171,33 +161,25 @@ def classifier_2d(A: Tensor, B: Tensor, bias: Optional[Tensor], summa_dim: int,
|
||||
|
||||
|
||||
class Matmul_AB_2D(torch.autograd.Function):
|
||||
"""
|
||||
Matrix multiplication for :math:`C = AB`
|
||||
r"""Matrix multiplication for :math:`C = AB`.
|
||||
|
||||
:param a: matrix :math:`A`
|
||||
:type a: torch.tensor
|
||||
:param b: matrix :math:`B`
|
||||
:type b: torch.tensor
|
||||
:param summa_dim: dimension of SUMMA fo 2D parallelism
|
||||
:type summa_dim: int
|
||||
:param out_shape: shape of output tensor
|
||||
:type out_shape: tuple
|
||||
:param row_rank: the rank of row
|
||||
:type row_rank: int
|
||||
:param col_rank: the rank of column
|
||||
:type col_rank: int
|
||||
:param row_parallel_mode: row parallel mode
|
||||
:type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
|
||||
:param col_parallel_mode: column parallel mode
|
||||
:type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
|
||||
:param data_parallel_rank: data parallel rank
|
||||
:type data_parallel_rank: int
|
||||
:param pipeline_parallel_rank: pipeline parallel rank
|
||||
:type pipeline_parallel_rank: int
|
||||
:param pipeline_parallel_size: pipeline parallel size
|
||||
:type pipeline_parallel_size: int
|
||||
:param tensor_parallel_size: tensor parallel size
|
||||
:type tensor_parallel_size: int
|
||||
Args:
|
||||
A (:class:`torch.tensor`): matrix :math:`A`.
|
||||
B (:class:`torch.tensor`): matrix :math:`B`.
|
||||
summa_dim (int): dimension of SUMMA fo 2D parallelism.
|
||||
out_shape (:class:`torch.size`): shape of output tensor.
|
||||
row_rank (int, optional): the rank of row, defaults to None.
|
||||
col_rank (int, optional): the rank of column, defaults to None.
|
||||
row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
|
||||
col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
|
||||
data_parallel_rank (int): data parallel rank.
|
||||
pipeline_parallel_rank (int): pipeline parallel rank
|
||||
pipeline_parallel_size (int): pipeline parallel size.
|
||||
tensor_parallel_size (int): tensor parallel size.
|
||||
|
||||
Note:
|
||||
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
|
||||
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
|
||||
"""
|
||||
@staticmethod
|
||||
@custom_fwd(cast_inputs=torch.float16)
|
||||
@ -305,33 +287,26 @@ class Matmul_AB_2D(torch.autograd.Function):
|
||||
|
||||
|
||||
class Matmul_ABT_2D(torch.autograd.Function):
|
||||
"""
|
||||
Matrix multiplication for :math:`C = AB^T`
|
||||
|
||||
:param a: matrix :math:`A`
|
||||
:type a: torch.tensor
|
||||
:param b: matrix :math:`B`
|
||||
:type b: torch.tensor
|
||||
:param summa_dim: dimension of SUMMA fo 2D parallelism
|
||||
:type summa_dim: int
|
||||
:param out_shape: shape of output tensor
|
||||
:type out_shape: tuple
|
||||
:param row_rank: the rank of row
|
||||
:type row_rank: int
|
||||
:param col_rank: the rank of column
|
||||
:type col_rank: int
|
||||
:param row_parallel_mode: row parallel mode
|
||||
:type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
|
||||
:param col_parallel_mode: column parallel mode
|
||||
:type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
|
||||
:param data_parallel_rank: data parallel rank
|
||||
:type data_parallel_rank: int
|
||||
:param pipeline_parallel_rank: pipeline parallel rank
|
||||
:type pipeline_parallel_rank: int
|
||||
:param pipeline_parallel_size: pipeline parallel size
|
||||
:type pipeline_parallel_size: int
|
||||
:param tensor_parallel_size: tensor parallel size
|
||||
:type tensor_parallel_size: int
|
||||
r"""Matrix multiplication for :math:`C = AB^T`
|
||||
|
||||
Args:
|
||||
A (:class:`torch.tensor`): matrix :math:`A`.
|
||||
B (:class:`torch.tensor`): matrix :math:`B`.
|
||||
summa_dim (int): dimension of SUMMA fo 2D parallelism.
|
||||
out_shape (:class:`torch.size`): shape of output tensor.
|
||||
row_rank (int, optional): the rank of row, defaults to None.
|
||||
col_rank (int, optional): the rank of column, defaults to None.
|
||||
row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
|
||||
col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
|
||||
column parallel mode, defaults to ParallelMode.PARALLEL_2D_COL.
|
||||
data_parallel_rank (int): data parallel rank.
|
||||
pipeline_parallel_rank (int): pipeline parallel rank
|
||||
pipeline_parallel_size (int): pipeline parallel size.
|
||||
tensor_parallel_size (int): tensor parallel size.
|
||||
|
||||
Note:
|
||||
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
|
||||
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
|
||||
"""
|
||||
@staticmethod
|
||||
@custom_fwd(cast_inputs=torch.float16)
|
||||
@ -445,33 +420,25 @@ class Matmul_ABT_2D(torch.autograd.Function):
|
||||
|
||||
|
||||
class Matmul_ATB_2D(torch.autograd.Function):
|
||||
"""
|
||||
Matrix multiplication for :math:`C = A^TB`
|
||||
r"""Matrix multiplication for :math:`C = A^TB`.
|
||||
|
||||
:param a: matrix :math:`A`
|
||||
:type a: torch.tensor
|
||||
:param b: matrix :math:`B`
|
||||
:type b: torch.tensor
|
||||
:param summa_dim: dimension of SUMMA fo 2D parallelism
|
||||
:type summa_dim: int
|
||||
:param out_shape: shape of output tensor
|
||||
:type out_shape: tuple
|
||||
:param row_rank: the rank of row
|
||||
:type row_rank: int
|
||||
:param col_rank: the rank of column
|
||||
:type col_rank: int
|
||||
:param row_parallel_mode: row parallel mode
|
||||
:type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
|
||||
:param col_parallel_mode: column parallel mode
|
||||
:type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
|
||||
:param data_parallel_rank: data parallel rank
|
||||
:type data_parallel_rank: int
|
||||
:param pipeline_parallel_rank: pipeline parallel rank
|
||||
:type pipeline_parallel_rank: int
|
||||
:param pipeline_parallel_size: pipeline parallel size
|
||||
:type pipeline_parallel_size: int
|
||||
:param tensor_parallel_size: tensor parallel size
|
||||
:type tensor_parallel_size: int
|
||||
Args:
|
||||
A (:class:`torch.tensor`): matrix :math:`A`.
|
||||
B (:class:`torch.tensor`): matrix :math:`B`.
|
||||
summa_dim (int): dimension of SUMMA fo 2D parallelism.
|
||||
out_shape (:class:`torch.size`): shape of output tensor.
|
||||
row_rank (int, optional): the rank of row, defaults to None.
|
||||
col_rank (int, optional): the rank of column, defaults to None.
|
||||
row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
|
||||
col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
|
||||
data_parallel_rank (int): data parallel rank.
|
||||
pipeline_parallel_rank (int): pipeline parallel rank
|
||||
pipeline_parallel_size (int): pipeline parallel size.
|
||||
tensor_parallel_size (int): tensor parallel size.
|
||||
|
||||
Note:
|
||||
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
|
||||
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
|
||||
"""
|
||||
@staticmethod
|
||||
@custom_fwd(cast_inputs=torch.float16)
|
||||
@ -639,33 +606,26 @@ def add_bias_2d(input_: Tensor, bias: Tensor, output_size_per_partition: int, ro
|
||||
row_parallel_mode: ParallelMode, col_parallel_mode: ParallelMode, skip_bias_add: bool,
|
||||
data_parallel_rank: int, pipeline_parallel_rank: int, pipeline_parallel_size: int,
|
||||
tensor_parallel_size: int) -> Tensor:
|
||||
"""
|
||||
Matrix add bias: :math:`C = A + b`
|
||||
r"""Matrix add bias: :math:`C = A + b`.
|
||||
|
||||
:param input_: matrix :math:`A`
|
||||
:type input_: torch.tensor
|
||||
:param bias: matrix :math:`b`
|
||||
:type bias: torch.tensor
|
||||
:param output_size_per_partition: size of ouput per partition
|
||||
:type output_size_per_partition: int
|
||||
:param row_rank: the rank of row
|
||||
:type row_rank: int
|
||||
:param col_rank: the rank of column
|
||||
:type col_rank: int
|
||||
:param row_parallel_mode: row parallel mode
|
||||
:type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
|
||||
:param col_parallel_mode: column parallel mode
|
||||
:type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
|
||||
:param skip_bias_add: If set to ``True``, it will skip bias add for linear layer, which is preserved for kernel fusion
|
||||
:type skip_bias_add: bool
|
||||
:param data_parallel_rank: data parallel rank
|
||||
:type data_parallel_rank: int
|
||||
:param pipeline_parallel_rank: pipeline parallel rank
|
||||
:type pipeline_parallel_rank: int
|
||||
:param pipeline_parallel_size: pipeline parallel size
|
||||
:type pipeline_parallel_size: int
|
||||
:param tensor_parallel_size: tensor parallel size
|
||||
:type tensor_parallel_size: int
|
||||
Args:
|
||||
input_ (:class:`torch.tensor`): matrix :math:`A`.
|
||||
bias (:class:`torch.tensor`): matrix :math:`B`.
|
||||
output_size_per_partition (int): size of output per partition.
|
||||
row_rank (int, optional): the rank of row, defaults to None.
|
||||
col_rank (int, optional): the rank of column, defaults to None.
|
||||
row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
|
||||
col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
|
||||
skip_bias_add (bool):
|
||||
If set to ``True``, it will skip bias add for linear layer, which is preserved for kernel fusion.
|
||||
data_parallel_rank (int): data parallel rank.
|
||||
pipeline_parallel_rank (int): pipeline parallel rank
|
||||
pipeline_parallel_size (int): pipeline parallel size.
|
||||
tensor_parallel_size (int): tensor parallel size.
|
||||
|
||||
Note:
|
||||
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
|
||||
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
|
||||
"""
|
||||
return _Add_Bias_2D.apply(input_, bias, output_size_per_partition, row_rank, col_rank, row_parallel_mode,
|
||||
col_parallel_mode, skip_bias_add, data_parallel_rank, pipeline_parallel_rank,
|
||||
@ -711,21 +671,19 @@ class _Layernorm_2D(torch.autograd.Function):
|
||||
|
||||
def layernorm_2d(input_: Tensor, E_x: Tensor, Var_x: Tensor, hidden_size: int, row_parallel_mode: ParallelMode,
|
||||
col_parallel_mode: ParallelMode) -> Tensor:
|
||||
"""
|
||||
Layernorm
|
||||
r"""Layernorm.
|
||||
|
||||
:param input_: input maxtrix
|
||||
:type input_: torch.tensor
|
||||
:param E_x: mean
|
||||
:type E_x: torch.tensor
|
||||
:param Var_x: variance
|
||||
:type Var_x: torch.tensor
|
||||
:param hidden_size: hidden size
|
||||
:type hidden_size: int
|
||||
:param row_parallel_mode: row parallel mode
|
||||
:type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
|
||||
:param col_parallel_mode: column parallel mode
|
||||
:type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
|
||||
Args:
|
||||
input_ (:class:`torch.tensor`): input matrix.
|
||||
E_x (:class:`torch.tensor`): mean.
|
||||
Var_x (:class:`torch.tensor`): variance.
|
||||
hidden_size (int): hidden size.
|
||||
row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
|
||||
col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
|
||||
|
||||
Note:
|
||||
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
|
||||
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
|
||||
"""
|
||||
return _Layernorm_2D.apply(input_, E_x, Var_x, hidden_size, row_parallel_mode, col_parallel_mode)
|
||||
|
||||
@ -748,27 +706,29 @@ class _AllGatherTensor2D(torch.autograd.Function):
|
||||
|
||||
|
||||
def all_gather_tensor_2d(tensor: Tensor, dim: int, parallel_mode: ParallelMode) -> Tensor:
|
||||
"""
|
||||
All gather the tensor of 2D parallelism
|
||||
r"""All gather the tensor of 2D parallelism.
|
||||
|
||||
:param inputs: input maxtrix
|
||||
:type inputs: torch.tensor
|
||||
:param dim: dimension to gather
|
||||
:type dim: int
|
||||
:param parallel_mode: parallel mode
|
||||
:type parallel_mode: colossalai.context.parallel_mode.ParallelMode
|
||||
Args:
|
||||
tensor (:class:`torch.tensor`): Input tensor.
|
||||
dim (int): Dimension to gather.
|
||||
parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode tensor used.
|
||||
|
||||
Note:
|
||||
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
|
||||
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
|
||||
"""
|
||||
return _AllGatherTensor2D.apply(tensor, dim, parallel_mode)
|
||||
|
||||
|
||||
def split_tensor_2d(input_: Tensor, dim: int = 0) -> Tensor:
|
||||
"""Splits 2D tensor in specified dimension across cols
|
||||
:param input_: Input tensor
|
||||
:param dim: Specified dimension in which to split
|
||||
:type input_: torch.Tensor
|
||||
:type dim: int, optional
|
||||
:return output: Splitted tensor
|
||||
:rtype output: torch.Tensor
|
||||
"""Splits 2D tensor in specified dimension across cols.
|
||||
|
||||
Args:
|
||||
input_ (:class:`torch.tensor`): Input tensor.
|
||||
dim (int): Specified dimension in which to split.
|
||||
|
||||
Returns:
|
||||
:class:`torch.tensor`: The tensor has been split.
|
||||
"""
|
||||
if input_.size(dim) <= 1:
|
||||
return input_
|
||||
@ -787,11 +747,15 @@ class _ReduceTensor2D(torch.autograd.Function):
|
||||
|
||||
|
||||
def reduce_tensor_2d(input_: Tensor, parallel_mode: ParallelMode) -> Tensor:
|
||||
"""
|
||||
All-reduce the input.
|
||||
|
||||
:param input_: input tensor
|
||||
:param parallel_mode: parallel mode
|
||||
r"""All-reduce the input.
|
||||
|
||||
Args:
|
||||
input_ (:class:`torch.tensor`): Input tensor.
|
||||
parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode tensor used.
|
||||
|
||||
Note:
|
||||
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
|
||||
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
|
||||
"""
|
||||
return _ReduceTensor2D.apply(input_, parallel_mode)
|
||||
|
||||
@ -809,12 +773,16 @@ class _ReduceScatterTensor2D(torch.autograd.Function):
|
||||
|
||||
|
||||
def reduce_scatter_tensor_2d(tensor: Tensor, dim: int, parallel_mode: ParallelMode) -> Tensor:
|
||||
"""
|
||||
Reduce-scatter the input.
|
||||
|
||||
:param tensor: Input tensor
|
||||
:param dim: Dimension to scatter
|
||||
:param parallel_mode: Parallel mode
|
||||
r"""Reduce-scatter the input.
|
||||
|
||||
Args:
|
||||
tensor (:class:`torch.tensor`): Input tensor.
|
||||
dim (int): Dimension to reduce.
|
||||
parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode tensor used.
|
||||
|
||||
Note:
|
||||
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
|
||||
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
|
||||
"""
|
||||
return _ReduceScatterTensor2D.apply(tensor, dim, parallel_mode)
|
||||
|
||||
@ -849,11 +817,11 @@ class _ReduceByBatch2D(torch.autograd.Function):
|
||||
|
||||
|
||||
def reduce_by_batch_2d(input_, reduce_mean: bool = False) -> Tensor:
|
||||
"""All-reduce the input from the model parallel region.
|
||||
r"""All-reduce the input from the model parallel region.
|
||||
|
||||
:param input_: input maxtrix
|
||||
:type input_: torch.tensor
|
||||
:param reduce_mean: If set to ``True``, it will divide the output by column parallel size, default to False
|
||||
:type reduce_mean: bool, optional
|
||||
Args:
|
||||
input_ (:class:`torch.tensor`): input matrix.
|
||||
reduce_mean (bool, optional):
|
||||
If set to ``True``, it will divide the output by column parallel size, default to False.
|
||||
"""
|
||||
return _ReduceByBatch2D.apply(input_, reduce_mean)
|
@ -22,23 +22,22 @@ from ._utils import assert_summa_initialization, get_summa_dim_from_env
|
||||
|
||||
@LAYERS.register_module
|
||||
class Linear2D(ParallelLayer):
|
||||
"""
|
||||
Linear layer for 2D parallelism
|
||||
r"""Linear layer for 2D parallelism
|
||||
|
||||
:param in_features: size of each input sample
|
||||
:type in_features: int
|
||||
:param out_features: size of each output sample
|
||||
:type out_features: int
|
||||
:param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True
|
||||
:type bias: bool, optional
|
||||
:param dtype: The dtype of parameters, defaults to None
|
||||
:type dtype: torch.dtype, optional
|
||||
:param skip_bias_add: If set to ``True``, it will skip bias add for linear layer, which is preserved for kernel fusion, defaults to False
|
||||
:type skip_bias_add: bool, optional
|
||||
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
|
||||
:type weight_initializer: typing.Callable, optional
|
||||
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
|
||||
:type bias_initializer: typing.Callable, optional
|
||||
Args:
|
||||
in_features (int): size of each input sample.
|
||||
out_features (int): size of each output sample.
|
||||
bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
|
||||
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
|
||||
skip_bias_add (bool, optional): If set to ``True``, it will skip bias add for linear layer,
|
||||
which is preserved for kernel fusion, defaults to False.
|
||||
weight_initializer (:class:`typing.Callable`, optional):
|
||||
The initializer of weight, defaults to kaiming uniform initializer.
|
||||
bias_initializer (:class:`typing.Callable`, optional):
|
||||
The initializer of bias, defaults to xavier uniform initializer.
|
||||
|
||||
More details about ``initializer`` please refer to
|
||||
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
|
||||
"""
|
||||
def __init__(self,
|
||||
in_features: int,
|
||||
@ -119,18 +118,16 @@ class Linear2D(ParallelLayer):
|
||||
|
||||
@LAYERS.register_module
|
||||
class LayerNorm2D(ParallelLayer):
|
||||
r"""
|
||||
Layer Normalization for 2D parallelism
|
||||
r"""Layer Normalization for 2D parallelism.
|
||||
|
||||
:param normalized_shape: input shape from an expected input
|
||||
of size. :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1] \times \ldots \times \text{normalized_shape}[-1]]`
|
||||
If a single integer is used, it is treated as a singleton list, and this module will
|
||||
normalize over the last dimension which is expected to be of that specific size.
|
||||
:type normalized_shape: int
|
||||
:param eps: a value added to the denominator for numerical stability, defaults to 1e-05
|
||||
:type eps: float, optional
|
||||
:param dtype: The dtype of parameters, defaults to None
|
||||
:type dtype: torch.dtype, optional
|
||||
Args:
|
||||
normalized_shape (int): input shape from an expected input of size.
|
||||
:math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
|
||||
\times \ldots \times \text{normalized_shape}[-1]]`
|
||||
If a single integer is used, it is treated as a singleton list, and this module will
|
||||
normalize over the last dimension which is expected to be of that specific size.
|
||||
eps (float, optional): a value added to the denominator for numerical stability, defaults to 1e-05.
|
||||
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
|
||||
"""
|
||||
def __init__(self, normalized_shape: int, eps: float = 1e-05, dtype=None):
|
||||
super().__init__()
|
||||
@ -189,27 +186,24 @@ class LayerNorm2D(ParallelLayer):
|
||||
|
||||
@LAYERS.register_module
|
||||
class PatchEmbedding2D(ParallelLayer):
|
||||
"""
|
||||
2D Image to Patch Embedding
|
||||
r"""2D Image to Patch Embedding.
|
||||
|
||||
:param img_size: image size
|
||||
:type img_size: int
|
||||
:param patch_size: patch size
|
||||
:type patch_size: int
|
||||
:param in_chans: number of channels of input image
|
||||
:type in_chans: int
|
||||
:param embed_size: size of embedding
|
||||
:type embed_size: int
|
||||
:param dtype: The dtype of parameters, defaults to None
|
||||
:type dtype: torch.dtype, optional
|
||||
:param flatten: whether to flatten output tensor, defaults to True
|
||||
:type flatten: bool, optional
|
||||
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
|
||||
:type weight_initializer: typing.Callable, optional
|
||||
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
|
||||
:type bias_initializer: typing.Callable, optional
|
||||
:param position_embed_initializer: The intializer of position embedding, defaults to zero
|
||||
:type position_embed_initializer: typing.Callable, optional
|
||||
Args:
|
||||
img_size (int): image size.
|
||||
patch_size (int): patch size.
|
||||
in_chans (int): number of channels of input image.
|
||||
embed_size (int): size of embedding.
|
||||
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
|
||||
flatten (bool, optional): whether to flatten output tensor, defaults to True.
|
||||
weight_initializer (:class:`typing.Callable`, optional):
|
||||
The initializer of weight, defaults to kaiming uniform initializer.
|
||||
bias_initializer (:class:`typing.Callable`, optional):
|
||||
The initializer of bias, defaults to xavier uniform initializer.
|
||||
position_embed_initializer (:class:`typing.Callable`, optional):
|
||||
The initializer of position embedding, defaults to zeros initializer.
|
||||
|
||||
More details about ``initializer`` please refer to
|
||||
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
|
||||
"""
|
||||
def __init__(self,
|
||||
img_size: int,
|
||||
@ -291,21 +285,33 @@ class PatchEmbedding2D(ParallelLayer):
|
||||
|
||||
@LAYERS.register_module
|
||||
class Embedding2D(ParallelLayer):
|
||||
"""
|
||||
Embedding for 2D parallelism
|
||||
r"""Embedding for 2D parallelism.
|
||||
|
||||
:param num_embeddings: number of embeddings
|
||||
:type num_embeddings: int
|
||||
:param embedding_dim: dimension of embedding
|
||||
:type embedding_dim: int
|
||||
:param padding_idx: index of padding, defaults to None
|
||||
:type padding_idx: int, optional
|
||||
:param dtype: The dtype of parameters, defaults to None
|
||||
:type dtype: torch.dtype, optional
|
||||
:param weight_initializer: The intializer of weight, defaults to normal initializer
|
||||
:type weight_initializer: typing.Callable, optional
|
||||
:param args: Args used in F.embedding
|
||||
:param kwargs: Kwargs used in F.embedding
|
||||
Args:
|
||||
num_embeddings (int): number of embeddings.
|
||||
embedding_dim (int): dimension of embedding.
|
||||
padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
|
||||
therefore, the embedding vector at padding_idx is not updated during training,
|
||||
i.e. it remains as a fixed “pad”, defaults to None.
|
||||
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
|
||||
weight_initializer (:class:`typing.Callable`, optional):
|
||||
he initializer of weight, defaults to normal initializer.
|
||||
|
||||
The ``args`` and ``kwargs`` used in :class:``torch.nn.functional.embedding`` should contain:
|
||||
::
|
||||
|
||||
max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
|
||||
renormalized to have norm max_norm. Note: this will modify weight in-place.
|
||||
norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
|
||||
scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
|
||||
of frequency of the words in the mini-batch. Default False.
|
||||
sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
|
||||
|
||||
More details about ``args`` and ``kwargs`` could be found in
|
||||
`Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
|
||||
|
||||
More details about initializer please refer to
|
||||
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_
|
||||
"""
|
||||
def __init__(self,
|
||||
num_embeddings: int,
|
||||
@ -358,20 +364,33 @@ class Embedding2D(ParallelLayer):
|
||||
|
||||
@LAYERS.register_module
|
||||
class VocabParallelEmbedding2D(torch.nn.Module):
|
||||
"""Embedding parallelized in the vocabulary dimension.
|
||||
r"""Embedding parallelized in the vocabulary dimension.
|
||||
|
||||
:param num_embeddings: number of embeddings
|
||||
:type num_embeddings: int
|
||||
:param embedding_dim: dimension of embedding
|
||||
:type embedding_dim: int
|
||||
:param padding_idx: index of padding, defaults to None
|
||||
:type padding_idx: int, optional
|
||||
:param dtype: The dtype of parameters, defaults to None
|
||||
:type dtype: torch.dtype, optional
|
||||
:param weight_initializer: The intializer of weight, defaults to normal initializer
|
||||
:type weight_initializer: typing.Callable, optional
|
||||
:param args: Args used in F.embedding
|
||||
:param kwargs: Kwargs used in F.embedding
|
||||
Args:
|
||||
num_embeddings (int): number of embeddings.
|
||||
embedding_dim (int): dimension of embedding.
|
||||
padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
|
||||
therefore, the embedding vector at padding_idx is not updated during training,
|
||||
i.e. it remains as a fixed “pad”, defaults to None.
|
||||
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
|
||||
weight_initializer (:class:`typing.Callable`, optional):
|
||||
he initializer of weight, defaults to normal initializer.
|
||||
|
||||
The ``args`` and ``kwargs`` used in :class:``torch.nn.functional.embedding`` should contain:
|
||||
::
|
||||
|
||||
max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
|
||||
renormalized to have norm max_norm. Note: this will modify weight in-place.
|
||||
norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
|
||||
scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
|
||||
of frequency of the words in the mini-batch. Default False.
|
||||
sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
|
||||
|
||||
More details about ``args`` and ``kwargs`` could be found in
|
||||
`Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
|
||||
|
||||
More details about initializer please refer to
|
||||
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
|
||||
"""
|
||||
def __init__(self,
|
||||
num_embeddings: int,
|
||||
@ -435,23 +454,21 @@ class VocabParallelEmbedding2D(torch.nn.Module):
|
||||
|
||||
@LAYERS.register_module
|
||||
class Classifier2D(ParallelLayer):
|
||||
"""
|
||||
Classifier for 2D parallelism
|
||||
r"""Classifier for 2D parallelism.
|
||||
|
||||
:param in_features: size of each input sample
|
||||
:type in_features: int
|
||||
:param num_classes: number of classes
|
||||
:type num_classes: int
|
||||
:param weight: weight of the classifier, defaults to True
|
||||
:type weight: torch.nn.Parameter, optional
|
||||
:param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True``
|
||||
:type bias: bool, optional
|
||||
:param dtype: The dtype of parameters, defaults to None
|
||||
:type dtype: torch.dtype, optional
|
||||
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
|
||||
:type weight_initializer: typing.Callable, optional
|
||||
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
|
||||
:type bias_initializer: typing.Callable, optional
|
||||
Args:
|
||||
in_features (int): size of each input sample.
|
||||
num_classes (int): number of classes.
|
||||
weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
|
||||
bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
|
||||
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
|
||||
weight_initializer (:class:`typing.Callable`, optional):
|
||||
The initializer of weight, defaults to kaiming uniform initializer.
|
||||
bias_initializer (:class:`typing.Callable`, optional):
|
||||
The initializer of bias, defaults to xavier uniform initializer.
|
||||
|
||||
More details about ``initializer`` please refer to
|
||||
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
|
||||
"""
|
||||
def __init__(self,
|
||||
in_features: int,
|
||||
@ -515,23 +532,21 @@ class Classifier2D(ParallelLayer):
|
||||
|
||||
@LAYERS.register_module
|
||||
class VocabParallelClassifier2D(ParallelLayer):
|
||||
"""
|
||||
Vocab parallel classifier layer for 2D parallelism
|
||||
r"""Vocab parallel classifier layer for 2D parallelism.
|
||||
|
||||
:param in_features: size of each input sample
|
||||
:type in_features: int
|
||||
:param num_classes: number of classes
|
||||
:type num_classes: int
|
||||
:param weight: weight of the classifier, defaults to True
|
||||
:type weight: torch.nn.Parameter, optional
|
||||
:param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True``
|
||||
:type bias: bool, optional
|
||||
:param dtype: The dtype of parameters, defaults to None
|
||||
:type dtype: torch.dtype, optional
|
||||
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
|
||||
:type weight_initializer: typing.Callable, optional
|
||||
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
|
||||
:type bias_initializer: typing.Callable, optional
|
||||
Args:
|
||||
in_features (int): size of each input sample.
|
||||
num_classes (int): number of classes.
|
||||
weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
|
||||
bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
|
||||
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
|
||||
weight_initializer (:class:`typing.Callable`, optional):
|
||||
The initializer of weight, defaults to kaiming uniform initializer.
|
||||
bias_initializer (:class:`typing.Callable`, optional):
|
||||
The initializer of bias, defaults to xavier uniform initializer.
|
||||
|
||||
More details about ``initializer`` please refer to
|
||||
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
|
||||
"""
|
||||
def __init__(self,
|
||||
in_features: int,
|
||||
|
@ -100,35 +100,26 @@ def classifier_2p5d(A: Tensor, B: Tensor, bias, tesseract_dim: int, out_shape: T
|
||||
...], row_rank: int, col_rank: int,
|
||||
row_parallel_mode: ParallelMode, col_parallel_mode: ParallelMode, data_parallel_rank: int,
|
||||
pipeline_parallel_rank: int, pipeline_parallel_size: int, tensor_parallel_size: int) -> Tensor:
|
||||
"""
|
||||
Classifier
|
||||
r"""Classifier.
|
||||
|
||||
:param a: matrix :math:`A`
|
||||
:type a: torch.tensor
|
||||
:param b: matrix :math:`B`
|
||||
:type b: torch.tensor
|
||||
:param bias: matrix of bias
|
||||
:type bias: torch.tensor, optional
|
||||
:param tesseract_dim: dimension of TESSERACT fo 2.5D parallelism
|
||||
:type tesseract_dim: int
|
||||
:param out_shape: shape of output tensor
|
||||
:type out_shape: tuple
|
||||
:param row_rank: the rank of row
|
||||
:type row_rank: int
|
||||
:param col_rank: the rank of column
|
||||
:type col_rank: int
|
||||
:param row_parallel_mode: row parallel mode
|
||||
:type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
|
||||
:param col_parallel_mode: column parallel mode
|
||||
:type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
|
||||
:param data_parallel_rank: data parallel rank
|
||||
:type data_parallel_rank: int
|
||||
:param pipeline_parallel_rank: pipeline parallel rank
|
||||
:type pipeline_parallel_rank: int
|
||||
:param pipeline_parallel_size: pipeline parallel size
|
||||
:type pipeline_parallel_size: int
|
||||
:param tensor_parallel_size: tensor parallel size
|
||||
:type tensor_parallel_size: int
|
||||
Args:
|
||||
A (:class:`torch.tensor`): matrix :math:`A`.
|
||||
B (:class:`torch.tensor`): matrix :math:`B`.
|
||||
bias (:class:`torch.tensor`): matrix of bias.
|
||||
tesseract_dim (int): dimension of TESSERACT fo 2.5D parallelism.
|
||||
out_shape (:class:`torch.size`): shape of output tensor.
|
||||
row_rank (int): the rank of row.
|
||||
col_rank (int): the rank of column.
|
||||
row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
|
||||
col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
|
||||
data_parallel_rank (int): data parallel rank.
|
||||
pipeline_parallel_rank (int): pipeline parallel rank
|
||||
pipeline_parallel_size (int): pipeline parallel size.
|
||||
tensor_parallel_size (int): tensor parallel size.
|
||||
|
||||
Note:
|
||||
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
|
||||
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
|
||||
"""
|
||||
return _Classifier2p5D.apply(A, B, bias, tesseract_dim, out_shape, row_rank, col_rank, row_parallel_mode,
|
||||
col_parallel_mode, data_parallel_rank, pipeline_parallel_rank, pipeline_parallel_size,
|
||||
@ -136,35 +127,26 @@ def classifier_2p5d(A: Tensor, B: Tensor, bias, tesseract_dim: int, out_shape: T
|
||||
|
||||
|
||||
class Matmul_AB_2p5D(torch.autograd.Function):
|
||||
"""
|
||||
Matrix multiplication for :math:`C = AB`
|
||||
r"""Matrix multiplication for :math:`C = AB`.
|
||||
|
||||
:param a: matrix :math:`A`
|
||||
:type a: torch.tensor
|
||||
:param b: matrix :math:`B`
|
||||
:type b: torch.tensor
|
||||
:param tesseract_dim: dimension of TESSERACT fo 2.5D parallelism
|
||||
:type tesseract_dim: int
|
||||
:param out_shape: shape of output tensor
|
||||
:type out_shape: tuple
|
||||
:param row_rank: the rank of row
|
||||
:type row_rank: int
|
||||
:param col_rank: the rank of column
|
||||
:type col_rank: int
|
||||
:param dep_rank: the rank of depth
|
||||
:type dep_rank: int
|
||||
:param row_parallel_mode: row parallel mode
|
||||
:type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
|
||||
:param col_parallel_mode: column parallel mode
|
||||
:type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
|
||||
:param data_parallel_rank: data parallel rank
|
||||
:type data_parallel_rank: int
|
||||
:param pipeline_parallel_rank: pipeline parallel rank
|
||||
:type pipeline_parallel_rank: int
|
||||
:param pipeline_parallel_size: pipeline parallel size
|
||||
:type pipeline_parallel_size: int
|
||||
:param tensor_parallel_size: tensor parallel size
|
||||
:type tensor_parallel_size: int
|
||||
Args:
|
||||
A (:class:`torch.tensor`): matrix :math:`A`.
|
||||
B (:class:`torch.tensor`): matrix :math:`B`.
|
||||
tesseract_dim (int): dimension of TESSERACT fo 2.5D parallelism.
|
||||
out_shape (:class:`torch.size`): shape of output tensor.
|
||||
row_rank (int): the rank of row.
|
||||
col_rank (int): the rank of column.
|
||||
dep_rank (int): the rank of depth.
|
||||
row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
|
||||
col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
|
||||
data_parallel_rank (int): data parallel rank.
|
||||
pipeline_parallel_rank (int): pipeline parallel rank
|
||||
pipeline_parallel_size (int): pipeline parallel size.
|
||||
tensor_parallel_size (int): tensor parallel size.
|
||||
|
||||
Note:
|
||||
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
|
||||
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
@ -270,35 +252,26 @@ class Matmul_AB_2p5D(torch.autograd.Function):
|
||||
|
||||
|
||||
class Matmul_ABT_2p5D(torch.autograd.Function):
|
||||
"""
|
||||
Matrix multiplication for :math:`C = AB^T`
|
||||
r"""Matrix multiplication for :math:`C = AB^T`.
|
||||
|
||||
:param a: matrix :math:`A`
|
||||
:type a: torch.tensor
|
||||
:param b: matrix :math:`B`
|
||||
:type b: torch.tensor
|
||||
:param tesseract_dim: dimension of TESSERACT fo 2.5D parallelism
|
||||
:type tesseract_dim: int
|
||||
:param out_shape: shape of output tensor
|
||||
:type out_shape: tuple
|
||||
:param row_rank: the rank of row
|
||||
:type row_rank: int
|
||||
:param col_rank: the rank of column
|
||||
:type col_rank: int
|
||||
:param dep_rank: the rank of depth
|
||||
:type dep_rank: int
|
||||
:param row_parallel_mode: row parallel mode
|
||||
:type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
|
||||
:param col_parallel_mode: column parallel mode
|
||||
:type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
|
||||
:param data_parallel_rank: data parallel rank
|
||||
:type data_parallel_rank: int
|
||||
:param pipeline_parallel_rank: pipeline parallel rank
|
||||
:type pipeline_parallel_rank: int
|
||||
:param pipeline_parallel_size: pipeline parallel size
|
||||
:type pipeline_parallel_size: int
|
||||
:param tensor_parallel_size: tensor parallel size
|
||||
:type tensor_parallel_size: int
|
||||
Args:
|
||||
A (:class:`torch.tensor`): matrix :math:`A`.
|
||||
B (:class:`torch.tensor`): matrix :math:`B`.
|
||||
tesseract_dim (int): dimension of TESSERACT fo 2.5D parallelism.
|
||||
out_shape (:class:`torch.size`): shape of output tensor.
|
||||
row_rank (int): the rank of row.
|
||||
col_rank (int): the rank of column.
|
||||
dep_rank (int): the rank of depth.
|
||||
row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
|
||||
col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
|
||||
data_parallel_rank (int): data parallel rank.
|
||||
pipeline_parallel_rank (int): pipeline parallel rank
|
||||
pipeline_parallel_size (int): pipeline parallel size.
|
||||
tensor_parallel_size (int): tensor parallel size.
|
||||
|
||||
Note:
|
||||
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
|
||||
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
@ -409,35 +382,26 @@ class Matmul_ABT_2p5D(torch.autograd.Function):
|
||||
|
||||
|
||||
class Matmul_ATB_2p5D(torch.autograd.Function):
|
||||
"""
|
||||
Matrix multiplication for :math:`C = A^TB`
|
||||
r"""Matrix multiplication for :math:`C = A^TB`
|
||||
|
||||
:param a: matrix :math:`A`
|
||||
:type a: torch.tensor
|
||||
:param b: matrix :math:`B`
|
||||
:type b: torch.tensor
|
||||
:param tesseract_dim: dimension of TESSERACT fo 2.5D parallelism
|
||||
:type tesseract_dim: int
|
||||
:param out_shape: shape of output tensor
|
||||
:type out_shape: tuple
|
||||
:param row_rank: the rank of row
|
||||
:type row_rank: int
|
||||
:param col_rank: the rank of column
|
||||
:type col_rank: int
|
||||
:param dep_rank: the rank of depth
|
||||
:type dep_rank: int
|
||||
:param row_parallel_mode: row parallel mode
|
||||
:type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
|
||||
:param col_parallel_mode: column parallel mode
|
||||
:type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
|
||||
:param data_parallel_rank: data parallel rank
|
||||
:type data_parallel_rank: int
|
||||
:param pipeline_parallel_rank: pipeline parallel rank
|
||||
:type pipeline_parallel_rank: int
|
||||
:param pipeline_parallel_size: pipeline parallel size
|
||||
:type pipeline_parallel_size: int
|
||||
:param tensor_parallel_size: tensor parallel size
|
||||
:type tensor_parallel_size: int
|
||||
Args:
|
||||
A (:class:`torch.tensor`): matrix :math:`A`.
|
||||
B (:class:`torch.tensor`): matrix :math:`B`.
|
||||
tesseract_dim (int): dimension of TESSERACT fo 2.5D parallelism.
|
||||
out_shape (:class:`torch.size`): shape of output tensor.
|
||||
row_rank (int): the rank of row.
|
||||
col_rank (int): the rank of column.
|
||||
dep_rank (int): the rank of depth.
|
||||
row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
|
||||
col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
|
||||
data_parallel_rank (int): data parallel rank.
|
||||
pipeline_parallel_rank (int): pipeline parallel rank
|
||||
pipeline_parallel_size (int): pipeline parallel size.
|
||||
tensor_parallel_size (int): tensor parallel size.
|
||||
|
||||
Note:
|
||||
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
|
||||
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
@ -629,36 +593,27 @@ def add_bias_2p5d(input: Tensor, bias: Tensor, output_size_per_partition: int, t
|
||||
col_rank: int, dep_rank: int, col_parallel_mode: ParallelMode, skip_bias_add: bool,
|
||||
data_parallel_rank: int, pipeline_parallel_rank: int, pipeline_parallel_size: int,
|
||||
tensor_parallel_size: int) -> Tensor:
|
||||
"""
|
||||
Matrix add bias: :math:`C = A + b`
|
||||
r"""Matrix add bias: :math:`C = A + b`.
|
||||
|
||||
:param input: matrix :math:`A`
|
||||
:type input: torch.tensor
|
||||
:param bias: matrix :math:`b`
|
||||
:type bias: torch.tensor
|
||||
:param output_size_per_partition: output size in each partition
|
||||
:type output_size_per_partition: int
|
||||
:param tesseract_dim: dimension of TESSERACT fo 2.5D parallelism
|
||||
:type tesseract_dim: int
|
||||
:param row_rank: the rank of row
|
||||
:type row_rank: int
|
||||
:param col_rank: the rank of column
|
||||
:type col_rank: int
|
||||
:param row_parallel_mode: row parallel mode
|
||||
:type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
|
||||
:param col_parallel_mode: column parallel mode
|
||||
:type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
|
||||
:param skip_bias_add: If set to ``True``, it will skip bias add for linear layer,
|
||||
which is preserved for kernel fusion
|
||||
:type skip_bias_add: bool
|
||||
:param data_parallel_rank: data parallel rank
|
||||
:type data_parallel_rank: int
|
||||
:param pipeline_parallel_rank: pipeline parallel rank
|
||||
:type pipeline_parallel_rank: int
|
||||
:param pipeline_parallel_size: pipeline parallel size
|
||||
:type pipeline_parallel_size: int
|
||||
:param tensor_parallel_size: tensor parallel size
|
||||
:type tensor_parallel_size: int
|
||||
Args:
|
||||
input (:class:`torch.tensor`): matrix :math:`A`.
|
||||
bias (:class:`torch.tensor`): matrix :math:`B`.
|
||||
tesseract_dim (int): dimension of TESSERACT fo 2.5D parallelism.
|
||||
output_size_per_partition (int): output size in each partition.
|
||||
row_rank (int): the rank of row.
|
||||
col_rank (int): the rank of column.
|
||||
dep_rank (int): the rank of depth.
|
||||
col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
|
||||
skip_bias_add (bool): If set to ``True``, it will skip bias add for linear layer,
|
||||
which is preserved for kernel fusion.
|
||||
data_parallel_rank (int): data parallel rank.
|
||||
pipeline_parallel_rank (int): pipeline parallel rank
|
||||
pipeline_parallel_size (int): pipeline parallel size.
|
||||
tensor_parallel_size (int): tensor parallel size.
|
||||
|
||||
Note:
|
||||
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
|
||||
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
|
||||
"""
|
||||
return _Add_Bias_2p5D.apply(input, bias, output_size_per_partition, tesseract_dim, row_rank, col_rank, dep_rank,
|
||||
col_parallel_mode, skip_bias_add, data_parallel_rank, pipeline_parallel_rank,
|
||||
@ -666,19 +621,18 @@ def add_bias_2p5d(input: Tensor, bias: Tensor, output_size_per_partition: int, t
|
||||
|
||||
|
||||
class _Layernorm2p5D(torch.autograd.Function):
|
||||
"""
|
||||
Layernorm
|
||||
r"""Layernorm.
|
||||
|
||||
:param input: input maxtrix
|
||||
:type input: torch.tensor
|
||||
:param E_x: mean
|
||||
:type E_x: torch.tensor
|
||||
:param Var_x: variance
|
||||
:type Var_x: torch.tensor
|
||||
:param hidden_size: hidden size
|
||||
:type hidden_size: int
|
||||
:param row_parallel_mode: row parallel mode
|
||||
:type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
|
||||
Args:
|
||||
input (:class:`torch.tensor`): input matrix.
|
||||
E_x (:class:`torch.tensor`): mean.
|
||||
Var_x (:class:`torch.tensor`): variance.
|
||||
hidden_size (int): hidden size.
|
||||
row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
|
||||
|
||||
Note:
|
||||
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
|
||||
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
@ -718,19 +672,18 @@ class _Layernorm2p5D(torch.autograd.Function):
|
||||
|
||||
def layernorm_2p5d(input: Tensor, E_x: Tensor, Var_x: Tensor, hidden_size: int,
|
||||
row_parallel_mode: ParallelMode) -> Tensor:
|
||||
"""
|
||||
Layernorm
|
||||
r"""Layernorm.
|
||||
|
||||
:param input: input maxtrix
|
||||
:type input: torch.tensor
|
||||
:param E_x: mean
|
||||
:type E_x: torch.tensor
|
||||
:param Var_x: variance
|
||||
:type Var_x: torch.tensor
|
||||
:param hidden_size: hidden size
|
||||
:type hidden_size: int
|
||||
:param row_parallel_mode: row parallel mode
|
||||
:type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
|
||||
Args:
|
||||
input (:class:`torch.tensor`): input matrix.
|
||||
E_x (:class:`torch.tensor`): mean.
|
||||
Var_x (:class:`torch.tensor`): variance.
|
||||
hidden_size (int): hidden size.
|
||||
row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
|
||||
|
||||
Note:
|
||||
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
|
||||
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
|
||||
"""
|
||||
return _Layernorm2p5D.apply(input, E_x, Var_x, hidden_size, row_parallel_mode)
|
||||
|
||||
@ -753,29 +706,31 @@ class _AllGatherTensor2p5D(torch.autograd.Function):
|
||||
|
||||
|
||||
def all_gather_tensor_2p5d(inputs: Tensor, dim: int, col_parallel_mode: ParallelMode) -> Tensor:
|
||||
"""
|
||||
all gather the weight of 2.5D parallelism
|
||||
r"""all gather the weight of 2.5D parallelism.
|
||||
|
||||
:param inputs: input maxtrix
|
||||
:type inputs: torch.tensor
|
||||
:param dim: dimension of all gather
|
||||
:type dim: int
|
||||
:param tesseract_dim: dimension of TESSERACT fo 2.5D parallelism
|
||||
:type tesseract_dim: int
|
||||
:param col_parallel_mode: column parallel mode
|
||||
:type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
|
||||
Args:
|
||||
inputs (:class:`torch.tensor`): input tensor.
|
||||
dim (int): dimension of all-gather.
|
||||
col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
|
||||
|
||||
Note:
|
||||
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
|
||||
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
|
||||
"""
|
||||
return _AllGatherTensor2p5D.apply(inputs, dim, col_parallel_mode)
|
||||
|
||||
|
||||
class SplitFirst(torch.autograd.Function):
|
||||
"""
|
||||
:param inputs: input maxtrix
|
||||
:type inputs: torch.tensor
|
||||
:param tesseract_dim: dimension of TESSERACT fo 2.5D parallelism
|
||||
:type tesseract_dim: int
|
||||
:param col_parallel_mode: column parallel mode
|
||||
:type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
|
||||
r"""
|
||||
|
||||
Args:
|
||||
inputs (:class:`torch.tensor`): input tensor.
|
||||
tesseract_dim (int): dimension of TESSERACT fo 2.5D parallelism
|
||||
col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
|
||||
|
||||
Note:
|
||||
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
|
||||
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
@ -801,16 +756,14 @@ class SplitFirst(torch.autograd.Function):
|
||||
|
||||
|
||||
def split_tensor_2p5d(input_: Tensor, dim: int = 0) -> Tensor:
|
||||
"""Splits 2P5D tensor in specified dimension across cols
|
||||
"""Splits 2P5D tensor in specified dimension across cols.
|
||||
|
||||
:param input_: Input tensor
|
||||
:param dim: Specified dimension in which to split
|
||||
Args:
|
||||
input_ (:class:`torch.tensor`): Input tensor.
|
||||
dim (int): Specified dimension in which to split.
|
||||
|
||||
:type input_: torch.Tensor
|
||||
:type dim: int, optional
|
||||
|
||||
:return output: Splitted tensor
|
||||
:rtype output: torch.Tensor
|
||||
Returns:
|
||||
:class:`torch.tensor`: The tensor has been split.
|
||||
"""
|
||||
if input_.size(dim) <= 1:
|
||||
return input_
|
||||
@ -829,11 +782,15 @@ class _ReduceTensor2p5D(torch.autograd.Function):
|
||||
|
||||
|
||||
def reduce_tensor_2p5d(input_: Tensor, parallel_mode: ParallelMode) -> Tensor:
|
||||
"""
|
||||
All-reduce the input.
|
||||
r"""All-reduce the input.
|
||||
|
||||
:param input_: input tensor
|
||||
:param parallel_mode: parallel mode
|
||||
Args:
|
||||
input_ (:class:`torch.tensor`): Input tensor.
|
||||
parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode tensor used.
|
||||
|
||||
Note:
|
||||
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
|
||||
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
|
||||
"""
|
||||
return _ReduceTensor2p5D.apply(input_, parallel_mode)
|
||||
|
||||
@ -851,11 +808,16 @@ class _ReduceScatterTensor2p5D(torch.autograd.Function):
|
||||
|
||||
|
||||
def reduce_scatter_tensor_2p5d(input_: Tensor, dim: int, parallel_mode: ParallelMode) -> Tensor:
|
||||
"""
|
||||
Reduce-scatter the input.
|
||||
r"""Reduce-scatter the input.
|
||||
|
||||
:param input_: input tensor
|
||||
:param parallel_mode: parallel mode
|
||||
Args:
|
||||
input_ (:class:`torch.tensor`): Input tensor.
|
||||
dim (int): Dimension to reduce.
|
||||
parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode tensor used.
|
||||
|
||||
Note:
|
||||
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
|
||||
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
|
||||
"""
|
||||
return _ReduceScatterTensor2p5D.apply(input_, dim, parallel_mode)
|
||||
|
||||
@ -890,12 +852,11 @@ class _RreduceByBatch2p5D(torch.autograd.Function):
|
||||
|
||||
|
||||
def reduce_by_batch_2p5d(input_, reduce_mean: bool = False) -> Tensor:
|
||||
"""
|
||||
All-reduce the input from the model parallel region.
|
||||
r"""All-reduce the input from the model parallel region.
|
||||
|
||||
:param input_: input maxtrix
|
||||
:type input_: torch.tensor
|
||||
:param reduce_mean: If set to ``True``, it will divide the output by column parallel size, default to False
|
||||
:type reduce_mean: bool, optional
|
||||
Args:
|
||||
input_ (:class:`torch.tensor`): input matrix.
|
||||
reduce_mean (bool, optional):
|
||||
If set to ``True``, it will divide the output by column parallel size, default to False.
|
||||
"""
|
||||
return _RreduceByBatch2p5D.apply(input_, reduce_mean)
|
||||
|
@ -23,21 +23,22 @@ from ._utils import assert_tesseract_initialization, get_tesseract_dim_dep_from_
|
||||
|
||||
@LAYERS.register_module
|
||||
class Linear2p5D(ParallelLayer):
|
||||
"""
|
||||
Linear layer for 2.5D parallelism
|
||||
r"""Linear layer for 2.5D parallelism.
|
||||
|
||||
:param in_features: size of each input sample
|
||||
:type in_features: int
|
||||
:param out_features: size of each output sample
|
||||
:type out_features: int
|
||||
:param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True
|
||||
:type bias: bool, optional
|
||||
:param dtype: The dtype of parameters, defaults to None
|
||||
:type dtype: torch.dtype, optional
|
||||
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
|
||||
:type weight_initializer: typing.Callable, optional
|
||||
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
|
||||
:type bias_initializer: typing.Callable, optional
|
||||
Args:
|
||||
in_features (int): size of each input sample.
|
||||
out_features (int): size of each output sample.
|
||||
bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
|
||||
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
|
||||
skip_bias_add (bool, optional): If set to ``True``, it will skip bias add for linear layer,
|
||||
which is preserved for kernel fusion, defaults to False.
|
||||
weight_initializer (:class:`typing.Callable`, optional):
|
||||
The initializer of weight, defaults to kaiming uniform initializer.
|
||||
bias_initializer (:class:`typing.Callable`, optional):
|
||||
The initializer of bias, defaults to xavier uniform initializer.
|
||||
|
||||
More details about ``initializer`` please refer to
|
||||
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
|
||||
"""
|
||||
def __init__(self,
|
||||
in_features: int,
|
||||
@ -131,19 +132,16 @@ class Linear2p5D(ParallelLayer):
|
||||
|
||||
@LAYERS.register_module
|
||||
class LayerNorm2p5D(ParallelLayer):
|
||||
r"""
|
||||
Layer Normalization for 2.5D parallelism
|
||||
r"""Layer Normalization for 2.5D parallelism.
|
||||
|
||||
:param normalized_shape: input shape from an expected input of size.
|
||||
:math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
|
||||
\times \ldots \times \text{normalized_shape}[-1]]`
|
||||
If a single integer is used, it is treated as a singleton list, and this module will
|
||||
normalize over the last dimension which is expected to be of that specific size.
|
||||
:type normalized_shape: int
|
||||
:param eps: a value added to the denominator for numerical stability, defaults to 1e-05
|
||||
:type eps: float, optional
|
||||
:param dtype: The dtype of parameters, defaults to None
|
||||
:type dtype: torch.dtype, optional
|
||||
Args:
|
||||
normalized_shape (int): input shape from an expected input of size.
|
||||
:math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
|
||||
\times \ldots \times \text{normalized_shape}[-1]]`
|
||||
If a single integer is used, it is treated as a singleton list, and this module will
|
||||
normalize over the last dimension which is expected to be of that specific size.
|
||||
eps (float, optional): a value added to the denominator for numerical stability, defaults to 1e-05.
|
||||
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
|
||||
"""
|
||||
def __init__(self, normalized_shape: int, eps: float = 1e-05, dtype=None):
|
||||
super().__init__()
|
||||
@ -204,27 +202,24 @@ class LayerNorm2p5D(ParallelLayer):
|
||||
|
||||
@LAYERS.register_module
|
||||
class PatchEmbedding2p5D(ParallelLayer):
|
||||
"""
|
||||
2D Image to Patch Embedding
|
||||
r"""2D Image to Patch Embedding.
|
||||
|
||||
:param img_size: image size
|
||||
:type img_size: int
|
||||
:param patch_size: patch size
|
||||
:type patch_size: int
|
||||
:param in_chans: number of channels of input image
|
||||
:type in_chans: int
|
||||
:param embed_size: size of embedding
|
||||
:type embed_size: int
|
||||
:param dtype: The dtype of parameters, defaults to None
|
||||
:type dtype: torch.dtype, optional
|
||||
:param flatten: whether to flatten output tensor, defaults to True
|
||||
:type flatten: bool, optional
|
||||
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
|
||||
:type weight_initializer: typing.Callable, optional
|
||||
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
|
||||
:type bias_initializer: typing.Callable, optional
|
||||
:param position_embed_initializer: The intializer of position embedding, defaults to zero
|
||||
:type position_embed_initializer: typing.Callable, optional
|
||||
Args:
|
||||
img_size (int): image size.
|
||||
patch_size (int): patch size.
|
||||
in_chans (int): number of channels of input image.
|
||||
embed_size (int): size of embedding.
|
||||
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
|
||||
flatten (bool, optional): whether to flatten output tensor, defaults to True.
|
||||
weight_initializer (:class:`typing.Callable`, optional):
|
||||
The initializer of weight, defaults to kaiming uniform initializer.
|
||||
bias_initializer (:class:`typing.Callable`, optional):
|
||||
The initializer of bias, defaults to xavier uniform initializer.
|
||||
position_embed_initializer (:class:`typing.Callable`, optional):
|
||||
The initializer of position embedding, defaults to zeros initializer.
|
||||
|
||||
More details about ``initializer`` please refer to
|
||||
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
|
||||
"""
|
||||
def __init__(self,
|
||||
img_size: int,
|
||||
@ -306,21 +301,33 @@ class PatchEmbedding2p5D(ParallelLayer):
|
||||
|
||||
@LAYERS.register_module
|
||||
class Embedding2p5D(ParallelLayer):
|
||||
"""
|
||||
Embedding for 2.5D parallelism
|
||||
r"""Embedding for 2.5D parallelism.
|
||||
|
||||
:param num_embeddings: number of embeddings
|
||||
:type num_embeddings: int
|
||||
:param embedding_dim: dimension of embedding
|
||||
:type embedding_dim: int
|
||||
:param padding_idx: index of padding, defaults to None
|
||||
:type padding_idx: int, optional
|
||||
:param dtype: The dtype of parameters, defaults to None
|
||||
:type dtype: torch.dtype, optional
|
||||
:param weight_initializer: The intializer of weight, defaults to normal initializer
|
||||
:type weight_initializer: typing.Callable, optional
|
||||
:param args: Args used in F.embedding
|
||||
:param kwargs: Kwargs used in F.embedding
|
||||
Args:
|
||||
num_embeddings (int): number of embeddings.
|
||||
embedding_dim (int): dimension of embedding.
|
||||
padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
|
||||
therefore, the embedding vector at padding_idx is not updated during training,
|
||||
i.e. it remains as a fixed “pad”, defaults to None.
|
||||
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
|
||||
weight_initializer (:class:`typing.Callable`, optional):
|
||||
he initializer of weight, defaults to normal initializer.
|
||||
|
||||
The ``args`` and ``kwargs`` used in :class:``torch.nn.functional.embedding`` should contain:
|
||||
::
|
||||
|
||||
max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
|
||||
renormalized to have norm max_norm. Note: this will modify weight in-place.
|
||||
norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
|
||||
scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
|
||||
of frequency of the words in the mini-batch. Default False.
|
||||
sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
|
||||
|
||||
More details about ``args`` and ``kwargs`` could be found in
|
||||
`Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
|
||||
|
||||
More details about initializer please refer to
|
||||
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_
|
||||
"""
|
||||
def __init__(self,
|
||||
num_embeddings: int,
|
||||
@ -376,18 +383,31 @@ class Embedding2p5D(ParallelLayer):
|
||||
class VocabParallelEmbedding2p5D(torch.nn.Module):
|
||||
"""Embedding parallelized in the vocabulary dimension.
|
||||
|
||||
:param num_embeddings: number of embeddings
|
||||
:type num_embeddings: int
|
||||
:param embedding_dim: dimension of embedding
|
||||
:type embedding_dim: int
|
||||
:param padding_idx: index of padding, defaults to None
|
||||
:type padding_idx: int, optional
|
||||
:param dtype: The dtype of parameters, defaults to None
|
||||
:type dtype: torch.dtype, optional
|
||||
:param weight_initializer: The intializer of weight, defaults to normal initializer
|
||||
:type weight_initializer: typing.Callable, optional
|
||||
:param args: Args used in F.embedding
|
||||
:param kwargs: Kwargs used in F.embedding
|
||||
Args:
|
||||
num_embeddings (int): number of embeddings.
|
||||
embedding_dim (int): dimension of embedding.
|
||||
padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
|
||||
therefore, the embedding vector at padding_idx is not updated during training,
|
||||
i.e. it remains as a fixed “pad”, defaults to None.
|
||||
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
|
||||
weight_initializer (:class:`typing.Callable`, optional):
|
||||
he initializer of weight, defaults to normal initializer.
|
||||
|
||||
The ``args`` and ``kwargs`` used in :class:``torch.nn.functional.embedding`` should contain:
|
||||
::
|
||||
|
||||
max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
|
||||
renormalized to have norm max_norm. Note: this will modify weight in-place.
|
||||
norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
|
||||
scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
|
||||
of frequency of the words in the mini-batch. Default False.
|
||||
sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
|
||||
|
||||
More details about ``args`` and ``kwargs`` could be found in
|
||||
`Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
|
||||
|
||||
More details about initializer please refer to
|
||||
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
|
||||
"""
|
||||
def __init__(self,
|
||||
num_embeddings: int,
|
||||
@ -455,23 +475,21 @@ class VocabParallelEmbedding2p5D(torch.nn.Module):
|
||||
|
||||
@LAYERS.register_module
|
||||
class Classifier2p5D(ParallelLayer):
|
||||
"""
|
||||
Classifier for 2.5D parallelism
|
||||
r"""Classifier for 2.5D parallelism.
|
||||
|
||||
:param in_features: size of each input sample
|
||||
:type in_features: int
|
||||
:param num_classes: number of classes
|
||||
:type num_classes: int
|
||||
:param weight: weight of the classifier, defaults to True
|
||||
:type weight: torch.nn.Parameter, optional
|
||||
:param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True
|
||||
:type bias: bool, optional
|
||||
:param dtype: The dtype of parameters, defaults to None
|
||||
:type dtype: torch.dtype, optional
|
||||
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
|
||||
:type weight_initializer: typing.Callable, optional
|
||||
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
|
||||
:type bias_initializer: typing.Callable, optional
|
||||
Args:
|
||||
in_features (int): size of each input sample.
|
||||
num_classes (int): number of classes.
|
||||
weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
|
||||
bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
|
||||
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
|
||||
weight_initializer (:class:`typing.Callable`, optional):
|
||||
The initializer of weight, defaults to kaiming uniform initializer.
|
||||
bias_initializer (:class:`typing.Callable`, optional):
|
||||
The initializer of bias, defaults to xavier uniform initializer.
|
||||
|
||||
More details about ``initializer`` please refer to
|
||||
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
|
||||
"""
|
||||
def __init__(self,
|
||||
in_features: int,
|
||||
@ -537,23 +555,21 @@ class Classifier2p5D(ParallelLayer):
|
||||
|
||||
@LAYERS.register_module
|
||||
class VocabParallelClassifier2p5D(ParallelLayer):
|
||||
"""
|
||||
Vocab parallel classifier layer for 2.5D parallelism
|
||||
r"""Vocab parallel classifier layer for 2.5D parallelism.
|
||||
|
||||
:param in_features: size of each input sample
|
||||
:type in_features: int
|
||||
:param num_classes: number of classes
|
||||
:type num_classes: int
|
||||
:param weight: weight of the classifier, defaults to True
|
||||
:type weight: torch.nn.Parameter, optional
|
||||
:param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True``
|
||||
:type bias: bool, optional
|
||||
:param dtype: The dtype of parameters, defaults to None
|
||||
:type dtype: torch.dtype, optional
|
||||
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
|
||||
:type weight_initializer: typing.Callable, optional
|
||||
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
|
||||
:type bias_initializer: typing.Callable, optional
|
||||
Args:
|
||||
in_features (int): size of each input sample.
|
||||
num_classes (int): number of classes.
|
||||
weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
|
||||
bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
|
||||
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
|
||||
weight_initializer (:class:`typing.Callable`, optional):
|
||||
The initializer of weight, defaults to kaiming uniform initializer.
|
||||
bias_initializer (:class:`typing.Callable`, optional):
|
||||
The initializer of bias, defaults to xavier uniform initializer.
|
||||
|
||||
More details about ``initializer`` please refer to
|
||||
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
|
||||
"""
|
||||
def __init__(self,
|
||||
in_features: int,
|
||||
|
@ -88,27 +88,22 @@ def linear_3d(input_: Tensor,
|
||||
input_dim: int = 0,
|
||||
weight_dim: int = -1,
|
||||
output_dim: int = 0) -> Tensor:
|
||||
"""
|
||||
Linear layer for 3D parallelism
|
||||
r"""Linear layer for 3D parallelism.
|
||||
|
||||
:param input_: matrix of input
|
||||
:type input_: torch.tensor
|
||||
:param weight: matrix of weight
|
||||
:type weight: torch.tensor
|
||||
:param bias: matrix of bias
|
||||
:type bias: torch.tensor, optional
|
||||
:param input_parallel_mode: input parallel mode
|
||||
:type input_parallel_mode: colossalai.context.parallel_mode.ParallelMode
|
||||
:param weight_parallel_mode: weight parallel mode
|
||||
:type weight_parallel_mode: colossalai.context.parallel_mode.ParallelMode
|
||||
:param output_parallel_mode: output parallel mode
|
||||
:type output_parallel_mode: colossalai.context.parallel_mode.ParallelMode
|
||||
:param input_dim: dimension of input, defaults to 0
|
||||
:type input_dim: int, optional
|
||||
:param weight_dim: dimension of weight, defaults to -1
|
||||
:type weight_dim: int, optional
|
||||
:param output_dim: dimension of output, defaults to 0
|
||||
:type output_dim: int, optional
|
||||
Args:
|
||||
input_ (:class:`torch.tensor`): input matrix.
|
||||
weight (:class:`torch.tensor`): matrix of weight.
|
||||
bias (:class:`torch.tensor`): matrix of bias.
|
||||
input_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): input parallel mode.
|
||||
weight_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): weight parallel mode.
|
||||
output_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): output parallel mode.
|
||||
input_dim (int, optional): dimension of input, defaults to 0.
|
||||
weight_dim (int, optional): dimension of weight, defaults to -1.
|
||||
output_dim (int, optional): dimension of output, defaults to 0.
|
||||
|
||||
Note:
|
||||
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
|
||||
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
|
||||
"""
|
||||
return _Linear3D.apply(input_, weight, bias, input_parallel_mode, weight_parallel_mode, output_parallel_mode,
|
||||
input_dim, weight_dim, output_dim)
|
||||
@ -174,21 +169,19 @@ class _Classifier3D(torch.autograd.Function):
|
||||
|
||||
def classifier_3d(input_: Tensor, weight: Tensor, bias: Optional[Tensor], input_parallel_mode: ParallelMode,
|
||||
weight_parallel_mode: ParallelMode, output_parallel_mode: ParallelMode) -> Tensor:
|
||||
"""
|
||||
3D parallel classifier
|
||||
r"""3D parallel classifier.
|
||||
|
||||
:param input_: matrix of input
|
||||
:type input_: torch.tensor
|
||||
:param weight: matrix of weight
|
||||
:type weight: torch.tensor
|
||||
:param bias: matrix of bias
|
||||
:type bias: torch.tensor, optional
|
||||
:param input_parallel_mode: input parallel mode
|
||||
:type input_parallel_mode: colossalai.context.parallel_mode.ParallelMode
|
||||
:param weight_parallel_mode: weight parallel mode
|
||||
:type weight_parallel_mode: colossalai.context.parallel_mode.ParallelMode
|
||||
:param output_parallel_mode: output parallel mode
|
||||
:type output_parallel_mode: colossalai.context.parallel_mode.ParallelMode
|
||||
Args:
|
||||
input_ (:class:`torch.tensor`): input matrix.
|
||||
weight (:class:`torch.tensor`): matrix of weight.
|
||||
bias (:class:`torch.tensor`): matrix of bias.
|
||||
input_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): input parallel mode.
|
||||
weight_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): weight parallel mode.
|
||||
output_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): output parallel mode.
|
||||
|
||||
Note:
|
||||
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
|
||||
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
|
||||
"""
|
||||
return _Classifier3D.apply(input_, weight, bias, input_parallel_mode, weight_parallel_mode, output_parallel_mode)
|
||||
|
||||
@ -244,48 +237,44 @@ class _Layernorm3D(torch.autograd.Function):
|
||||
def layernorm_3d(input_: Tensor, weight: Tensor, bias: Tensor, normalized_shape: int, eps: float,
|
||||
input_parallel_mode: ParallelMode, weight_parallel_mode: ParallelMode,
|
||||
output_parallel_mode: ParallelMode) -> Tensor:
|
||||
r"""
|
||||
3D parallel Layernorm
|
||||
r"""3D parallel Layernorm.
|
||||
|
||||
:param input_: input maxtrix
|
||||
:type input_: torch.tensor
|
||||
:param weight: matrix of weight
|
||||
:type weight: torch.tensor
|
||||
:param bias: matrix of bias
|
||||
:type bias: torch.tensor
|
||||
:param normalized_shape: input shape from an expected input of size.
|
||||
:math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
|
||||
\times \ldots \times \text{normalized_shape}[-1]]`
|
||||
If a single integer is used, it is treated as a singleton list, and this module will
|
||||
normalize over the last dimension which is expected to be of that specific size.
|
||||
:type normalized_shape: int
|
||||
:param eps: a value added to the denominator for numerical stability
|
||||
:type eps: float
|
||||
:param input_parallel_mode: input parallel mode
|
||||
:type input_parallel_mode: colossalai.context.parallel_mode.ParallelMode
|
||||
:param weight_parallel_mode: weight parallel mode
|
||||
:type weight_parallel_mode: colossalai.context.parallel_mode.ParallelMode
|
||||
:param output_parallel_mode: output parallel mode
|
||||
:type output_parallel_mode: colossalai.context.parallel_mode.ParallelMode
|
||||
Args:
|
||||
input_ (:class:`torch.tensor`): input matrix.
|
||||
weight (:class:`torch.tensor`): matrix of weight.
|
||||
bias (:class:`torch.tensor`): matrix of bias.
|
||||
normalized_shape (int): input shape from an expected input of size.
|
||||
:math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
|
||||
\times \ldots \times \text{normalized_shape}[-1]]`
|
||||
If a single integer is used, it is treated as a singleton list, and this module will
|
||||
normalize over the last dimension which is expected to be of that specific size.
|
||||
eps (float): a value added to the denominator for numerical stability
|
||||
input_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): input parallel mode.
|
||||
weight_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): weight parallel mode.
|
||||
output_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): output parallel mode.
|
||||
|
||||
Note:
|
||||
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
|
||||
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
|
||||
"""
|
||||
return _Layernorm3D.apply(input_, weight, bias, normalized_shape, eps, input_parallel_mode, weight_parallel_mode,
|
||||
output_parallel_mode)
|
||||
|
||||
|
||||
def split_tensor_3d(tensor: Tensor, dim: int, parallel_mode: ParallelMode) -> Tensor:
|
||||
"""Splits 3D parallel tensor in specified dimension
|
||||
r"""Splits 3D parallel tensor in specified dimension.
|
||||
|
||||
:param tensor: Input tensor
|
||||
:param dim: Specified dimension in which to split
|
||||
:param parallel_mode: Parallel mode
|
||||
:param weight_parallel_mode: Weight parallel mode
|
||||
Args:
|
||||
tensor (:class:`torch.tensor`): Input tensor.
|
||||
dim (int): Specified dimension in which to split.
|
||||
parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`, optional): Parallel mode.
|
||||
|
||||
:type tensor: torch.Tensor
|
||||
:type dim: int
|
||||
:type parallel_mode: colossalai.context.parallel_mode.ParallelMode
|
||||
Returns:
|
||||
:class:`torch.tensor`: The tensor has been split.
|
||||
|
||||
:return output: Splitted tensor
|
||||
:rtype output: torch.Tensor
|
||||
Note:
|
||||
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
|
||||
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
|
||||
"""
|
||||
if tensor.size(dim) <= 1:
|
||||
return tensor
|
||||
@ -298,17 +287,20 @@ def split_batch_3d(input_: Tensor,
|
||||
dim: int = 0,
|
||||
input_parallel_mode: ParallelMode = ParallelMode.PARALLEL_3D_INPUT,
|
||||
weight_parallel_mode: ParallelMode = ParallelMode.PARALLEL_3D_WEIGHT) -> Tensor:
|
||||
"""Splits 3D tensor in batch
|
||||
:param input_: Input tensor
|
||||
:param dim: Specified dimension in which to split
|
||||
:param input_parallel_mode: Input parallel mode
|
||||
:param weight_parallel_mode: Weight parallel mode
|
||||
:type input_: torch.Tensor
|
||||
:type dim: int, optional
|
||||
:type input_parallel_mode: colossalai.context.parallel_mode.ParallelMode, optional
|
||||
:type weight_parallel_mode: colossalai.context.parallel_mode.ParallelMode, optional
|
||||
:return output: Splitted tensor
|
||||
:rtype output: torch.Tensor
|
||||
r"""Splits 3D tensor in batch.
|
||||
|
||||
Args:
|
||||
input_ (:class:`torch.tensor`): Input tensor.
|
||||
dim (int): Specified dimension in which to split.
|
||||
input_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`, optional): input parallel mode.
|
||||
weight_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`, optional): weight parallel mode.
|
||||
|
||||
Returns:
|
||||
:class:`torch.tensor`: The tensor has been split.
|
||||
|
||||
Note:
|
||||
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
|
||||
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
|
||||
"""
|
||||
if input_.size(dim) <= 1:
|
||||
return input_
|
||||
@ -333,11 +325,15 @@ class _ReduceTensor3D(torch.autograd.Function):
|
||||
|
||||
|
||||
def reduce_tensor_3d(tensor: Tensor, parallel_mode: ParallelMode) -> Tensor:
|
||||
"""
|
||||
All-reduce the input
|
||||
r"""All-reduce the input
|
||||
|
||||
:param tensor: Input tensor
|
||||
:param parallel_mode: Parallel mode
|
||||
Args:
|
||||
tensor (:class:`torch.tensor`): Input tensor.
|
||||
parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): Parallel mode.
|
||||
|
||||
Note:
|
||||
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
|
||||
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
|
||||
"""
|
||||
return _ReduceTensor3D.apply(tensor, parallel_mode)
|
||||
|
||||
@ -358,11 +354,16 @@ class _AllGatherTensor3D(torch.autograd.Function):
|
||||
|
||||
|
||||
def all_gather_tensor_3d(tensor: Tensor, dim: int, parallel_mode: ParallelMode) -> Tensor:
|
||||
"""
|
||||
All-reduce the gradient in backward pass.
|
||||
r"""All-reduce the gradient in backward pass.
|
||||
|
||||
:param tensor: Input tensor
|
||||
:param parallel_mode: Parallel mode
|
||||
Args:
|
||||
tensor (:class:`torch.tensor`): Input tensor.
|
||||
dim (int): Dimension to gather.
|
||||
parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): Parallel mode.
|
||||
|
||||
Note:
|
||||
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
|
||||
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
|
||||
"""
|
||||
return _AllGatherTensor3D.apply(tensor, dim, parallel_mode)
|
||||
|
||||
@ -382,12 +383,16 @@ class _ReduceScatterTensor3D(torch.autograd.Function):
|
||||
|
||||
|
||||
def reduce_scatter_tensor_3d(tensor: Tensor, dim: int, parallel_mode: ParallelMode) -> Tensor:
|
||||
"""
|
||||
Reduce-scatter the input.
|
||||
r"""Reduce-scatter the input.
|
||||
|
||||
:param tensor: Input tensor
|
||||
:param dim: Dimension to scatter
|
||||
:param parallel_mode: Parallel mode
|
||||
Args:
|
||||
tensor (:class:`torch.tensor`): Input tensor.
|
||||
dim (int): Dimension to scatter.
|
||||
parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): Parallel mode.
|
||||
|
||||
Note:
|
||||
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
|
||||
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
|
||||
"""
|
||||
return _ReduceScatterTensor3D.apply(tensor, dim, parallel_mode)
|
||||
|
||||
@ -423,34 +428,33 @@ def reduce_by_batch_3d(tensor: Tensor,
|
||||
input_parallel_mode: ParallelMode,
|
||||
weight_parallel_mode: ParallelMode,
|
||||
reduce_mean: bool = False) -> Tensor:
|
||||
"""
|
||||
All-reduce the input from the model parallel region.
|
||||
r"""All-reduce the input from the model parallel region.
|
||||
|
||||
:param input_: input maxtrix
|
||||
:type input_: torch.tensor
|
||||
:param input_parallel_mode: input parallel mode
|
||||
:type input_parallel_mode: colossalai.context.parallel_mode.ParallelMode
|
||||
:param weight_parallel_mode: weight parallel mode
|
||||
:type weight_parallel_mode: colossalai.context.parallel_mode.ParallelMode
|
||||
:param reduce_mean: If set to ``True``, it will divide the output by (input parallel size * weight parallel size),
|
||||
default to False
|
||||
:type reduce_mean: int, optional
|
||||
Args:
|
||||
input_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): input parallel mode.
|
||||
weight_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): weight parallel mode.
|
||||
reduce_mean (bool, optional): If set to ``True``, it will divide the output by
|
||||
(input parallel size * weight parallel size), default to False.
|
||||
|
||||
Note:
|
||||
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
|
||||
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
|
||||
"""
|
||||
return _ReduceByBatch3D.apply(tensor, input_parallel_mode, weight_parallel_mode, reduce_mean)
|
||||
|
||||
|
||||
class _BroadcastWeight3D_FromDiagonal(torch.autograd.Function):
|
||||
"""
|
||||
broadcast weight from diagonal
|
||||
r"""broadcast weight from diagonal.
|
||||
|
||||
:param input_: input maxtrix
|
||||
:type input_: torch.tensor
|
||||
:param input_parallel_mode: input parallel mode
|
||||
:type input_parallel_mode: colossalai.context.parallel_mode.ParallelMode
|
||||
:param weight_parallel_mode: weight parallel mode
|
||||
:type weight_parallel_mode: colossalai.context.parallel_mode.ParallelMode
|
||||
:param weight_parallel_mode: output parallel mode
|
||||
:type weight_parallel_mode: colossalai.context.parallel_mode.ParallelMode
|
||||
Args:
|
||||
input_ (:class:`torch.tensor`): input matrix.
|
||||
input_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): input parallel mode.
|
||||
weight_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): weight parallel mode.
|
||||
output_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): output parallel mode.
|
||||
|
||||
Note:
|
||||
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
|
||||
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
|
@ -24,19 +24,16 @@ from ._utils import get_depth_from_env, get_last_group, get_parallel_mode_from_e
|
||||
|
||||
@LAYERS.register_module
|
||||
class LayerNorm3D(ParallelLayer):
|
||||
r"""
|
||||
Layer Normalization for 3D parallelism
|
||||
r"""Layer Normalization for 3D parallelism.
|
||||
|
||||
:param normalized_shape: input shape from an expected input of size.
|
||||
:math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
|
||||
\times \ldots \times \text{normalized_shape}[-1]]`
|
||||
If a single integer is used, it is treated as a singleton list, and this module will
|
||||
normalize over the last dimension which is expected to be of that specific size.
|
||||
:type normalized_shape: int
|
||||
:param eps: a value added to the denominator for numerical stability, defaults to 1e-12
|
||||
:type eps: float, optional
|
||||
:param dtype: The dtype of parameters, defaults to None
|
||||
:type dtype: torch.dtype, optional
|
||||
Args:
|
||||
normalized_shape (int): input shape from an expected input of size.
|
||||
:math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
|
||||
\times \ldots \times \text{normalized_shape}[-1]]`
|
||||
If a single integer is used, it is treated as a singleton list, and this module will
|
||||
normalize over the last dimension which is expected to be of that specific size.
|
||||
eps (float, optional): a value added to the denominator for numerical stability, defaults to 1e-12.
|
||||
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
|
||||
"""
|
||||
|
||||
def __init__(self, normalized_shape: int, eps: float = 1e-12, dtype=None):
|
||||
@ -71,21 +68,20 @@ class LayerNorm3D(ParallelLayer):
|
||||
|
||||
@LAYERS.register_module
|
||||
class Linear3D(ParallelLayer):
|
||||
"""
|
||||
Linear layer for 3D parallelism
|
||||
r"""Linear layer for 3D parallelism.
|
||||
|
||||
:param in_features: size of each input sample
|
||||
:type in_features: int
|
||||
:param out_features: size of each output sample
|
||||
:type out_features: int
|
||||
:param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True
|
||||
:type bias: bool, optional
|
||||
:param dtype: The dtype of parameters, defaults to None
|
||||
:type dtype: torch.dtype, optional
|
||||
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
|
||||
:type weight_initializer: typing.Callable, optional
|
||||
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
|
||||
:type bias_initializer: typing.Callable, optional
|
||||
Args:
|
||||
in_features (int): size of each input sample.
|
||||
out_features (int): size of each output sample.
|
||||
bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
|
||||
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
|
||||
weight_initializer (:class:`typing.Callable`, optional):
|
||||
The initializer of weight, defaults to kaiming uniform initializer.
|
||||
bias_initializer (:class:`typing.Callable`, optional):
|
||||
The initializer of bias, defaults to xavier uniform initializer.
|
||||
|
||||
More details about ``initializer`` please refer to
|
||||
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
@ -146,23 +142,21 @@ class Linear3D(ParallelLayer):
|
||||
|
||||
@LAYERS.register_module
|
||||
class Classifier3D(ParallelLayer):
|
||||
"""
|
||||
Classifier for 3D parallelism
|
||||
r"""Classifier for 3D parallelism.
|
||||
|
||||
:param in_features: size of each input sample
|
||||
:type in_features: int
|
||||
:param num_classes: number of classes
|
||||
:type num_classes: int
|
||||
:param weight: weight of the classifier, defaults to True
|
||||
:type weight: torch.nn.Parameter, optional
|
||||
:param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True
|
||||
:type bias: bool, optional
|
||||
:param dtype: The dtype of parameters, defaults to None
|
||||
:type dtype: torch.dtype, optional
|
||||
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
|
||||
:type weight_initializer: typing.Callable, optional
|
||||
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
|
||||
:type bias_initializer: typing.Callable, optional
|
||||
Args:
|
||||
in_features (int): size of each input sample.
|
||||
num_classes (int): number of classes.
|
||||
weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
|
||||
bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
|
||||
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
|
||||
weight_initializer (:class:`typing.Callable`, optional):
|
||||
The initializer of weight, defaults to kaiming uniform initializer.
|
||||
bias_initializer (:class:`typing.Callable`, optional):
|
||||
The initializer of bias, defaults to xavier uniform initializer.
|
||||
|
||||
More details about ``initializer`` please refer to
|
||||
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
@ -225,23 +219,21 @@ class Classifier3D(ParallelLayer):
|
||||
|
||||
@LAYERS.register_module
|
||||
class VocabParallelClassifier3D(ParallelLayer):
|
||||
"""
|
||||
Vocab parallel classifier layer for 2D parallelism
|
||||
r"""Vocab parallel classifier layer for 3D parallelism.
|
||||
|
||||
:param in_features: size of each input sample
|
||||
:type in_features: int
|
||||
:param num_classes: number of classes
|
||||
:type num_classes: int
|
||||
:param weight: weight of the classifier, defaults to True
|
||||
:type weight: torch.nn.Parameter, optional
|
||||
:param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True``
|
||||
:type bias: bool, optional
|
||||
:param dtype: The dtype of parameters, defaults to None
|
||||
:type dtype: torch.dtype, optional
|
||||
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
|
||||
:type weight_initializer: typing.Callable, optional
|
||||
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
|
||||
:type bias_initializer: typing.Callable, optional
|
||||
Args:
|
||||
in_features (int): size of each input sample.
|
||||
num_classes (int): number of classes.
|
||||
weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
|
||||
bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
|
||||
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
|
||||
weight_initializer (:class:`typing.Callable`, optional):
|
||||
The initializer of weight, defaults to kaiming uniform initializer.
|
||||
bias_initializer (:class:`typing.Callable`, optional):
|
||||
The initializer of bias, defaults to xavier uniform initializer.
|
||||
|
||||
More details about ``initializer`` please refer to
|
||||
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
@ -311,27 +303,24 @@ class VocabParallelClassifier3D(ParallelLayer):
|
||||
|
||||
@LAYERS.register_module
|
||||
class PatchEmbedding3D(ParallelLayer):
|
||||
"""
|
||||
2D Image to Patch Embedding
|
||||
r"""2D Image to Patch Embedding.
|
||||
|
||||
:param img_size: image size
|
||||
:type img_size: int
|
||||
:param patch_size: patch size
|
||||
:type patch_size: int
|
||||
:param in_chans: number of channels of input image
|
||||
:type in_chans: int
|
||||
:param embed_size: size of embedding
|
||||
:type embed_size: int
|
||||
:param dtype: The dtype of parameters, defaults to None
|
||||
:type dtype: torch.dtype, optional
|
||||
:param flatten: whether to flatten output tensor, defaults to True
|
||||
:type flatten: bool, optional
|
||||
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
|
||||
:type weight_initializer: typing.Callable, optional
|
||||
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
|
||||
:type bias_initializer: typing.Callable, optional
|
||||
:param position_embed_initializer: The intializer of position embedding, defaults to zero
|
||||
:type position_embed_initializer: typing.Callable, optional
|
||||
Args:
|
||||
img_size (int): image size.
|
||||
patch_size (int): patch size.
|
||||
in_chans (int): number of channels of input image.
|
||||
embed_size (int): size of embedding.
|
||||
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
|
||||
flatten (bool, optional): whether to flatten output tensor, defaults to True.
|
||||
weight_initializer (:class:`typing.Callable`, optional):
|
||||
The initializer of weight, defaults to kaiming uniform initializer.
|
||||
bias_initializer (:class:`typing.Callable`, optional):
|
||||
The initializer of bias, defaults to xavier uniform initializer.
|
||||
position_embed_initializer (:class:`typing.Callable`, optional):
|
||||
The initializer of position embedding, defaults to zeros initializer.
|
||||
|
||||
More details about ``initializer`` please refer to
|
||||
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
@ -419,21 +408,33 @@ class PatchEmbedding3D(ParallelLayer):
|
||||
|
||||
@LAYERS.register_module
|
||||
class Embedding3D(ParallelLayer):
|
||||
"""
|
||||
Embedding for 3D parallelism
|
||||
r"""Embedding for 3D parallelism.
|
||||
|
||||
:param num_embeddings: number of embeddings
|
||||
:type num_embeddings: int
|
||||
:param embedding_dim: dimension of embedding
|
||||
:type embedding_dim: int
|
||||
:param padding_idx: index of padding, defaults to None
|
||||
:type padding_idx: int, optional
|
||||
:param dtype: The dtype of parameters, defaults to None
|
||||
:type dtype: torch.dtype, optional
|
||||
:param weight_initializer: The intializer of weight, defaults to normal initializer
|
||||
:type weight_initializer: typing.Callable, optional
|
||||
:param args: Args used in F.embedding
|
||||
:param kwargs: Kwargs used in F.embedding
|
||||
Args:
|
||||
num_embeddings (int): number of embeddings.
|
||||
embedding_dim (int): dimension of embedding.
|
||||
padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
|
||||
therefore, the embedding vector at padding_idx is not updated during training,
|
||||
i.e. it remains as a fixed “pad”, defaults to None.
|
||||
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
|
||||
weight_initializer (:class:`typing.Callable`, optional):
|
||||
he initializer of weight, defaults to normal initializer.
|
||||
|
||||
The ``args`` and ``kwargs`` used in :class:``torch.nn.functional.embedding`` should contain:
|
||||
::
|
||||
|
||||
max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
|
||||
renormalized to have norm max_norm. Note: this will modify weight in-place.
|
||||
norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
|
||||
scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
|
||||
of frequency of the words in the mini-batch. Default False.
|
||||
sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
|
||||
|
||||
More details about ``args`` and ``kwargs`` could be found in
|
||||
`Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
|
||||
|
||||
More details about initializer please refer to
|
||||
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
@ -491,20 +492,33 @@ class Embedding3D(ParallelLayer):
|
||||
|
||||
@LAYERS.register_module
|
||||
class VocabParallelEmbedding3D(torch.nn.Module):
|
||||
"""Embedding parallelized in the vocabulary dimension.
|
||||
r"""Embedding parallelized in the vocabulary dimension.
|
||||
|
||||
:param num_embeddings: number of embeddings
|
||||
:type num_embeddings: int
|
||||
:param embedding_dim: dimension of embedding
|
||||
:type embedding_dim: int
|
||||
:param padding_idx: index of padding, defaults to None
|
||||
:type padding_idx: int, optional
|
||||
:param dtype: The dtype of parameters, defaults to None
|
||||
:type dtype: torch.dtype, optional
|
||||
:param weight_initializer: The intializer of weight, defaults to normal initializer
|
||||
:type weight_initializer: typing.Callable, optional
|
||||
:param args: Args used in F.embedding
|
||||
:param kwargs: Kwargs used in F.embedding
|
||||
Args:
|
||||
num_embeddings (int): number of embeddings.
|
||||
embedding_dim (int): dimension of embedding.
|
||||
padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
|
||||
therefore, the embedding vector at padding_idx is not updated during training,
|
||||
i.e. it remains as a fixed “pad”, defaults to None.
|
||||
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
|
||||
weight_initializer (:class:`typing.Callable`, optional):
|
||||
he initializer of weight, defaults to normal initializer.
|
||||
|
||||
The ``args`` and ``kwargs`` used in :class:``torch.nn.functional.embedding`` should contain:
|
||||
::
|
||||
|
||||
max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
|
||||
renormalized to have norm max_norm. Note: this will modify weight in-place.
|
||||
norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
|
||||
scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
|
||||
of frequency of the words in the mini-batch. Default False.
|
||||
sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
|
||||
|
||||
More details about ``args`` and ``kwargs`` could be found in
|
||||
`Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
|
||||
|
||||
More details about initializer please refer to
|
||||
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
|
@ -24,14 +24,13 @@ class TransformerSelfAttentionRing(nn.Module):
|
||||
Self-attention layer takes input with size [b, s, h]
|
||||
and returns output of the same size.
|
||||
|
||||
:param hidden_size: hidden size
|
||||
:type hidden_size: int
|
||||
:param kv_channels: channels of key/value tensor
|
||||
:type kv_channels: int
|
||||
:param num_attention_heads: number of attention heads
|
||||
:type num_attention_heads: int
|
||||
:param attention_dropout: dropout probability for attention layer
|
||||
:type attention_dropout: float
|
||||
Args:
|
||||
hidden_size (int): hidden size.
|
||||
num_attention_heads (int): number of attention heads.
|
||||
attention_dropout (float): dropout probability for attention layer.
|
||||
attention_mask_func (:class:`typing.Callable`): Mask function to be applied.
|
||||
layer_number (int): number of layers.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
|
@ -38,11 +38,16 @@ class CheckpointModule(nn.Module):
|
||||
|
||||
|
||||
def divide(numerator, denominator):
|
||||
"""Only allow exact division
|
||||
"""Only allow exact division.
|
||||
|
||||
:param numerator: Numerator of the division
|
||||
:param denominator: Denominator of the division
|
||||
Args:
|
||||
numerator (int): Numerator of the division.
|
||||
denominator (int): Denominator of the division.
|
||||
|
||||
Returns:
|
||||
int: the result of exact division.
|
||||
"""
|
||||
assert denominator != 0, 'denominator can not be zero'
|
||||
assert numerator % denominator == 0, \
|
||||
'{} is not divisible by {}'.format(numerator, denominator)
|
||||
return numerator // denominator
|
||||
|
@ -15,11 +15,16 @@ from ..utils import to_2tuple
|
||||
|
||||
def drop_path(x, drop_prob: float = 0., training: bool = False):
|
||||
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
|
||||
|
||||
This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
|
||||
the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
|
||||
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
|
||||
changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
|
||||
'survival rate' as the argument.
|
||||
|
||||
Args:
|
||||
drop_prob (float, optional): probability of dropping path, defaults 0.0.
|
||||
training (bool, optional): whether in training progress, defaults False.
|
||||
"""
|
||||
if drop_prob == 0. or not training:
|
||||
return x
|
||||
@ -35,6 +40,9 @@ class DropPath(nn.Module):
|
||||
"""
|
||||
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
|
||||
Adapted from https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/drop.py
|
||||
|
||||
Args:
|
||||
drop_prob (float, optional): probability of dropping path, defaults None.
|
||||
"""
|
||||
|
||||
def __init__(self, drop_prob=None):
|
||||
@ -46,7 +54,19 @@ class DropPath(nn.Module):
|
||||
|
||||
|
||||
class WrappedDropout(nn.Module):
|
||||
"""Same as torch.nn.Dropout. But it is wrapped with the context of seed manager.
|
||||
r"""Same as torch.nn.Dropout. But it is wrapped with the context of seed manager. During training, randomly zeroes
|
||||
some elements of the input tensor with probability p using samples from a Bernoulli distribution. Each
|
||||
channel will be zeroed out independently on every forward call. Furthermore, the outputs are scaled by a factor of
|
||||
1/(1-p) during training. This means that during evaluation the module simply computes an identity function.
|
||||
|
||||
Args:
|
||||
p (float, optional): probability of an element to be zeroed, defaults 0.5.
|
||||
inplace (bool, optional): whether to do dropout in-place, default to be False.
|
||||
mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
|
||||
|
||||
Note:
|
||||
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
|
||||
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
|
||||
"""
|
||||
|
||||
def __init__(self, p: float = 0.5, inplace: bool = False, mode=None):
|
||||
@ -74,8 +94,16 @@ class WrappedDropout(nn.Module):
|
||||
|
||||
|
||||
class WrappedDropPath(nn.Module):
|
||||
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
|
||||
r"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
|
||||
Here, it is wrapped with the context of seed manager.
|
||||
|
||||
Args:
|
||||
p (float, optional): probability of dropping path, defaults 0.0.
|
||||
mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
|
||||
|
||||
Note:
|
||||
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
|
||||
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
|
||||
"""
|
||||
|
||||
def __init__(self, p: float = 0., mode=None):
|
||||
@ -101,27 +129,25 @@ class WrappedDropPath(nn.Module):
|
||||
|
||||
@LAYERS.register_module
|
||||
class VanillaPatchEmbedding(nn.Module):
|
||||
"""
|
||||
r"""
|
||||
2D Image to Patch Embedding
|
||||
|
||||
:param img_size: image size
|
||||
:type img_size: int
|
||||
:param patch_size: patch size
|
||||
:type patch_size: int
|
||||
:param in_chans: number of channels of input image
|
||||
:type in_chans: int
|
||||
:param embed_size: size of embedding
|
||||
:type embed_size: int
|
||||
:param dtype: The dtype of parameters, defaults to None
|
||||
:type dtype: torch.dtype, optional
|
||||
:param flatten: whether to flatten output tensor, defaults to True
|
||||
:type flatten: bool, optional
|
||||
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
|
||||
:type weight_initializer: typing.Callable, optional
|
||||
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
|
||||
:type bias_initializer: typing.Callable, optional
|
||||
:param position_embed_initializer: The intializer of position embedding, defaults to zero
|
||||
:type position_embed_initializer: typing.Callable, optional
|
||||
Args:
|
||||
img_size (int): image size.
|
||||
patch_size (int): patch size.
|
||||
in_chans (int): number of channels of input image.
|
||||
embed_size (int): size of embedding.
|
||||
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
|
||||
flatten (bool, optional): whether to flatten output tensor, defaults to True.
|
||||
weight_initializer (:class:`typing.Callable`, optional):
|
||||
The initializer of weight, defaults to kaiming uniform initializer.
|
||||
bias_initializer (:class:`typing.Callable`, optional):
|
||||
The initializer of bias, defaults to xavier uniform initializer.
|
||||
position_embed_initializer (:class:`typing.Callable`, optional):
|
||||
The initializer of position embedding, defaults to zeros initializer.
|
||||
|
||||
More details about initializer please refer to
|
||||
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
@ -174,23 +200,21 @@ class VanillaPatchEmbedding(nn.Module):
|
||||
|
||||
@LAYERS.register_module
|
||||
class VanillaClassifier(nn.Module):
|
||||
"""
|
||||
Dense linear classifier
|
||||
r"""Dense linear classifier.
|
||||
|
||||
:param in_features: size of each input sample
|
||||
:type in_features: int
|
||||
:param num_classes: number of classes
|
||||
:type num_classes: int
|
||||
:param weight: weight of the classifier, defaults to True
|
||||
:type weight: torch.nn.Parameter, optional
|
||||
:param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True
|
||||
:type bias: bool, optional
|
||||
:param dtype: The dtype of parameters, defaults to None
|
||||
:type dtype: torch.dtype, optional
|
||||
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
|
||||
:type weight_initializer: typing.Callable, optional
|
||||
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
|
||||
:type bias_initializer: typing.Callable, optional
|
||||
Args:
|
||||
in_features (int): size of each input sample.
|
||||
num_classes (int): number of classes.
|
||||
weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
|
||||
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
|
||||
flatten (bool, optional): whether to flatten output tensor, defaults to True.
|
||||
weight_initializer (:class:`typing.Callable`, optional):
|
||||
The initializer of weight, defaults to kaiming uniform initializer.
|
||||
bias_initializer (:class:`typing.Callable`, optional):
|
||||
The initializer of bias, defaults to xavier uniform initializer.
|
||||
|
||||
More details about initializer please refer to
|
||||
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
|
@ -9,12 +9,11 @@ from colossalai.registry import LAYERS
|
||||
|
||||
@LAYERS.register_module
|
||||
class LambdaWrapper(nn.Module):
|
||||
"""Wrap a function to nn.Module, which takes a config of layers and can fully access them
|
||||
"""Wrap a function to nn.Module, which takes a config of layers and can fully access them.
|
||||
|
||||
:param func: User customed function
|
||||
:type func: Callable
|
||||
:param layers_cfg: Config of layers, defaults to None
|
||||
:type layers_cfg: dict, optional
|
||||
Args:
|
||||
func (``Callable``): User customed function.
|
||||
layers_cfg (dict, optional): Config of layers, defaults to None.
|
||||
"""
|
||||
|
||||
def __init__(self, func, layers_cfg: dict = None):
|
||||
|
@ -86,12 +86,10 @@ class _VocabParallelCrossEntropy1D(torch.autograd.Function):
|
||||
|
||||
@LOSSES.register_module
|
||||
class VocabParallelCrossEntropyLoss1D(_Loss):
|
||||
"""
|
||||
Vocab parallel cross entropy loss for 1D parallelism
|
||||
"""Vocab parallel cross entropy loss for 1D parallelism.
|
||||
|
||||
:param reduction: whether to average the loss, defaults to True
|
||||
|
||||
:type reduction: bool, optional
|
||||
Args:
|
||||
reduction (bool, optional): whether to average the loss, defaults to True.
|
||||
"""
|
||||
|
||||
def __init__(self, reduction=True):
|
||||
@ -99,10 +97,11 @@ class VocabParallelCrossEntropyLoss1D(_Loss):
|
||||
self.reduction_mean = reduction
|
||||
|
||||
def forward(self, logits, targets):
|
||||
"""Calculate loss between logits and targets
|
||||
"""Calculate loss between logits and targets.
|
||||
|
||||
:param logits: Output logits of model
|
||||
:param targets: True targets from data
|
||||
Args:
|
||||
logits (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
|
||||
targets (:class:`torch.tensor`): Ground truth class indices or class probabilities.
|
||||
"""
|
||||
loss = _VocabParallelCrossEntropy1D.apply(logits, targets)
|
||||
if self.reduction_mean:
|
||||
|
@ -13,14 +13,22 @@ from torch.nn.modules.loss import _Loss
|
||||
|
||||
@LOSSES.register_module
|
||||
class CrossEntropyLoss2D(_Loss):
|
||||
"""
|
||||
Cross entropy loss for 2D parallelism
|
||||
r"""Cross entropy loss for 2D parallelism
|
||||
|
||||
:param reduction: whether to average the loss, defaults to True
|
||||
:param args: Args for loss function
|
||||
:param kwargs: Kwargs for loss function
|
||||
Args:
|
||||
reduction (bool, optional): whether to average the loss, defaults to True.
|
||||
|
||||
:type reduction: bool, optional
|
||||
The ``args`` and ``kwargs`` should include parameters below:
|
||||
::
|
||||
|
||||
weight (Tensor, optional)
|
||||
size_average (bool, optional)
|
||||
ignore_index (int, optional)
|
||||
reduce (bool, optional)
|
||||
label_smoothing (float, optional)
|
||||
|
||||
More details about args, kwargs and torch.nn.functional.cross_entropy could be found in
|
||||
`Cross_entropy <https://pytorch.org/docs/stable/generated/torch.nn.functional.cross_entropy.html#torch.nn.functional.cross_entropy>`_.
|
||||
"""
|
||||
|
||||
def __init__(self, reduction=True, *args, **kwargs):
|
||||
@ -31,10 +39,14 @@ class CrossEntropyLoss2D(_Loss):
|
||||
self.loss_kwargs = kwargs
|
||||
|
||||
def forward(self, logits, targets):
|
||||
"""Calculate loss between logits and targets
|
||||
"""Calculate loss between logits and targets.
|
||||
|
||||
:param logits: Output logits of model
|
||||
:param targets: True targets from data
|
||||
Args:
|
||||
logits (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
|
||||
targets (:class:`torch.tensor`): Ground truth class indices or class probabilities.
|
||||
|
||||
Returns:
|
||||
float: the loss between logits and targets.
|
||||
"""
|
||||
targets = split_tensor_2d(targets)
|
||||
loss = cross_entropy(logits, targets, reduction='none', *self.loss_args, **self.loss_kwargs)
|
||||
@ -116,12 +128,10 @@ class _VocabParallelCrossEntropy2D(torch.autograd.Function):
|
||||
|
||||
@LOSSES.register_module
|
||||
class VocabParallelCrossEntropyLoss2D(_Loss):
|
||||
"""
|
||||
Vocab parallel cross entropy loss for 2D parallelism
|
||||
"""Vocab parallel cross entropy loss for 2D parallelism.
|
||||
|
||||
:param reduction: whether to average the loss, defaults to True
|
||||
|
||||
:type reduction: bool, optional
|
||||
Args:
|
||||
reduction (bool, optional): whether to average the loss, defaults to True.
|
||||
"""
|
||||
|
||||
def __init__(self, reduction=True):
|
||||
@ -129,10 +139,11 @@ class VocabParallelCrossEntropyLoss2D(_Loss):
|
||||
self.reduction_mean = reduction
|
||||
|
||||
def forward(self, logits, targets):
|
||||
"""Calculate loss between logits and targets
|
||||
"""Calculate loss between logits and targets.
|
||||
|
||||
:param logits: Output logits of model
|
||||
:param targets: True targets from data
|
||||
Args:
|
||||
logits (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
|
||||
targets (:class:`torch.tensor`): Ground truth class indices or class probabilities.
|
||||
"""
|
||||
targets = split_tensor_2d(targets)
|
||||
loss = _VocabParallelCrossEntropy2D.apply(
|
||||
|
@ -13,14 +13,22 @@ from torch.nn.modules.loss import _Loss
|
||||
|
||||
@LOSSES.register_module
|
||||
class CrossEntropyLoss2p5D(_Loss):
|
||||
"""
|
||||
Cross entropy loss for 2.5D parallelism
|
||||
r"""Cross entropy loss for 2.5D parallelism
|
||||
|
||||
:param reduction: whether to average the loss, defaults to True
|
||||
:param args: Args for loss function
|
||||
:param kwargs: Kwargs for loss function
|
||||
Args:
|
||||
reduction (bool, optional): whether to average the loss, defaults to True.
|
||||
|
||||
:type reduction: bool, optional
|
||||
The ``args`` and ``kwargs`` should include parameters below:
|
||||
::
|
||||
|
||||
weight (Tensor, optional)
|
||||
size_average (bool, optional)
|
||||
ignore_index (int, optional)
|
||||
reduce (bool, optional)
|
||||
label_smoothing (float, optional)
|
||||
|
||||
More details about args, kwargs and torch.nn.functional.cross_entropy could be found in
|
||||
`Cross_entropy <https://pytorch.org/docs/stable/generated/torch.nn.functional.cross_entropy.html#torch.nn.functional.cross_entropy>`_.
|
||||
"""
|
||||
def __init__(self, reduction=True, *args, **kwargs):
|
||||
super().__init__()
|
||||
@ -30,10 +38,11 @@ class CrossEntropyLoss2p5D(_Loss):
|
||||
self.loss_kwargs = kwargs
|
||||
|
||||
def forward(self, logits, targets):
|
||||
"""Calculate loss between logits and targets
|
||||
"""Calculate loss between logits and targets.
|
||||
|
||||
:param logits: Output logits of model
|
||||
:param targets: True targets from data
|
||||
Args:
|
||||
logits (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
|
||||
targets (:class:`torch.tensor`): Ground truth class indices or class probabilities.
|
||||
"""
|
||||
targets = split_tensor_2p5d(targets)
|
||||
loss = cross_entropy(logits, targets, reduction='none', *self.loss_args, **self.loss_kwargs)
|
||||
@ -115,19 +124,19 @@ class VocabParallelCrossEntropyLoss2p5D(_Loss):
|
||||
"""
|
||||
Vocab parallel cross entropy loss for 2.5D parallelism
|
||||
|
||||
:param reduction: whether to average the loss, defaults to True
|
||||
|
||||
:type reduction: bool, optional
|
||||
Args:
|
||||
reduction (bool, optional): whether to average the loss, defaults to True.
|
||||
"""
|
||||
def __init__(self, reduction=True):
|
||||
super().__init__()
|
||||
self.reduction_mean = reduction
|
||||
|
||||
def forward(self, logits, targets):
|
||||
"""Calculate loss between logits and targets
|
||||
"""Calculate loss between logits and targets.
|
||||
|
||||
:param logits: Output logits of model
|
||||
:param targets: True targets from data
|
||||
Args:
|
||||
logits (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
|
||||
targets (:class:`torch.tensor`): Ground truth class indices or class probabilities.
|
||||
"""
|
||||
targets = split_tensor_2p5d(targets)
|
||||
loss = _VocabParallelCrossEntropy2p5D.apply(logits, targets)
|
||||
|
@ -13,14 +13,22 @@ from torch.nn.modules.loss import _Loss
|
||||
|
||||
@LOSSES.register_module
|
||||
class CrossEntropyLoss3D(_Loss):
|
||||
"""
|
||||
Cross entropy loss for 3D parallelism
|
||||
r"""Cross entropy loss for 3D parallelism.
|
||||
|
||||
:param reduction: whether to average the loss, defaults to True
|
||||
:param args: Args for loss function
|
||||
:param kwargs: Kwargs for loss function
|
||||
Args:
|
||||
reduction (bool, optional): whether to average the loss, defaults to True.
|
||||
|
||||
:type reduction: bool, optional
|
||||
The ``args`` and ``kwargs`` should include parameters below:
|
||||
::
|
||||
|
||||
weight (Tensor, optional)
|
||||
size_average (bool, optional)
|
||||
ignore_index (int, optional)
|
||||
reduce (bool, optional)
|
||||
label_smoothing (float, optional)
|
||||
|
||||
More details about args, kwargs and torch.nn.functional.cross_entropy could be found in
|
||||
`Cross_entropy <https://pytorch.org/docs/stable/generated/torch.nn.functional.cross_entropy.html#torch.nn.functional.cross_entropy>`_.
|
||||
"""
|
||||
|
||||
def __init__(self, reduction=True, *args, **kwargs):
|
||||
@ -32,10 +40,11 @@ class CrossEntropyLoss3D(_Loss):
|
||||
self.loss_kwargs = kwargs
|
||||
|
||||
def forward(self, logits, targets):
|
||||
"""Calculate loss between logits and targets
|
||||
"""Calculate loss between logits and targets.
|
||||
|
||||
:param logits: Output logits of model
|
||||
:param targets: True targets from data
|
||||
Args:
|
||||
logits (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
|
||||
targets (:class:`torch.tensor`): Ground truth class indices or class probabilities.
|
||||
"""
|
||||
targets = split_tensor_3d(targets, 0, self.weight_parallel_mode)
|
||||
targets = split_tensor_3d(targets, 0, self.input_parallel_mode)
|
||||
@ -109,12 +118,10 @@ class _VocabParallelCrossEntropy3D(torch.autograd.Function):
|
||||
|
||||
@LOSSES.register_module
|
||||
class VocabParallelCrossEntropyLoss3D(_Loss):
|
||||
"""
|
||||
Vocab parallel cross entropy loss for 2D parallelism
|
||||
"""Vocab parallel cross entropy loss for 2D parallelism.
|
||||
|
||||
:param reduction: whether to average the loss, defaults to True
|
||||
|
||||
:type reduction: bool, optional
|
||||
Args:
|
||||
reduction (bool, optional): whether to average the loss, defaults to True.
|
||||
"""
|
||||
|
||||
def __init__(self, reduction=True):
|
||||
@ -125,10 +132,11 @@ class VocabParallelCrossEntropyLoss3D(_Loss):
|
||||
self.reduction_mean = reduction
|
||||
|
||||
def forward(self, logits, targets):
|
||||
"""Calculate loss between logits and targets
|
||||
"""Calculate loss between logits and targets.
|
||||
|
||||
:param logits: Output logits of model
|
||||
:param targets: True targets from data
|
||||
Args:
|
||||
logits (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
|
||||
targets (:class:`torch.tensor`): Ground truth class indices or class probabilities.
|
||||
"""
|
||||
targets = split_tensor_3d(targets, 0, self.weight_parallel_mode)
|
||||
targets = split_tensor_3d(targets, 0, self.input_parallel_mode)
|
||||
|
@ -6,13 +6,25 @@ from colossalai.context.moe_context import MOE_CONTEXT
|
||||
|
||||
@LOSSES.register_module
|
||||
class MoeCrossEntropyLoss(_Loss):
|
||||
"""torch.nn.CrossEntropyLoss added with auxiliary loss.
|
||||
r"""torch.nn.CrossEntropyLoss added with auxiliary loss.
|
||||
|
||||
:param aux_weight: Weight of auxiliary loss in total loss
|
||||
:param args: Args in CrossEntropyLoss
|
||||
:param kwargs: Kwargs in CrossEntropyLoss
|
||||
Args:
|
||||
input (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
|
||||
target (:class:`torch.tensor`): Ground truth class indices or class probabilities.
|
||||
aux_weight (float, optional): Weight of auxiliary loss in total loss.Defaults 0.01.
|
||||
|
||||
:type aux_weight: float, optional
|
||||
The ``args`` and ``kwargs`` should include parameters below:
|
||||
::
|
||||
|
||||
weight (Tensor, optional)
|
||||
size_average (bool, optional)
|
||||
ignore_index (int, optional)
|
||||
reduce (bool, optional)
|
||||
reduction (str, optional)
|
||||
label_smoothing (float, optional)
|
||||
|
||||
More details about args, kwargs and torch.nn.functional.cross_entropy could be found in
|
||||
`Cross_entropy <https://pytorch.org/docs/stable/generated/torch.nn.functional.cross_entropy.html#torch.nn.functional.cross_entropy>`_.
|
||||
"""
|
||||
|
||||
def __init__(self, aux_weight: float = 0.01, *args, **kwargs):
|
||||
@ -21,6 +33,16 @@ class MoeCrossEntropyLoss(_Loss):
|
||||
self.aux_weight = aux_weight
|
||||
|
||||
def forward(self, *args):
|
||||
"""
|
||||
The ``args`` should at least include parameters below:
|
||||
::
|
||||
|
||||
input (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
|
||||
target (:class:`torch.tensor`): Ground truth class indices or class probabilities.
|
||||
|
||||
More details about args, kwargs and torch.nn.functional.cross_entropy could be found in
|
||||
`Cross_entropy <https://pytorch.org/docs/stable/generated/torch.nn.functional.cross_entropy.html#torch.nn.functional.cross_entropy>`_.
|
||||
"""
|
||||
main_loss = self.loss(*args)
|
||||
aux_loss = MOE_CONTEXT.get_loss()
|
||||
return main_loss + self.aux_weight * aux_loss
|
||||
@ -30,13 +52,11 @@ class MoeCrossEntropyLoss(_Loss):
|
||||
class MoeLoss(_Loss):
|
||||
"""A wrapper class for any loss module to add with auxiliary loss.
|
||||
|
||||
:param aux_weight: Weight of auxiliary loss in total loss
|
||||
:param loss_fn: Loss function
|
||||
:param args: Args in loss function
|
||||
:param kwargs: Kwargs in loss function
|
||||
|
||||
:type aux_weight: float
|
||||
:type loss_fn: Callable
|
||||
Args:
|
||||
aux_weight (float): Weight of auxiliary loss in total loss.
|
||||
loss_fn (``Callable``): Loss function.
|
||||
args (list): Args in loss function.
|
||||
kwargs (dict): Kwargs in loss function
|
||||
"""
|
||||
|
||||
def __init__(self, aux_weight: float, loss_fn, *args, **kwargs):
|
||||
@ -45,6 +65,16 @@ class MoeLoss(_Loss):
|
||||
self.aux_weight = aux_weight
|
||||
|
||||
def forward(self, *args, **kwargs):
|
||||
"""
|
||||
The ``args`` and ``kwargs`` should at least include parameters below:
|
||||
::
|
||||
|
||||
input (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
|
||||
target (:class:`torch.tensor`): Ground truth class indices or class probabilities.
|
||||
|
||||
Note:
|
||||
The ``args`` and ``kwargs`` may include different parameters varying with different loss function.
|
||||
"""
|
||||
main_loss = self.loss_fn(*args, **kwargs)
|
||||
aux_loss = MOE_CONTEXT.get_loss()
|
||||
return main_loss + self.aux_weight * aux_loss
|
||||
|
@ -36,14 +36,12 @@ class CosineAnnealingLR(_CosineAnnealingLR):
|
||||
.. _SGDR\: Stochastic Gradient Descent with Warm Restarts:
|
||||
https://arxiv.org/abs/1608.03983
|
||||
|
||||
:param optimizer: Wrapped optimizer
|
||||
:type optimizer: torch.optim.Optimizer
|
||||
:param total_steps: Number of total training steps
|
||||
:type total_steps: int
|
||||
:param eta_min: Minimum learning rate, defaults to 0
|
||||
:type eta_min: int, optional
|
||||
:param last_epoch: The index of last epoch, defaults to -1
|
||||
:type last_epoch: int, optional
|
||||
Args:
|
||||
optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
|
||||
total_steps (int): Number of total training steps.
|
||||
eta_min (int, optional): Minimum learning rate, defaults to 0.
|
||||
last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
|
||||
the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
|
||||
"""
|
||||
|
||||
def __init__(self, optimizer, total_steps: int, eta_min: int = 0, last_epoch: int = -1, **kwargs):
|
||||
@ -54,16 +52,13 @@ class CosineAnnealingLR(_CosineAnnealingLR):
|
||||
class CosineAnnealingWarmupLR(WarmupScheduler):
|
||||
"""Cosine annealing learning rate scheduler with learning rate warmup. A linear warmup schedule will be applied.
|
||||
|
||||
:param optimizer: Wrapped optimizer
|
||||
:type optimizer: torch.optim.Optimizer
|
||||
:param total_steps: Number of total training steps
|
||||
:type total_steps: int
|
||||
:param warmup_steps: Number of warmup steps, defaults to 0
|
||||
:type warmup_steps: int, optional
|
||||
:param eta_min: Minimum learning rate, defaults to 0
|
||||
:type eta_min: int, optional
|
||||
:param last_epoch: The index of last epoch, defaults to -1
|
||||
:type last_epoch: int, optional
|
||||
Args:
|
||||
optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
|
||||
total_steps (int): Number of total training steps.
|
||||
warmup_steps (int, optional): Number of warmup steps, defaults to 0.
|
||||
eta_min (int, optional): Minimum learning rate, defaults to 0.
|
||||
last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
|
||||
the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
|
||||
"""
|
||||
|
||||
def __init__(self, optimizer, total_steps: int, warmup_steps: int = 0, eta_min: float = 0., last_epoch: int = -1):
|
||||
@ -76,14 +71,12 @@ class CosineAnnealingWarmupLR(WarmupScheduler):
|
||||
class FlatAnnealingLR(DelayerScheduler):
|
||||
"""Flat and cosine annealing learning rate scheduler. The learning rate will be a fixed value before starting decay.
|
||||
|
||||
:param optimizer: Wrapped optimizer
|
||||
:type optimizer: torch.optim.Optimizer
|
||||
:param total_steps: Number of total training steps
|
||||
:type total_steps: int
|
||||
:param pct_start: Percent of steps before starting learning rate decay
|
||||
:type pct_start: float
|
||||
:param last_epoch: The index of last epoch, defaults to -1
|
||||
:type last_epoch: int, optional
|
||||
Args:
|
||||
optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
|
||||
total_steps (int): Number of total training steps.
|
||||
pct_start (float, optional): Percent of steps before starting learning rate decay, defaults to -0.72.
|
||||
last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
|
||||
the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
|
||||
"""
|
||||
|
||||
def __init__(self, optimizer, total_steps: int, pct_start: float = 0.72, last_epoch: int = -1, **kwargs):
|
||||
@ -102,18 +95,14 @@ class FlatAnnealingWarmupLR(WarmupDelayerScheduler):
|
||||
"""Flat and cosine annealing learning rate scheduler with learning rate warmup. A linear warmup schedule will be
|
||||
applied, and then the learning rate will be a fixed value before starting decay.
|
||||
|
||||
:param optimizer: Wrapped optimizer
|
||||
:type optimizer: torch.optim.Optimizer
|
||||
:param total_steps: Number of total training steps
|
||||
:type total_steps: int
|
||||
:param warmup_steps: Number of warmup steps, defaults to 0
|
||||
:type warmup_steps: int, optional
|
||||
:param pct_start: Percent of steps before starting learning rate decay
|
||||
:type pct_start: float
|
||||
:param eta_min: Minimum learning rate, defaults to 0
|
||||
:type eta_min: int, optional
|
||||
:param last_epoch: The index of last epoch, defaults to -1
|
||||
:type last_epoch: int, optional
|
||||
Args:
|
||||
optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
|
||||
total_steps (int): Number of total training steps.
|
||||
warmup_steps (int, optional): Number of warmup steps, defaults to 0.
|
||||
pct_start (float, optional): Percent of steps before starting learning rate decay, defaults to -0.72.
|
||||
eta_min (int, optional): Minimum learning rate, defaults to 0.
|
||||
last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
|
||||
the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
|
||||
"""
|
||||
|
||||
def __init__(self, optimizer, total_steps: int, warmup_steps: int = 0, pct_start: float = 0.72, eta_min: int = 0,
|
||||
|
@ -14,16 +14,15 @@ class _enable_get_lr_call:
|
||||
|
||||
|
||||
class DelayerScheduler(_LRScheduler):
|
||||
""" Starts with a flat lr schedule until it reaches N epochs the applies a scheduler
|
||||
"""Starts with a flat lr schedule until it reaches N epochs then applies
|
||||
the specific scheduler (For example: ReduceLROnPlateau)
|
||||
|
||||
:param optimizer: Wrapped optimizer.
|
||||
:type optimizer: torch.optim.Optimizer
|
||||
:param delay_epochs: Number of epochs to keep the initial lr until starting aplying the scheduler
|
||||
:type delay_epochs: int
|
||||
:param after_scheduler: After target_epoch, use this scheduler(eg. ReduceLROnPlateau)
|
||||
:type after_scheduler: torch.optim.lr_scheduler
|
||||
:param last_epoch: The index of last epoch, defaults to -1
|
||||
:type last_epoch: int, optional
|
||||
Args:
|
||||
optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
|
||||
delay_epochs (int): Number of epochs to keep the initial lr until starting applying the scheduler.
|
||||
after_scheduler (:class:`torch.optim.lr_scheduler`): After target_epoch, use this scheduler.
|
||||
last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
|
||||
the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
|
||||
"""
|
||||
|
||||
def __init__(self, optimizer, delay_epochs, after_scheduler, last_epoch=-1):
|
||||
@ -57,16 +56,15 @@ class DelayerScheduler(_LRScheduler):
|
||||
|
||||
|
||||
class WarmupScheduler(_LRScheduler):
|
||||
""" Starts with a linear warmup lr schedule until it reaches N epochs the applies a scheduler
|
||||
"""Starts with a linear warmup lr schedule until it reaches N epochs then applies
|
||||
the specific scheduler (For example: ReduceLROnPlateau).
|
||||
|
||||
:param optimizer: Wrapped optimizer.
|
||||
:type optimizer: torch.optim.Optimizer
|
||||
:param warmup_epochs: Number of epochs to linearly warmup lr until starting aplying the scheduler
|
||||
:type warmup_epochs: int
|
||||
:param after_scheduler: After target_epoch, use this scheduler(eg. ReduceLROnPlateau)
|
||||
:type after_scheduler: torch.optim.lr_scheduler
|
||||
:param last_epoch: The index of last epoch, defaults to -1
|
||||
:type last_epoch: int, optional
|
||||
Args:
|
||||
optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
|
||||
warmup_epochs (int): Number of epochs to linearly warmup lr until starting applying the scheduler.
|
||||
after_scheduler (:class:`torch.optim.lr_scheduler`): After target_epoch, use this scheduler.
|
||||
last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
|
||||
the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
|
||||
"""
|
||||
|
||||
def __init__(self, optimizer, warmup_epochs, after_scheduler, last_epoch=-1):
|
||||
@ -97,18 +95,16 @@ class WarmupScheduler(_LRScheduler):
|
||||
|
||||
|
||||
class WarmupDelayerScheduler(_LRScheduler):
|
||||
""" Starts with a linear warmup lr schedule until it reaches N epochs and a flat lr schedule until it reaches M epochs the applies a scheduler
|
||||
"""Starts with a linear warmup lr schedule until it reaches N epochs and a flat lr schedule
|
||||
until it reaches M epochs then applies the specific scheduler (For example: ReduceLROnPlateau).
|
||||
|
||||
:param optimizer: Wrapped optimizer.
|
||||
:type optimizer: torch.optim.Optimizer
|
||||
:param warmup_epochs: Number of epochs to linearly warmup lr until starting aplying the scheduler
|
||||
:type warmup_epochs: int
|
||||
:param delay_epochs: Number of epochs to keep the initial lr until starting aplying the scheduler
|
||||
:type delay_epochs: int
|
||||
:param after_scheduler: After target_epoch, use this scheduler(eg. ReduceLROnPlateau)
|
||||
:type after_scheduler: torch.optim.lr_scheduler
|
||||
:param last_epoch: The index of last epoch, defaults to -1
|
||||
:type last_epoch: int, optional
|
||||
Args:
|
||||
optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
|
||||
warmup_epochs (int): Number of epochs to linearly warmup lr until starting applying the scheduler.
|
||||
delay_epochs (int): Number of epochs to keep the initial lr until starting applying the scheduler.
|
||||
after_scheduler (:class:`torch.optim.lr_scheduler`): After target_epoch, use this scheduler.
|
||||
last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
|
||||
the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
|
||||
"""
|
||||
|
||||
def __init__(self, optimizer, warmup_epochs, delay_epochs, after_scheduler, last_epoch=-1):
|
||||
|
@ -5,16 +5,14 @@ from colossalai.registry import LR_SCHEDULERS
|
||||
|
||||
@LR_SCHEDULERS.register_module
|
||||
class LinearWarmupLR(_LRScheduler):
|
||||
"""Linearly warmup learning rate and then linearly decay
|
||||
"""Linearly warmup learning rate and then linearly decay.
|
||||
|
||||
:param optimizer: Wrapped optimizer
|
||||
:type optimizer: torch.optim.Optimizer
|
||||
:param total_steps: Number of total training steps
|
||||
:type total_steps: int
|
||||
:param warmup_steps: Number of warmup steps, defaults to 0
|
||||
:type warmup_steps: int, optional
|
||||
:param last_epoch: The index of last epoch, defaults to -1
|
||||
:type last_epoch: int, optional
|
||||
Args:
|
||||
optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
|
||||
total_steps (int): Number of total training steps.
|
||||
warmup_steps (int, optional): Number of warmup steps, defaults to 0
|
||||
last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
|
||||
the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
|
||||
"""
|
||||
|
||||
def __init__(self, optimizer, total_steps: int, warmup_steps: int = 0, last_epoch: int = -1, **kwargs):
|
||||
|
@ -13,18 +13,13 @@ class MultiStepLR(_MultiStepLR):
|
||||
happen simultaneously with other changes to the learning rate from outside
|
||||
this scheduler. When last_epoch=-1, sets initial lr as lr.
|
||||
|
||||
:param optimizer: Wrapped optimizer
|
||||
:type optimizer: torch.optim.Optimizer
|
||||
:param total_steps: Number of total training steps
|
||||
:type total_steps: int
|
||||
:param milestones: List of epoch indices. Must be increasing, defaults to None
|
||||
:type milestones: List[int], optional
|
||||
:param gamma: Multiplicative factor of learning rate decay, defaults to 0.1
|
||||
:type gamma: float, optional
|
||||
:param num_steps_per_epoch: Number of steps per epoch, defaults to -1
|
||||
:type num_steps_per_epoch: int, optional
|
||||
:param last_epoch: The index of last epoch, defaults to -1
|
||||
:type last_epoch: int, optional
|
||||
Args:
|
||||
optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
|
||||
total_steps (int): Number of total training steps.
|
||||
milestones (List[int], optional): List of epoch indices. Must be increasing, defaults to None.
|
||||
gamma (float, optional): Multiplicative factor of learning rate decay, defaults to 0.1.
|
||||
last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
|
||||
the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
|
||||
"""
|
||||
|
||||
def __init__(self, optimizer, total_steps: int, milestones: List[int] = None, gamma: float = 0.1, last_epoch: int = -1, **kwargs):
|
||||
@ -33,22 +28,17 @@ class MultiStepLR(_MultiStepLR):
|
||||
|
||||
@LR_SCHEDULERS.register_module
|
||||
class MultiStepWarmupLR(WarmupScheduler):
|
||||
"""Multi-step laerning rate scheduler with warmup.
|
||||
"""Multistep learning rate scheduler with warmup.
|
||||
|
||||
:param optimizer: Wrapped optimizer
|
||||
:type optimizer: torch.optim.Optimizer
|
||||
:param total_steps: Number of total training steps
|
||||
:type total_steps: int
|
||||
:param warmup_steps: Number of warmup steps, defaults to 0
|
||||
:type warmup_steps: int, optional
|
||||
:param milestones: List of epoch indices. Must be increasing, defaults to None
|
||||
:type milestones: List[int], optional
|
||||
:param gamma: Multiplicative factor of learning rate decay, defaults to 0.1
|
||||
:type gamma: float, optional
|
||||
:param num_steps_per_epoch: Number of steps per epoch, defaults to -1
|
||||
:type num_steps_per_epoch: int, optional
|
||||
:param last_epoch: The index of last epoch, defaults to -1
|
||||
:type last_epoch: int, optional
|
||||
Args:
|
||||
optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
|
||||
total_steps (int): Number of total training steps.
|
||||
warmup_steps (int, optional): Number of warmup steps, defaults to 0.
|
||||
milestones (List[int], optional): List of epoch indices. Must be increasing, defaults to None.
|
||||
gamma (float, optional): Multiplicative factor of learning rate decay, defaults to 0.1.
|
||||
num_steps_per_epoch (int, optional): Number of steps per epoch, defaults to -1.
|
||||
last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
|
||||
the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
|
||||
"""
|
||||
|
||||
def __init__(self, optimizer, total_steps: int, warmup_steps: int = 0, milestones: List[int] = None,
|
||||
|
@ -28,43 +28,41 @@ class OneCycleLR(_OneCycleLR):
|
||||
claims that "unpublished work has shown even better results by using only two phases". To
|
||||
mimic the behaviour of the original paper instead, set ``three_phase=True``.
|
||||
|
||||
:param optimizer: Wrapped optimizer
|
||||
:type optimizer: torch.optim.Optimizer
|
||||
:param total_steps: Number of total training steps
|
||||
:type total_steps: int
|
||||
:param pct_start: The percentage of the cycle (in number of steps) spent increasing the learning rate, defaults to 0.3
|
||||
:type pct_start: float, optional
|
||||
:param anneal_strategy: {'cos', 'linear'}
|
||||
Specifies the annealing strategy: "cos" for cosine annealing, "linear" for
|
||||
linear annealing, defaults to 'cos'
|
||||
:type anneal_strategy: str, optional
|
||||
:param cycle_momentum: If ``True``, momentum is cycled inversely
|
||||
to learning rate between 'base_momentum' and 'max_momentum', defaults to True
|
||||
:type cycle_momentum: bool, optional
|
||||
:param base_momentum: Lower momentum boundaries in the cycle
|
||||
for each parameter group. Note that momentum is cycled inversely
|
||||
to learning rate; at the peak of a cycle, momentum is
|
||||
'base_momentum' and learning rate is 'max_lr', defaults to 0.85
|
||||
:type base_momentum: float, optional
|
||||
:param max_momentum: Upper momentum boundaries in the cycle
|
||||
for each parameter group. Functionally,
|
||||
it defines the cycle amplitude (max_momentum - base_momentum).
|
||||
Note that momentum is cycled inversely
|
||||
to learning rate; at the start of a cycle, momentum is 'max_momentum'
|
||||
and learning rate is 'base_lr', defaults to 0.95
|
||||
:type max_momentum: float, optional
|
||||
:param div_factor: Determines the initial learning rate via
|
||||
initial_lr = max_lr/div_factor, defaults to 25.0
|
||||
:type div_factor: float, optional
|
||||
:param final_div_factor: Determines the minimum learning rate via
|
||||
min_lr = initial_lr/final_div_factor, defaults to 10000.0
|
||||
:type final_div_factor: float, optional
|
||||
:param last_epoch: The index of the last batch. This parameter is used when
|
||||
resuming a training job. Since `step()` should be invoked after each
|
||||
batch instead of after each epoch, this number represents the total
|
||||
number of *batches* computed, not the total number of epochs computed.
|
||||
When last_epoch=-1, the schedule is started from the beginning, defaults to -1
|
||||
:type last_epoch: int, optional
|
||||
Args:
|
||||
optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
|
||||
total_steps (int): Number of total training steps.
|
||||
pct_start (float, optional):
|
||||
The percentage of the cycle (in number of steps) spent increasing the learning rate, defaults to 0.3.
|
||||
anneal_strategy (str, optional): {'cos', 'linear'}, Specifies the annealing strategy:
|
||||
"cos" for cosine annealing, "linear" for linear annealing, defaults to 'cos'.
|
||||
cycle_momentum (bool, optional): If ``True``, momentum is cycled inversely
|
||||
to learning rate between 'base_momentum' and 'max_momentum', defaults to True.
|
||||
base_momentum (float, optional): Lower momentum boundaries in the cycle for each parameter group.
|
||||
Note that momentum is cycled inversely to learning rate; at the peak of a cycle, momentum is
|
||||
'base_momentum' and learning rate is 'max_lr', defaults to 0.85.
|
||||
max_momentum (float, optional): Upper momentum boundaries in the cycle for each parameter group.
|
||||
Functionally, it defines the cycle amplitude (max_momentum - base_momentum).
|
||||
Note that momentum is cycled inversely to learning rate; at the start of a cycle, momentum is 'max_momentum'
|
||||
and learning rate is 'base_lr', defaults to 0.95.
|
||||
div_factor (float, optional): Determines the initial learning rate via
|
||||
initial_lr = max_lr/div_factor, defaults to 25.0.
|
||||
final_div_factor (float, optional): Determines the minimum learning rate via
|
||||
min_lr = initial_lr/final_div_factor, defaults to 10000.0.
|
||||
last_epoch (int, optional): The index of the last batch. This parameter is used when resuming a training job.
|
||||
Since `step()` should be invoked after each batch instead of after each epoch, this number represents
|
||||
the total number of *batches* computed, not the total number of epochs computed.
|
||||
When last_epoch=-1, the schedule is started from the beginning, defaults to -1
|
||||
|
||||
The ``kwargs`` for initializing torch.optim.lr_scheduler.OneCycleLR should include parameters below:
|
||||
::
|
||||
|
||||
epochs (int, optional, default=None)
|
||||
steps_per_epoch (int, optional, default=None)
|
||||
three_phase (bool, optional, default=False)
|
||||
verbose (bool, optional, default=False)
|
||||
|
||||
More details about kwargs could be found in
|
||||
`OneCycleLR <https://pytorch.org/docs/stable/generated/torch.optim.lr_scheduler.OneCycleLR.html#torch.optim.lr_scheduler.OneCycleLR>`_.
|
||||
|
||||
.. _Super-Convergence\: Very Fast Training of Neural Networks Using Large Learning Rates:
|
||||
https://arxiv.org/abs/1708.07120
|
||||
|
@ -8,16 +8,13 @@ from .delayed import WarmupScheduler
|
||||
class PolynomialLR(_LRScheduler):
|
||||
"""Polynomial learning rate scheduler.
|
||||
|
||||
:param optimizer: Wrapped optimizer
|
||||
:type optimizer: torch.optim.Optimizer
|
||||
:param total_steps: Number of total training steps
|
||||
:type total_steps: int
|
||||
:param end_lr: Minimum learning rate, defaults to 0.0001
|
||||
:type end_lr: float, optional
|
||||
:param power: The power of polynomial, defaults to 1.0
|
||||
:type power: float, optional
|
||||
:param last_epoch: The index of last epoch, defaults to -1
|
||||
:type last_epoch: int, optional
|
||||
Args:
|
||||
optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
|
||||
total_steps (int): Number of total training steps.
|
||||
end_lr (float, optional): Minimum learning rate, defaults to 0.0001.
|
||||
power (float, optional): The power of polynomial, defaults to 1.0.
|
||||
last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
|
||||
the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
|
||||
"""
|
||||
|
||||
def __init__(self, optimizer, total_steps: int, end_lr: float = 0.0001, power: float = 1.0, last_epoch: int = -1,
|
||||
@ -44,18 +41,14 @@ class PolynomialLR(_LRScheduler):
|
||||
class PolynomialWarmupLR(WarmupScheduler):
|
||||
"""Polynomial learning rate scheduler with warmup.
|
||||
|
||||
:param optimizer: Wrapped optimizer
|
||||
:type optimizer: torch.optim.Optimizer
|
||||
:param total_steps: Number of total training steps
|
||||
:type total_steps: int
|
||||
:param warmup_steps: Number of warmup steps, defaults to 0
|
||||
:type warmup_steps: int, optional
|
||||
:param end_lr: Minimum learning rate, defaults to 0.0001
|
||||
:type end_lr: float, optional
|
||||
:param power: The power of polynomial, defaults to 1.0
|
||||
:type power: float, optional
|
||||
:param last_epoch: The index of last epoch, defaults to -1
|
||||
:type last_epoch: int, optional
|
||||
Args:
|
||||
optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
|
||||
total_steps (int): Number of total training steps.
|
||||
warmup_steps (int, optional): Number of warmup steps, defaults to 0.
|
||||
end_lr (float, optional): Minimum learning rate, defaults to 0.0001.
|
||||
power (float, optional): The power of polynomial, defaults to 1.0.
|
||||
last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
|
||||
the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
|
||||
"""
|
||||
|
||||
def __init__(self, optimizer, total_steps: int, warmup_steps: int = 0, end_lr: float = 0.0001, power: float = 1.0,
|
||||
|
@ -11,16 +11,13 @@ class LambdaLR(_LambdaLR):
|
||||
"""Sets the learning rate of each parameter group to the initial lr
|
||||
times a given function. When last_epoch=-1, sets initial lr as lr.
|
||||
|
||||
:param optimizer: Wrapped optimizer
|
||||
:type optimizer: torch.optim.Optimizer
|
||||
:param total_steps: Number of total training steps
|
||||
:type total_steps: int
|
||||
:param lr_lambda: A function which computes a multiplicative
|
||||
factor given an integer parameter epoch, or a list of such
|
||||
functions, one for each group in optimizer.param_groups, defaults to None
|
||||
:type lr_lambda: function or list, optional
|
||||
:param last_epoch: The index of last epoch, defaults to -1
|
||||
:type last_epoch: int, optional
|
||||
Args:
|
||||
optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
|
||||
total_steps (int): Number of total training steps.
|
||||
lr_lambda (Union[``function``, ``list[function]``]): A function which computes a multiplicative
|
||||
factor given an integer parameter epoch, or a list of such functions,
|
||||
one for each group in optimizer.param_groups, defaults to None.
|
||||
last_epoch (int, optional): The index of last epoch, defaults to -1.
|
||||
"""
|
||||
|
||||
def __init__(self, optimizer, total_steps, lr_lambda=None, last_epoch: int = -1) -> None:
|
||||
@ -30,18 +27,15 @@ class LambdaLR(_LambdaLR):
|
||||
@LR_SCHEDULERS.register_module
|
||||
class MultiplicativeLR(_MultiplicativeLR):
|
||||
"""Multiply the learning rate of each parameter group by the factor given
|
||||
in the specified function. When last_epoch=-1, sets initial lr as lr
|
||||
in the specified function. When last_epoch=-1, sets initial lr as lr.
|
||||
|
||||
:param optimizer: Wrapped optimizer
|
||||
:type optimizer: torch.optim.Optimizer
|
||||
:param total_steps: Number of total training steps
|
||||
:type total_steps: int
|
||||
:param lr_lambda: A function which computes a multiplicative
|
||||
factor given an integer parameter epoch, or a list of such
|
||||
functions, one for each group in optimizer.param_groups, defaults to None
|
||||
:type lr_lambda: function or list, optional
|
||||
:param last_epoch: The index of last epoch, defaults to -1
|
||||
:type last_epoch: int, optional
|
||||
Args:
|
||||
optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
|
||||
total_steps (int): Number of total training steps.
|
||||
lr_lambda (Union[``function``, ``list[function]``]): A function which computes a multiplicative
|
||||
factor given an integer parameter epoch, or a list of such functions,
|
||||
one for each group in optimizer.param_groups, defaults to None.
|
||||
last_epoch (int, optional): The index of last epoch, defaults to -1.
|
||||
"""
|
||||
|
||||
def __init__(self, optimizer, total_steps, lr_lambda=None, last_epoch: int = -1) -> None:
|
||||
@ -53,18 +47,14 @@ class StepLR(_StepLR):
|
||||
"""Decays the learning rate of each parameter group by gamma every
|
||||
step_size epochs. Notice that such decay can happen simultaneously with
|
||||
other changes to the learning rate from outside this scheduler. When
|
||||
last_epoch=-1, sets initial lr as lr
|
||||
last_epoch=-1, sets initial lr as lr.
|
||||
|
||||
:param optimizer: Wrapped optimizer
|
||||
:type optimizer: torch.optim.Optimizer
|
||||
:param total_steps: Number of total training steps
|
||||
:type total_steps: int
|
||||
:param step_size: Period of learning rate decay, defaults to 1
|
||||
:type step_size: int, optional
|
||||
:param gamma: Multiplicative factor of learning rate decay, defaults to 0.1
|
||||
:type gamma: float, optional
|
||||
:param last_epoch: The index of last epoch, defaults to -1
|
||||
:type last_epoch: int, optional
|
||||
Args:
|
||||
optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
|
||||
total_steps (int): Number of total training steps.
|
||||
step_size (int, optional): Period of learning rate decay, defaults to 1.
|
||||
gamma (float, optional): Multiplicative factor of learning rate decay, defaults to 0.1.
|
||||
last_epoch (int, optional): The index of last epoch, defaults to -1.
|
||||
"""
|
||||
|
||||
def __init__(self, optimizer, total_steps, step_size: int = 1, gamma: float = 0.1, last_epoch: int = -1) -> None:
|
||||
@ -77,14 +67,11 @@ class ExponentialLR(_ExponentialLR):
|
||||
"""Decays the learning rate of each parameter group by gamma every epoch.
|
||||
When last_epoch=-1, sets initial lr as lr
|
||||
|
||||
:param optimizer: Wrapped optimizer
|
||||
:type optimizer: torch.optim.Optimizer
|
||||
:param total_steps: Number of total training steps
|
||||
:type total_steps: int
|
||||
:param gamma: Multiplicative factor of learning rate decay, defaults to 1.0
|
||||
:type gamma: float, optional
|
||||
:param last_epoch: The index of last epoch, defaults to -1
|
||||
:type last_epoch: int, optional
|
||||
Args:
|
||||
optimizer (Union[:class:`torch.optim.Optimizer`, :class:`colossalai.nn.optimizer`]): Wrapped optimizer.
|
||||
total_steps (int): Number of total training steps.
|
||||
gamma (float, optional): Multiplicative factor of learning rate decay, defaults to 1.0.
|
||||
last_epoch (int, optional): The index of last epoch, defaults to -1.
|
||||
"""
|
||||
|
||||
def __init__(self, optimizer, total_steps, gamma: float = 1.0,
|
||||
|
@ -14,8 +14,12 @@ class Accuracy2D(nn.Module):
|
||||
def forward(self, logits, targets):
|
||||
"""Calculate the accuracy of predicted labels.
|
||||
|
||||
:param logits: Predicted labels
|
||||
:param targets: True labels from data
|
||||
Args:
|
||||
logits (:class:`torch.tensor`): Predicted labels.
|
||||
targets (:class:`torch.tensor`): True labels from data.
|
||||
|
||||
Returns:
|
||||
float: the accuracy of prediction.
|
||||
"""
|
||||
with torch.no_grad():
|
||||
targets = split_tensor_2d(targets)
|
||||
|
@ -14,8 +14,12 @@ class Accuracy2p5D(nn.Module):
|
||||
def forward(self, logits, targets):
|
||||
"""Calculate the accuracy of predicted labels.
|
||||
|
||||
:param logits: Predicted labels
|
||||
:param targets: True labels from data
|
||||
Args:
|
||||
logits (:class:`torch.tensor`): Predicted labels.
|
||||
targets (:class:`torch.tensor`): True labels from data.
|
||||
|
||||
Returns:
|
||||
float: the accuracy of prediction.
|
||||
"""
|
||||
with torch.no_grad():
|
||||
targets = split_tensor_2p5d(targets)
|
||||
|
@ -18,8 +18,12 @@ class Accuracy3D(nn.Module):
|
||||
def forward(self, logits, targets):
|
||||
"""Calculate the accuracy of predicted labels.
|
||||
|
||||
:param logits: Predicted labels
|
||||
:param targets: True labels from data
|
||||
Args:
|
||||
logits (:class:`torch.tensor`): Predicted labels.
|
||||
targets (:class:`torch.tensor`): True labels from data.
|
||||
|
||||
Returns:
|
||||
float: the accuracy of prediction.
|
||||
"""
|
||||
with torch.no_grad():
|
||||
targets = split_tensor_3d(targets, 0, self.weight_parallel_mode)
|
||||
|
@ -9,11 +9,10 @@ class Registry:
|
||||
"""This is a registry class used to register classes and modules so that a universal
|
||||
object builder can be enabled.
|
||||
|
||||
:param name: The name of the registry
|
||||
:type name: str
|
||||
:param third_party_library: List of third party libraries which are used in the
|
||||
initialization of the register module
|
||||
:type third_party_library: list, optional
|
||||
Args:
|
||||
name (str): The name of the registry .
|
||||
third_party_library (list, optional):
|
||||
List of third party libraries which are used in the initialization of the register module.
|
||||
"""
|
||||
|
||||
def __init__(self, name: str, third_party_library: List[ModuleType] = None):
|
||||
@ -28,12 +27,12 @@ class Registry:
|
||||
def register_module(self, module_class):
|
||||
"""Registers a module represented in `module_class`.
|
||||
|
||||
:param module_class: The module to be registered
|
||||
:type module_class: class
|
||||
:raises AssertionError: Raises an AssertionError if the module has already been
|
||||
registered before
|
||||
:return: The module to be registered, so as to use it normally if via importing
|
||||
:rtype: class
|
||||
Args:
|
||||
module_class (class): The module to be registered.
|
||||
Returns:
|
||||
class: The module to be registered, so as to use it normally if via importing.
|
||||
Raises:
|
||||
AssertionError: Raises an AssertionError if the module has already been registered before.
|
||||
"""
|
||||
module_name = module_class.__name__
|
||||
assert module_name not in self._registry
|
||||
@ -46,12 +45,13 @@ class Registry:
|
||||
"""Retrieves a module with name `module_name` and returns the module if it has
|
||||
already been registered before.
|
||||
|
||||
:param module_name: The name of the module to be retrieved
|
||||
:type module_name: str
|
||||
:raises NameError: Raises a NameError if the module to be retrieved has neither been
|
||||
registered directly nor as third party modules before
|
||||
:return: The retrieved module or None
|
||||
:rtype: :class:`object`
|
||||
Args:
|
||||
module_name (str): The name of the module to be retrieved.
|
||||
Returns:
|
||||
:class:`object`: The retrieved module or None.
|
||||
Raises:
|
||||
NameError: Raises a NameError if the module to be retrieved has neither been
|
||||
registered directly nor as third party modules before.
|
||||
"""
|
||||
if module_name in self._registry:
|
||||
return self._registry[module_name]
|
||||
@ -65,11 +65,11 @@ class Registry:
|
||||
"""Searches for a module with name `module_name` and returns a boolean value indicating
|
||||
whether the module has been registered directly or as third party modules before.
|
||||
|
||||
:param module_name: The name of the module to be searched for
|
||||
:type module_name: str
|
||||
:return: A boolean value indicating whether the module has been registered directly or
|
||||
as third party modules before
|
||||
:rtype: bool
|
||||
Args:
|
||||
module_name (str): The name of the module to be searched for.
|
||||
Returns:
|
||||
bool: A boolean value indicating whether the module has been registered directly or
|
||||
as third party modules before.
|
||||
"""
|
||||
found_flag = module_name in self._registry
|
||||
|
||||
|
@ -17,18 +17,46 @@ from colossalai.trainer.hooks import BaseHook
|
||||
|
||||
|
||||
class Trainer:
|
||||
"""This a class tending for easy deployments of users' training and evaluation instead of
|
||||
r"""This is a class tending for easy deployments of users' training and evaluation instead of
|
||||
writing their own scripts. It is similar with ``ignite.engine`` and ``keras.engine``, but is
|
||||
called `Trainer`.
|
||||
|
||||
:param engine: Engine responsible for the process function
|
||||
:type engine: :class:`Engine`
|
||||
:param schedule: Schedule responsible for forward and backward steps
|
||||
:type schedule: :class:`BaseSchedule`, optional
|
||||
:param timer: Timer used to monitor the whole training
|
||||
:type timer: :class:`MultiTimer`, optional
|
||||
:param logger: Logger used to record the whole training
|
||||
:type logger: :class:`colossalai.logging.DistributedLogger`, optional
|
||||
Args:
|
||||
engine (:class:`Engine`): Engine responsible for the process function.
|
||||
schedule (:class:`BaseSchedule`, optional): Schedule responsible for forward and backward steps.
|
||||
timer (:class:`MultiTimer`, optional): Timer used to monitor the whole training.
|
||||
logger (:class:`colossalai.logging.DistributedLogger`, optional): Logger used to record the whole training log.
|
||||
|
||||
Note:
|
||||
when `schedule` is None, the ``NonPipelineSchedule`` would be used. If you would like to use pipeline,
|
||||
you should choose ``PipelineSchedule`` or ``InterleavedPipelineSchedule`` for the `schedule`
|
||||
|
||||
Examples:
|
||||
>>> # define model, criterion, optimizer, lr_scheduler, train_dataloader for your training
|
||||
>>> model = ...
|
||||
>>> criterion = ...
|
||||
>>> optimizer = ...
|
||||
>>> train_dataloader = ...
|
||||
>>> # Initialize your engine, train_dataloader, test_dataloader, lr_scheduler
|
||||
>>> engine, train_dataloader, _, _ = colossalai.initialize(model, optimizer, criterion)
|
||||
>>> # Beginning training progress
|
||||
>>> timier = ...
|
||||
>>> logger = ...
|
||||
>>> trainer = Trainer(engine=engine, logger=logger, schedule=schedule, timer=timier)
|
||||
>>> # add hooks you would like to use here.
|
||||
>>> hook_list = []
|
||||
>>> trainer.fit(
|
||||
>>> train_dataloader=train_dataloader,
|
||||
>>> epochs=gpc.config.NUM_EPOCHS,
|
||||
>>> test_interval=1,
|
||||
>>> hooks=hook_list,
|
||||
>>> display_progress=True,
|
||||
>>> return_output_label=False
|
||||
>>> )
|
||||
|
||||
More examples and details could be found in
|
||||
`Training with engine and trainer <https://www.colossalai.org/docs/basics/engine_trainer>`_
|
||||
and `ColossalAI-Examples <https://github.com/hpcaitech/ColossalAI-Examples/tree/main>`_.
|
||||
"""
|
||||
def __init__(
|
||||
self,
|
||||
@ -108,20 +136,19 @@ class Trainer:
|
||||
def _set_current_step(self, epoch: int):
|
||||
"""Sets current step number.
|
||||
|
||||
:param epoch: Step number to be set
|
||||
:type epoch: int
|
||||
Args:
|
||||
epoch (int): Step number to be set.
|
||||
"""
|
||||
self._cur_step = epoch * self._steps_per_epoch
|
||||
|
||||
def _call_timer(self, action: str, item: str, *args, **kwargs) -> None:
|
||||
"""Call timer funciton with a given timer name.
|
||||
|
||||
:param action: Function to be called on timer
|
||||
:type action: str
|
||||
:param item: Name of the timer
|
||||
:type item: str
|
||||
:param args: args used for action function
|
||||
:param kwargs: kwargs used for action function
|
||||
Args:
|
||||
action (str): Function to be called on timer.
|
||||
item (str): Name of the timer.
|
||||
args (list): args used for action function.
|
||||
kwargs (dict): kwargs used for action function.
|
||||
"""
|
||||
|
||||
if self._timer is not None:
|
||||
@ -134,10 +161,9 @@ class Trainer:
|
||||
def _call_hooks(self, func, output=None):
|
||||
"""Calls specific hooks in the current time point.
|
||||
|
||||
:param func: A string represents the time point
|
||||
:param output: Output of the model after running a iteration or None in any other time points
|
||||
:type func: str
|
||||
:type output: optional
|
||||
Args:
|
||||
func (str): A string represents the time point.
|
||||
output (Any, optional): Output of the model after running an iteration or None in any other time points.
|
||||
"""
|
||||
# Only after iter hook will receive output
|
||||
for hook in self.hooks:
|
||||
@ -273,25 +299,17 @@ class Trainer:
|
||||
display_progress: bool = False,
|
||||
return_output_label: bool = True,
|
||||
):
|
||||
"""Trains the model to fit training data.
|
||||
r"""Trains the model to fit training data.
|
||||
|
||||
:param train_dataloader: DataLoader in training
|
||||
:param epochs: Maximum number of epoches
|
||||
:param max_steps: Maximum number of running iterations
|
||||
:param test_dataloader: DataLoader in testing
|
||||
:param test_interval: Interval of testing
|
||||
:param hooks: A list of hooks used in training
|
||||
:param display_progress: If True, the training progress will be printed
|
||||
:param return_output_label: If True, the output of model and the label will be returned
|
||||
|
||||
:type train_dataloader: DataLoader
|
||||
:type epochs: int
|
||||
:type max_steps: int, optional
|
||||
:type test_dataloader: DataLoader, optional
|
||||
:type test_interval: int, optional
|
||||
:type hooks: list, optional
|
||||
:type display_progress: bool, optional
|
||||
:type return_output_label: bool, optional
|
||||
Args:
|
||||
train_dataloader (:class:`torch.utils.data.DataLoader`): DataLoader for training.
|
||||
epochs (int): Maximum number of epochs.
|
||||
max_steps (int, optional): Maximum number of running iterations.
|
||||
test_dataloader (:class:`torch.utils.data.DataLoader`, optional): DataLoader for validation.
|
||||
test_interval (int, optional): Interval of validation
|
||||
hooks (list[`BaseHook <https://github.com/hpcaitech/ColossalAI/tree/main/colossalai/trainer/hooks>`_],
|
||||
optional): A list of hooks used in training.
|
||||
display_progress (bool, optional): If True, a progress bar will be displayed.
|
||||
"""
|
||||
|
||||
# set epochs and steps, consider gradient accumulation
|
||||
@ -374,15 +392,12 @@ class Trainer:
|
||||
):
|
||||
"""Evaluates the model with testing data.
|
||||
|
||||
:param test_dataloader: DataLoader in testing
|
||||
:param hooks: A list of hooks used in evaluation
|
||||
:param display_progress: If True, the evaluation progress will be printed
|
||||
:param return_output_label: If True, the output of model and the label will be returned
|
||||
|
||||
:type test_dataloader: DataLoader
|
||||
:type hooks: list, optional
|
||||
:type display_progress: bool, optional
|
||||
:type return_output_label: bool
|
||||
Args:
|
||||
test_dataloader (:class:`torch.utils.data.DataLoader`, optional): Dataloader for testing.
|
||||
hooks (list, optional): A list of hooks used in evaluation. Defaults to None.
|
||||
display_progress (bool, optional): If True, the evaluation progress will be printed. Defaults to False.
|
||||
return_output_label (bool, optional): If True, the output of model and the label
|
||||
will be returned. Defaults to True.
|
||||
"""
|
||||
# set display
|
||||
display_progress = self._should_display_progress(display_progress)
|
||||
@ -418,10 +433,11 @@ class Trainer:
|
||||
def predict(self, data: Union[Tensor, List[Tensor]]):
|
||||
"""Uses trained model to make a prediction for a tensor or a tensor list.
|
||||
|
||||
:param data: Data as the input
|
||||
:type data: Union[Tensor, List[Tensor]
|
||||
:return: The output of model as the prediction
|
||||
:rtype: Tensor
|
||||
Args:
|
||||
data (Union[:class:`torch.tensor`, List[:class:`torch.tensor`]]): Data as the input.
|
||||
|
||||
Returns:
|
||||
:class:`torch.tensor`: The output of model as the prediction
|
||||
"""
|
||||
# predict without labels
|
||||
if isinstance(data, (list, tuple)):
|
||||
|
@ -40,14 +40,11 @@ class BaseHook(ABC):
|
||||
def after_train_iter(self, trainer, output: Tensor, label: Tensor, loss: Tensor):
|
||||
"""Actions after running a training iteration.
|
||||
|
||||
:param trainer: Trainer which is using this hook
|
||||
:type trainer: :class:`Trainer`
|
||||
:param output: Output of the model
|
||||
:type output: torch.Tensor
|
||||
:param label: Labels of the input data
|
||||
:type label: torch.Tensor
|
||||
:param loss: Loss between the output and input data
|
||||
:type loss: torch.Tensor
|
||||
Args:
|
||||
trainer (:class:`Trainer`): Trainer which is using this hook.
|
||||
output (:class:`torch.Tensor`): Output of the model.
|
||||
label (:class:`torch.Tensor`): Labels of the input data.
|
||||
loss (:class:`torch.Tensor`): Loss between the output and input data.
|
||||
"""
|
||||
pass
|
||||
|
||||
@ -89,24 +86,21 @@ class BaseHook(ABC):
|
||||
def after_test_iter(self, trainer, output: Tensor, label: Tensor, loss: Tensor):
|
||||
"""Actions after running a testing iteration.
|
||||
|
||||
:param trainer: Trainer which is using this hook
|
||||
:type trainer: :class:`Trainer`
|
||||
:param output: Output of the model
|
||||
:type output: Tensor
|
||||
:param label: Labels of the input data
|
||||
:type label: Tensor
|
||||
:param loss: Loss between the output and input data
|
||||
:type loss: Tensor
|
||||
Args:
|
||||
trainer (:class:`Trainer`): Trainer which is using this hook
|
||||
output (:class:`torch.Tensor`): Output of the model
|
||||
label (:class:`torch.Tensor`): Labels of the input data
|
||||
loss (:class:`torch.Tensor`): Loss between the output and input data
|
||||
"""
|
||||
pass
|
||||
|
||||
def init_runner_states(self, trainer, key, val):
|
||||
"""Initializes trainer's state.
|
||||
|
||||
:param trainer: Trainer which is using this hook
|
||||
:type trainer: :class:`Trainer`
|
||||
:param key: Key of reseting state
|
||||
:param val: Value of reseting state
|
||||
Args:
|
||||
trainer (:class:`Trainer`): Trainer which is using this hook
|
||||
key: Key of state to be reset
|
||||
val: Value of state to be reset
|
||||
"""
|
||||
if key not in trainer.states:
|
||||
trainer.states[key] = val
|
||||
|
@ -16,14 +16,13 @@ from ._lr_scheduler_hook import LRSchedulerHook
|
||||
class SaveCheckpointHook(BaseHook):
|
||||
"""Saves the model by interval in training process.
|
||||
|
||||
:param interval: Saving interval, defaults to 1
|
||||
:type interval: int, optional
|
||||
:param checkpoint_dir: Directory of saving checkpoint, defaults to None
|
||||
:type checkpoint_dir: str, optional
|
||||
:param suffix: Saving suffix of the file, defaults to ''
|
||||
:type suffix: str, optional
|
||||
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10
|
||||
:type priority: int, optional
|
||||
Args:
|
||||
interval (int, optional): Saving interval, defaults to 1.
|
||||
checkpoint_dir (str, optional): Directory of saving checkpoint, defaults to None.
|
||||
suffix (str, optional): Saving suffix of the file, defaults to ''.
|
||||
priority (int, optional): Priority in the printing, hooks with small priority will be printed in front
|
||||
defaults to 10. If different hooks share same priority, the order of printing would
|
||||
depend on the hooks order in the hook list.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
@ -71,18 +70,17 @@ class SaveCheckpointHook(BaseHook):
|
||||
class LoadCheckpointHook(BaseHook):
|
||||
"""Loads the model before training process.
|
||||
|
||||
:param checkpoint_dir: Directory of saving checkpoint, defaults to None
|
||||
:type checkpoint_dir: str, optional
|
||||
:param epoch: Epoch number to be set, defaults to -1
|
||||
:type epoch: str, optional
|
||||
:param finetune: Whether allows to load a part of the model, defaults to False
|
||||
:type finetune: bool, optional
|
||||
:param strict: Whether loads a model that has the same shape of parameters, defaults to False
|
||||
:type strict: bool, optional
|
||||
:param suffix: Suffic, defaults to ''
|
||||
:type suffix: str, optional
|
||||
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 0
|
||||
:type priority: int, optional
|
||||
Args:
|
||||
checkpoint_dir (str, optional): Directory of saving checkpoint, defaults to None.
|
||||
epoch (str, optional): Loading checkpoint of setting epoch numbers, defaults to -1.
|
||||
Epoch equals to -1 means choosing the latest checkpoint.
|
||||
finetune (bool, optional): Whether allows to load a part of the model, defaults to False.
|
||||
strict (bool, optional): Whether to strictly enforce that the keys in :attr:`state_dict` of the checkpoint
|
||||
match the names of parameters and buffers in model, defaults to False.
|
||||
suffix (str, optional): Suffix of checkpoint file path, defaults to ''.
|
||||
priority (int, optional): Priority in the printing, hooks with small priority will be printed in front,
|
||||
defaults to 0. If different hooks share same priority, the order of printing would
|
||||
depend on the hooks order in the hook list.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
|
@ -25,13 +25,14 @@ def _format_number(val, prec=5):
|
||||
|
||||
|
||||
class LogByEpochHook(BaseHook):
|
||||
"""Hook to log by epoch
|
||||
"""Hook to log by epoch.
|
||||
|
||||
:param logger: Logger for the log
|
||||
:param interval: Recording interval, defaults to 1
|
||||
:type interval: int, optional
|
||||
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 1
|
||||
:type priority: int, optional
|
||||
Args:
|
||||
logger (:class:`colossalai.logging.DistributedLogger`): Logger for recording the log information.
|
||||
interval (int, optional): Interval of printing log information, defaults to 1.
|
||||
priority (int, optional): Priority in the printing, hooks with small priority will be printed in front,
|
||||
defaults to 1. If different hooks share same priority, the order of printing would
|
||||
depend on the hooks order in the hook list.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
@ -48,10 +49,12 @@ class LogByEpochHook(BaseHook):
|
||||
|
||||
@HOOKS.register_module
|
||||
class LogMetricByStepHook(BaseHook):
|
||||
"""Hook to log metric by step
|
||||
"""Hook to log metric by step.
|
||||
|
||||
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10
|
||||
:type priority: int, optional
|
||||
Args:
|
||||
priority (int, optional): Priority in the printing, hooks with small priority will be printed in front,
|
||||
defaults to 10. If different hooks share same priority, the order of printing would
|
||||
depend on the hooks order in the hook list.
|
||||
"""
|
||||
|
||||
def __init__(self, priority: int = 10):
|
||||
@ -74,11 +77,12 @@ class LogMetricByStepHook(BaseHook):
|
||||
class LogMetricByEpochHook(LogByEpochHook):
|
||||
"""Specialized hook to record the metric to log.
|
||||
|
||||
:param logger: Logger for the log
|
||||
:param interval: Recording interval, defaults to 1
|
||||
:type interval: int, optional
|
||||
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10
|
||||
:type priority: int, optional
|
||||
Args:
|
||||
logger (:class:`colossalai.logging.DistributedLogger`): Logger for recording the log information.
|
||||
interval (int, optional): Interval of printing log information, defaults to 1.
|
||||
priority (int, optional): Priority in the printing, hooks with small priority will be printed in front,
|
||||
defaults to 10. If different hooks share same priority, the order of printing would
|
||||
depend on the hooks order in the hook list.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
@ -116,14 +120,14 @@ class LogMetricByEpochHook(LogByEpochHook):
|
||||
class TensorboardHook(BaseHook):
|
||||
"""Specialized hook to record the metric to Tensorboard.
|
||||
|
||||
:param log_dir: Directory of log
|
||||
:type log_dir: str
|
||||
:param ranks: Ranks of processors
|
||||
:type ranks: typing.List
|
||||
:param parallel_mode: Parallel mode, defaults to colossalai.context.parallel_mode.ParallelMode.GLOBAL
|
||||
:type parallel_mode: :class:`colossalai.context.parallel_mode.ParallelMode`, optional
|
||||
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10
|
||||
:type priority: int, optional
|
||||
Args:
|
||||
log_dir (str): Directory of log.
|
||||
ranks (list): Ranks of processors.
|
||||
parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`, optional): Parallel mode used in trainer,
|
||||
defaults to colossalai.context.parallel_mode.ParallelMode.GLOBAL.
|
||||
priority (int, optional): Priority in the printing, hooks with small priority will be printed in front,
|
||||
defaults to 10. If different hooks share same priority, the order of printing would
|
||||
depend on the hooks order in the hook list.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
@ -200,18 +204,15 @@ class TensorboardHook(BaseHook):
|
||||
class LogTimingByEpochHook(LogByEpochHook):
|
||||
"""Specialized hook to write timing record to log.
|
||||
|
||||
:param timer: Timer for the hook
|
||||
:type timer: :class:`colossalai.utils.MultiTimer`
|
||||
:param logger: Logger for the log
|
||||
:type logger: :class:`colossalai.logging.DistributedLogger`
|
||||
:param interval: Recording interval, defaults to 1
|
||||
:type interval: int, optional
|
||||
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10
|
||||
:type priority: int, optional
|
||||
:param log_eval: Whether writes in evaluation, defaults to True
|
||||
:type log_eval: bool, optional
|
||||
:param ignore_num_train_steps: Number of training steps to ignore, defaults to 0
|
||||
:type ignore_num_train_steps: int, optional
|
||||
Args:
|
||||
timer (:class:`colossalai.utils.MultiTimer`): Timer for the hook.
|
||||
logger (:class:`colossalai.logging.DistributedLogger`): Logger for recording the log information.
|
||||
interval (int, optional): Interval of printing log information, defaults to 1.
|
||||
priority (int, optional): Priority in the printing, hooks with small priority will be printed in front
|
||||
defaults to 10. If different hooks share same priority, the order of printing would
|
||||
depend on the hooks order in the hook list.
|
||||
log_eval (bool, optional): Whether writes in evaluation, defaults to True.
|
||||
ignore_num_train_steps (int, optional): Number of training steps to ignore, defaults to 0.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
@ -270,14 +271,13 @@ class LogTimingByEpochHook(LogByEpochHook):
|
||||
class LogMemoryByEpochHook(LogByEpochHook):
|
||||
"""Specialized Hook to write memory usage record to log.
|
||||
|
||||
:param logger: Logger for the log
|
||||
:type logger: colossalai.logging.DistributedLogger
|
||||
:param interval: Recording interval, defaults to 1
|
||||
:type interval: int, optional
|
||||
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10
|
||||
:type priority: int, optional
|
||||
:param log_eval: Whether writes in evaluation, defaults to True
|
||||
:type log_eval: bool, optional
|
||||
Args:
|
||||
logger (:class:`colossalai.logging.DistributedLogger`): Logger for recording the log information.
|
||||
interval (int, optional): Interval of printing log information, defaults to 1.
|
||||
priority (int, optional): Priority in the printing, hooks with small priority will be printed in front
|
||||
defaults to 1. If different hooks share same priority, the order of printing would
|
||||
depend on the hooks order in the hook list.
|
||||
log_eval (bool, optional): Whether writes in evaluation, defaults to True.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
|
@ -6,15 +6,17 @@ from ._metric_hook import LearningRateMetric, MetricHook
|
||||
|
||||
@HOOKS.register_module
|
||||
class LRSchedulerHook(MetricHook):
|
||||
"""Build LR scheduler
|
||||
r"""Build LR scheduler for trainer.
|
||||
|
||||
:param lr_scheduler: LR scheduler
|
||||
:param by_epoch: If `True`, the LR will be scheduled every epoch. Else, the LR will be scheduled every batch
|
||||
:type by_epoch: bool
|
||||
:param store_lr_in_state: If `True`, store the learning rate in each state, defaults to `True`
|
||||
:type store_lr_in_state: bool, optional
|
||||
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 1
|
||||
:type priority: int, optional
|
||||
Args:
|
||||
lr_scheduler (:class:`colossalai.nn.lr_scheduler`): The specific LR scheduler
|
||||
in range of ``colossalai.nn.lr_scheduler``, more details about ``lr_scheduler`` could be found in
|
||||
`lr_scheduler <https://github.com/hpcaitech/ColossalAI/tree/main/colossalai/nn/lr_scheduler>`_.
|
||||
by_epoch (bool): If `True`, the LR will be scheduled every epoch. Else, the LR will be scheduled every batch.
|
||||
store_lr_in_state (bool, optional): If `True`, store the learning rate in each state, defaults to `True`.
|
||||
priority (int, optional): Priority in the printing, hooks with small priority will be printed in front
|
||||
defaults to 1. If different hooks share same priority, the order of printing would
|
||||
depend on the hooks order in the hook list.
|
||||
"""
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -17,13 +17,13 @@ from ._base_hook import BaseHook
|
||||
|
||||
class Metric(ABC):
|
||||
"""A basic class of metric collectors. It collects a specific
|
||||
metric during training or evaluation and it's always used with
|
||||
metric during training or evaluation and would always be used with
|
||||
:class:`MetricHook` to help it update its states and show the
|
||||
metric. So please use corresponding hook class to make the metric
|
||||
collector works.
|
||||
|
||||
:param epoch_only: Whether the metric only read for the full epoch
|
||||
:type epoch_only: bool
|
||||
Args:
|
||||
epoch_only (bool): Whether the metric only read for the full epoch.
|
||||
"""
|
||||
|
||||
def __init__(self, epoch_only: bool):
|
||||
@ -80,8 +80,8 @@ class Metric(ABC):
|
||||
class LossMetric(Metric):
|
||||
"""A metric collector for loss.
|
||||
|
||||
:param epoch_only: Whether the metric only read for the full epoch
|
||||
:type epoch_only: bool
|
||||
Args:
|
||||
epoch_only (bool): Whether the metric only read for the full epoch.
|
||||
"""
|
||||
|
||||
def __init__(self, epoch_only):
|
||||
@ -101,7 +101,8 @@ class LossMetric(Metric):
|
||||
"""Updates :attr:`last_step_loss` and :attr:`accum_loss` with current loss.
|
||||
It expects the output has loss.
|
||||
|
||||
:param loss: Current loss of the output
|
||||
Args:
|
||||
loss (:class:`torch.tensor`): Current loss of the output.
|
||||
"""
|
||||
# expect output to be logits, label and loss
|
||||
loss_ = loss.detach()
|
||||
@ -132,10 +133,9 @@ class LossMetric(Metric):
|
||||
class LearningRateMetric(Metric):
|
||||
"""A metric collector for learning rate.
|
||||
|
||||
:param epoch_only: Whether the metric only read for the full epoch
|
||||
:type epoch_only: bool
|
||||
:param initial_lr: Initial learning rate, defaults to 0.0
|
||||
:type initial_lr: float, optional
|
||||
Args:
|
||||
epoch_only (bool): Whether the metric only read for the full epoch.
|
||||
initial_lr (float, optional): Initial learning rate, defaults to 0.0.
|
||||
"""
|
||||
|
||||
def __init__(self, epoch_only: bool, initial_lr: float = 0.):
|
||||
@ -163,10 +163,9 @@ class AccuracyMetric(Metric):
|
||||
"""A metric collector for accuracy. It only works for classification
|
||||
tasks.
|
||||
|
||||
:param epoch_only: Whether the metric only read for the full epoch
|
||||
:type epoch_only: bool
|
||||
:param accuracy_func: Accuracy function for the classification task
|
||||
:type accuracy_func: :class:`typing.Callable`
|
||||
Args:
|
||||
epoch_only (bool): Whether the metric only read for the full epoch.
|
||||
accuracy_func (:class:`typing.Callable`): Accuracy function for the classification task.
|
||||
"""
|
||||
|
||||
def __init__(self, epoch_only: bool, accuracy_func: Callable):
|
||||
@ -187,9 +186,10 @@ class AccuracyMetric(Metric):
|
||||
"""Updates last step accuracy and accumulated accuracy with current logits
|
||||
and labels. It expects the output has logits and labels.
|
||||
|
||||
:param logits: The logits output of the model
|
||||
:param targets: Real labels of the dataset
|
||||
:param batch_size: Batch size of the task
|
||||
Args:
|
||||
logits (:class:`torch.tensor`): The logits output of the model.
|
||||
targets (:class:`torch.tensor`): Real labels of the dataset.
|
||||
batch_size (int): Batch size of the task.
|
||||
"""
|
||||
if isinstance(logits, (list, tuple)):
|
||||
logits = logits[0]
|
||||
@ -224,8 +224,10 @@ class MetricHook(BaseHook):
|
||||
update their states. Others are used to display and
|
||||
record the metric.
|
||||
|
||||
:param priority: Priority in the printing, hooks with small priority will be printed in front
|
||||
:type priority: int
|
||||
Args:
|
||||
priority (int): Priority in the printing, hooks with small priority will be printed in front
|
||||
defaults to 1. If different hooks share same priority, the order of printing would
|
||||
depend on the hooks order in the hook list.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
@ -244,8 +246,10 @@ class MetricHook(BaseHook):
|
||||
class LossHook(MetricHook):
|
||||
"""Specialized hook class for :class:`Loss`.
|
||||
|
||||
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 0
|
||||
:type priority: int, optional
|
||||
Args:
|
||||
priority (int, optional): Priority in the printing, hooks with small priority will be printed in front
|
||||
defaults to 0. If different hooks share same priority, the order of printing would
|
||||
depend on the hooks order in the hook list.
|
||||
"""
|
||||
|
||||
def __init__(self, priority: int = 0):
|
||||
@ -283,10 +287,11 @@ class LossHook(MetricHook):
|
||||
class AccuracyHook(MetricHook):
|
||||
"""Specialized hook class for :class:`Accuracy`.
|
||||
|
||||
:param accuracy_func: Priority in the printing, hooks with small priority will be printed in front
|
||||
:type accuracy_func: typing.Callable
|
||||
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 0
|
||||
:type priority: int, optional
|
||||
Args:
|
||||
accuracy_func (:class:`typing.Callable`): Accuracy function for the classification task.
|
||||
priority (int, optional): Priority in the printing, hooks with small priority will be printed in front
|
||||
defaults to 0. If different hooks share same priority, the order of printing would
|
||||
depend on the hooks order in the hook list.
|
||||
"""
|
||||
|
||||
def __init__(self, accuracy_func: Callable, priority: int = 0):
|
||||
@ -314,8 +319,8 @@ class AccuracyHook(MetricHook):
|
||||
class ThroughputMetric(Metric):
|
||||
"""Metric for :class:`Throughput`.
|
||||
|
||||
:param epoch_only: epoch only
|
||||
:type epoch_only: bool
|
||||
Args:
|
||||
epoch_only (bool): Whether the metric only read for the full epoch.
|
||||
"""
|
||||
def __init__(self, epoch_only: bool, ignored_steps: int = 0):
|
||||
super().__init__(epoch_only=epoch_only)
|
||||
@ -360,10 +365,13 @@ class ThroughputMetric(Metric):
|
||||
|
||||
@HOOKS.register_module
|
||||
class ThroughputHook(MetricHook):
|
||||
"""Specialized hook class for :class:`Throughput`.
|
||||
"""Specialized hook class for :class:`Throughput`. Hook to measure execution throughput (samples/sec).
|
||||
|
||||
:param priority: priority of throughput hook, defaults to 10
|
||||
:type priority: int, optional
|
||||
Args:
|
||||
ignored_steps (int, optional): the number of initial training steps to ignore.
|
||||
priority (int, optional): Priority in the printing, hooks with small priority will be printed in front
|
||||
defaults to 10. If different hooks share same priority, the order of printing would
|
||||
depend on the hooks order in the hook list.
|
||||
"""
|
||||
def __init__(self, ignored_steps: int = 0, priority: int = 10):
|
||||
super().__init__(priority)
|
||||
|
@ -114,10 +114,13 @@ class CheckpointFunction(torch.autograd.Function):
|
||||
|
||||
|
||||
def checkpoint(function, activation_offload ,*args):
|
||||
"""Checkpoint the computation while preserve the rng states, modified from Pytorch torch.utils.checkpoint
|
||||
"""Checkpoint the computation while preserve the rng states, modified from Pytorch torch.utils.checkpoint.
|
||||
|
||||
:param function: Describe the forward pass function. It should know how to handle the input tuples.
|
||||
:param args: Tuple containing the parameters of the function
|
||||
:return: Output of running function with provided args
|
||||
Args:
|
||||
function: Describe the forward pass function. It should know how to handle the input tuples.
|
||||
args (list): Tuple containing the parameters of the function
|
||||
|
||||
Returns:
|
||||
Output of running function with provided args.
|
||||
"""
|
||||
return CheckpointFunction.apply(function, activation_offload, *args)
|
||||
|
@ -50,17 +50,17 @@ def _get_standard_checkpoint_filename(epoch: int, suffix: str = ''):
|
||||
|
||||
|
||||
def get_checkpoint_path(checkpoint_dir: str, epoch: int, suffix: str = ''):
|
||||
"""This is a function to generate the checkpoint path from the (checkpoint_dir, epoch, suffix, gpu_parallel_rank) tuple.
|
||||
"""This is a function to generate the checkpoint path from the tuple
|
||||
(checkpoint_dir, epoch, suffix, gpu_parallel_rank).
|
||||
This is useful during generation and recuperation of the checkpoint.
|
||||
|
||||
:param checkpoint_dir: Set up a directory for saving checkpoints
|
||||
:type checkpoint_dir: str
|
||||
:param epoch: Epoch number (indicate how many epochs have you trained this model)
|
||||
:type epoch: int
|
||||
:param suffix: Additional notation to specify the model or checkpoint, defaults to ''
|
||||
:type suffix: str, optional
|
||||
:return: Checkpoint path to be generated
|
||||
:rtype: path
|
||||
Args:
|
||||
checkpoint_dir (str): Set up a directory for saving checkpoints.
|
||||
epoch (int): Epoch number (indicate how many epochs have you trained this model).
|
||||
suffix (str, optional): Additional notation to specify the model or checkpoint, defaults to ''
|
||||
|
||||
Returns:
|
||||
str: The checkpoint path to be generated.
|
||||
"""
|
||||
ckpt_filename = _get_standard_checkpoint_filename(epoch, suffix)
|
||||
return os.path.join(checkpoint_dir, ckpt_filename)
|
||||
@ -74,12 +74,13 @@ def _ensure_directory_exists(filename: str):
|
||||
|
||||
|
||||
def get_latest_checkpoint_pattern(suffix: str = ''):
|
||||
"""Generate Regular expression of latest checkpoint's pattern
|
||||
"""Generate Regular expression of the latest checkpoint's pattern.
|
||||
|
||||
:param suffix: Additional notation to specify the model or checkpoint, defaults to ''
|
||||
:type suffix: str, optional
|
||||
:return: Checkpoint pattern
|
||||
:rtype: regular expression
|
||||
Args:
|
||||
suffix (str, optional): Additional notation to specify the model or checkpoint, defaults to ''.
|
||||
|
||||
Returns:
|
||||
str: The regular expression of checkpoint pattern.
|
||||
"""
|
||||
ranks_name = _get_ranks_name()
|
||||
pattern = r'epoch(\d+)-{}{}\.pt'.format(ranks_name, suffix)
|
||||
@ -88,16 +89,19 @@ def get_latest_checkpoint_pattern(suffix: str = ''):
|
||||
|
||||
|
||||
def get_latest_checkpoint_path(checkpoint_dir: str, suffix: str = ''):
|
||||
"""This is a function to retrieve the latest checkpoint path from the (checkpoint_dir, suffix, gpu_parallel_rank) tuple.
|
||||
"""This is a function to retrieve the latest checkpoint path from the tuple
|
||||
(checkpoint_dir, suffix, gpu_parallel_rank).
|
||||
This is useful during recuperation of the checkpoint, especially when you do not know the epoch number.
|
||||
|
||||
:param checkpoint_dir: Directory for saving checkpoints
|
||||
:type checkpoint_dir: str
|
||||
:param suffix: Additional notation to specify the model or checkpoint, defaults to ''
|
||||
:type suffix: str, optional
|
||||
:raises FileNotFoundError: Raise error when we cannot find the latest checkpoint file with inputs given
|
||||
:return: The latest checkpoint path to be retrieved
|
||||
:rtype: path
|
||||
Args:
|
||||
checkpoint_dir (str): Directory for saving checkpoints
|
||||
suffix (str, optional): Additional notation to specify the model or checkpoint, defaults to ''
|
||||
|
||||
Returns:
|
||||
str: The latest retrieved checkpoint path.
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: Raise error when we cannot find the latest checkpoint file with inputs given.
|
||||
"""
|
||||
CKPT_NAME_PAT = get_latest_checkpoint_pattern(suffix=suffix)
|
||||
|
||||
@ -126,22 +130,19 @@ def save_checkpoint(checkpoint_path: str,
|
||||
optimizer: torch.optim.Optimizer,
|
||||
lr_scheduler: torch.optim.lr_scheduler._LRScheduler = None,
|
||||
**kwargs):
|
||||
"""Given a directory to store the checkpoints, saves all the training components' parameters or buffers, such as model,
|
||||
optimizer, lr_scheduler and etc. into a checkpoint dictionary.
|
||||
"""Given a directory to store the checkpoints, saves all the training components' parameters or buffers, such as
|
||||
model, optimizer, lr_scheduler etc. into a checkpoint dictionary.
|
||||
|
||||
This method can be used for both colosalai nn.BaseModel and normal pytorch nn.Module.
|
||||
This method can be used for both :class:`colossalai.nn.BaseModel` and normal :class:`torch.nn.Module`.
|
||||
|
||||
|
||||
:param checkpoint_path: Set up a directory for saving checkpoints
|
||||
:type checkpoint_path: str
|
||||
:param epoch: Epoch number (indicate how many epochs have you trained this model)
|
||||
:type epoch: int
|
||||
:param model: Model to be registered
|
||||
:type model: torch.nn.Module
|
||||
:param optimizer: Optimizer to be registered
|
||||
:type optimizer: torch.optim.Optimizer
|
||||
:param lr_scheduler: lr_scheduler to be registered, defaults to None
|
||||
:type lr_scheduler: torch.optim.lr_scheduler._LRScheduler, optional
|
||||
Args:
|
||||
checkpoint_path (str): Set up a directory for saving checkpoints.
|
||||
epoch (int): Epoch number (indicate how many epochs have you trained this model).
|
||||
model (:class:`torch.nn.Module`): Model to be registered.
|
||||
optimizer (Union[:class:`torch.optim.Optimizer`, :class:`colossalai.nn.optimizer`]): Optimizer to be registered.
|
||||
lr_scheduler (Union[:class:`torch.optim.lr_scheduler`,
|
||||
:class:`colossalai.nn.lr_scheduler`], optional): lr_scheduler to be registered, defaults to None.
|
||||
kwargs (dict): additional parameters to be saved.
|
||||
"""
|
||||
# for compatibility with normal pytorch nn.Module
|
||||
if hasattr(model, 'state_dict_for_save_checkpoint'):
|
||||
@ -165,31 +166,31 @@ def load_checkpoint(checkpoint_path: str,
|
||||
finetune: bool = False,
|
||||
strict: bool = True) -> Tuple:
|
||||
"""Loads the checkpoint file.
|
||||
|
||||
If finetune is False, then we intend to continue/resume the training process from the checkpoint given.
|
||||
So we copy parameters and buffers from state_dict into these modules(model, optimizer,lr_scheduler)
|
||||
and its descendants.
|
||||
If finetune is True, then only the weights and buffers of model should be reload.
|
||||
If strict is True, then the keys of state_dict must exactly match the keys returned by this module’s
|
||||
state_dict() function.
|
||||
and its descendants.
|
||||
|
||||
:param checkpoint_path: The exact and matched checkpoint_path directory to retrieve appropriate state_dict
|
||||
:type checkpoint_path: str
|
||||
:param model: Model to reload parameters and buffers
|
||||
:type model: torch.nn.Module
|
||||
:param optimizer: Optimizer to recuperate
|
||||
:type optimizer: torch.optim.Optimizer
|
||||
:param lr_scheduler: lr_scheduler to recuperate, defaults to None
|
||||
:type lr_scheduler: torch.optim.lr_scheduler._LRScheduler, optional
|
||||
:param finetune: Whether to finetune the model with new dataset or continue the pre-training, defaults to False
|
||||
:type finetune: bool, optional
|
||||
:param strict: Whether to strictly enforce that the keys in
|
||||
:attr:`state_dict` of the checkpoint match the names of
|
||||
parameters and buffers in model., defaults to True
|
||||
:type strict: bool, optional
|
||||
:raises ValueError: Raise error if the model/optimizer cannot successfully be recuperated
|
||||
:return: (the epoch number of the checkpoint retrieved, the checkpoint retrieved)
|
||||
:rtype: Tuple
|
||||
If finetune is True, then only the weights and buffers of model should be reloaded.
|
||||
If strict is True, then the keys of state_dict must exactly match the keys returned
|
||||
by this module’s state_dict() function.
|
||||
|
||||
Args:
|
||||
checkpoint_path (str): The exact and matched checkpoint_path directory to retrieve appropriate state_dict.
|
||||
model (:class:`torch.nn.Module`): Model to reload parameters and buffers.
|
||||
optimizer (Union[:class:`torch.optim.Optimizer`, :class:`colossalai.nn.optimizer`]): Optimizer to recuperate.
|
||||
lr_scheduler (:class:`torch.optim.lr_scheduler._LRScheduler`, optional):
|
||||
lr_scheduler to recuperate, defaults to None.
|
||||
finetune (bool, optional): Whether to finetune the model with new dataset or
|
||||
continue the pre-training, defaults to False.
|
||||
strict (bool, optional): Whether to strictly enforce that the keys in :attr:`state_dict`
|
||||
of the checkpoint match the names of parameters and buffers in model, defaults to True.
|
||||
|
||||
Returns:
|
||||
Tuple(int, ``checkpoint``): The tuple (the epoch number of the checkpoint retrieved, the checkpoint retrieved).
|
||||
|
||||
Raises:
|
||||
ValueError: Raise error if the model/optimizer cannot successfully be recuperated
|
||||
"""
|
||||
# Load the checkpoint.
|
||||
checkpoint = torch.load(checkpoint_path, map_location='cpu')
|
||||
|
@ -27,10 +27,10 @@ from .multi_tensor_apply import multi_tensor_applier
|
||||
def print_rank_0(msg: str, logger=None):
|
||||
"""Print messages and save logs(optional). This is executed only if you are the rank-0 gpu.
|
||||
|
||||
:param msg: A string message to output
|
||||
:type msg: str
|
||||
:param logger: Python logger object, defaults to None
|
||||
:type logger: optional
|
||||
Args:
|
||||
msg (str): A string message to output.
|
||||
logger (:class:`colossalai.logging.DistributedLogger`, optional):
|
||||
The logger to record the message, defaults to None.
|
||||
"""
|
||||
if gpc.get_global_rank() == 0:
|
||||
if logger is None:
|
||||
@ -53,12 +53,15 @@ def free_port():
|
||||
|
||||
|
||||
def sync_model_param(model, parallel_mode):
|
||||
"""Make sure data parameters are consistent during Data Parallel Mode
|
||||
r"""Make sure data parameters are consistent during Data Parallel Mode.
|
||||
|
||||
:param model: A pyTorch nn.model on whose parameters you check the consistency
|
||||
:param parallel_mode: Parallel mode to be checked
|
||||
:type model: torch.nn.Module
|
||||
:type parallel_mode: colossalai.context.ParallelMode
|
||||
Args:
|
||||
model (:class:`torch.nn.Module`): A pyTorch model on whose parameters you check the consistency.
|
||||
parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel mode to be checked.
|
||||
|
||||
Note:
|
||||
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
|
||||
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
|
||||
"""
|
||||
if gpc.is_initialized(parallel_mode) and gpc.get_world_size(parallel_mode) > 1:
|
||||
for param in model.parameters():
|
||||
@ -146,18 +149,19 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
|
||||
"""Clips gradient norm of an iterable of parameters whose gradients are in fp32.
|
||||
|
||||
This is adapted from :func:`torch.nn.utils.clip_grad.clip_grad_norm_` and
|
||||
added functionality to handle model parallel parameters. Note that
|
||||
the gradients are modified in place.
|
||||
added functionality to handle model parallel parameters.
|
||||
|
||||
:param parameters: An iterable of Tensors or a single Tensor that will have gradients normalized
|
||||
:type parameters: (Iterable[Tensor] or Tensor)
|
||||
:param max_norm: Max norm of the gradients
|
||||
:type max_norm: float or int
|
||||
:param norm_type: Type of the used p-norm. Can be ``'inf'`` for infinity norm.
|
||||
:type norm_type: float or int
|
||||
Note:
|
||||
the gradients are modified in place.
|
||||
|
||||
:return: Total norm of the parameters (viewed as a single vector).
|
||||
:rtype: float
|
||||
Args:
|
||||
parameters (Iterable[:class:`torch.tensor`] or :class:`torch.tensor`):
|
||||
An iterable of Tensors or a single Tensor that will have gradients normalized.
|
||||
max_norm (Union[float, int]): Max norm of the gradients.
|
||||
norm_type (Union[float, int, 'inf']): Type of the used p-norm. Can be ``'inf'`` for infinity norm.
|
||||
|
||||
Returns:
|
||||
float: Total norm of the parameters.
|
||||
"""
|
||||
|
||||
if isinstance(parameters, torch.Tensor):
|
||||
|
@ -19,18 +19,15 @@ T_co = TypeVar('T_co', covariant=True)
|
||||
|
||||
@DATA_SAMPLERS.register_module
|
||||
class DataParallelSampler(Sampler):
|
||||
"""A data sampler for distributed data parallelism
|
||||
"""A data sampler for distributed data parallelism.
|
||||
|
||||
:param dataset: A Dataset instance
|
||||
:type dataset: torch.utils.data.Dataset
|
||||
:param shuffle: Whether to shuffle data, defaults to False
|
||||
:type shuffle: bool, optional
|
||||
:param seed: The random seed, defaults to 0
|
||||
:type seed: int, optional
|
||||
:param drop_last: Set to True to drop the last incomplete batch, if the dataset size is not divisible by the batch
|
||||
size. If False and the size of dataset is not divisible by the batch size, then the last batch will be smaller,
|
||||
defaults to False
|
||||
:type drop_last: bool, optional
|
||||
Args:
|
||||
dataset (:class:`torch.utils.data.Dataset`): The Dataset for sampling.
|
||||
shuffle (bool, optional): Whether to shuffle data, defaults to False.
|
||||
seed (int, optional): The random seed used for sampling, defaults to 0.
|
||||
drop_last (bool, optional): Set to True to drop the last incomplete batch, if the dataset size
|
||||
is not divisible by the batch size. If False and the size of dataset is not divisible by
|
||||
the batch size, then the last batch will be smaller, defaults to False.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
@ -104,8 +101,8 @@ class DataParallelSampler(Sampler):
|
||||
use a different random ordering for each epoch. Otherwise, the next iteration of this
|
||||
sampler will yield the same ordering.
|
||||
|
||||
:param epoch: Epoch number.
|
||||
:type epoch: int
|
||||
Args:
|
||||
epoch (int): Epoch number.
|
||||
"""
|
||||
self.epoch = epoch
|
||||
|
||||
@ -118,29 +115,27 @@ def get_dataloader(dataset,
|
||||
pin_memory=False,
|
||||
num_workers=0,
|
||||
**kwargs):
|
||||
"""Set up a deterministic dataloader (also configure seed workers, samplers and whether shuffle or not)
|
||||
r"""Set up a deterministic dataloader (also configure seed workers, samplers and whether shuffle or not)
|
||||
|
||||
.. note:: When pipeline parallel is enabled, shuffle cannot be True as it will result in mismatch between input data
|
||||
on the 1st stage and label on the last stage
|
||||
Note:
|
||||
When pipeline parallel is enabled, shuffle cannot be True as it will result in mismatch between input data
|
||||
on the 1st stage and label on the last stage.
|
||||
|
||||
:param dataset: A :class:`torch.utils.data.Dataset` object
|
||||
:param shuffle: Whether to shuffle the dataset
|
||||
:param seed: Random worker seed, defaults to 1024
|
||||
:param add_sampler: Add DistributedDataParallelSampelr to the dataset
|
||||
:param drop_last: Drop the last incomplete batch of data
|
||||
:param pin_memory: Whether to pin memory address in CPU memory
|
||||
:param num_workers: Number of worker threads for this dataloader
|
||||
Args:
|
||||
dataset (:class:`torch.utils.data.Dataset`): The dataset to be loaded.
|
||||
shuffle (bool, optional): Whether to shuffle the dataset. Defaults to False.
|
||||
seed (int, optional): Random worker seed for sampling, defaults to 1024.
|
||||
add_sampler: Whether to add ``DistributedDataParallelSampler`` to the dataset. Defaults to True.
|
||||
drop_last (bool, optional): Set to True to drop the last incomplete batch, if the dataset size
|
||||
is not divisible by the batch size. If False and the size of dataset is not divisible by
|
||||
the batch size, then the last batch will be smaller, defaults to False.
|
||||
pin_memory (bool, optional): Whether to pin memory address in CPU memory. Defaults to False.
|
||||
num_workers (int, optional): Number of worker threads for this dataloader. Defaults to 0.
|
||||
kwargs (dict): optional parameters for ``torch.utils.data.DataLoader``, more details could be found in
|
||||
`DataLoader <https://pytorch.org/docs/stable/_modules/torch/utils/data/dataloader.html#DataLoader>`_.
|
||||
|
||||
:type dataset: :class:`torch.utils.data.Dataset`
|
||||
:type shuffle: bool, optional. Default is False
|
||||
:type seed: int, optional. Default is 1024
|
||||
:type add_sampler: bool, optional. Default is True
|
||||
:type drop_last: bool, optional. Default is False
|
||||
:type pin_memory: bool, optional. Default is False
|
||||
:type num_workers: int, optional. Default is 0
|
||||
|
||||
:return: A object of :class:`torch.utils.data.DataLoader`
|
||||
:rtype: :class:`torch.utils.data.DataLoader`
|
||||
Returns:
|
||||
:class:`torch.utils.data.DataLoader`: A DataLoader used for training or testing.
|
||||
"""
|
||||
_kwargs = kwargs.copy()
|
||||
|
||||
|
@ -13,19 +13,25 @@ def accumulate_gradient(model: nn.Module,
|
||||
accumulate_size: int,
|
||||
gradient_handlers: List[BaseGradientHandler] = None,
|
||||
lr_scheduler: _LRScheduler = None):
|
||||
"""
|
||||
:param model: your model object
|
||||
:type model: :class:`torch.nn.Module`
|
||||
:param optimizer: your optimizer object
|
||||
:type optimizer: :class:`torch.optim.Optimizer`
|
||||
:param dataloader: your dataloader object
|
||||
:type dataloader: Iterable
|
||||
:param accumulate_size: the number of steps to accumulate gradients
|
||||
:type accumulate_size: int
|
||||
:param gradient_handlers: list of gradient handler objects. Default is None
|
||||
:type gradient_handlers: List[:class:`colossalai.engine.BaseGradientHandler`]
|
||||
:param lr_scheduler: your lr scheduler object. Default is None
|
||||
:type lr_scheduler: `torch.optim.lr_scheduler._LRScheduler`
|
||||
r"""Turning model, optimizer, dataloader into corresponding object for gradient accumulation.
|
||||
|
||||
Args:
|
||||
model (:class:`torch.nn.Module`): your model object for gradient accumulation.
|
||||
optimizer (:class:`torch.optim.Optimizer`): your optimizer object for gradient accumulation.
|
||||
dataloader (:class:`torch.utils.data.DataLoader` or iterable objects):
|
||||
your dataloader object, would be called like iter(dataloader)
|
||||
accumulate_size (int): the number of steps to accumulate gradients
|
||||
gradient_handlers (List[:class:`colossalai.engine.BaseGradientHandler`]):
|
||||
list of gradient handler objects. Default is None.
|
||||
lr_scheduler (`torch.optim.lr_scheduler` or `colossalai.nn.lr_scheduler`):
|
||||
your ``lr_scheduler`` object for gradient accumulation. Defaults to None.
|
||||
|
||||
More details about `gradient_handlers` could be found in
|
||||
`Gradient_handler <https://github.com/hpcaitech/ColossalAI/tree/main/colossalai/engine/gradient_handler>`_.
|
||||
|
||||
More details about `lr_scheduler` could be found
|
||||
`lr_scheduler <https://github.com/hpcaitech/ColossalAI/tree/main/colossalai/nn/lr_scheduler>`_. and
|
||||
`how to adjust learning rate <https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate>`_.
|
||||
"""
|
||||
optimizer = GradAccumOptimizer(optimizer, accumulate_size=accumulate_size, model=model)
|
||||
dataloader = GradAccumDataloader(dataloader, accumulate_size=accumulate_size)
|
||||
|
@ -15,15 +15,13 @@ from colossalai.engine import BaseGradientHandler
|
||||
|
||||
class GradAccumOptimizer(ColossalaiOptimizer):
|
||||
"""A wrapper for the optimizer to enable gradient accumulation by skipping the steps
|
||||
before accumulation size is reached
|
||||
|
||||
:param optim: Your optimizer object
|
||||
:type optim: :class:`torch.optim.Optimizer`
|
||||
:param accumulate_size: The number of steps to accumulate gradients
|
||||
:type accumulate_size: int
|
||||
:param model: Your model object to check if it is DDP for special handling of no_sync() context
|
||||
:type model: :class:`torch.nn.Module`
|
||||
before accumulation size is reached.
|
||||
|
||||
Args:
|
||||
optim (:class:`torch.optim.Optimizer`): Your optimizer object for gradient accumulation.
|
||||
accumulate_size (int): The number of steps to accumulate gradients.
|
||||
model (:class:`torch.nn.Module`):
|
||||
Your model object to check if it is DistributedDataParallel for special handling of no_sync() context.
|
||||
"""
|
||||
|
||||
def __init__(self, optim: Optimizer, accumulate_size: int, model: nn.Module = None):
|
||||
@ -76,18 +74,18 @@ class GradAccumOptimizer(ColossalaiOptimizer):
|
||||
|
||||
|
||||
class GradAccumDataloader:
|
||||
"""A wrapper for dataloder to enable gradient accumulation by dropping the last incomplete steps.
|
||||
"""A wrapper for dataloader to enable gradient accumulation by dropping the last incomplete steps.
|
||||
|
||||
For example, if a dataloader has 10 batches of data and accumulate size is 4. The model paramters will
|
||||
be update only twice at step 4 and step 8. The last two batches of data do not form a complete 4-step cycle.
|
||||
Thus, they will be automatically skipped by this class. If the dataloader is not standard PyTorch dataloader,
|
||||
(e.g. Dali dataloader), this class will automatically consume (load data for nothing) the remaining 2 batches.
|
||||
|
||||
:param dataloader: Your dataloader object
|
||||
:type dataloader: Iterable
|
||||
:param accumulate_size: The number of steps to accumulate gradients
|
||||
:type accumulate_size: int
|
||||
Note:
|
||||
The dataloader would drop the last incomplete steps for gradient accumulation.
|
||||
For example, if a dataloader has 10 batches of data and accumulate size is 4. The model parameters will
|
||||
be updated only twice at step 4 and step 8. The last two batches of data do not form a complete 4-step cycle.
|
||||
Thus, they will be automatically skipped by this class. If the dataloader is not standard PyTorch dataloader,
|
||||
(e.g. Dali dataloader), this class will automatically consume (load data for nothing) the remaining 2 batches.
|
||||
|
||||
Args:
|
||||
optim (``Iterable``): Your dataloader object for gradient accumulation.
|
||||
accumulate_size (int): The number of steps to accumulate gradients.
|
||||
"""
|
||||
|
||||
def __init__(self, dataloader: Iterable, accumulate_size: int) -> None:
|
||||
@ -125,13 +123,12 @@ class GradAccumDataloader:
|
||||
|
||||
class GradAccumLrSchedulerByStep(_LRScheduler):
|
||||
"""A wrapper for the LR scheduler to enable gradient accumulation by skipping the steps
|
||||
before accumulation size is reached
|
||||
|
||||
:param lr_scheduler: Your lr scheduler object
|
||||
:type lr_scheduler: :class:`torch.optim.lr_scheduler._LRScheduler`
|
||||
:param accumulate_size: The number of steps to accumulate gradients
|
||||
:type accumulate_size: int
|
||||
before accumulation size is reached.
|
||||
|
||||
Args:
|
||||
lr_scheduler (:class:`torch.optim.lr_scheduler._LRScheduler`):
|
||||
Your ``lr_scheduler`` object for gradient accumulation.
|
||||
accumulate_size (int): The number of steps to accumulate gradients.
|
||||
"""
|
||||
|
||||
def __init__(self, lr_scheduler: _LRScheduler, accumulate_size: int) -> None:
|
||||
@ -171,13 +168,16 @@ class GradAccumLrSchedulerByStep(_LRScheduler):
|
||||
|
||||
|
||||
class GradAccumGradientHandler:
|
||||
"""A wrapper for the gradient handler to enable gradient accumulation by skipping the steps
|
||||
before accumulation size is reached
|
||||
r"""A wrapper for the gradient handler to enable gradient accumulation by skipping the steps
|
||||
before accumulation size is reached.
|
||||
|
||||
:param grad_handler: Your gradient handler object
|
||||
:type grad_handler: :class:`colossalai.engine.BaseGradientHandler`
|
||||
:param accumulate_size: The number of steps to accumulate gradients
|
||||
:type accumulate_size: int
|
||||
Args:
|
||||
grad_handler (:class:`colossalai.engine.BaseGradientHandler`):
|
||||
Your ``gradient_handler`` object for gradient accumulation, would be called when achieving `accumulate_size`.
|
||||
accumulate_size (int): The number of steps to accumulate gradients.
|
||||
|
||||
More details about ``gradient_handlers`` could be found in
|
||||
`Gradient_handler <https://github.com/hpcaitech/ColossalAI/tree/main/colossalai/engine/gradient_handler>`_.
|
||||
|
||||
"""
|
||||
|
||||
|
@ -14,12 +14,13 @@ from typing import Optional
|
||||
|
||||
|
||||
def colo_cuda_memory_used(device: Optional[torch.device] = None) -> int:
|
||||
"""
|
||||
Get the free memory info of device.
|
||||
:param device: a torch device instance or None
|
||||
:type device: Optional[torch.device]
|
||||
:return: current memory usage, sized by Byte
|
||||
:rtype: int
|
||||
"""Get the free memory info of device.
|
||||
|
||||
Args:
|
||||
device (Optional[``torch.device``]): a torch device instance or None. Defaults None.
|
||||
|
||||
Returns:
|
||||
int: current memory usage, sized by Byte.
|
||||
"""
|
||||
if device:
|
||||
assert device.type == 'cuda'
|
||||
@ -34,7 +35,7 @@ def colo_cuda_memory_used(device: Optional[torch.device] = None) -> int:
|
||||
|
||||
|
||||
def bytes_to_GB(val, decimal=2):
|
||||
"""A byte-to-Gigabyte converter, defaultly using binary notation.
|
||||
"""A byte-to-Gigabyte converter, default using binary notation.
|
||||
|
||||
:param val: X bytes to convert
|
||||
:return: X' GB
|
||||
@ -43,7 +44,7 @@ def bytes_to_GB(val, decimal=2):
|
||||
|
||||
|
||||
def bytes_to_MB(val, decimal=2):
|
||||
"""A byte-to-Megabyte converter, defaultly using binary notation.
|
||||
"""A byte-to-Megabyte converter, default using binary notation.
|
||||
|
||||
:param val: X bytes to convert
|
||||
:return: X' MB
|
||||
@ -54,13 +55,13 @@ def bytes_to_MB(val, decimal=2):
|
||||
def report_memory_usage(message, logger=None, report_cpu=False):
|
||||
"""Calculate and print RAM usage (in GB)
|
||||
|
||||
:param message: A prefix message to add in the log
|
||||
:type message: str
|
||||
:param logger: An instance of :class:`colossalai.logging.DistributedLogger`
|
||||
:type logger: :class:`colossalai.logging.DistributedLogger`, optional
|
||||
:param report_cpu: Whether to report CPU memory
|
||||
:type report_cpu: bool, optional
|
||||
:raises EnvironmentError: Raise error if no distributed environment has been initialized
|
||||
Args:
|
||||
message (str): A prefix message to add in the log.
|
||||
logger (:class:`colossalai.logging.DistributedLogger`): The logger used to record memory information.
|
||||
report_cpu (bool, optional): Whether to report CPU memory.
|
||||
|
||||
Raises:
|
||||
EnvironmentError: Raise error if no distributed environment has been initialized.
|
||||
"""
|
||||
if not gpc.is_initialized(ParallelMode.GLOBAL):
|
||||
raise EnvironmentError("No distributed environment is initialized")
|
||||
|
@ -12,8 +12,8 @@ def get_moe_epsize_param_dict(model: nn.Module) -> Dict[int, List[nn.Parameter]]
|
||||
size of every parameter. Since the parameters in data parallelism is replicated
|
||||
in each GPU, we set their ep_size to 1.
|
||||
|
||||
:param model: A pyTorch nn.model from which we get dict
|
||||
:type model: torch.nn.Module
|
||||
Args:
|
||||
model (:class:`torch.nn.Module`): A pyTorch `nn.Module` from which we get dict.
|
||||
"""
|
||||
epsize_param_dict = dict()
|
||||
for param in model.parameters():
|
||||
@ -29,10 +29,10 @@ def get_moe_epsize_param_dict(model: nn.Module) -> Dict[int, List[nn.Parameter]]
|
||||
|
||||
|
||||
def sync_moe_model_param(model: nn.Module):
|
||||
"""Make sure model parameters are consistent in MoE parallel context
|
||||
"""Make sure model parameters are consistent in MoE parallel context.
|
||||
|
||||
:param model: A pyTorch nn.model on whose parameters you check the consistency
|
||||
:type model: torch.nn.Module
|
||||
Args:
|
||||
model (:class:`torch.nn.Module`): A pyTorch model on whose parameters you check the consistency.
|
||||
"""
|
||||
if is_using_ddp():
|
||||
|
||||
|
@ -3,10 +3,10 @@
|
||||
|
||||
class MultiTensorApply(object):
|
||||
"""
|
||||
Apply an operation to a list of tensors efficiently
|
||||
Apply an operation to a list of tensors efficiently.
|
||||
|
||||
:param chunk_size: Size of a chunk
|
||||
:type chunk_size: int
|
||||
Args:
|
||||
chunk_size (int): Size of a chunk.
|
||||
"""
|
||||
|
||||
available = False
|
||||
|
@ -9,6 +9,7 @@ from collections import defaultdict
|
||||
LINE_WIDTH = 108
|
||||
LINE = '-' * LINE_WIDTH + '\n'
|
||||
|
||||
|
||||
class TensorDetector():
|
||||
def __init__(self,
|
||||
show_info: bool = True,
|
||||
@ -16,17 +17,14 @@ class TensorDetector():
|
||||
include_cpu: bool = False,
|
||||
module: Optional[nn.Module] = None
|
||||
):
|
||||
"""This class is an detector to detect tensor on different devices.
|
||||
|
||||
:param show_info: whether to print the info on screen, default True
|
||||
:type show_info: bool
|
||||
:param log: the file name to save the log
|
||||
:type log: str
|
||||
:param include_cpu: whether to detect tensor on cpu, default False
|
||||
:type include_cpu: bool
|
||||
:param module: when sending an `nn.Module` it, the detector can name the tensors detected better
|
||||
:type module: Optional[nn.Module]
|
||||
"""This class is a detector to detect tensor on different devices.
|
||||
|
||||
Args:
|
||||
show_info (bool, optional): whether to print the info on screen, default True.
|
||||
log (str, optional): the file name to save the log. Defaults to None.
|
||||
include_cpu (bool, optional): whether to detect tensor on cpu, default False.
|
||||
module (Optional[:class:`nn.Module`]): when sending an ``nn.Module`` object,
|
||||
the detector can name the tensors detected better.
|
||||
"""
|
||||
self.show_info = show_info
|
||||
self.log = log
|
||||
@ -48,7 +46,6 @@ class TensorDetector():
|
||||
self.tensor_info[id(param)].append(param.requires_grad)
|
||||
self.tensor_info[id(param)].append(param.dtype)
|
||||
self.tensor_info[id(param)].append(self.get_tensor_mem(param))
|
||||
|
||||
|
||||
def get_tensor_mem(self, tensor):
|
||||
# calculate the memory occupied by a tensor
|
||||
@ -58,7 +55,6 @@ class TensorDetector():
|
||||
memory_size += grad_memory_size
|
||||
return self.mem_format(memory_size)
|
||||
|
||||
|
||||
def mem_format(self, real_memory_size):
|
||||
# format the tensor memory into a reasonal magnitude
|
||||
if real_memory_size >= 2 ** 30:
|
||||
@ -68,7 +64,6 @@ class TensorDetector():
|
||||
if real_memory_size >= 2 ** 10:
|
||||
return str(real_memory_size / (2 ** 10)) + ' KB'
|
||||
return str(real_memory_size) + ' B'
|
||||
|
||||
|
||||
def collect_tensors_state(self):
|
||||
for obj in gc.get_objects():
|
||||
@ -116,7 +111,6 @@ class TensorDetector():
|
||||
if obj.device not in self.devices:
|
||||
self.devices.append(obj.device)
|
||||
|
||||
|
||||
def print_tensors_state(self):
|
||||
template_format = '{:3s}{:<30s}{:>10s}{:>20s}{:>10s}{:>20s}{:>15s}'
|
||||
self.info += LINE
|
||||
@ -173,7 +167,6 @@ class TensorDetector():
|
||||
if self.log is not None:
|
||||
with open(self.log + '.log', 'a') as f:
|
||||
f.write(self.info)
|
||||
|
||||
|
||||
def detect(self, include_cpu = False):
|
||||
self.include_cpu = include_cpu
|
||||
|
@ -25,7 +25,7 @@ class Timer:
|
||||
return time.time()
|
||||
|
||||
def start(self):
|
||||
"""Fisrtly synchronize cuda, reset the clock and then start the timer.
|
||||
"""Firstly synchronize cuda, reset the clock and then start the timer.
|
||||
"""
|
||||
self._elapsed = 0
|
||||
synchronize()
|
||||
@ -40,10 +40,11 @@ class Timer:
|
||||
def stop(self, keep_in_history: bool = False):
|
||||
"""Stop the timer and record the start-stop time interval.
|
||||
|
||||
:param keep_in_history: Whether does it record into history each start-stop interval, defaults to False
|
||||
:type keep_in_history: bool, optional
|
||||
:return: Start-stop interval
|
||||
:rtype: int
|
||||
Args:
|
||||
keep_in_history (bool, optional): Whether does it record into history
|
||||
each start-stop interval, defaults to False.
|
||||
Returns:
|
||||
int: Start-stop interval.
|
||||
"""
|
||||
synchronize()
|
||||
end_time = time.time()
|
||||
@ -57,26 +58,27 @@ class Timer:
|
||||
def get_history_mean(self):
|
||||
"""Mean of all history start-stop time intervals.
|
||||
|
||||
:return: Mean of time intervals
|
||||
:rtype: int
|
||||
Returns:
|
||||
int: Mean of time intervals
|
||||
"""
|
||||
return sum(self._history) / len(self._history)
|
||||
|
||||
def get_history_sum(self):
|
||||
"""Add up all the start-stop time intervals.
|
||||
|
||||
:return: Sum of time intervals
|
||||
:rtype: int
|
||||
Returns:
|
||||
int: Sum of time intervals.
|
||||
"""
|
||||
return sum(self._history)
|
||||
|
||||
def get_elapsed_time(self):
|
||||
"""Return the last start-stop time interval.
|
||||
|
||||
.. note:: Use it only when timer is not in progress
|
||||
Returns:
|
||||
int: The last time interval.
|
||||
|
||||
:return: The last time interval
|
||||
:rtype: int
|
||||
Note:
|
||||
Use it only when timer is not in progress
|
||||
"""
|
||||
assert not self._started, 'Timer is still in progress'
|
||||
return self._elapsed
|
||||
@ -90,10 +92,10 @@ class Timer:
|
||||
|
||||
|
||||
class MultiTimer:
|
||||
"""An object contains multiple timers
|
||||
"""An object contains multiple timers.
|
||||
|
||||
:param on: Whether the timer is enabled. Default is True
|
||||
:type on: bool, optional
|
||||
Args:
|
||||
on (bool, optional): Whether the timer is enabled. Default is True.
|
||||
"""
|
||||
|
||||
def __init__(self, on: bool = True):
|
||||
@ -101,10 +103,10 @@ class MultiTimer:
|
||||
self._timers = dict()
|
||||
|
||||
def start(self, name: str):
|
||||
"""Start namely one of the timers
|
||||
"""Start namely one of the timers.
|
||||
|
||||
:param name: Timer's key
|
||||
:type name: str
|
||||
Args:
|
||||
name (str): Timer's key.
|
||||
"""
|
||||
if self._on:
|
||||
if name not in self._timers:
|
||||
@ -114,10 +116,9 @@ class MultiTimer:
|
||||
def stop(self, name: str, keep_in_history: bool):
|
||||
"""Stop namely one of the timers.
|
||||
|
||||
:param name: Timer's key
|
||||
:type name: str
|
||||
:param keep_in_history: Whether does it record into history each start-stop interval
|
||||
:type keep_in_history: bool
|
||||
Args:
|
||||
name (str): Timer's key.
|
||||
keep_in_history (bool): Whether does it record into history each start-stop interval.
|
||||
"""
|
||||
if self._on:
|
||||
return self._timers[name].stop(keep_in_history)
|
||||
@ -127,17 +128,19 @@ class MultiTimer:
|
||||
def get_timer(self, name):
|
||||
"""Get timer by its name (from multitimer)
|
||||
|
||||
:param name: Timer's key
|
||||
:return: Timer with the name you give correctly
|
||||
:rtype: Timer
|
||||
Args:
|
||||
name (str): Timer's key.
|
||||
Returns:
|
||||
:class:`colossalai.utils.Timer`: Timer with the name you give correctly.
|
||||
"""
|
||||
return self._timers[name]
|
||||
|
||||
def reset(self, name=None):
|
||||
"""Reset timers.
|
||||
|
||||
:param name: If name is designated, the named timer will be reset and others will not, defaults to None
|
||||
:type name: optional
|
||||
Args:
|
||||
name (str, optional): If name is designated, the named timer will be reset
|
||||
and others will not, defaults to None.
|
||||
"""
|
||||
if self._on:
|
||||
if name is not None:
|
||||
|
@ -1,74 +1,74 @@
|
||||
from functools import partial
|
||||
import pytest
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.multiprocessing as mp
|
||||
import torch.distributed as dist
|
||||
import colossalai
|
||||
from colossalai.utils import free_port, get_current_device
|
||||
from colossalai.nn.layer.moe import Top1Router, UniformNoiseGenerator, MoeLayer, Experts
|
||||
from colossalai.context.moe_context import MOE_CONTEXT
|
||||
from colossalai.utils.moe import sync_moe_model_param
|
||||
from colossalai.engine.gradient_handler import MoeGradientHandler
|
||||
from colossalai.testing import assert_equal_in_group
|
||||
from colossalai.testing import rerun_on_exception
|
||||
|
||||
BATCH_SIZE = 4
|
||||
DIM = 16
|
||||
CONFIG = dict()
|
||||
|
||||
|
||||
def run_test(rank, world_size, port):
|
||||
colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
|
||||
expert_module = nn.Linear
|
||||
expert_factor = dict(in_features=DIM, out_features=DIM, device=get_current_device())
|
||||
|
||||
MOE_CONTEXT.setup(42) # MOE initialization
|
||||
noisy_func = UniformNoiseGenerator()
|
||||
router = Top1Router(noisy_func=noisy_func)
|
||||
num_experts_list = [1, 2, 4]
|
||||
layer_list = []
|
||||
for num_experts in num_experts_list:
|
||||
exp = Experts(expert_module, num_experts, **expert_factor)
|
||||
moe_layer = MoeLayer(DIM, num_experts, router, exp)
|
||||
layer_list.append(moe_layer)
|
||||
|
||||
model = nn.Sequential(*layer_list)
|
||||
model = model.to(get_current_device())
|
||||
sync_moe_model_param(model)
|
||||
|
||||
dist_dict = MOE_CONTEXT.parallel_info_dict
|
||||
assert_equal_in_group(layer_list[0].experts.experts[0].weight.data, dist_dict[1].dp_group)
|
||||
assert_equal_in_group(layer_list[1].experts.experts[0].weight.data, dist_dict[2].dp_group)
|
||||
# MoE model synchronization passed
|
||||
|
||||
grad_handler = MoeGradientHandler(model, 0)
|
||||
|
||||
rank = dist.get_rank()
|
||||
torch.cuda.manual_seed(78 + rank)
|
||||
data = torch.randn(BATCH_SIZE, DIM, device=get_current_device())
|
||||
grad = torch.randn_like(data)
|
||||
|
||||
MOE_CONTEXT.reset_loss()
|
||||
outputs = model(data)
|
||||
outputs.backward(grad)
|
||||
grad_handler.handle_gradient()
|
||||
|
||||
assert_equal_in_group(layer_list[0].experts.experts[0].weight.grad, dist_dict[1].dp_group)
|
||||
assert_equal_in_group(layer_list[0].experts.experts[0].bias.grad, dist_dict[1].dp_group)
|
||||
|
||||
assert_equal_in_group(layer_list[1].experts.experts[0].weight.grad, dist_dict[2].dp_group)
|
||||
assert_equal_in_group(layer_list[1].experts.experts[0].bias.grad, dist_dict[2].dp_group)
|
||||
# MoE grad handler test passed
|
||||
|
||||
|
||||
@pytest.mark.dist
|
||||
@rerun_on_exception(exception_type=mp.ProcessRaisedException, pattern=".*Address already in use.*")
|
||||
def test_grad_handler():
|
||||
world_size = 4
|
||||
run_func = partial(run_test, world_size=world_size, port=free_port())
|
||||
mp.spawn(run_func, nprocs=world_size)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_grad_handler()
|
||||
from functools import partial
|
||||
import pytest
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.multiprocessing as mp
|
||||
import torch.distributed as dist
|
||||
import colossalai
|
||||
from colossalai.utils import free_port, get_current_device
|
||||
from colossalai.nn.layer.moe import Top1Router, UniformNoiseGenerator, MoeLayer, Experts
|
||||
from colossalai.context.moe_context import MOE_CONTEXT
|
||||
from colossalai.utils.moe import sync_moe_model_param
|
||||
from colossalai.engine.gradient_handler import MoeGradientHandler
|
||||
from colossalai.testing import assert_equal_in_group
|
||||
from colossalai.testing import rerun_on_exception
|
||||
|
||||
BATCH_SIZE = 4
|
||||
DIM = 16
|
||||
CONFIG = dict()
|
||||
|
||||
|
||||
def run_test(rank, world_size, port):
|
||||
colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
|
||||
expert_module = nn.Linear
|
||||
expert_factor = dict(in_features=DIM, out_features=DIM, device=get_current_device())
|
||||
|
||||
MOE_CONTEXT.setup(42) # MOE initialization
|
||||
noisy_func = UniformNoiseGenerator()
|
||||
router = Top1Router(noisy_func=noisy_func)
|
||||
num_experts_list = [1, 2, 4]
|
||||
layer_list = []
|
||||
for num_experts in num_experts_list:
|
||||
exp = Experts(expert_module, num_experts, **expert_factor)
|
||||
moe_layer = MoeLayer(DIM, num_experts, router, exp)
|
||||
layer_list.append(moe_layer)
|
||||
|
||||
model = nn.Sequential(*layer_list)
|
||||
model = model.to(get_current_device())
|
||||
sync_moe_model_param(model)
|
||||
|
||||
dist_dict = MOE_CONTEXT.parallel_info_dict
|
||||
assert_equal_in_group(layer_list[0].experts.experts[0].weight.data, dist_dict[1].dp_group)
|
||||
assert_equal_in_group(layer_list[1].experts.experts[0].weight.data, dist_dict[2].dp_group)
|
||||
# MoE model synchronization passed
|
||||
|
||||
grad_handler = MoeGradientHandler(model, 0)
|
||||
|
||||
rank = dist.get_rank()
|
||||
torch.cuda.manual_seed(78 + rank)
|
||||
data = torch.randn(BATCH_SIZE, DIM, device=get_current_device())
|
||||
grad = torch.randn_like(data)
|
||||
|
||||
MOE_CONTEXT.reset_loss()
|
||||
outputs = model(data)
|
||||
outputs.backward(grad)
|
||||
grad_handler.handle_gradient()
|
||||
|
||||
assert_equal_in_group(layer_list[0].experts.experts[0].weight.grad, dist_dict[1].dp_group)
|
||||
assert_equal_in_group(layer_list[0].experts.experts[0].bias.grad, dist_dict[1].dp_group)
|
||||
|
||||
assert_equal_in_group(layer_list[1].experts.experts[0].weight.grad, dist_dict[2].dp_group)
|
||||
assert_equal_in_group(layer_list[1].experts.experts[0].bias.grad, dist_dict[2].dp_group)
|
||||
# MoE grad handler test passed
|
||||
|
||||
|
||||
@pytest.mark.dist
|
||||
@rerun_on_exception(exception_type=mp.ProcessRaisedException, pattern=".*Address already in use.*")
|
||||
def test_grad_handler():
|
||||
world_size = 4
|
||||
run_func = partial(run_test, world_size=world_size, port=free_port())
|
||||
mp.spawn(run_func, nprocs=world_size)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_grad_handler()
|
||||
|
@ -1,104 +1,104 @@
|
||||
from functools import partial
|
||||
import pytest
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.multiprocessing as mp
|
||||
import colossalai
|
||||
from colossalai.context import ParallelMode
|
||||
from colossalai.core import global_context as gpc
|
||||
from colossalai.utils import free_port, get_current_device
|
||||
from colossalai.nn.layer.moe import Top1Router, Top2Router, MoeLayer, Experts
|
||||
from colossalai.context.moe_context import MOE_CONTEXT
|
||||
from colossalai.testing import rerun_on_exception
|
||||
|
||||
BATCH_SIZE = 16
|
||||
NUM_EXPERTS = 4
|
||||
CONFIG = dict()
|
||||
|
||||
|
||||
def check_equal(tensor_a, tensor_b, atol=1e-06):
|
||||
assert torch.allclose(tensor_a, tensor_b, rtol=0, atol=atol) is True
|
||||
|
||||
|
||||
def run_routing(rank, world_size, port, rs=2, hidden_size=128, data_type=torch.float32, router=Top2Router):
|
||||
# Here we do not need TF32, since it brings absolute error on results
|
||||
torch.backends.cuda.matmul.allow_tf32 = False
|
||||
|
||||
colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
|
||||
local_rank = gpc.get_local_rank(ParallelMode.GLOBAL)
|
||||
|
||||
MOE_CONTEXT.setup(42) # MOE environment initialization
|
||||
MOE_CONTEXT.reset_loss()
|
||||
torch.manual_seed(rs + local_rank) # set each process has different random seed
|
||||
|
||||
# get randomized data
|
||||
tokens = torch.randn(BATCH_SIZE, hidden_size, dtype=data_type, device=get_current_device(), requires_grad=True)
|
||||
|
||||
expert_module = nn.Linear
|
||||
expert_factor = dict(in_features=hidden_size, out_features=hidden_size, device=get_current_device())
|
||||
expert = Experts(expert_module, NUM_EXPERTS, **expert_factor)
|
||||
layer = MoeLayer(hidden_size, NUM_EXPERTS, router(capacity_factor_train=1.0), expert)
|
||||
if data_type == torch.float16:
|
||||
layer = layer.half()
|
||||
|
||||
# use matrix multiplication instead of COL_MOE_KERNL in MOE dispatch and combine
|
||||
layer.use_kernel = False
|
||||
old_out = layer(tokens)
|
||||
ech = old_out.shape
|
||||
grad = torch.randn(ech, device=get_current_device())
|
||||
old_out.backward(grad) # get gradient
|
||||
|
||||
# save all results
|
||||
o_tk_grad = tokens.grad.data.clone()
|
||||
o_gt_grad = layer.gate.weight.grad.data.clone()
|
||||
|
||||
# reset all gradients
|
||||
tokens.grad.zero_()
|
||||
layer.gate.weight.grad.zero_()
|
||||
|
||||
layer.use_kernel = True
|
||||
new_out = layer(tokens) # get ouputs through colossal kernel
|
||||
|
||||
if data_type == torch.float32:
|
||||
check_equal(old_out, new_out)
|
||||
else:
|
||||
check_equal(old_out, new_out, 1e-2)
|
||||
# forward function passed
|
||||
|
||||
new_out.backward(grad) # get new type gradient
|
||||
n_tk_grad = tokens.grad.data.clone()
|
||||
n_gt_grad = layer.gate.weight.grad.data.clone()
|
||||
|
||||
if data_type == torch.float32:
|
||||
check_equal(o_tk_grad, n_tk_grad)
|
||||
else:
|
||||
check_equal(o_tk_grad, o_tk_grad, 1e-2)
|
||||
# tokens gradient is correct
|
||||
|
||||
if data_type == torch.float32:
|
||||
check_equal(o_gt_grad, n_gt_grad, 5e-05)
|
||||
else:
|
||||
check_equal(o_gt_grad, n_gt_grad, 2e-01)
|
||||
# bias gradient is correct
|
||||
|
||||
|
||||
@pytest.mark.dist
|
||||
@pytest.mark.parametrize("rs", [131])
|
||||
@pytest.mark.parametrize("hidden_size", [32, 144])
|
||||
@pytest.mark.parametrize("data_type", [torch.float32, torch.float16])
|
||||
@pytest.mark.parametrize("router", [Top1Router, Top2Router])
|
||||
@rerun_on_exception(exception_type=mp.ProcessRaisedException, pattern=".*Address already in use.*")
|
||||
def test_moe_kernel(rs, hidden_size, data_type, router):
|
||||
world_size = 4
|
||||
run_func = partial(run_routing,
|
||||
world_size=world_size,
|
||||
port=free_port(),
|
||||
rs=rs,
|
||||
hidden_size=hidden_size,
|
||||
data_type=data_type,
|
||||
router=router)
|
||||
mp.spawn(run_func, nprocs=world_size)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_moe_kernel(2, 256, torch.float16, Top2Router)
|
||||
from functools import partial
|
||||
import pytest
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.multiprocessing as mp
|
||||
import colossalai
|
||||
from colossalai.context import ParallelMode
|
||||
from colossalai.core import global_context as gpc
|
||||
from colossalai.utils import free_port, get_current_device
|
||||
from colossalai.nn.layer.moe import Top1Router, Top2Router, MoeLayer, Experts
|
||||
from colossalai.context.moe_context import MOE_CONTEXT
|
||||
from colossalai.testing import rerun_on_exception
|
||||
|
||||
BATCH_SIZE = 16
|
||||
NUM_EXPERTS = 4
|
||||
CONFIG = dict()
|
||||
|
||||
|
||||
def check_equal(tensor_a, tensor_b, atol=1e-06):
|
||||
assert torch.allclose(tensor_a, tensor_b, rtol=0, atol=atol) is True
|
||||
|
||||
|
||||
def run_routing(rank, world_size, port, rs=2, hidden_size=128, data_type=torch.float32, router=Top2Router):
|
||||
# Here we do not need TF32, since it brings absolute error on results
|
||||
torch.backends.cuda.matmul.allow_tf32 = False
|
||||
|
||||
colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
|
||||
local_rank = gpc.get_local_rank(ParallelMode.GLOBAL)
|
||||
|
||||
MOE_CONTEXT.setup(42) # MOE environment initialization
|
||||
MOE_CONTEXT.reset_loss()
|
||||
torch.manual_seed(rs + local_rank) # set each process has different random seed
|
||||
|
||||
# get randomized data
|
||||
tokens = torch.randn(BATCH_SIZE, hidden_size, dtype=data_type, device=get_current_device(), requires_grad=True)
|
||||
|
||||
expert_module = nn.Linear
|
||||
expert_factor = dict(in_features=hidden_size, out_features=hidden_size, device=get_current_device())
|
||||
expert = Experts(expert_module, NUM_EXPERTS, **expert_factor)
|
||||
layer = MoeLayer(hidden_size, NUM_EXPERTS, router(capacity_factor_train=1.0), expert)
|
||||
if data_type == torch.float16:
|
||||
layer = layer.half()
|
||||
|
||||
# use matrix multiplication instead of COL_MOE_KERNL in MOE dispatch and combine
|
||||
layer.use_kernel = False
|
||||
old_out = layer(tokens)
|
||||
ech = old_out.shape
|
||||
grad = torch.randn(ech, device=get_current_device())
|
||||
old_out.backward(grad) # get gradient
|
||||
|
||||
# save all results
|
||||
o_tk_grad = tokens.grad.data.clone()
|
||||
o_gt_grad = layer.gate.weight.grad.data.clone()
|
||||
|
||||
# reset all gradients
|
||||
tokens.grad.zero_()
|
||||
layer.gate.weight.grad.zero_()
|
||||
|
||||
layer.use_kernel = True
|
||||
new_out = layer(tokens) # get ouputs through colossal kernel
|
||||
|
||||
if data_type == torch.float32:
|
||||
check_equal(old_out, new_out)
|
||||
else:
|
||||
check_equal(old_out, new_out, 1e-2)
|
||||
# forward function passed
|
||||
|
||||
new_out.backward(grad) # get new type gradient
|
||||
n_tk_grad = tokens.grad.data.clone()
|
||||
n_gt_grad = layer.gate.weight.grad.data.clone()
|
||||
|
||||
if data_type == torch.float32:
|
||||
check_equal(o_tk_grad, n_tk_grad)
|
||||
else:
|
||||
check_equal(o_tk_grad, o_tk_grad, 1e-2)
|
||||
# tokens gradient is correct
|
||||
|
||||
if data_type == torch.float32:
|
||||
check_equal(o_gt_grad, n_gt_grad, 5e-05)
|
||||
else:
|
||||
check_equal(o_gt_grad, n_gt_grad, 2e-01)
|
||||
# bias gradient is correct
|
||||
|
||||
|
||||
@pytest.mark.dist
|
||||
@pytest.mark.parametrize("rs", [131])
|
||||
@pytest.mark.parametrize("hidden_size", [32, 144])
|
||||
@pytest.mark.parametrize("data_type", [torch.float32, torch.float16])
|
||||
@pytest.mark.parametrize("router", [Top1Router, Top2Router])
|
||||
@rerun_on_exception(exception_type=mp.ProcessRaisedException, pattern=".*Address already in use.*")
|
||||
def test_moe_kernel(rs, hidden_size, data_type, router):
|
||||
world_size = 4
|
||||
run_func = partial(run_routing,
|
||||
world_size=world_size,
|
||||
port=free_port(),
|
||||
rs=rs,
|
||||
hidden_size=hidden_size,
|
||||
data_type=data_type,
|
||||
router=router)
|
||||
mp.spawn(run_func, nprocs=world_size)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_moe_kernel(2, 256, torch.float16, Top2Router)
|
||||
|
@ -1,71 +1,71 @@
|
||||
from functools import partial
|
||||
import pytest
|
||||
import torch.nn as nn
|
||||
import torch.multiprocessing as mp
|
||||
import torch.distributed as dist
|
||||
import colossalai
|
||||
from colossalai.utils import free_port, get_current_device
|
||||
from colossalai.nn.layer.moe import Experts
|
||||
from colossalai.context.moe_context import MOE_CONTEXT
|
||||
from colossalai.utils.moe import sync_moe_model_param
|
||||
from colossalai.testing import assert_equal_in_group, rerun_on_exception
|
||||
|
||||
D_MODEL = 4
|
||||
D_FF = 8
|
||||
CONFIG = dict()
|
||||
|
||||
|
||||
def run_test(rank, port):
|
||||
world_size = 4
|
||||
colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
|
||||
expert_module = nn.Linear
|
||||
expert_factor = dict(in_features=D_MODEL, out_features=D_FF, device=get_current_device())
|
||||
|
||||
MOE_CONTEXT.setup(42) # MOE environment initialization
|
||||
exp0 = Experts(expert_module, 1, **expert_factor)
|
||||
exp1 = Experts(expert_module, 2, **expert_factor)
|
||||
exp2 = Experts(expert_module, 4, **expert_factor)
|
||||
exp3 = Experts(expert_module, 8, **expert_factor)
|
||||
|
||||
assert exp0.num_local_experts == 1
|
||||
assert exp1.num_local_experts == 1
|
||||
assert exp2.num_local_experts == 1
|
||||
assert exp3.num_local_experts == 2
|
||||
# experts deployment passed
|
||||
|
||||
parallel_info_dict = MOE_CONTEXT.parallel_info_dict
|
||||
rank = dist.get_rank()
|
||||
|
||||
assert len(parallel_info_dict) == 3
|
||||
assert dist.get_rank(parallel_info_dict[4].ep_group) == rank
|
||||
assert dist.get_rank(parallel_info_dict[2].ep_group) == rank % 2
|
||||
assert dist.get_rank(parallel_info_dict[1].ep_group) == 0
|
||||
|
||||
assert dist.get_rank(parallel_info_dict[4].dp_group) == 0
|
||||
assert dist.get_rank(parallel_info_dict[2].dp_group) == rank // 2
|
||||
assert dist.get_rank(parallel_info_dict[1].dp_group) == rank
|
||||
# group creation passed
|
||||
|
||||
model = nn.ModuleList([exp0, exp1, exp2, exp3])
|
||||
model = model.to(get_current_device())
|
||||
sync_moe_model_param(model)
|
||||
|
||||
assert_equal_in_group(exp0.experts[0].weight.data, parallel_info_dict[1].dp_group)
|
||||
assert_equal_in_group(exp0.experts[0].bias.data, parallel_info_dict[1].dp_group)
|
||||
# MOE experts layout success when ep_size = 1
|
||||
|
||||
assert_equal_in_group(exp1.experts[0].weight.data, parallel_info_dict[2].dp_group)
|
||||
assert_equal_in_group(exp1.experts[0].bias.data, parallel_info_dict[2].dp_group)
|
||||
# MOE experts layout success when ep_size = 2
|
||||
|
||||
|
||||
@pytest.mark.dist
|
||||
@rerun_on_exception(exception_type=mp.ProcessRaisedException, pattern=".*Address already in use.*")
|
||||
def test_moe_initialization():
|
||||
world_size = 4
|
||||
run_func = partial(run_test, port=free_port())
|
||||
mp.spawn(run_func, nprocs=world_size)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_moe_initialization()
|
||||
from functools import partial
|
||||
import pytest
|
||||
import torch.nn as nn
|
||||
import torch.multiprocessing as mp
|
||||
import torch.distributed as dist
|
||||
import colossalai
|
||||
from colossalai.utils import free_port, get_current_device
|
||||
from colossalai.nn.layer.moe import Experts
|
||||
from colossalai.context.moe_context import MOE_CONTEXT
|
||||
from colossalai.utils.moe import sync_moe_model_param
|
||||
from colossalai.testing import assert_equal_in_group, rerun_on_exception
|
||||
|
||||
D_MODEL = 4
|
||||
D_FF = 8
|
||||
CONFIG = dict()
|
||||
|
||||
|
||||
def run_test(rank, port):
|
||||
world_size = 4
|
||||
colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
|
||||
expert_module = nn.Linear
|
||||
expert_factor = dict(in_features=D_MODEL, out_features=D_FF, device=get_current_device())
|
||||
|
||||
MOE_CONTEXT.setup(42) # MOE environment initialization
|
||||
exp0 = Experts(expert_module, 1, **expert_factor)
|
||||
exp1 = Experts(expert_module, 2, **expert_factor)
|
||||
exp2 = Experts(expert_module, 4, **expert_factor)
|
||||
exp3 = Experts(expert_module, 8, **expert_factor)
|
||||
|
||||
assert exp0.num_local_experts == 1
|
||||
assert exp1.num_local_experts == 1
|
||||
assert exp2.num_local_experts == 1
|
||||
assert exp3.num_local_experts == 2
|
||||
# experts deployment passed
|
||||
|
||||
parallel_info_dict = MOE_CONTEXT.parallel_info_dict
|
||||
rank = dist.get_rank()
|
||||
|
||||
assert len(parallel_info_dict) == 3
|
||||
assert dist.get_rank(parallel_info_dict[4].ep_group) == rank
|
||||
assert dist.get_rank(parallel_info_dict[2].ep_group) == rank % 2
|
||||
assert dist.get_rank(parallel_info_dict[1].ep_group) == 0
|
||||
|
||||
assert dist.get_rank(parallel_info_dict[4].dp_group) == 0
|
||||
assert dist.get_rank(parallel_info_dict[2].dp_group) == rank // 2
|
||||
assert dist.get_rank(parallel_info_dict[1].dp_group) == rank
|
||||
# group creation passed
|
||||
|
||||
model = nn.ModuleList([exp0, exp1, exp2, exp3])
|
||||
model = model.to(get_current_device())
|
||||
sync_moe_model_param(model)
|
||||
|
||||
assert_equal_in_group(exp0.experts[0].weight.data, parallel_info_dict[1].dp_group)
|
||||
assert_equal_in_group(exp0.experts[0].bias.data, parallel_info_dict[1].dp_group)
|
||||
# MOE experts layout success when ep_size = 1
|
||||
|
||||
assert_equal_in_group(exp1.experts[0].weight.data, parallel_info_dict[2].dp_group)
|
||||
assert_equal_in_group(exp1.experts[0].bias.data, parallel_info_dict[2].dp_group)
|
||||
# MOE experts layout success when ep_size = 2
|
||||
|
||||
|
||||
@pytest.mark.dist
|
||||
@rerun_on_exception(exception_type=mp.ProcessRaisedException, pattern=".*Address already in use.*")
|
||||
def test_moe_initialization():
|
||||
world_size = 4
|
||||
run_func = partial(run_test, port=free_port())
|
||||
mp.spawn(run_func, nprocs=world_size)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_moe_initialization()
|
||||
|
Loading…
Reference in New Issue
Block a user