mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-09 04:50:17 +00:00
Refactored docstring to google style
This commit is contained in:
@@ -114,10 +114,13 @@ class CheckpointFunction(torch.autograd.Function):
|
||||
|
||||
|
||||
def checkpoint(function, activation_offload ,*args):
|
||||
"""Checkpoint the computation while preserve the rng states, modified from Pytorch torch.utils.checkpoint
|
||||
"""Checkpoint the computation while preserve the rng states, modified from Pytorch torch.utils.checkpoint.
|
||||
|
||||
:param function: Describe the forward pass function. It should know how to handle the input tuples.
|
||||
:param args: Tuple containing the parameters of the function
|
||||
:return: Output of running function with provided args
|
||||
Args:
|
||||
function: Describe the forward pass function. It should know how to handle the input tuples.
|
||||
args (list): Tuple containing the parameters of the function
|
||||
|
||||
Returns:
|
||||
Output of running function with provided args.
|
||||
"""
|
||||
return CheckpointFunction.apply(function, activation_offload, *args)
|
||||
|
@@ -50,17 +50,17 @@ def _get_standard_checkpoint_filename(epoch: int, suffix: str = ''):
|
||||
|
||||
|
||||
def get_checkpoint_path(checkpoint_dir: str, epoch: int, suffix: str = ''):
|
||||
"""This is a function to generate the checkpoint path from the (checkpoint_dir, epoch, suffix, gpu_parallel_rank) tuple.
|
||||
"""This is a function to generate the checkpoint path from the tuple
|
||||
(checkpoint_dir, epoch, suffix, gpu_parallel_rank).
|
||||
This is useful during generation and recuperation of the checkpoint.
|
||||
|
||||
:param checkpoint_dir: Set up a directory for saving checkpoints
|
||||
:type checkpoint_dir: str
|
||||
:param epoch: Epoch number (indicate how many epochs have you trained this model)
|
||||
:type epoch: int
|
||||
:param suffix: Additional notation to specify the model or checkpoint, defaults to ''
|
||||
:type suffix: str, optional
|
||||
:return: Checkpoint path to be generated
|
||||
:rtype: path
|
||||
Args:
|
||||
checkpoint_dir (str): Set up a directory for saving checkpoints.
|
||||
epoch (int): Epoch number (indicate how many epochs have you trained this model).
|
||||
suffix (str, optional): Additional notation to specify the model or checkpoint, defaults to ''
|
||||
|
||||
Returns:
|
||||
str: The checkpoint path to be generated.
|
||||
"""
|
||||
ckpt_filename = _get_standard_checkpoint_filename(epoch, suffix)
|
||||
return os.path.join(checkpoint_dir, ckpt_filename)
|
||||
@@ -74,12 +74,13 @@ def _ensure_directory_exists(filename: str):
|
||||
|
||||
|
||||
def get_latest_checkpoint_pattern(suffix: str = ''):
|
||||
"""Generate Regular expression of latest checkpoint's pattern
|
||||
"""Generate Regular expression of the latest checkpoint's pattern.
|
||||
|
||||
:param suffix: Additional notation to specify the model or checkpoint, defaults to ''
|
||||
:type suffix: str, optional
|
||||
:return: Checkpoint pattern
|
||||
:rtype: regular expression
|
||||
Args:
|
||||
suffix (str, optional): Additional notation to specify the model or checkpoint, defaults to ''.
|
||||
|
||||
Returns:
|
||||
str: The regular expression of checkpoint pattern.
|
||||
"""
|
||||
ranks_name = _get_ranks_name()
|
||||
pattern = r'epoch(\d+)-{}{}\.pt'.format(ranks_name, suffix)
|
||||
@@ -88,16 +89,19 @@ def get_latest_checkpoint_pattern(suffix: str = ''):
|
||||
|
||||
|
||||
def get_latest_checkpoint_path(checkpoint_dir: str, suffix: str = ''):
|
||||
"""This is a function to retrieve the latest checkpoint path from the (checkpoint_dir, suffix, gpu_parallel_rank) tuple.
|
||||
"""This is a function to retrieve the latest checkpoint path from the tuple
|
||||
(checkpoint_dir, suffix, gpu_parallel_rank).
|
||||
This is useful during recuperation of the checkpoint, especially when you do not know the epoch number.
|
||||
|
||||
:param checkpoint_dir: Directory for saving checkpoints
|
||||
:type checkpoint_dir: str
|
||||
:param suffix: Additional notation to specify the model or checkpoint, defaults to ''
|
||||
:type suffix: str, optional
|
||||
:raises FileNotFoundError: Raise error when we cannot find the latest checkpoint file with inputs given
|
||||
:return: The latest checkpoint path to be retrieved
|
||||
:rtype: path
|
||||
Args:
|
||||
checkpoint_dir (str): Directory for saving checkpoints
|
||||
suffix (str, optional): Additional notation to specify the model or checkpoint, defaults to ''
|
||||
|
||||
Returns:
|
||||
str: The latest retrieved checkpoint path.
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: Raise error when we cannot find the latest checkpoint file with inputs given.
|
||||
"""
|
||||
CKPT_NAME_PAT = get_latest_checkpoint_pattern(suffix=suffix)
|
||||
|
||||
@@ -126,22 +130,19 @@ def save_checkpoint(checkpoint_path: str,
|
||||
optimizer: torch.optim.Optimizer,
|
||||
lr_scheduler: torch.optim.lr_scheduler._LRScheduler = None,
|
||||
**kwargs):
|
||||
"""Given a directory to store the checkpoints, saves all the training components' parameters or buffers, such as model,
|
||||
optimizer, lr_scheduler and etc. into a checkpoint dictionary.
|
||||
"""Given a directory to store the checkpoints, saves all the training components' parameters or buffers, such as
|
||||
model, optimizer, lr_scheduler etc. into a checkpoint dictionary.
|
||||
|
||||
This method can be used for both colosalai nn.BaseModel and normal pytorch nn.Module.
|
||||
This method can be used for both :class:`colossalai.nn.BaseModel` and normal :class:`torch.nn.Module`.
|
||||
|
||||
|
||||
:param checkpoint_path: Set up a directory for saving checkpoints
|
||||
:type checkpoint_path: str
|
||||
:param epoch: Epoch number (indicate how many epochs have you trained this model)
|
||||
:type epoch: int
|
||||
:param model: Model to be registered
|
||||
:type model: torch.nn.Module
|
||||
:param optimizer: Optimizer to be registered
|
||||
:type optimizer: torch.optim.Optimizer
|
||||
:param lr_scheduler: lr_scheduler to be registered, defaults to None
|
||||
:type lr_scheduler: torch.optim.lr_scheduler._LRScheduler, optional
|
||||
Args:
|
||||
checkpoint_path (str): Set up a directory for saving checkpoints.
|
||||
epoch (int): Epoch number (indicate how many epochs have you trained this model).
|
||||
model (:class:`torch.nn.Module`): Model to be registered.
|
||||
optimizer (Union[:class:`torch.optim.Optimizer`, :class:`colossalai.nn.optimizer`]): Optimizer to be registered.
|
||||
lr_scheduler (Union[:class:`torch.optim.lr_scheduler`,
|
||||
:class:`colossalai.nn.lr_scheduler`], optional): lr_scheduler to be registered, defaults to None.
|
||||
kwargs (dict): additional parameters to be saved.
|
||||
"""
|
||||
# for compatibility with normal pytorch nn.Module
|
||||
if hasattr(model, 'state_dict_for_save_checkpoint'):
|
||||
@@ -165,31 +166,31 @@ def load_checkpoint(checkpoint_path: str,
|
||||
finetune: bool = False,
|
||||
strict: bool = True) -> Tuple:
|
||||
"""Loads the checkpoint file.
|
||||
|
||||
If finetune is False, then we intend to continue/resume the training process from the checkpoint given.
|
||||
So we copy parameters and buffers from state_dict into these modules(model, optimizer,lr_scheduler)
|
||||
and its descendants.
|
||||
If finetune is True, then only the weights and buffers of model should be reload.
|
||||
If strict is True, then the keys of state_dict must exactly match the keys returned by this module’s
|
||||
state_dict() function.
|
||||
and its descendants.
|
||||
|
||||
:param checkpoint_path: The exact and matched checkpoint_path directory to retrieve appropriate state_dict
|
||||
:type checkpoint_path: str
|
||||
:param model: Model to reload parameters and buffers
|
||||
:type model: torch.nn.Module
|
||||
:param optimizer: Optimizer to recuperate
|
||||
:type optimizer: torch.optim.Optimizer
|
||||
:param lr_scheduler: lr_scheduler to recuperate, defaults to None
|
||||
:type lr_scheduler: torch.optim.lr_scheduler._LRScheduler, optional
|
||||
:param finetune: Whether to finetune the model with new dataset or continue the pre-training, defaults to False
|
||||
:type finetune: bool, optional
|
||||
:param strict: Whether to strictly enforce that the keys in
|
||||
:attr:`state_dict` of the checkpoint match the names of
|
||||
parameters and buffers in model., defaults to True
|
||||
:type strict: bool, optional
|
||||
:raises ValueError: Raise error if the model/optimizer cannot successfully be recuperated
|
||||
:return: (the epoch number of the checkpoint retrieved, the checkpoint retrieved)
|
||||
:rtype: Tuple
|
||||
If finetune is True, then only the weights and buffers of model should be reloaded.
|
||||
If strict is True, then the keys of state_dict must exactly match the keys returned
|
||||
by this module’s state_dict() function.
|
||||
|
||||
Args:
|
||||
checkpoint_path (str): The exact and matched checkpoint_path directory to retrieve appropriate state_dict.
|
||||
model (:class:`torch.nn.Module`): Model to reload parameters and buffers.
|
||||
optimizer (Union[:class:`torch.optim.Optimizer`, :class:`colossalai.nn.optimizer`]): Optimizer to recuperate.
|
||||
lr_scheduler (:class:`torch.optim.lr_scheduler._LRScheduler`, optional):
|
||||
lr_scheduler to recuperate, defaults to None.
|
||||
finetune (bool, optional): Whether to finetune the model with new dataset or
|
||||
continue the pre-training, defaults to False.
|
||||
strict (bool, optional): Whether to strictly enforce that the keys in :attr:`state_dict`
|
||||
of the checkpoint match the names of parameters and buffers in model, defaults to True.
|
||||
|
||||
Returns:
|
||||
Tuple(int, ``checkpoint``): The tuple (the epoch number of the checkpoint retrieved, the checkpoint retrieved).
|
||||
|
||||
Raises:
|
||||
ValueError: Raise error if the model/optimizer cannot successfully be recuperated
|
||||
"""
|
||||
# Load the checkpoint.
|
||||
checkpoint = torch.load(checkpoint_path, map_location='cpu')
|
||||
|
@@ -27,10 +27,10 @@ from .multi_tensor_apply import multi_tensor_applier
|
||||
def print_rank_0(msg: str, logger=None):
|
||||
"""Print messages and save logs(optional). This is executed only if you are the rank-0 gpu.
|
||||
|
||||
:param msg: A string message to output
|
||||
:type msg: str
|
||||
:param logger: Python logger object, defaults to None
|
||||
:type logger: optional
|
||||
Args:
|
||||
msg (str): A string message to output.
|
||||
logger (:class:`colossalai.logging.DistributedLogger`, optional):
|
||||
The logger to record the message, defaults to None.
|
||||
"""
|
||||
if gpc.get_global_rank() == 0:
|
||||
if logger is None:
|
||||
@@ -53,12 +53,15 @@ def free_port():
|
||||
|
||||
|
||||
def sync_model_param(model, parallel_mode):
|
||||
"""Make sure data parameters are consistent during Data Parallel Mode
|
||||
r"""Make sure data parameters are consistent during Data Parallel Mode.
|
||||
|
||||
:param model: A pyTorch nn.model on whose parameters you check the consistency
|
||||
:param parallel_mode: Parallel mode to be checked
|
||||
:type model: torch.nn.Module
|
||||
:type parallel_mode: colossalai.context.ParallelMode
|
||||
Args:
|
||||
model (:class:`torch.nn.Module`): A pyTorch model on whose parameters you check the consistency.
|
||||
parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel mode to be checked.
|
||||
|
||||
Note:
|
||||
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
|
||||
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
|
||||
"""
|
||||
if gpc.is_initialized(parallel_mode) and gpc.get_world_size(parallel_mode) > 1:
|
||||
for param in model.parameters():
|
||||
@@ -146,18 +149,19 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
|
||||
"""Clips gradient norm of an iterable of parameters whose gradients are in fp32.
|
||||
|
||||
This is adapted from :func:`torch.nn.utils.clip_grad.clip_grad_norm_` and
|
||||
added functionality to handle model parallel parameters. Note that
|
||||
the gradients are modified in place.
|
||||
added functionality to handle model parallel parameters.
|
||||
|
||||
:param parameters: An iterable of Tensors or a single Tensor that will have gradients normalized
|
||||
:type parameters: (Iterable[Tensor] or Tensor)
|
||||
:param max_norm: Max norm of the gradients
|
||||
:type max_norm: float or int
|
||||
:param norm_type: Type of the used p-norm. Can be ``'inf'`` for infinity norm.
|
||||
:type norm_type: float or int
|
||||
Note:
|
||||
the gradients are modified in place.
|
||||
|
||||
:return: Total norm of the parameters (viewed as a single vector).
|
||||
:rtype: float
|
||||
Args:
|
||||
parameters (Iterable[:class:`torch.tensor`] or :class:`torch.tensor`):
|
||||
An iterable of Tensors or a single Tensor that will have gradients normalized.
|
||||
max_norm (Union[float, int]): Max norm of the gradients.
|
||||
norm_type (Union[float, int, 'inf']): Type of the used p-norm. Can be ``'inf'`` for infinity norm.
|
||||
|
||||
Returns:
|
||||
float: Total norm of the parameters.
|
||||
"""
|
||||
|
||||
if isinstance(parameters, torch.Tensor):
|
||||
|
@@ -19,18 +19,15 @@ T_co = TypeVar('T_co', covariant=True)
|
||||
|
||||
@DATA_SAMPLERS.register_module
|
||||
class DataParallelSampler(Sampler):
|
||||
"""A data sampler for distributed data parallelism
|
||||
"""A data sampler for distributed data parallelism.
|
||||
|
||||
:param dataset: A Dataset instance
|
||||
:type dataset: torch.utils.data.Dataset
|
||||
:param shuffle: Whether to shuffle data, defaults to False
|
||||
:type shuffle: bool, optional
|
||||
:param seed: The random seed, defaults to 0
|
||||
:type seed: int, optional
|
||||
:param drop_last: Set to True to drop the last incomplete batch, if the dataset size is not divisible by the batch
|
||||
size. If False and the size of dataset is not divisible by the batch size, then the last batch will be smaller,
|
||||
defaults to False
|
||||
:type drop_last: bool, optional
|
||||
Args:
|
||||
dataset (:class:`torch.utils.data.Dataset`): The Dataset for sampling.
|
||||
shuffle (bool, optional): Whether to shuffle data, defaults to False.
|
||||
seed (int, optional): The random seed used for sampling, defaults to 0.
|
||||
drop_last (bool, optional): Set to True to drop the last incomplete batch, if the dataset size
|
||||
is not divisible by the batch size. If False and the size of dataset is not divisible by
|
||||
the batch size, then the last batch will be smaller, defaults to False.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
@@ -104,8 +101,8 @@ class DataParallelSampler(Sampler):
|
||||
use a different random ordering for each epoch. Otherwise, the next iteration of this
|
||||
sampler will yield the same ordering.
|
||||
|
||||
:param epoch: Epoch number.
|
||||
:type epoch: int
|
||||
Args:
|
||||
epoch (int): Epoch number.
|
||||
"""
|
||||
self.epoch = epoch
|
||||
|
||||
@@ -118,29 +115,27 @@ def get_dataloader(dataset,
|
||||
pin_memory=False,
|
||||
num_workers=0,
|
||||
**kwargs):
|
||||
"""Set up a deterministic dataloader (also configure seed workers, samplers and whether shuffle or not)
|
||||
r"""Set up a deterministic dataloader (also configure seed workers, samplers and whether shuffle or not)
|
||||
|
||||
.. note:: When pipeline parallel is enabled, shuffle cannot be True as it will result in mismatch between input data
|
||||
on the 1st stage and label on the last stage
|
||||
Note:
|
||||
When pipeline parallel is enabled, shuffle cannot be True as it will result in mismatch between input data
|
||||
on the 1st stage and label on the last stage.
|
||||
|
||||
:param dataset: A :class:`torch.utils.data.Dataset` object
|
||||
:param shuffle: Whether to shuffle the dataset
|
||||
:param seed: Random worker seed, defaults to 1024
|
||||
:param add_sampler: Add DistributedDataParallelSampelr to the dataset
|
||||
:param drop_last: Drop the last incomplete batch of data
|
||||
:param pin_memory: Whether to pin memory address in CPU memory
|
||||
:param num_workers: Number of worker threads for this dataloader
|
||||
Args:
|
||||
dataset (:class:`torch.utils.data.Dataset`): The dataset to be loaded.
|
||||
shuffle (bool, optional): Whether to shuffle the dataset. Defaults to False.
|
||||
seed (int, optional): Random worker seed for sampling, defaults to 1024.
|
||||
add_sampler: Whether to add ``DistributedDataParallelSampler`` to the dataset. Defaults to True.
|
||||
drop_last (bool, optional): Set to True to drop the last incomplete batch, if the dataset size
|
||||
is not divisible by the batch size. If False and the size of dataset is not divisible by
|
||||
the batch size, then the last batch will be smaller, defaults to False.
|
||||
pin_memory (bool, optional): Whether to pin memory address in CPU memory. Defaults to False.
|
||||
num_workers (int, optional): Number of worker threads for this dataloader. Defaults to 0.
|
||||
kwargs (dict): optional parameters for ``torch.utils.data.DataLoader``, more details could be found in
|
||||
`DataLoader <https://pytorch.org/docs/stable/_modules/torch/utils/data/dataloader.html#DataLoader>`_.
|
||||
|
||||
:type dataset: :class:`torch.utils.data.Dataset`
|
||||
:type shuffle: bool, optional. Default is False
|
||||
:type seed: int, optional. Default is 1024
|
||||
:type add_sampler: bool, optional. Default is True
|
||||
:type drop_last: bool, optional. Default is False
|
||||
:type pin_memory: bool, optional. Default is False
|
||||
:type num_workers: int, optional. Default is 0
|
||||
|
||||
:return: A object of :class:`torch.utils.data.DataLoader`
|
||||
:rtype: :class:`torch.utils.data.DataLoader`
|
||||
Returns:
|
||||
:class:`torch.utils.data.DataLoader`: A DataLoader used for training or testing.
|
||||
"""
|
||||
_kwargs = kwargs.copy()
|
||||
|
||||
|
@@ -13,19 +13,25 @@ def accumulate_gradient(model: nn.Module,
|
||||
accumulate_size: int,
|
||||
gradient_handlers: List[BaseGradientHandler] = None,
|
||||
lr_scheduler: _LRScheduler = None):
|
||||
"""
|
||||
:param model: your model object
|
||||
:type model: :class:`torch.nn.Module`
|
||||
:param optimizer: your optimizer object
|
||||
:type optimizer: :class:`torch.optim.Optimizer`
|
||||
:param dataloader: your dataloader object
|
||||
:type dataloader: Iterable
|
||||
:param accumulate_size: the number of steps to accumulate gradients
|
||||
:type accumulate_size: int
|
||||
:param gradient_handlers: list of gradient handler objects. Default is None
|
||||
:type gradient_handlers: List[:class:`colossalai.engine.BaseGradientHandler`]
|
||||
:param lr_scheduler: your lr scheduler object. Default is None
|
||||
:type lr_scheduler: `torch.optim.lr_scheduler._LRScheduler`
|
||||
r"""Turning model, optimizer, dataloader into corresponding object for gradient accumulation.
|
||||
|
||||
Args:
|
||||
model (:class:`torch.nn.Module`): your model object for gradient accumulation.
|
||||
optimizer (:class:`torch.optim.Optimizer`): your optimizer object for gradient accumulation.
|
||||
dataloader (:class:`torch.utils.data.DataLoader` or iterable objects):
|
||||
your dataloader object, would be called like iter(dataloader)
|
||||
accumulate_size (int): the number of steps to accumulate gradients
|
||||
gradient_handlers (List[:class:`colossalai.engine.BaseGradientHandler`]):
|
||||
list of gradient handler objects. Default is None.
|
||||
lr_scheduler (`torch.optim.lr_scheduler` or `colossalai.nn.lr_scheduler`):
|
||||
your ``lr_scheduler`` object for gradient accumulation. Defaults to None.
|
||||
|
||||
More details about `gradient_handlers` could be found in
|
||||
`Gradient_handler <https://github.com/hpcaitech/ColossalAI/tree/main/colossalai/engine/gradient_handler>`_.
|
||||
|
||||
More details about `lr_scheduler` could be found
|
||||
`lr_scheduler <https://github.com/hpcaitech/ColossalAI/tree/main/colossalai/nn/lr_scheduler>`_. and
|
||||
`how to adjust learning rate <https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate>`_.
|
||||
"""
|
||||
optimizer = GradAccumOptimizer(optimizer, accumulate_size=accumulate_size, model=model)
|
||||
dataloader = GradAccumDataloader(dataloader, accumulate_size=accumulate_size)
|
||||
|
@@ -15,15 +15,13 @@ from colossalai.engine import BaseGradientHandler
|
||||
|
||||
class GradAccumOptimizer(ColossalaiOptimizer):
|
||||
"""A wrapper for the optimizer to enable gradient accumulation by skipping the steps
|
||||
before accumulation size is reached
|
||||
|
||||
:param optim: Your optimizer object
|
||||
:type optim: :class:`torch.optim.Optimizer`
|
||||
:param accumulate_size: The number of steps to accumulate gradients
|
||||
:type accumulate_size: int
|
||||
:param model: Your model object to check if it is DDP for special handling of no_sync() context
|
||||
:type model: :class:`torch.nn.Module`
|
||||
before accumulation size is reached.
|
||||
|
||||
Args:
|
||||
optim (:class:`torch.optim.Optimizer`): Your optimizer object for gradient accumulation.
|
||||
accumulate_size (int): The number of steps to accumulate gradients.
|
||||
model (:class:`torch.nn.Module`):
|
||||
Your model object to check if it is DistributedDataParallel for special handling of no_sync() context.
|
||||
"""
|
||||
|
||||
def __init__(self, optim: Optimizer, accumulate_size: int, model: nn.Module = None):
|
||||
@@ -76,18 +74,18 @@ class GradAccumOptimizer(ColossalaiOptimizer):
|
||||
|
||||
|
||||
class GradAccumDataloader:
|
||||
"""A wrapper for dataloder to enable gradient accumulation by dropping the last incomplete steps.
|
||||
"""A wrapper for dataloader to enable gradient accumulation by dropping the last incomplete steps.
|
||||
|
||||
For example, if a dataloader has 10 batches of data and accumulate size is 4. The model paramters will
|
||||
be update only twice at step 4 and step 8. The last two batches of data do not form a complete 4-step cycle.
|
||||
Thus, they will be automatically skipped by this class. If the dataloader is not standard PyTorch dataloader,
|
||||
(e.g. Dali dataloader), this class will automatically consume (load data for nothing) the remaining 2 batches.
|
||||
|
||||
:param dataloader: Your dataloader object
|
||||
:type dataloader: Iterable
|
||||
:param accumulate_size: The number of steps to accumulate gradients
|
||||
:type accumulate_size: int
|
||||
Note:
|
||||
The dataloader would drop the last incomplete steps for gradient accumulation.
|
||||
For example, if a dataloader has 10 batches of data and accumulate size is 4. The model parameters will
|
||||
be updated only twice at step 4 and step 8. The last two batches of data do not form a complete 4-step cycle.
|
||||
Thus, they will be automatically skipped by this class. If the dataloader is not standard PyTorch dataloader,
|
||||
(e.g. Dali dataloader), this class will automatically consume (load data for nothing) the remaining 2 batches.
|
||||
|
||||
Args:
|
||||
optim (``Iterable``): Your dataloader object for gradient accumulation.
|
||||
accumulate_size (int): The number of steps to accumulate gradients.
|
||||
"""
|
||||
|
||||
def __init__(self, dataloader: Iterable, accumulate_size: int) -> None:
|
||||
@@ -125,13 +123,12 @@ class GradAccumDataloader:
|
||||
|
||||
class GradAccumLrSchedulerByStep(_LRScheduler):
|
||||
"""A wrapper for the LR scheduler to enable gradient accumulation by skipping the steps
|
||||
before accumulation size is reached
|
||||
|
||||
:param lr_scheduler: Your lr scheduler object
|
||||
:type lr_scheduler: :class:`torch.optim.lr_scheduler._LRScheduler`
|
||||
:param accumulate_size: The number of steps to accumulate gradients
|
||||
:type accumulate_size: int
|
||||
before accumulation size is reached.
|
||||
|
||||
Args:
|
||||
lr_scheduler (:class:`torch.optim.lr_scheduler._LRScheduler`):
|
||||
Your ``lr_scheduler`` object for gradient accumulation.
|
||||
accumulate_size (int): The number of steps to accumulate gradients.
|
||||
"""
|
||||
|
||||
def __init__(self, lr_scheduler: _LRScheduler, accumulate_size: int) -> None:
|
||||
@@ -171,13 +168,16 @@ class GradAccumLrSchedulerByStep(_LRScheduler):
|
||||
|
||||
|
||||
class GradAccumGradientHandler:
|
||||
"""A wrapper for the gradient handler to enable gradient accumulation by skipping the steps
|
||||
before accumulation size is reached
|
||||
r"""A wrapper for the gradient handler to enable gradient accumulation by skipping the steps
|
||||
before accumulation size is reached.
|
||||
|
||||
:param grad_handler: Your gradient handler object
|
||||
:type grad_handler: :class:`colossalai.engine.BaseGradientHandler`
|
||||
:param accumulate_size: The number of steps to accumulate gradients
|
||||
:type accumulate_size: int
|
||||
Args:
|
||||
grad_handler (:class:`colossalai.engine.BaseGradientHandler`):
|
||||
Your ``gradient_handler`` object for gradient accumulation, would be called when achieving `accumulate_size`.
|
||||
accumulate_size (int): The number of steps to accumulate gradients.
|
||||
|
||||
More details about ``gradient_handlers`` could be found in
|
||||
`Gradient_handler <https://github.com/hpcaitech/ColossalAI/tree/main/colossalai/engine/gradient_handler>`_.
|
||||
|
||||
"""
|
||||
|
||||
|
@@ -14,12 +14,13 @@ from typing import Optional
|
||||
|
||||
|
||||
def colo_cuda_memory_used(device: Optional[torch.device] = None) -> int:
|
||||
"""
|
||||
Get the free memory info of device.
|
||||
:param device: a torch device instance or None
|
||||
:type device: Optional[torch.device]
|
||||
:return: current memory usage, sized by Byte
|
||||
:rtype: int
|
||||
"""Get the free memory info of device.
|
||||
|
||||
Args:
|
||||
device (Optional[``torch.device``]): a torch device instance or None. Defaults None.
|
||||
|
||||
Returns:
|
||||
int: current memory usage, sized by Byte.
|
||||
"""
|
||||
if device:
|
||||
assert device.type == 'cuda'
|
||||
@@ -34,7 +35,7 @@ def colo_cuda_memory_used(device: Optional[torch.device] = None) -> int:
|
||||
|
||||
|
||||
def bytes_to_GB(val, decimal=2):
|
||||
"""A byte-to-Gigabyte converter, defaultly using binary notation.
|
||||
"""A byte-to-Gigabyte converter, default using binary notation.
|
||||
|
||||
:param val: X bytes to convert
|
||||
:return: X' GB
|
||||
@@ -43,7 +44,7 @@ def bytes_to_GB(val, decimal=2):
|
||||
|
||||
|
||||
def bytes_to_MB(val, decimal=2):
|
||||
"""A byte-to-Megabyte converter, defaultly using binary notation.
|
||||
"""A byte-to-Megabyte converter, default using binary notation.
|
||||
|
||||
:param val: X bytes to convert
|
||||
:return: X' MB
|
||||
@@ -54,13 +55,13 @@ def bytes_to_MB(val, decimal=2):
|
||||
def report_memory_usage(message, logger=None, report_cpu=False):
|
||||
"""Calculate and print RAM usage (in GB)
|
||||
|
||||
:param message: A prefix message to add in the log
|
||||
:type message: str
|
||||
:param logger: An instance of :class:`colossalai.logging.DistributedLogger`
|
||||
:type logger: :class:`colossalai.logging.DistributedLogger`, optional
|
||||
:param report_cpu: Whether to report CPU memory
|
||||
:type report_cpu: bool, optional
|
||||
:raises EnvironmentError: Raise error if no distributed environment has been initialized
|
||||
Args:
|
||||
message (str): A prefix message to add in the log.
|
||||
logger (:class:`colossalai.logging.DistributedLogger`): The logger used to record memory information.
|
||||
report_cpu (bool, optional): Whether to report CPU memory.
|
||||
|
||||
Raises:
|
||||
EnvironmentError: Raise error if no distributed environment has been initialized.
|
||||
"""
|
||||
if not gpc.is_initialized(ParallelMode.GLOBAL):
|
||||
raise EnvironmentError("No distributed environment is initialized")
|
||||
|
@@ -12,8 +12,8 @@ def get_moe_epsize_param_dict(model: nn.Module) -> Dict[int, List[nn.Parameter]]
|
||||
size of every parameter. Since the parameters in data parallelism is replicated
|
||||
in each GPU, we set their ep_size to 1.
|
||||
|
||||
:param model: A pyTorch nn.model from which we get dict
|
||||
:type model: torch.nn.Module
|
||||
Args:
|
||||
model (:class:`torch.nn.Module`): A pyTorch `nn.Module` from which we get dict.
|
||||
"""
|
||||
epsize_param_dict = dict()
|
||||
for param in model.parameters():
|
||||
@@ -29,10 +29,10 @@ def get_moe_epsize_param_dict(model: nn.Module) -> Dict[int, List[nn.Parameter]]
|
||||
|
||||
|
||||
def sync_moe_model_param(model: nn.Module):
|
||||
"""Make sure model parameters are consistent in MoE parallel context
|
||||
"""Make sure model parameters are consistent in MoE parallel context.
|
||||
|
||||
:param model: A pyTorch nn.model on whose parameters you check the consistency
|
||||
:type model: torch.nn.Module
|
||||
Args:
|
||||
model (:class:`torch.nn.Module`): A pyTorch model on whose parameters you check the consistency.
|
||||
"""
|
||||
if is_using_ddp():
|
||||
|
||||
|
@@ -3,10 +3,10 @@
|
||||
|
||||
class MultiTensorApply(object):
|
||||
"""
|
||||
Apply an operation to a list of tensors efficiently
|
||||
Apply an operation to a list of tensors efficiently.
|
||||
|
||||
:param chunk_size: Size of a chunk
|
||||
:type chunk_size: int
|
||||
Args:
|
||||
chunk_size (int): Size of a chunk.
|
||||
"""
|
||||
|
||||
available = False
|
||||
|
@@ -9,6 +9,7 @@ from collections import defaultdict
|
||||
LINE_WIDTH = 108
|
||||
LINE = '-' * LINE_WIDTH + '\n'
|
||||
|
||||
|
||||
class TensorDetector():
|
||||
def __init__(self,
|
||||
show_info: bool = True,
|
||||
@@ -16,17 +17,14 @@ class TensorDetector():
|
||||
include_cpu: bool = False,
|
||||
module: Optional[nn.Module] = None
|
||||
):
|
||||
"""This class is an detector to detect tensor on different devices.
|
||||
|
||||
:param show_info: whether to print the info on screen, default True
|
||||
:type show_info: bool
|
||||
:param log: the file name to save the log
|
||||
:type log: str
|
||||
:param include_cpu: whether to detect tensor on cpu, default False
|
||||
:type include_cpu: bool
|
||||
:param module: when sending an `nn.Module` it, the detector can name the tensors detected better
|
||||
:type module: Optional[nn.Module]
|
||||
"""This class is a detector to detect tensor on different devices.
|
||||
|
||||
Args:
|
||||
show_info (bool, optional): whether to print the info on screen, default True.
|
||||
log (str, optional): the file name to save the log. Defaults to None.
|
||||
include_cpu (bool, optional): whether to detect tensor on cpu, default False.
|
||||
module (Optional[:class:`nn.Module`]): when sending an ``nn.Module`` object,
|
||||
the detector can name the tensors detected better.
|
||||
"""
|
||||
self.show_info = show_info
|
||||
self.log = log
|
||||
@@ -48,7 +46,6 @@ class TensorDetector():
|
||||
self.tensor_info[id(param)].append(param.requires_grad)
|
||||
self.tensor_info[id(param)].append(param.dtype)
|
||||
self.tensor_info[id(param)].append(self.get_tensor_mem(param))
|
||||
|
||||
|
||||
def get_tensor_mem(self, tensor):
|
||||
# calculate the memory occupied by a tensor
|
||||
@@ -58,7 +55,6 @@ class TensorDetector():
|
||||
memory_size += grad_memory_size
|
||||
return self.mem_format(memory_size)
|
||||
|
||||
|
||||
def mem_format(self, real_memory_size):
|
||||
# format the tensor memory into a reasonal magnitude
|
||||
if real_memory_size >= 2 ** 30:
|
||||
@@ -68,7 +64,6 @@ class TensorDetector():
|
||||
if real_memory_size >= 2 ** 10:
|
||||
return str(real_memory_size / (2 ** 10)) + ' KB'
|
||||
return str(real_memory_size) + ' B'
|
||||
|
||||
|
||||
def collect_tensors_state(self):
|
||||
for obj in gc.get_objects():
|
||||
@@ -116,7 +111,6 @@ class TensorDetector():
|
||||
if obj.device not in self.devices:
|
||||
self.devices.append(obj.device)
|
||||
|
||||
|
||||
def print_tensors_state(self):
|
||||
template_format = '{:3s}{:<30s}{:>10s}{:>20s}{:>10s}{:>20s}{:>15s}'
|
||||
self.info += LINE
|
||||
@@ -173,7 +167,6 @@ class TensorDetector():
|
||||
if self.log is not None:
|
||||
with open(self.log + '.log', 'a') as f:
|
||||
f.write(self.info)
|
||||
|
||||
|
||||
def detect(self, include_cpu = False):
|
||||
self.include_cpu = include_cpu
|
||||
|
@@ -25,7 +25,7 @@ class Timer:
|
||||
return time.time()
|
||||
|
||||
def start(self):
|
||||
"""Fisrtly synchronize cuda, reset the clock and then start the timer.
|
||||
"""Firstly synchronize cuda, reset the clock and then start the timer.
|
||||
"""
|
||||
self._elapsed = 0
|
||||
synchronize()
|
||||
@@ -40,10 +40,11 @@ class Timer:
|
||||
def stop(self, keep_in_history: bool = False):
|
||||
"""Stop the timer and record the start-stop time interval.
|
||||
|
||||
:param keep_in_history: Whether does it record into history each start-stop interval, defaults to False
|
||||
:type keep_in_history: bool, optional
|
||||
:return: Start-stop interval
|
||||
:rtype: int
|
||||
Args:
|
||||
keep_in_history (bool, optional): Whether does it record into history
|
||||
each start-stop interval, defaults to False.
|
||||
Returns:
|
||||
int: Start-stop interval.
|
||||
"""
|
||||
synchronize()
|
||||
end_time = time.time()
|
||||
@@ -57,26 +58,27 @@ class Timer:
|
||||
def get_history_mean(self):
|
||||
"""Mean of all history start-stop time intervals.
|
||||
|
||||
:return: Mean of time intervals
|
||||
:rtype: int
|
||||
Returns:
|
||||
int: Mean of time intervals
|
||||
"""
|
||||
return sum(self._history) / len(self._history)
|
||||
|
||||
def get_history_sum(self):
|
||||
"""Add up all the start-stop time intervals.
|
||||
|
||||
:return: Sum of time intervals
|
||||
:rtype: int
|
||||
Returns:
|
||||
int: Sum of time intervals.
|
||||
"""
|
||||
return sum(self._history)
|
||||
|
||||
def get_elapsed_time(self):
|
||||
"""Return the last start-stop time interval.
|
||||
|
||||
.. note:: Use it only when timer is not in progress
|
||||
Returns:
|
||||
int: The last time interval.
|
||||
|
||||
:return: The last time interval
|
||||
:rtype: int
|
||||
Note:
|
||||
Use it only when timer is not in progress
|
||||
"""
|
||||
assert not self._started, 'Timer is still in progress'
|
||||
return self._elapsed
|
||||
@@ -90,10 +92,10 @@ class Timer:
|
||||
|
||||
|
||||
class MultiTimer:
|
||||
"""An object contains multiple timers
|
||||
"""An object contains multiple timers.
|
||||
|
||||
:param on: Whether the timer is enabled. Default is True
|
||||
:type on: bool, optional
|
||||
Args:
|
||||
on (bool, optional): Whether the timer is enabled. Default is True.
|
||||
"""
|
||||
|
||||
def __init__(self, on: bool = True):
|
||||
@@ -101,10 +103,10 @@ class MultiTimer:
|
||||
self._timers = dict()
|
||||
|
||||
def start(self, name: str):
|
||||
"""Start namely one of the timers
|
||||
"""Start namely one of the timers.
|
||||
|
||||
:param name: Timer's key
|
||||
:type name: str
|
||||
Args:
|
||||
name (str): Timer's key.
|
||||
"""
|
||||
if self._on:
|
||||
if name not in self._timers:
|
||||
@@ -114,10 +116,9 @@ class MultiTimer:
|
||||
def stop(self, name: str, keep_in_history: bool):
|
||||
"""Stop namely one of the timers.
|
||||
|
||||
:param name: Timer's key
|
||||
:type name: str
|
||||
:param keep_in_history: Whether does it record into history each start-stop interval
|
||||
:type keep_in_history: bool
|
||||
Args:
|
||||
name (str): Timer's key.
|
||||
keep_in_history (bool): Whether does it record into history each start-stop interval.
|
||||
"""
|
||||
if self._on:
|
||||
return self._timers[name].stop(keep_in_history)
|
||||
@@ -127,17 +128,19 @@ class MultiTimer:
|
||||
def get_timer(self, name):
|
||||
"""Get timer by its name (from multitimer)
|
||||
|
||||
:param name: Timer's key
|
||||
:return: Timer with the name you give correctly
|
||||
:rtype: Timer
|
||||
Args:
|
||||
name (str): Timer's key.
|
||||
Returns:
|
||||
:class:`colossalai.utils.Timer`: Timer with the name you give correctly.
|
||||
"""
|
||||
return self._timers[name]
|
||||
|
||||
def reset(self, name=None):
|
||||
"""Reset timers.
|
||||
|
||||
:param name: If name is designated, the named timer will be reset and others will not, defaults to None
|
||||
:type name: optional
|
||||
Args:
|
||||
name (str, optional): If name is designated, the named timer will be reset
|
||||
and others will not, defaults to None.
|
||||
"""
|
||||
if self._on:
|
||||
if name is not None:
|
||||
|
Reference in New Issue
Block a user