mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-15 22:19:38 +00:00
update examples and sphnix docs for the new api (#63)
This commit is contained in:
@@ -109,15 +109,15 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
|
||||
added functionality to handle model parallel parameters. Note that
|
||||
the gradients are modified in place.
|
||||
|
||||
Arguments:
|
||||
parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
|
||||
single Tensor that will have gradients normalized
|
||||
max_norm (float or int): max norm of the gradients
|
||||
norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
|
||||
infinity norm.
|
||||
:param parameters: an iterable of Tensors or a single Tensor that will have gradients normalized
|
||||
:type parameters: (Iterable[Tensor] or Tensor)
|
||||
:param max_norm: max norm of the gradients
|
||||
:type max_norm: float or int
|
||||
:param norm_type: type of the used p-norm. Can be ``'inf'`` for infinity norm.
|
||||
:type norm_type: float or int
|
||||
|
||||
Returns:
|
||||
Total norm of the parameters (viewed as a single vector).
|
||||
:return: Total norm of the parameters (viewed as a single vector).
|
||||
:rtype: float
|
||||
"""
|
||||
|
||||
if isinstance(parameters, torch.Tensor):
|
||||
|
@@ -123,12 +123,23 @@ def get_dataloader(dataset,
|
||||
stage and label on the last stage
|
||||
|
||||
:param dataset: a :class:utils.data.dataset dataset
|
||||
:param shuffle: whether to shuffle the dataset
|
||||
:param seed: random worker seed, defaults to 1024
|
||||
:type seed: int, optional
|
||||
:param add_sampler_if_possible: [description], defaults to False
|
||||
:type add_sampler_if_possible: bool, optional
|
||||
:return: a :class:utils.data.dataset dataloader
|
||||
:rtype: torch.utils.data.dataset
|
||||
:param add_sampler: add DistributedDataParallelSampelr to the dataset
|
||||
:param drop_last: drop the last incomplete batch of data
|
||||
:param pin_memory: whether to pin memory address in CPU memory
|
||||
:param num_workers: number of worker threads for this dataloader
|
||||
|
||||
:type dataset: :class:`torch.utils.data.Dataset`
|
||||
:type shuffle: bool, optional. Default is False
|
||||
:type seed: int, optional. Default is 1024
|
||||
:type add_sampler: bool, optional. Default is True
|
||||
:type drop_last: bool, optional. Default is False
|
||||
:type pin_memory: bool, optional. Default is False
|
||||
:type num_workers: int, optional. Default is 0
|
||||
|
||||
:return: a object of :class:`torch.utils.data.DataLoader`
|
||||
:rtype: :class:`torch.utils.data.DataLoader`
|
||||
'''
|
||||
_kwargs = kwargs.copy()
|
||||
|
||||
|
@@ -13,6 +13,20 @@ def accumulate_gradient(model: nn.Module,
|
||||
accumulate_size: int,
|
||||
gradient_handlers: List[BaseGradientHandler] = None,
|
||||
lr_scheduler: _LRScheduler = None):
|
||||
"""
|
||||
:param model: your model object
|
||||
:type model: :class:`torch.nn.Module`
|
||||
:param optimizer: your optimizer object
|
||||
:type optimizer: :class:`torch.optim.Optimizer`
|
||||
:param dataloader: your dataloader object
|
||||
:type dataloader: Iterable
|
||||
:param accumulate_size: the number of steps to accumulate gradients
|
||||
:type accumulate_size: int
|
||||
:param gradient_handlers: list of gradient handler objects. Default is None
|
||||
:type gradient_handlers: List[:class:`colossalai.engine.BaseGradientHandler`]
|
||||
:param lr_scheduler: your lr scheduler object. Default is None
|
||||
:type lr_scheduler: `torch.optim.lr_scheduler._LRScheduler`
|
||||
"""
|
||||
optimizer = GradAccumOptimizer(optimizer, accumulate_size=accumulate_size, model=model)
|
||||
dataloader = GradAccumDataloader(dataloader, accumulate_size=accumulate_size)
|
||||
|
||||
|
@@ -14,6 +14,17 @@ from colossalai.engine import BaseGradientHandler
|
||||
|
||||
|
||||
class GradAccumOptimizer(ColossalaiOptimizer):
|
||||
"""A wrapper for the optimizer to enable gradient accumulation by skipping the steps
|
||||
before accumulation size is reached
|
||||
|
||||
:param optim: your optimizer object
|
||||
:type optim: :class:`torch.optim.Optimizer`
|
||||
:param accumulate_size: the number of steps to accumulate gradients
|
||||
:type accumualate_size: int
|
||||
:param model: your model object to check if it is DDP for special handling of no_sync() context
|
||||
:type model: :class:`torch.nn.Module`
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, optim: Optimizer, accumulate_size: int, model: nn.Module = None):
|
||||
super().__init__(optim)
|
||||
@@ -64,6 +75,19 @@ class GradAccumOptimizer(ColossalaiOptimizer):
|
||||
|
||||
|
||||
class GradAccumDataloader():
|
||||
"""A wrapper for dataloder to enable gradient accumulation by dropping the last incomplete steps.
|
||||
|
||||
For example, if a dataloader has 10 batches of data and accumulate size is 4. The model paramters will
|
||||
be update only twice at step 4 and step 8. The last two batches of data do not form a complete 4-step cycle.
|
||||
Thus, they will be automatically skipped by this class. If the dataloader is not standard PyTorch dataloader,
|
||||
(e.g. Dali dataloader), this class will automatically consume (load data for nothing) the remaining 2 batches.
|
||||
|
||||
:param dataloader: your dataloader object
|
||||
:type dataloader: Iterable
|
||||
:param accumulate_size: the number of steps to accumulate gradients
|
||||
:type accumualate_size: int
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, dataloader: Iterable, accumulate_size: int) -> None:
|
||||
self.dataloader = dataloader
|
||||
@@ -99,6 +123,15 @@ class GradAccumDataloader():
|
||||
|
||||
|
||||
class GradAccumLrSchedulerByStep(_LRScheduler):
|
||||
"""A wrapper for the LR scheduler to enable gradient accumulation by skipping the steps
|
||||
before accumulation size is reached
|
||||
|
||||
:param lr_scheduler: your lr scheduler object
|
||||
:type lr_scheduler: :class:`torch.optim.lr_scheduler._LRScheduler`
|
||||
:param accumulate_size: the number of steps to accumulate gradients
|
||||
:type accumualate_size: int
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, lr_scheduler: _LRScheduler, accumulate_size: int) -> None:
|
||||
self.lr_scheduler = lr_scheduler
|
||||
@@ -137,6 +170,15 @@ class GradAccumLrSchedulerByStep(_LRScheduler):
|
||||
|
||||
|
||||
class GradAccumGradientHandler():
|
||||
"""A wrapper for the gradient handler to enable gradient accumulation by skipping the steps
|
||||
before accumulation size is reached
|
||||
|
||||
:param grad_handler: your gradient handler object
|
||||
:type grad_handler: :class:`colossalai.engine.BaseGradientHandler`
|
||||
:param accumulate_size: the number of steps to accumulate gradients
|
||||
:type accumualate_size: int
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, grad_handler: BaseGradientHandler, accumulate_size: int) -> None:
|
||||
assert isinstance(grad_handler, BaseGradientHandler), \
|
||||
|
@@ -34,6 +34,10 @@ def report_memory_usage(message, logger=None, report_cpu=False):
|
||||
|
||||
:param message: a prefix message to add in the log
|
||||
:type message: str
|
||||
:param logger: an instance of :class:`colossalai.logging.DistributedLogger`
|
||||
:type logger: :class:`colossalai.logging.DistributedLogger`
|
||||
:param report_cpu: whether to report CPU memory
|
||||
:type report_cpu: bool
|
||||
:raises EnvironmentError: raise error if no distributed environment has been initialized
|
||||
'''
|
||||
if not gpc.is_initialized(ParallelMode.GLOBAL):
|
||||
|
@@ -2,6 +2,13 @@
|
||||
|
||||
|
||||
class MultiTensorApply(object):
|
||||
"""
|
||||
Apply an operation to a list of tensors efficiently
|
||||
|
||||
:param chunk_size: size of a chunk
|
||||
:type chunk_size: int
|
||||
"""
|
||||
|
||||
available = False
|
||||
warned = False
|
||||
|
||||
|
@@ -74,6 +74,9 @@ class Timer:
|
||||
|
||||
class MultiTimer:
|
||||
'''An object contains multiple timers
|
||||
|
||||
:param on: whether the timer is enabled. Default is True
|
||||
:type on: bool
|
||||
'''
|
||||
|
||||
def __init__(self, on: bool = True):
|
||||
|
Reference in New Issue
Block a user