mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-22 01:48:07 +00:00
Refactored docstring to google style
This commit is contained in:
@@ -19,20 +19,37 @@ class Engine:
|
||||
:meth:`step` which is based on the given :attr:`schedule` over each batch of a dataset.
|
||||
It controls a iteration in training.
|
||||
|
||||
:param model: The neural network model
|
||||
:type model: ``torch.nn.Module``
|
||||
:param optimizer: Optimizer for updating the parameters
|
||||
:type optimizer: ``torch.optim.Optimizer``
|
||||
:param criterion: Loss function for calculating loss
|
||||
:type criterion: ``torch.nn.modules.loss._Loss``, optional
|
||||
:param gradient_handlers: A list of gradient handler used in backward
|
||||
:type gradient_handlers: a list of ``BaseGradientHandler``, optional
|
||||
:param clip_grad_norm: The norm of gradient clipping
|
||||
:type clip_grad_norm: float, optional
|
||||
:param ophook_list: List of ophook
|
||||
:type ophook_list: list
|
||||
:param verbose: whether to display log info
|
||||
:type verbose: bool
|
||||
Args:
|
||||
model (``torch.nn.Module``): The neural network model.
|
||||
optimizer (``torch.optim.Optimizer``): Optimizer for updating the parameters.
|
||||
criterion (``torch.nn.modules.loss._Loss``, optional): Loss function for calculating loss.
|
||||
gradient_handlers (List[``BaseGradientHandler``], optional): A list of gradient handler used in backward.
|
||||
clip_grad_norm (float, optional): The norm of gradient clipping.
|
||||
ophook_list (list): List of ophook.
|
||||
verbose (bool): whether to display log info.
|
||||
|
||||
Examples:
|
||||
>>> # define model, criterion, optimizer, lr_scheduler, train_dataloader for your training
|
||||
>>> model = ...
|
||||
>>> criterion = ...
|
||||
>>> optimizer = ...
|
||||
>>> train_dataloader = ...
|
||||
>>> engine, _, _, _ = colossalai.initialize(model, optimizer, criterion)
|
||||
>>> engine.train()
|
||||
>>> for inputs, labels in train_dataloader
|
||||
>>> # set gradients to zero
|
||||
>>> engine.zero_grad()
|
||||
>>> # run forward pass
|
||||
>>> outputs = engine(inputs)
|
||||
>>> # compute loss value and run backward pass
|
||||
>>> loss = engine.criterion(outputs, labels)
|
||||
>>> engine.backward(loss)
|
||||
>>> # update parameters
|
||||
>>> engine.step()
|
||||
|
||||
The example of using Engine in training could be find in
|
||||
`Training with engine and trainer <https://www.colossalai.org/docs/basics/engine_trainer>`_. and
|
||||
`Run resnet cifar10 with engine <https://github.com/hpcaitech/ColossalAI-Examples/blob/main/image/resnet/run_resnet_cifar10_with_engine.py>`_.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
@@ -113,10 +130,10 @@ class Engine:
|
||||
return self.optimizer.step()
|
||||
|
||||
def backward(self, loss: Tensor):
|
||||
"""Start backward propagation given the loss value computed by a loss function
|
||||
"""Start backward propagation given the loss value computed by a loss function.
|
||||
|
||||
:param loss: Loss value computed by a loss function
|
||||
:type loss: :class:`torch.Tensor`
|
||||
Args:
|
||||
loss (:class:`torch.Tensor`): Loss value computed by a loss function.
|
||||
"""
|
||||
ret = self.optimizer.backward(loss)
|
||||
for ophook in self._ophook_list:
|
||||
@@ -124,34 +141,22 @@ class Engine:
|
||||
return ret
|
||||
|
||||
def backward_by_grad(self, tensor, grad):
|
||||
"""Start backward propagation given the gradient of the output tensor
|
||||
"""Start backward propagation given the gradient of the output tensor.
|
||||
|
||||
:param tensor: Output tensor
|
||||
:type tensor: :class:`torch.Tensor`
|
||||
:param grad: Gradient passed back to the output
|
||||
:type grad: :class:`torch.Tensor`
|
||||
Args:
|
||||
tensor (:class:`torch.Tensor`): Output tensor.
|
||||
grad (:class:`torch.Tensor`): Gradient passed back to the output.
|
||||
"""
|
||||
ret = self.optimizer.backward_by_grad(tensor, grad)
|
||||
for ophook in self._ophook_list:
|
||||
ophook.post_iter()
|
||||
return ret
|
||||
|
||||
def calc_loss(self, *args, **kwargs):
|
||||
"""Compute the loss value
|
||||
|
||||
:param args: Args used in criterion function
|
||||
:param kwargs: Kwargs used in criterion function
|
||||
|
||||
:return: The loss value
|
||||
:rtype: :class:`torch.Tensor`
|
||||
"""
|
||||
return self.criterion(*args, **kwargs)
|
||||
|
||||
def __call__(self, *args, **kwargs):
|
||||
"""Run the forward step for the model
|
||||
"""Run the forward step for the model.
|
||||
|
||||
:return: Output the model
|
||||
:rtype: Tuple[:class:`torch.Tensor`] or :class:`torch.Tensor`
|
||||
Returns:
|
||||
Tuple[:class:`torch.Tensor`] or :class:`torch.Tensor`: Output of the model.
|
||||
"""
|
||||
return self.model(*args, **kwargs)
|
||||
|
||||
|
@@ -8,10 +8,9 @@ class BaseGradientHandler(ABC):
|
||||
"""A basic helper class to handle all-reduce operations of gradients across different parallel groups
|
||||
before optimization.
|
||||
|
||||
:param model: Model where the gradients accumulate
|
||||
:param optimizer: Optimizer for updating the parameters
|
||||
:type model: Module
|
||||
:type optimizer: Optimizer
|
||||
Args:
|
||||
model (Module): Model where the gradients accumulate.
|
||||
optimizer (Optimizer): Optimizer for updating the parameters.
|
||||
"""
|
||||
def __init__(self, model, optimizer):
|
||||
self._model = model
|
||||
|
@@ -17,12 +17,11 @@ import math
|
||||
class MemTracerOpHook(BaseOpHook):
|
||||
"""
|
||||
Collect GPU memory usage information
|
||||
:param warmup: This parameter indicates how many iterations to truncate before profiling, defaults to 50
|
||||
:type warmup: int
|
||||
:param refreshrate: This parameter decides the frequency of write file, defaults to 10
|
||||
:type refreshrate: int
|
||||
:param data_prefix: The prefix of the stats data file, defaults to "memstats"
|
||||
:type data_prefix: string
|
||||
|
||||
Args:
|
||||
warmup (int): This parameter indicates how many iterations to truncate before profiling, defaults to 50.
|
||||
refreshrate (int): This parameter decides the frequency of write file, defaults to 10.
|
||||
data_prefix (string): The prefix of the stats data file, defaults to "memstats".
|
||||
"""
|
||||
|
||||
def __init__(self, warmup: int = 50, refreshrate: int = 10, data_prefix: str = "memstats"):
|
||||
|
@@ -15,8 +15,12 @@ class BaseSchedule(ABC):
|
||||
"""A basic helper class to control the process of training or evaluation.
|
||||
It mainly composes of forward_backward_step for gradient backward and
|
||||
optimizer_step for parameters update.
|
||||
For the convenience to enable FP16, we aggreate all codes that contain the
|
||||
For the convenience to enable FP16, we aggregate all codes that contain the
|
||||
control of FP16 in class schedule.
|
||||
|
||||
Args:
|
||||
batch_data_process_func (Callable, optional): The preprocessing function which receives a batch of data,
|
||||
and it will be executed in load_batch.
|
||||
"""
|
||||
|
||||
def __init__(self, batch_data_process_func: Callable = None):
|
||||
@@ -46,13 +50,12 @@ class BaseSchedule(ABC):
|
||||
"""Loads a batch from data iterator. It returns the data and labels which are
|
||||
already in the same GPU as where the model's.
|
||||
|
||||
:param data_iter: Data iterator from which get a batch of data
|
||||
:type data_iter: DataIter
|
||||
:param to_gpu: Whether the data should be moved to GPU
|
||||
:type to_gpu: bool, optional
|
||||
Args:
|
||||
data_iter (Iterable): Data iterator from which get a batch of data, obtained by calling iter(dataloader).
|
||||
to_gpu (bool, optional): Whether the data should be moved to GPU
|
||||
|
||||
:return: (data, label)
|
||||
:rtype: (:class:`Tensor`, :class:`torch.Tensor`)
|
||||
Returns:
|
||||
Tuple (:class:`Tensor`, :class:`torch.Tensor`): A tuple of (data, label).
|
||||
"""
|
||||
if data_iter is None:
|
||||
raise RuntimeError('Dataloader is not defined.')
|
||||
@@ -87,16 +90,12 @@ class BaseSchedule(ABC):
|
||||
):
|
||||
"""The process function over a batch of dataset for training or evaluation.
|
||||
|
||||
:param engine: Colossalai training engine
|
||||
:type engine: colossalai.engine.Engine
|
||||
:param data_iter: Data iterator from which get a batch of data
|
||||
:type data_iter: DataIter
|
||||
:param forward_only: If True, the process won't include backward
|
||||
:type forward_only: bool
|
||||
:param return_loss: If False, the loss won't be returned
|
||||
:type return_loss: bool, optional
|
||||
:param return_output_label: If False, the output and label won't be returned
|
||||
:type return_output_label: bool, optional
|
||||
Args:
|
||||
engine (colossalai.engine.Engine): Colossalai engine for training and inference.
|
||||
data_iter (Iterable): Data iterator from which get a batch of data, obtained by calling iter(dataloader).
|
||||
forward_only (bool): If True, the process won't include backward.
|
||||
return_loss (bool, optional): If False, the loss won't be returned.
|
||||
return_output_label (bool, optional): If False, the output and label won't be returned.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
@@ -15,6 +15,10 @@ class NonPipelineSchedule(BaseSchedule):
|
||||
During one process, it loads a batch of dataset and feeds it to the model.
|
||||
After getting the output and calculating the loss, it will use :meth:`step`
|
||||
to update the parameters if it is in training mode.
|
||||
|
||||
Args:
|
||||
batch_data_process_func (Callable, optional): The preprocessing function which receives a batch of data,
|
||||
and it will be executed in load_batch.
|
||||
"""
|
||||
|
||||
def forward_backward_step(self,
|
||||
@@ -23,22 +27,19 @@ class NonPipelineSchedule(BaseSchedule):
|
||||
forward_only: bool = False,
|
||||
return_loss: bool = True,
|
||||
return_output_label: bool = True):
|
||||
"""The process function that loads loads a batch of dataset and feeds it to the model.
|
||||
"""The process function that loads a batch of dataset and feeds it to the model.
|
||||
The returned labels and loss will None if :attr:`return_loss` is False.
|
||||
|
||||
:param engine: Model for training and inference
|
||||
:param data_iter: Data iterator of the dataloader, e.g. iter(dataloader)
|
||||
:param forward_only: If True, the model is run for the forward pass, else back propagation will be executed
|
||||
:param return_loss: Loss will be returned if True
|
||||
:param return_output_label: Output and label will be returned if True
|
||||
:type engine: Iterator
|
||||
:type data_iter: Iterator
|
||||
:type forward_only: bool, optional
|
||||
:type return_loss: bool, optional
|
||||
:type return_output_label: bool, optional
|
||||
Args:
|
||||
engine (colossalai.engine.Engine): Colossalai engine for training and inference.
|
||||
data_iter (Iterable): Dataloader as the form of an iterator, obtained by calling iter(dataloader).
|
||||
forward_only (bool, optional):
|
||||
If True, the model is run for the forward pass, else back propagation will be executed.
|
||||
return_loss (bool, optional): Loss will be returned if True.
|
||||
return_output_label (bool, optional): Output and label will be returned if True.
|
||||
|
||||
:return: (output, label, loss)
|
||||
:rtype: Tuple[:class:`torch.Tensor`]
|
||||
Returns:
|
||||
Tuple[:class:`torch.Tensor`]: A tuple of (output, label, loss), loss and label could be None.
|
||||
"""
|
||||
assert forward_only or return_loss, \
|
||||
"The argument 'return_loss' has to be True when 'forward_only' is False, but got False."
|
||||
|
@@ -41,14 +41,13 @@ class PipelineSchedule(BaseSchedule):
|
||||
It uses non-interleaved 1F1B strategy. Other properties are similar as
|
||||
:class:`NonPipelineSchedule`.
|
||||
|
||||
:param num_microbatches: The number of microbatches
|
||||
:type num_microbatches: int
|
||||
:param batch_data_process_func: The preprocessing function which receives a batch of data, and it will be executed in `load_batch`
|
||||
:type batch_data_process_func: Callable, optional
|
||||
:param tensor_shape: Specified shape in pipeline communication
|
||||
:type tensor_shape: torch.Size, optional
|
||||
:param scatter_gather_tensors: If set to `True`, communication will be reduced over pipeline when using 1D tensor parallelization
|
||||
:type scatter_gather_tensors: bool, optional
|
||||
Args:
|
||||
num_microbatches (int): The number of microbatches.
|
||||
batch_data_process_func (Callable, optional):
|
||||
The preprocessing function which receives a batch of data, and it will be executed in `load_batch`.
|
||||
tensor_shape (torch.Size, optional): Specified shape in pipeline communication.
|
||||
scatter_gather_tensors (bool, optional):
|
||||
If set to `True`, communication will be reduced over pipeline when using 1D tensor parallelization.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
@@ -131,19 +130,14 @@ class PipelineSchedule(BaseSchedule):
|
||||
is obtained from data_iterator, otherwise the passed-in input_tensor is used.
|
||||
Returns output tensor. This is a helper function and can be ignored by users.
|
||||
|
||||
:param engine: Your engine object
|
||||
:type engine: colossalai.engine.Engine
|
||||
:param input_tensor: Input tensor for this pipeline stage
|
||||
:type input_tensor: :class:`torch.Tensor`
|
||||
:param return_tensors: A list of tensors to return
|
||||
:type return_tensors: List[:class:`torch.Tensor`]
|
||||
:param return_output_label: Whether returns output labels
|
||||
:type return_output_label: bool, optional
|
||||
:param accum_loss: Where accumulated loss stores
|
||||
:type accum_loss: optional
|
||||
|
||||
:return: output or the loss value of the current pipeline stage
|
||||
:rtype: :class:`torch.Tensor`
|
||||
Args:
|
||||
engine (colossalai.engine.Engine): Colossalai engine for training and inference.
|
||||
input_tensor (:class:`torch.Tensor`): Input tensor for this pipeline stage.
|
||||
return_tensors (List[:class:`torch.Tensor`]): A list of tensors to return.
|
||||
return_output_label (bool, optional): Whether returns output labels.
|
||||
accum_loss (optional): Where accumulated loss stores.
|
||||
Returns:
|
||||
:class:`torch.Tensor`: output or the loss value of the current pipeline stage.
|
||||
"""
|
||||
data, label = self.load_micro_batch()
|
||||
output_tensor = self._call_engine(engine.model, input_tensor, data)
|
||||
@@ -173,17 +167,14 @@ class PipelineSchedule(BaseSchedule):
|
||||
Returns the gradients with respect to the input tensor (None if first stage).
|
||||
This is a helper function and can be ignored by users.
|
||||
|
||||
:param engine: your engine object
|
||||
:type engine: colossalai.engine.Engine
|
||||
:param input_tensor: input tensor for this pipeline stage
|
||||
:type input_tensor: :class:`torch.Tensor`
|
||||
:param output_tensor: output tensor for this pipeline stage
|
||||
:type output_tensor: :class:`torch.Tensor`
|
||||
:param output_tensor_grad: gradient of output tensor for this pipeline stage
|
||||
:type output_tensor_grad: :class:`torch.Tensor`
|
||||
Args:
|
||||
engine (colossalai.engine.Engine): Colossalai engine for training and inference.
|
||||
input_tensor (:class:`torch.Tensor`): input tensor for this pipeline stage.
|
||||
output_tensor (:class:`torch.Tensor`): output tensor for this pipeline stage.
|
||||
output_tensor_grad (:class:`torch.Tensor`): gradient of output tensor for this pipeline stage.
|
||||
|
||||
:return: gradient of input tensor
|
||||
:rtype: :class:`torch.Tensor`
|
||||
Returns:
|
||||
:class:`torch.Tensor`: gradient of input tensor.
|
||||
"""
|
||||
|
||||
# Retain the grad on the input_tensor.
|
||||
@@ -207,19 +198,16 @@ class PipelineSchedule(BaseSchedule):
|
||||
"""Runs non-interleaved 1F1B schedule, with communication between pipeline stages.
|
||||
Returns a tuple with losses if the last stage, an empty tuple otherwise.
|
||||
|
||||
:param engine: Your engine object
|
||||
:type engine: colossalai.engine.Engine
|
||||
:param data_iter: Dataloader as the form of an iterator, obtained by calling iter(dataloader)
|
||||
:type data_iter: Iterable
|
||||
:param forward_only: Whether run forward step only. Default is false. If true, no backward will be run.
|
||||
:type forward_only: bool
|
||||
:param return_loss: Whether returns the loss value. Default is true.
|
||||
:type return_loss: bool
|
||||
:param return_output_label: If False, the output and label won't be returned
|
||||
:type return_output_label: bool
|
||||
Args:
|
||||
engine (colossalai.engine.Engine): Colossalai engine for training and inference.
|
||||
data_iter (Iterable): Dataloader as the form of an iterator, obtained by calling iter(dataloader).
|
||||
forward_only (bool, optional):
|
||||
Whether run forward step only. Default is false. If true, no backward will be run.
|
||||
return_loss (bool, optional): Whether returns the loss value. Default is true.
|
||||
return_output_label (bool, optional): If False, the output and label won't be returned.
|
||||
|
||||
:return: (output, label, loss)
|
||||
:rtype: Tuple[:class:`torch.Tensor`]
|
||||
Returns:
|
||||
Tuple[:class:`torch.Tensor`]: A tuple of (output, label, loss), loss and label could be None.
|
||||
"""
|
||||
|
||||
assert forward_only or return_loss, \
|
||||
@@ -354,16 +342,14 @@ class InterleavedPipelineSchedule(PipelineSchedule):
|
||||
It uses interleaved 1F1B strategy. Other properties are similar as
|
||||
:class:`NonPipelineSchedule`.
|
||||
|
||||
:param num_microbatches: The number of microbatches
|
||||
:type num_microbatches: int
|
||||
:param num_model_chunks: The number of model chunks
|
||||
:type num_model_chunks: int
|
||||
:param batch_data_process_func: The preprocessing function which receives a batch of data, and it will be executed in `load_batch`
|
||||
:type batch_data_process_func: Callable, optional
|
||||
:param tensor_shape: Specified shape in pipeline communication
|
||||
:type tensor_shape: torch.Size, optional
|
||||
:param scatter_gather_tensors: If set to `True`, communication will be reduced over pipeline when using 1D tensor parallelization
|
||||
:type scatter_gather_tensors: bool, optional
|
||||
Args:
|
||||
num_microbatches (int): The number of microbatches.
|
||||
num_model_chunks (int): The number of model chunks.
|
||||
batch_data_process_func (Callable, optional):
|
||||
The preprocessing function which receives a batch of data, and it will be executed in `load_batch`.
|
||||
tensor_shape (torch.Size, optional): Specified shape in pipeline communication.
|
||||
scatter_gather_tensors (bool, optional):
|
||||
If set to `True`, communication will be reduced over pipeline when using 1D tensor parallelization.
|
||||
"""
|
||||
assert num_microbatches % gpc.get_world_size(ParallelMode.PIPELINE) == 0, \
|
||||
'num_microbatches must be an integer multiple of pipeline parallel world size'
|
||||
@@ -408,6 +394,16 @@ class InterleavedPipelineSchedule(PipelineSchedule):
|
||||
"""Forward step for passed-in model. If it is the first stage, the input tensor
|
||||
is obtained from data_iterator, otherwise the passed-in input_tensor is used.
|
||||
Returns output tensor. This is a helper function and can be ignored by users.
|
||||
|
||||
Args:
|
||||
engine (colossalai.engine.Engine): Colossalai engine for training and inference.
|
||||
model_chunk_id (int): The id of model chunks.
|
||||
input_tensor (:class:`torch.Tensor`): Input tensor for this pipeline stage.
|
||||
return_tensors (List[:class:`torch.Tensor`]): A list of tensors to return.
|
||||
return_output_label (bool, optional): Whether returns output labels.
|
||||
accum_loss (optional): Where accumulated loss stores.
|
||||
Returns:
|
||||
:class:`torch.Tensor`: output or the loss value of the current pipeline stage.
|
||||
"""
|
||||
data, label = self.load_micro_batch(model_chunk_id)
|
||||
output_tensor = self._call_engine(engine.model[model_chunk_id], input_tensor, data)
|
||||
@@ -435,18 +431,17 @@ class InterleavedPipelineSchedule(PipelineSchedule):
|
||||
"""Run interleaved 1F1B schedule (model split into model chunks), with
|
||||
communication between pipeline stages as needed.
|
||||
|
||||
Returns dictionary with losses if the last stage, empty dict otherwise.
|
||||
Args:
|
||||
engine (colossalai.engine.Engine): Colossalai engine for training and inference.
|
||||
data_iter (Iterable): Dataloader as the form of an iterator, obtained by calling iter(dataloader).
|
||||
forward_only (bool, optional):
|
||||
Whether run forward step only. Default is false. If true, no backward will be run.
|
||||
return_loss (bool, optional): Whether returns the loss value. Default is true.
|
||||
return_output_label (bool, optional): If False, the output and label won't be returned.
|
||||
|
||||
:param engine: Your engine object
|
||||
:type engine: colossalai.engine.Engine
|
||||
:param data_iter: Dataloader as the form of an iterator, obtained by calling iter(dataloader)
|
||||
:type data_iter: Iterable
|
||||
:param forward_only: Whether run forward step only. Default is false. If true, no backward will be run.
|
||||
:type forward_only: bool
|
||||
:param return_loss: Whether returns the loss value. Default is true.
|
||||
:type return_loss: bool
|
||||
:param return_output_label: If False, the output and label won't be returned
|
||||
:type return_output_label: bool
|
||||
Returns:
|
||||
Tuple[:class:`torch.Tensor`]: A tuple of (output, label, loss), loss and label could be None.
|
||||
The loss would be returned only in the last stage.
|
||||
"""
|
||||
assert forward_only or return_loss, \
|
||||
'The argument \'return_loss\' has to be True when \'forward_only\' is False, but got False.'
|
||||
|
Reference in New Issue
Block a user