Refactored docstring to google style

This commit is contained in:
Liang Bowen
2022-03-25 13:02:39 +08:00
committed by アマデウス
parent 53b1b6e340
commit ec5086c49c
94 changed files with 3389 additions and 2982 deletions

View File

@@ -19,20 +19,37 @@ class Engine:
:meth:`step` which is based on the given :attr:`schedule` over each batch of a dataset.
It controls a iteration in training.
:param model: The neural network model
:type model: ``torch.nn.Module``
:param optimizer: Optimizer for updating the parameters
:type optimizer: ``torch.optim.Optimizer``
:param criterion: Loss function for calculating loss
:type criterion: ``torch.nn.modules.loss._Loss``, optional
:param gradient_handlers: A list of gradient handler used in backward
:type gradient_handlers: a list of ``BaseGradientHandler``, optional
:param clip_grad_norm: The norm of gradient clipping
:type clip_grad_norm: float, optional
:param ophook_list: List of ophook
:type ophook_list: list
:param verbose: whether to display log info
:type verbose: bool
Args:
model (``torch.nn.Module``): The neural network model.
optimizer (``torch.optim.Optimizer``): Optimizer for updating the parameters.
criterion (``torch.nn.modules.loss._Loss``, optional): Loss function for calculating loss.
gradient_handlers (List[``BaseGradientHandler``], optional): A list of gradient handler used in backward.
clip_grad_norm (float, optional): The norm of gradient clipping.
ophook_list (list): List of ophook.
verbose (bool): whether to display log info.
Examples:
>>> # define model, criterion, optimizer, lr_scheduler, train_dataloader for your training
>>> model = ...
>>> criterion = ...
>>> optimizer = ...
>>> train_dataloader = ...
>>> engine, _, _, _ = colossalai.initialize(model, optimizer, criterion)
>>> engine.train()
>>> for inputs, labels in train_dataloader
>>> # set gradients to zero
>>> engine.zero_grad()
>>> # run forward pass
>>> outputs = engine(inputs)
>>> # compute loss value and run backward pass
>>> loss = engine.criterion(outputs, labels)
>>> engine.backward(loss)
>>> # update parameters
>>> engine.step()
The example of using Engine in training could be find in
`Training with engine and trainer <https://www.colossalai.org/docs/basics/engine_trainer>`_. and
`Run resnet cifar10 with engine <https://github.com/hpcaitech/ColossalAI-Examples/blob/main/image/resnet/run_resnet_cifar10_with_engine.py>`_.
"""
def __init__(self,
@@ -113,10 +130,10 @@ class Engine:
return self.optimizer.step()
def backward(self, loss: Tensor):
"""Start backward propagation given the loss value computed by a loss function
"""Start backward propagation given the loss value computed by a loss function.
:param loss: Loss value computed by a loss function
:type loss: :class:`torch.Tensor`
Args:
loss (:class:`torch.Tensor`): Loss value computed by a loss function.
"""
ret = self.optimizer.backward(loss)
for ophook in self._ophook_list:
@@ -124,34 +141,22 @@ class Engine:
return ret
def backward_by_grad(self, tensor, grad):
"""Start backward propagation given the gradient of the output tensor
"""Start backward propagation given the gradient of the output tensor.
:param tensor: Output tensor
:type tensor: :class:`torch.Tensor`
:param grad: Gradient passed back to the output
:type grad: :class:`torch.Tensor`
Args:
tensor (:class:`torch.Tensor`): Output tensor.
grad (:class:`torch.Tensor`): Gradient passed back to the output.
"""
ret = self.optimizer.backward_by_grad(tensor, grad)
for ophook in self._ophook_list:
ophook.post_iter()
return ret
def calc_loss(self, *args, **kwargs):
"""Compute the loss value
:param args: Args used in criterion function
:param kwargs: Kwargs used in criterion function
:return: The loss value
:rtype: :class:`torch.Tensor`
"""
return self.criterion(*args, **kwargs)
def __call__(self, *args, **kwargs):
"""Run the forward step for the model
"""Run the forward step for the model.
:return: Output the model
:rtype: Tuple[:class:`torch.Tensor`] or :class:`torch.Tensor`
Returns:
Tuple[:class:`torch.Tensor`] or :class:`torch.Tensor`: Output of the model.
"""
return self.model(*args, **kwargs)

View File

@@ -8,10 +8,9 @@ class BaseGradientHandler(ABC):
"""A basic helper class to handle all-reduce operations of gradients across different parallel groups
before optimization.
:param model: Model where the gradients accumulate
:param optimizer: Optimizer for updating the parameters
:type model: Module
:type optimizer: Optimizer
Args:
model (Module): Model where the gradients accumulate.
optimizer (Optimizer): Optimizer for updating the parameters.
"""
def __init__(self, model, optimizer):
self._model = model

View File

@@ -17,12 +17,11 @@ import math
class MemTracerOpHook(BaseOpHook):
"""
Collect GPU memory usage information
:param warmup: This parameter indicates how many iterations to truncate before profiling, defaults to 50
:type warmup: int
:param refreshrate: This parameter decides the frequency of write file, defaults to 10
:type refreshrate: int
:param data_prefix: The prefix of the stats data file, defaults to "memstats"
:type data_prefix: string
Args:
warmup (int): This parameter indicates how many iterations to truncate before profiling, defaults to 50.
refreshrate (int): This parameter decides the frequency of write file, defaults to 10.
data_prefix (string): The prefix of the stats data file, defaults to "memstats".
"""
def __init__(self, warmup: int = 50, refreshrate: int = 10, data_prefix: str = "memstats"):

View File

@@ -15,8 +15,12 @@ class BaseSchedule(ABC):
"""A basic helper class to control the process of training or evaluation.
It mainly composes of forward_backward_step for gradient backward and
optimizer_step for parameters update.
For the convenience to enable FP16, we aggreate all codes that contain the
For the convenience to enable FP16, we aggregate all codes that contain the
control of FP16 in class schedule.
Args:
batch_data_process_func (Callable, optional): The preprocessing function which receives a batch of data,
and it will be executed in load_batch.
"""
def __init__(self, batch_data_process_func: Callable = None):
@@ -46,13 +50,12 @@ class BaseSchedule(ABC):
"""Loads a batch from data iterator. It returns the data and labels which are
already in the same GPU as where the model's.
:param data_iter: Data iterator from which get a batch of data
:type data_iter: DataIter
:param to_gpu: Whether the data should be moved to GPU
:type to_gpu: bool, optional
Args:
data_iter (Iterable): Data iterator from which get a batch of data, obtained by calling iter(dataloader).
to_gpu (bool, optional): Whether the data should be moved to GPU
:return: (data, label)
:rtype: (:class:`Tensor`, :class:`torch.Tensor`)
Returns:
Tuple (:class:`Tensor`, :class:`torch.Tensor`): A tuple of (data, label).
"""
if data_iter is None:
raise RuntimeError('Dataloader is not defined.')
@@ -87,16 +90,12 @@ class BaseSchedule(ABC):
):
"""The process function over a batch of dataset for training or evaluation.
:param engine: Colossalai training engine
:type engine: colossalai.engine.Engine
:param data_iter: Data iterator from which get a batch of data
:type data_iter: DataIter
:param forward_only: If True, the process won't include backward
:type forward_only: bool
:param return_loss: If False, the loss won't be returned
:type return_loss: bool, optional
:param return_output_label: If False, the output and label won't be returned
:type return_output_label: bool, optional
Args:
engine (colossalai.engine.Engine): Colossalai engine for training and inference.
data_iter (Iterable): Data iterator from which get a batch of data, obtained by calling iter(dataloader).
forward_only (bool): If True, the process won't include backward.
return_loss (bool, optional): If False, the loss won't be returned.
return_output_label (bool, optional): If False, the output and label won't be returned.
"""
pass

View File

@@ -15,6 +15,10 @@ class NonPipelineSchedule(BaseSchedule):
During one process, it loads a batch of dataset and feeds it to the model.
After getting the output and calculating the loss, it will use :meth:`step`
to update the parameters if it is in training mode.
Args:
batch_data_process_func (Callable, optional): The preprocessing function which receives a batch of data,
and it will be executed in load_batch.
"""
def forward_backward_step(self,
@@ -23,22 +27,19 @@ class NonPipelineSchedule(BaseSchedule):
forward_only: bool = False,
return_loss: bool = True,
return_output_label: bool = True):
"""The process function that loads loads a batch of dataset and feeds it to the model.
"""The process function that loads a batch of dataset and feeds it to the model.
The returned labels and loss will None if :attr:`return_loss` is False.
:param engine: Model for training and inference
:param data_iter: Data iterator of the dataloader, e.g. iter(dataloader)
:param forward_only: If True, the model is run for the forward pass, else back propagation will be executed
:param return_loss: Loss will be returned if True
:param return_output_label: Output and label will be returned if True
:type engine: Iterator
:type data_iter: Iterator
:type forward_only: bool, optional
:type return_loss: bool, optional
:type return_output_label: bool, optional
Args:
engine (colossalai.engine.Engine): Colossalai engine for training and inference.
data_iter (Iterable): Dataloader as the form of an iterator, obtained by calling iter(dataloader).
forward_only (bool, optional):
If True, the model is run for the forward pass, else back propagation will be executed.
return_loss (bool, optional): Loss will be returned if True.
return_output_label (bool, optional): Output and label will be returned if True.
:return: (output, label, loss)
:rtype: Tuple[:class:`torch.Tensor`]
Returns:
Tuple[:class:`torch.Tensor`]: A tuple of (output, label, loss), loss and label could be None.
"""
assert forward_only or return_loss, \
"The argument 'return_loss' has to be True when 'forward_only' is False, but got False."

View File

@@ -41,14 +41,13 @@ class PipelineSchedule(BaseSchedule):
It uses non-interleaved 1F1B strategy. Other properties are similar as
:class:`NonPipelineSchedule`.
:param num_microbatches: The number of microbatches
:type num_microbatches: int
:param batch_data_process_func: The preprocessing function which receives a batch of data, and it will be executed in `load_batch`
:type batch_data_process_func: Callable, optional
:param tensor_shape: Specified shape in pipeline communication
:type tensor_shape: torch.Size, optional
:param scatter_gather_tensors: If set to `True`, communication will be reduced over pipeline when using 1D tensor parallelization
:type scatter_gather_tensors: bool, optional
Args:
num_microbatches (int): The number of microbatches.
batch_data_process_func (Callable, optional):
The preprocessing function which receives a batch of data, and it will be executed in `load_batch`.
tensor_shape (torch.Size, optional): Specified shape in pipeline communication.
scatter_gather_tensors (bool, optional):
If set to `True`, communication will be reduced over pipeline when using 1D tensor parallelization.
"""
def __init__(self,
@@ -131,19 +130,14 @@ class PipelineSchedule(BaseSchedule):
is obtained from data_iterator, otherwise the passed-in input_tensor is used.
Returns output tensor. This is a helper function and can be ignored by users.
:param engine: Your engine object
:type engine: colossalai.engine.Engine
:param input_tensor: Input tensor for this pipeline stage
:type input_tensor: :class:`torch.Tensor`
:param return_tensors: A list of tensors to return
:type return_tensors: List[:class:`torch.Tensor`]
:param return_output_label: Whether returns output labels
:type return_output_label: bool, optional
:param accum_loss: Where accumulated loss stores
:type accum_loss: optional
:return: output or the loss value of the current pipeline stage
:rtype: :class:`torch.Tensor`
Args:
engine (colossalai.engine.Engine): Colossalai engine for training and inference.
input_tensor (:class:`torch.Tensor`): Input tensor for this pipeline stage.
return_tensors (List[:class:`torch.Tensor`]): A list of tensors to return.
return_output_label (bool, optional): Whether returns output labels.
accum_loss (optional): Where accumulated loss stores.
Returns:
:class:`torch.Tensor`: output or the loss value of the current pipeline stage.
"""
data, label = self.load_micro_batch()
output_tensor = self._call_engine(engine.model, input_tensor, data)
@@ -173,17 +167,14 @@ class PipelineSchedule(BaseSchedule):
Returns the gradients with respect to the input tensor (None if first stage).
This is a helper function and can be ignored by users.
:param engine: your engine object
:type engine: colossalai.engine.Engine
:param input_tensor: input tensor for this pipeline stage
:type input_tensor: :class:`torch.Tensor`
:param output_tensor: output tensor for this pipeline stage
:type output_tensor: :class:`torch.Tensor`
:param output_tensor_grad: gradient of output tensor for this pipeline stage
:type output_tensor_grad: :class:`torch.Tensor`
Args:
engine (colossalai.engine.Engine): Colossalai engine for training and inference.
input_tensor (:class:`torch.Tensor`): input tensor for this pipeline stage.
output_tensor (:class:`torch.Tensor`): output tensor for this pipeline stage.
output_tensor_grad (:class:`torch.Tensor`): gradient of output tensor for this pipeline stage.
:return: gradient of input tensor
:rtype: :class:`torch.Tensor`
Returns:
:class:`torch.Tensor`: gradient of input tensor.
"""
# Retain the grad on the input_tensor.
@@ -207,19 +198,16 @@ class PipelineSchedule(BaseSchedule):
"""Runs non-interleaved 1F1B schedule, with communication between pipeline stages.
Returns a tuple with losses if the last stage, an empty tuple otherwise.
:param engine: Your engine object
:type engine: colossalai.engine.Engine
:param data_iter: Dataloader as the form of an iterator, obtained by calling iter(dataloader)
:type data_iter: Iterable
:param forward_only: Whether run forward step only. Default is false. If true, no backward will be run.
:type forward_only: bool
:param return_loss: Whether returns the loss value. Default is true.
:type return_loss: bool
:param return_output_label: If False, the output and label won't be returned
:type return_output_label: bool
Args:
engine (colossalai.engine.Engine): Colossalai engine for training and inference.
data_iter (Iterable): Dataloader as the form of an iterator, obtained by calling iter(dataloader).
forward_only (bool, optional):
Whether run forward step only. Default is false. If true, no backward will be run.
return_loss (bool, optional): Whether returns the loss value. Default is true.
return_output_label (bool, optional): If False, the output and label won't be returned.
:return: (output, label, loss)
:rtype: Tuple[:class:`torch.Tensor`]
Returns:
Tuple[:class:`torch.Tensor`]: A tuple of (output, label, loss), loss and label could be None.
"""
assert forward_only or return_loss, \
@@ -354,16 +342,14 @@ class InterleavedPipelineSchedule(PipelineSchedule):
It uses interleaved 1F1B strategy. Other properties are similar as
:class:`NonPipelineSchedule`.
:param num_microbatches: The number of microbatches
:type num_microbatches: int
:param num_model_chunks: The number of model chunks
:type num_model_chunks: int
:param batch_data_process_func: The preprocessing function which receives a batch of data, and it will be executed in `load_batch`
:type batch_data_process_func: Callable, optional
:param tensor_shape: Specified shape in pipeline communication
:type tensor_shape: torch.Size, optional
:param scatter_gather_tensors: If set to `True`, communication will be reduced over pipeline when using 1D tensor parallelization
:type scatter_gather_tensors: bool, optional
Args:
num_microbatches (int): The number of microbatches.
num_model_chunks (int): The number of model chunks.
batch_data_process_func (Callable, optional):
The preprocessing function which receives a batch of data, and it will be executed in `load_batch`.
tensor_shape (torch.Size, optional): Specified shape in pipeline communication.
scatter_gather_tensors (bool, optional):
If set to `True`, communication will be reduced over pipeline when using 1D tensor parallelization.
"""
assert num_microbatches % gpc.get_world_size(ParallelMode.PIPELINE) == 0, \
'num_microbatches must be an integer multiple of pipeline parallel world size'
@@ -408,6 +394,16 @@ class InterleavedPipelineSchedule(PipelineSchedule):
"""Forward step for passed-in model. If it is the first stage, the input tensor
is obtained from data_iterator, otherwise the passed-in input_tensor is used.
Returns output tensor. This is a helper function and can be ignored by users.
Args:
engine (colossalai.engine.Engine): Colossalai engine for training and inference.
model_chunk_id (int): The id of model chunks.
input_tensor (:class:`torch.Tensor`): Input tensor for this pipeline stage.
return_tensors (List[:class:`torch.Tensor`]): A list of tensors to return.
return_output_label (bool, optional): Whether returns output labels.
accum_loss (optional): Where accumulated loss stores.
Returns:
:class:`torch.Tensor`: output or the loss value of the current pipeline stage.
"""
data, label = self.load_micro_batch(model_chunk_id)
output_tensor = self._call_engine(engine.model[model_chunk_id], input_tensor, data)
@@ -435,18 +431,17 @@ class InterleavedPipelineSchedule(PipelineSchedule):
"""Run interleaved 1F1B schedule (model split into model chunks), with
communication between pipeline stages as needed.
Returns dictionary with losses if the last stage, empty dict otherwise.
Args:
engine (colossalai.engine.Engine): Colossalai engine for training and inference.
data_iter (Iterable): Dataloader as the form of an iterator, obtained by calling iter(dataloader).
forward_only (bool, optional):
Whether run forward step only. Default is false. If true, no backward will be run.
return_loss (bool, optional): Whether returns the loss value. Default is true.
return_output_label (bool, optional): If False, the output and label won't be returned.
:param engine: Your engine object
:type engine: colossalai.engine.Engine
:param data_iter: Dataloader as the form of an iterator, obtained by calling iter(dataloader)
:type data_iter: Iterable
:param forward_only: Whether run forward step only. Default is false. If true, no backward will be run.
:type forward_only: bool
:param return_loss: Whether returns the loss value. Default is true.
:type return_loss: bool
:param return_output_label: If False, the output and label won't be returned
:type return_output_label: bool
Returns:
Tuple[:class:`torch.Tensor`]: A tuple of (output, label, loss), loss and label could be None.
The loss would be returned only in the last stage.
"""
assert forward_only or return_loss, \
'The argument \'return_loss\' has to be True when \'forward_only\' is False, but got False.'