Refactored docstring to google style

This commit is contained in:
Liang Bowen
2022-03-25 13:02:39 +08:00
committed by アマデウス
parent 53b1b6e340
commit ec5086c49c
94 changed files with 3389 additions and 2982 deletions

View File

@@ -17,18 +17,46 @@ from colossalai.trainer.hooks import BaseHook
class Trainer:
"""This a class tending for easy deployments of users' training and evaluation instead of
r"""This is a class tending for easy deployments of users' training and evaluation instead of
writing their own scripts. It is similar with ``ignite.engine`` and ``keras.engine``, but is
called `Trainer`.
:param engine: Engine responsible for the process function
:type engine: :class:`Engine`
:param schedule: Schedule responsible for forward and backward steps
:type schedule: :class:`BaseSchedule`, optional
:param timer: Timer used to monitor the whole training
:type timer: :class:`MultiTimer`, optional
:param logger: Logger used to record the whole training
:type logger: :class:`colossalai.logging.DistributedLogger`, optional
Args:
engine (:class:`Engine`): Engine responsible for the process function.
schedule (:class:`BaseSchedule`, optional): Schedule responsible for forward and backward steps.
timer (:class:`MultiTimer`, optional): Timer used to monitor the whole training.
logger (:class:`colossalai.logging.DistributedLogger`, optional): Logger used to record the whole training log.
Note:
when `schedule` is None, the ``NonPipelineSchedule`` would be used. If you would like to use pipeline,
you should choose ``PipelineSchedule`` or ``InterleavedPipelineSchedule`` for the `schedule`
Examples:
>>> # define model, criterion, optimizer, lr_scheduler, train_dataloader for your training
>>> model = ...
>>> criterion = ...
>>> optimizer = ...
>>> train_dataloader = ...
>>> # Initialize your engine, train_dataloader, test_dataloader, lr_scheduler
>>> engine, train_dataloader, _, _ = colossalai.initialize(model, optimizer, criterion)
>>> # Beginning training progress
>>> timier = ...
>>> logger = ...
>>> trainer = Trainer(engine=engine, logger=logger, schedule=schedule, timer=timier)
>>> # add hooks you would like to use here.
>>> hook_list = []
>>> trainer.fit(
>>> train_dataloader=train_dataloader,
>>> epochs=gpc.config.NUM_EPOCHS,
>>> test_interval=1,
>>> hooks=hook_list,
>>> display_progress=True,
>>> return_output_label=False
>>> )
More examples and details could be found in
`Training with engine and trainer <https://www.colossalai.org/docs/basics/engine_trainer>`_
and `ColossalAI-Examples <https://github.com/hpcaitech/ColossalAI-Examples/tree/main>`_.
"""
def __init__(
self,
@@ -108,20 +136,19 @@ class Trainer:
def _set_current_step(self, epoch: int):
"""Sets current step number.
:param epoch: Step number to be set
:type epoch: int
Args:
epoch (int): Step number to be set.
"""
self._cur_step = epoch * self._steps_per_epoch
def _call_timer(self, action: str, item: str, *args, **kwargs) -> None:
"""Call timer funciton with a given timer name.
:param action: Function to be called on timer
:type action: str
:param item: Name of the timer
:type item: str
:param args: args used for action function
:param kwargs: kwargs used for action function
Args:
action (str): Function to be called on timer.
item (str): Name of the timer.
args (list): args used for action function.
kwargs (dict): kwargs used for action function.
"""
if self._timer is not None:
@@ -134,10 +161,9 @@ class Trainer:
def _call_hooks(self, func, output=None):
"""Calls specific hooks in the current time point.
:param func: A string represents the time point
:param output: Output of the model after running a iteration or None in any other time points
:type func: str
:type output: optional
Args:
func (str): A string represents the time point.
output (Any, optional): Output of the model after running an iteration or None in any other time points.
"""
# Only after iter hook will receive output
for hook in self.hooks:
@@ -273,25 +299,17 @@ class Trainer:
display_progress: bool = False,
return_output_label: bool = True,
):
"""Trains the model to fit training data.
r"""Trains the model to fit training data.
:param train_dataloader: DataLoader in training
:param epochs: Maximum number of epoches
:param max_steps: Maximum number of running iterations
:param test_dataloader: DataLoader in testing
:param test_interval: Interval of testing
:param hooks: A list of hooks used in training
:param display_progress: If True, the training progress will be printed
:param return_output_label: If True, the output of model and the label will be returned
:type train_dataloader: DataLoader
:type epochs: int
:type max_steps: int, optional
:type test_dataloader: DataLoader, optional
:type test_interval: int, optional
:type hooks: list, optional
:type display_progress: bool, optional
:type return_output_label: bool, optional
Args:
train_dataloader (:class:`torch.utils.data.DataLoader`): DataLoader for training.
epochs (int): Maximum number of epochs.
max_steps (int, optional): Maximum number of running iterations.
test_dataloader (:class:`torch.utils.data.DataLoader`, optional): DataLoader for validation.
test_interval (int, optional): Interval of validation
hooks (list[`BaseHook <https://github.com/hpcaitech/ColossalAI/tree/main/colossalai/trainer/hooks>`_],
optional): A list of hooks used in training.
display_progress (bool, optional): If True, a progress bar will be displayed.
"""
# set epochs and steps, consider gradient accumulation
@@ -374,15 +392,12 @@ class Trainer:
):
"""Evaluates the model with testing data.
:param test_dataloader: DataLoader in testing
:param hooks: A list of hooks used in evaluation
:param display_progress: If True, the evaluation progress will be printed
:param return_output_label: If True, the output of model and the label will be returned
:type test_dataloader: DataLoader
:type hooks: list, optional
:type display_progress: bool, optional
:type return_output_label: bool
Args:
test_dataloader (:class:`torch.utils.data.DataLoader`, optional): Dataloader for testing.
hooks (list, optional): A list of hooks used in evaluation. Defaults to None.
display_progress (bool, optional): If True, the evaluation progress will be printed. Defaults to False.
return_output_label (bool, optional): If True, the output of model and the label
will be returned. Defaults to True.
"""
# set display
display_progress = self._should_display_progress(display_progress)
@@ -418,10 +433,11 @@ class Trainer:
def predict(self, data: Union[Tensor, List[Tensor]]):
"""Uses trained model to make a prediction for a tensor or a tensor list.
:param data: Data as the input
:type data: Union[Tensor, List[Tensor]
:return: The output of model as the prediction
:rtype: Tensor
Args:
data (Union[:class:`torch.tensor`, List[:class:`torch.tensor`]]): Data as the input.
Returns:
:class:`torch.tensor`: The output of model as the prediction
"""
# predict without labels
if isinstance(data, (list, tuple)):

View File

@@ -40,14 +40,11 @@ class BaseHook(ABC):
def after_train_iter(self, trainer, output: Tensor, label: Tensor, loss: Tensor):
"""Actions after running a training iteration.
:param trainer: Trainer which is using this hook
:type trainer: :class:`Trainer`
:param output: Output of the model
:type output: torch.Tensor
:param label: Labels of the input data
:type label: torch.Tensor
:param loss: Loss between the output and input data
:type loss: torch.Tensor
Args:
trainer (:class:`Trainer`): Trainer which is using this hook.
output (:class:`torch.Tensor`): Output of the model.
label (:class:`torch.Tensor`): Labels of the input data.
loss (:class:`torch.Tensor`): Loss between the output and input data.
"""
pass
@@ -89,24 +86,21 @@ class BaseHook(ABC):
def after_test_iter(self, trainer, output: Tensor, label: Tensor, loss: Tensor):
"""Actions after running a testing iteration.
:param trainer: Trainer which is using this hook
:type trainer: :class:`Trainer`
:param output: Output of the model
:type output: Tensor
:param label: Labels of the input data
:type label: Tensor
:param loss: Loss between the output and input data
:type loss: Tensor
Args:
trainer (:class:`Trainer`): Trainer which is using this hook
output (:class:`torch.Tensor`): Output of the model
label (:class:`torch.Tensor`): Labels of the input data
loss (:class:`torch.Tensor`): Loss between the output and input data
"""
pass
def init_runner_states(self, trainer, key, val):
"""Initializes trainer's state.
:param trainer: Trainer which is using this hook
:type trainer: :class:`Trainer`
:param key: Key of reseting state
:param val: Value of reseting state
Args:
trainer (:class:`Trainer`): Trainer which is using this hook
key: Key of state to be reset
val: Value of state to be reset
"""
if key not in trainer.states:
trainer.states[key] = val

View File

@@ -16,14 +16,13 @@ from ._lr_scheduler_hook import LRSchedulerHook
class SaveCheckpointHook(BaseHook):
"""Saves the model by interval in training process.
:param interval: Saving interval, defaults to 1
:type interval: int, optional
:param checkpoint_dir: Directory of saving checkpoint, defaults to None
:type checkpoint_dir: str, optional
:param suffix: Saving suffix of the file, defaults to ''
:type suffix: str, optional
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10
:type priority: int, optional
Args:
interval (int, optional): Saving interval, defaults to 1.
checkpoint_dir (str, optional): Directory of saving checkpoint, defaults to None.
suffix (str, optional): Saving suffix of the file, defaults to ''.
priority (int, optional): Priority in the printing, hooks with small priority will be printed in front
defaults to 10. If different hooks share same priority, the order of printing would
depend on the hooks order in the hook list.
"""
def __init__(self,
@@ -71,18 +70,17 @@ class SaveCheckpointHook(BaseHook):
class LoadCheckpointHook(BaseHook):
"""Loads the model before training process.
:param checkpoint_dir: Directory of saving checkpoint, defaults to None
:type checkpoint_dir: str, optional
:param epoch: Epoch number to be set, defaults to -1
:type epoch: str, optional
:param finetune: Whether allows to load a part of the model, defaults to False
:type finetune: bool, optional
:param strict: Whether loads a model that has the same shape of parameters, defaults to False
:type strict: bool, optional
:param suffix: Suffic, defaults to ''
:type suffix: str, optional
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 0
:type priority: int, optional
Args:
checkpoint_dir (str, optional): Directory of saving checkpoint, defaults to None.
epoch (str, optional): Loading checkpoint of setting epoch numbers, defaults to -1.
Epoch equals to -1 means choosing the latest checkpoint.
finetune (bool, optional): Whether allows to load a part of the model, defaults to False.
strict (bool, optional): Whether to strictly enforce that the keys in :attr:`state_dict` of the checkpoint
match the names of parameters and buffers in model, defaults to False.
suffix (str, optional): Suffix of checkpoint file path, defaults to ''.
priority (int, optional): Priority in the printing, hooks with small priority will be printed in front,
defaults to 0. If different hooks share same priority, the order of printing would
depend on the hooks order in the hook list.
"""
def __init__(self,

View File

@@ -25,13 +25,14 @@ def _format_number(val, prec=5):
class LogByEpochHook(BaseHook):
"""Hook to log by epoch
"""Hook to log by epoch.
:param logger: Logger for the log
:param interval: Recording interval, defaults to 1
:type interval: int, optional
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 1
:type priority: int, optional
Args:
logger (:class:`colossalai.logging.DistributedLogger`): Logger for recording the log information.
interval (int, optional): Interval of printing log information, defaults to 1.
priority (int, optional): Priority in the printing, hooks with small priority will be printed in front,
defaults to 1. If different hooks share same priority, the order of printing would
depend on the hooks order in the hook list.
"""
def __init__(self,
@@ -48,10 +49,12 @@ class LogByEpochHook(BaseHook):
@HOOKS.register_module
class LogMetricByStepHook(BaseHook):
"""Hook to log metric by step
"""Hook to log metric by step.
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10
:type priority: int, optional
Args:
priority (int, optional): Priority in the printing, hooks with small priority will be printed in front,
defaults to 10. If different hooks share same priority, the order of printing would
depend on the hooks order in the hook list.
"""
def __init__(self, priority: int = 10):
@@ -74,11 +77,12 @@ class LogMetricByStepHook(BaseHook):
class LogMetricByEpochHook(LogByEpochHook):
"""Specialized hook to record the metric to log.
:param logger: Logger for the log
:param interval: Recording interval, defaults to 1
:type interval: int, optional
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10
:type priority: int, optional
Args:
logger (:class:`colossalai.logging.DistributedLogger`): Logger for recording the log information.
interval (int, optional): Interval of printing log information, defaults to 1.
priority (int, optional): Priority in the printing, hooks with small priority will be printed in front,
defaults to 10. If different hooks share same priority, the order of printing would
depend on the hooks order in the hook list.
"""
def __init__(self,
@@ -116,14 +120,14 @@ class LogMetricByEpochHook(LogByEpochHook):
class TensorboardHook(BaseHook):
"""Specialized hook to record the metric to Tensorboard.
:param log_dir: Directory of log
:type log_dir: str
:param ranks: Ranks of processors
:type ranks: typing.List
:param parallel_mode: Parallel mode, defaults to colossalai.context.parallel_mode.ParallelMode.GLOBAL
:type parallel_mode: :class:`colossalai.context.parallel_mode.ParallelMode`, optional
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10
:type priority: int, optional
Args:
log_dir (str): Directory of log.
ranks (list): Ranks of processors.
parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`, optional): Parallel mode used in trainer,
defaults to colossalai.context.parallel_mode.ParallelMode.GLOBAL.
priority (int, optional): Priority in the printing, hooks with small priority will be printed in front,
defaults to 10. If different hooks share same priority, the order of printing would
depend on the hooks order in the hook list.
"""
def __init__(self,
@@ -200,18 +204,15 @@ class TensorboardHook(BaseHook):
class LogTimingByEpochHook(LogByEpochHook):
"""Specialized hook to write timing record to log.
:param timer: Timer for the hook
:type timer: :class:`colossalai.utils.MultiTimer`
:param logger: Logger for the log
:type logger: :class:`colossalai.logging.DistributedLogger`
:param interval: Recording interval, defaults to 1
:type interval: int, optional
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10
:type priority: int, optional
:param log_eval: Whether writes in evaluation, defaults to True
:type log_eval: bool, optional
:param ignore_num_train_steps: Number of training steps to ignore, defaults to 0
:type ignore_num_train_steps: int, optional
Args:
timer (:class:`colossalai.utils.MultiTimer`): Timer for the hook.
logger (:class:`colossalai.logging.DistributedLogger`): Logger for recording the log information.
interval (int, optional): Interval of printing log information, defaults to 1.
priority (int, optional): Priority in the printing, hooks with small priority will be printed in front
defaults to 10. If different hooks share same priority, the order of printing would
depend on the hooks order in the hook list.
log_eval (bool, optional): Whether writes in evaluation, defaults to True.
ignore_num_train_steps (int, optional): Number of training steps to ignore, defaults to 0.
"""
def __init__(self,
@@ -270,14 +271,13 @@ class LogTimingByEpochHook(LogByEpochHook):
class LogMemoryByEpochHook(LogByEpochHook):
"""Specialized Hook to write memory usage record to log.
:param logger: Logger for the log
:type logger: colossalai.logging.DistributedLogger
:param interval: Recording interval, defaults to 1
:type interval: int, optional
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10
:type priority: int, optional
:param log_eval: Whether writes in evaluation, defaults to True
:type log_eval: bool, optional
Args:
logger (:class:`colossalai.logging.DistributedLogger`): Logger for recording the log information.
interval (int, optional): Interval of printing log information, defaults to 1.
priority (int, optional): Priority in the printing, hooks with small priority will be printed in front
defaults to 1. If different hooks share same priority, the order of printing would
depend on the hooks order in the hook list.
log_eval (bool, optional): Whether writes in evaluation, defaults to True.
"""
def __init__(self,

View File

@@ -6,15 +6,17 @@ from ._metric_hook import LearningRateMetric, MetricHook
@HOOKS.register_module
class LRSchedulerHook(MetricHook):
"""Build LR scheduler
r"""Build LR scheduler for trainer.
:param lr_scheduler: LR scheduler
:param by_epoch: If `True`, the LR will be scheduled every epoch. Else, the LR will be scheduled every batch
:type by_epoch: bool
:param store_lr_in_state: If `True`, store the learning rate in each state, defaults to `True`
:type store_lr_in_state: bool, optional
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 1
:type priority: int, optional
Args:
lr_scheduler (:class:`colossalai.nn.lr_scheduler`): The specific LR scheduler
in range of ``colossalai.nn.lr_scheduler``, more details about ``lr_scheduler`` could be found in
`lr_scheduler <https://github.com/hpcaitech/ColossalAI/tree/main/colossalai/nn/lr_scheduler>`_.
by_epoch (bool): If `True`, the LR will be scheduled every epoch. Else, the LR will be scheduled every batch.
store_lr_in_state (bool, optional): If `True`, store the learning rate in each state, defaults to `True`.
priority (int, optional): Priority in the printing, hooks with small priority will be printed in front
defaults to 1. If different hooks share same priority, the order of printing would
depend on the hooks order in the hook list.
"""
def __init__(
self,

View File

@@ -17,13 +17,13 @@ from ._base_hook import BaseHook
class Metric(ABC):
"""A basic class of metric collectors. It collects a specific
metric during training or evaluation and it's always used with
metric during training or evaluation and would always be used with
:class:`MetricHook` to help it update its states and show the
metric. So please use corresponding hook class to make the metric
collector works.
:param epoch_only: Whether the metric only read for the full epoch
:type epoch_only: bool
Args:
epoch_only (bool): Whether the metric only read for the full epoch.
"""
def __init__(self, epoch_only: bool):
@@ -80,8 +80,8 @@ class Metric(ABC):
class LossMetric(Metric):
"""A metric collector for loss.
:param epoch_only: Whether the metric only read for the full epoch
:type epoch_only: bool
Args:
epoch_only (bool): Whether the metric only read for the full epoch.
"""
def __init__(self, epoch_only):
@@ -101,7 +101,8 @@ class LossMetric(Metric):
"""Updates :attr:`last_step_loss` and :attr:`accum_loss` with current loss.
It expects the output has loss.
:param loss: Current loss of the output
Args:
loss (:class:`torch.tensor`): Current loss of the output.
"""
# expect output to be logits, label and loss
loss_ = loss.detach()
@@ -132,10 +133,9 @@ class LossMetric(Metric):
class LearningRateMetric(Metric):
"""A metric collector for learning rate.
:param epoch_only: Whether the metric only read for the full epoch
:type epoch_only: bool
:param initial_lr: Initial learning rate, defaults to 0.0
:type initial_lr: float, optional
Args:
epoch_only (bool): Whether the metric only read for the full epoch.
initial_lr (float, optional): Initial learning rate, defaults to 0.0.
"""
def __init__(self, epoch_only: bool, initial_lr: float = 0.):
@@ -163,10 +163,9 @@ class AccuracyMetric(Metric):
"""A metric collector for accuracy. It only works for classification
tasks.
:param epoch_only: Whether the metric only read for the full epoch
:type epoch_only: bool
:param accuracy_func: Accuracy function for the classification task
:type accuracy_func: :class:`typing.Callable`
Args:
epoch_only (bool): Whether the metric only read for the full epoch.
accuracy_func (:class:`typing.Callable`): Accuracy function for the classification task.
"""
def __init__(self, epoch_only: bool, accuracy_func: Callable):
@@ -187,9 +186,10 @@ class AccuracyMetric(Metric):
"""Updates last step accuracy and accumulated accuracy with current logits
and labels. It expects the output has logits and labels.
:param logits: The logits output of the model
:param targets: Real labels of the dataset
:param batch_size: Batch size of the task
Args:
logits (:class:`torch.tensor`): The logits output of the model.
targets (:class:`torch.tensor`): Real labels of the dataset.
batch_size (int): Batch size of the task.
"""
if isinstance(logits, (list, tuple)):
logits = logits[0]
@@ -224,8 +224,10 @@ class MetricHook(BaseHook):
update their states. Others are used to display and
record the metric.
:param priority: Priority in the printing, hooks with small priority will be printed in front
:type priority: int
Args:
priority (int): Priority in the printing, hooks with small priority will be printed in front
defaults to 1. If different hooks share same priority, the order of printing would
depend on the hooks order in the hook list.
"""
def __init__(
@@ -244,8 +246,10 @@ class MetricHook(BaseHook):
class LossHook(MetricHook):
"""Specialized hook class for :class:`Loss`.
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 0
:type priority: int, optional
Args:
priority (int, optional): Priority in the printing, hooks with small priority will be printed in front
defaults to 0. If different hooks share same priority, the order of printing would
depend on the hooks order in the hook list.
"""
def __init__(self, priority: int = 0):
@@ -283,10 +287,11 @@ class LossHook(MetricHook):
class AccuracyHook(MetricHook):
"""Specialized hook class for :class:`Accuracy`.
:param accuracy_func: Priority in the printing, hooks with small priority will be printed in front
:type accuracy_func: typing.Callable
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 0
:type priority: int, optional
Args:
accuracy_func (:class:`typing.Callable`): Accuracy function for the classification task.
priority (int, optional): Priority in the printing, hooks with small priority will be printed in front
defaults to 0. If different hooks share same priority, the order of printing would
depend on the hooks order in the hook list.
"""
def __init__(self, accuracy_func: Callable, priority: int = 0):
@@ -314,8 +319,8 @@ class AccuracyHook(MetricHook):
class ThroughputMetric(Metric):
"""Metric for :class:`Throughput`.
:param epoch_only: epoch only
:type epoch_only: bool
Args:
epoch_only (bool): Whether the metric only read for the full epoch.
"""
def __init__(self, epoch_only: bool, ignored_steps: int = 0):
super().__init__(epoch_only=epoch_only)
@@ -360,10 +365,13 @@ class ThroughputMetric(Metric):
@HOOKS.register_module
class ThroughputHook(MetricHook):
"""Specialized hook class for :class:`Throughput`.
"""Specialized hook class for :class:`Throughput`. Hook to measure execution throughput (samples/sec).
:param priority: priority of throughput hook, defaults to 10
:type priority: int, optional
Args:
ignored_steps (int, optional): the number of initial training steps to ignore.
priority (int, optional): Priority in the printing, hooks with small priority will be printed in front
defaults to 10. If different hooks share same priority, the order of printing would
depend on the hooks order in the hook list.
"""
def __init__(self, ignored_steps: int = 0, priority: int = 10):
super().__init__(priority)