Update layer integration documentations (#108)

Update the documentations of layer integration

Update _log_hook.py

Update _operation.py
This commit is contained in:
BoxiangW
2022-01-10 18:05:58 +08:00
committed by GitHub
parent 3a61d785b5
commit 4a3d3446b0
25 changed files with 1020 additions and 98 deletions

View File

@@ -10,10 +10,9 @@ class BaseHook(ABC):
"""This class allows users to add desired actions in specific time points
during training or evaluation.
:param trainer: Trainer attached with current hook
:param priority: Priority in the printing, hooks with small priority will be printed in front
:type trainer: Trainer
:type priority: int
:param trainer: Trainer attached with current hook
"""
def __init__(self, priority: int) -> None:
@@ -43,11 +42,11 @@ class BaseHook(ABC):
"""Actions after running a training iteration.
:param output: Output of the model
:type output: torch.Tensor
:param label: Labels of the input data
:type label: torch.Tensor
:param loss: Loss between the output and input data
:type output: Tensor
:type label: Tensor
:type loss: Tensor
:type loss: torch.Tensor
"""
pass
@@ -90,10 +89,10 @@ class BaseHook(ABC):
"""Actions after running a testing iteration.
:param output: Output of the model
:param label: Labels of the input data
:param loss: Loss between the output and input data
:type output: Tensor
:param label: Labels of the input data
:type label: Tensor
:param loss: Loss between the output and input data
:type loss: Tensor
"""
pass

View File

@@ -16,14 +16,15 @@ from ._lr_scheduler_hook import LRSchedulerHook
class SaveCheckpointHook(BaseHook):
"""Saves the model by interval in training process.
:param interval: Saving interval
:param checkpoint_dir: Directory of saving checkpoint
:param suffix: Saving suffix of the file
:param priority: Priority in the printing, hooks with small priority will be printed in front
:param interval: Saving interval, defaults to 1
:type interval: int, optional
:type checkpoint_dir: int, optional
:param checkpoint_dir: Directory of saving checkpoint, defaults to None
:type checkpoint_dir: str, optional
:param suffix: Saving suffix of the file, defaults to ''
:type suffix: str, optional
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10
:type priority: int, optional
:param trainer: Trainer attached with current hook
"""
def __init__(self,
@@ -71,16 +72,19 @@ class SaveCheckpointHook(BaseHook):
class LoadCheckpointHook(BaseHook):
"""Loads the model before training process.
:param checkpoint_dir: Directory of saving checkpoint
:param epoch: Epoch number to be set
:param finetune: Whether allows to load a part of the model
:param strict: Whether loads a model that has the same shape of parameters
:param priority: Priority in the printing, hooks with small priority will be printed in front
:param checkpoint_dir: Directory of saving checkpoint, defaults to None
:type checkpoint_dir: str, optional
:param epoch: Epoch number to be set, defaults to -1
:type epoch: str, optional
:param finetune: Whether allows to load a part of the model, defaults to False
:type finetune: bool, optional
:param strict: Whether loads a model that has the same shape of parameters, defaults to False
:type strict: bool, optional
:param suffix: Suffic, defaults to ''
:type suffix: str, optional
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 0
:type priority: int, optional
:param trainer: Trainer attached with current hook
"""
def __init__(self,

View File

@@ -25,6 +25,15 @@ def _format_number(val, prec=5):
class LogByEpochHook(BaseHook):
"""hook to log by epoch
:param logger: logger for the log
:param interval: Recording interval, defaults to 1
:type interval: int, optional
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 1
:type priority: int, optional
:param trainer: Trainer attached with current hook
"""
def __init__(self,
logger,
interval: int = 1,
@@ -39,6 +48,12 @@ class LogByEpochHook(BaseHook):
@HOOKS.register_module
class LogMetricByStepHook(BaseHook):
"""hook to log metric by step
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10
:type priority: int, optional
:param trainer: Trainer attached with current hook
"""
def __init__(self, priority: int = 10):
super().__init__(priority)
@@ -59,12 +74,13 @@ class LogMetricByStepHook(BaseHook):
class LogMetricByEpochHook(LogByEpochHook):
"""Specialized Hook to record the metric to log.
:param trainer: Trainer attached with current hook
:type trainer: Trainer
:param interval: Recording interval
:param logger: logger for the log
:param interval: Recording interval, defaults to 1
:type interval: int, optional
:param priority: Priority in the printing, hooks with small priority will be printed in front
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10
:type priority: int, optional
:param trainer: Trainer attached with current hook
:param mode: Mode of metrics, 'train' and 'test'
"""
def __init__(self,
@@ -102,12 +118,17 @@ class LogMetricByEpochHook(LogByEpochHook):
class TensorboardHook(BaseHook):
"""Specialized Hook to record the metric to Tensorboard.
:param trainer: Trainer attached with current hook
:type trainer: Trainer
:param log_dir: Directory of log
:type log_dir: str, optional
:param priority: Priority in the printing, hooks with small priority will be printed in front
:type log_dir: str
:param ranks: ranks of processors
:type ranks: typing.List
:param parallel_mode: Parallel mode, defaults to colossalai.context.parallel_mode.ParallelMode.GLOBAL
:type parallel_mode: colossalai.context.parallel_mode.ParallelMode, optional
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10
:type priority: int, optional
:param trainer: Trainer attached with current hook
:param mode: Mode of metrics, 'train' and 'test'
:type mode: str
"""
def __init__(self,
@@ -184,14 +205,20 @@ class TensorboardHook(BaseHook):
class LogTimingByEpochHook(LogByEpochHook):
"""Specialized Hook to write timing record to log.
:param trainer: Trainer attached with current hook
:type trainer: Trainer
:param interval: Recording interval
:param timer: Timer for the hook
:type timer: colossalai.utils.MultiTimer
:param logger: Logger for the log
:type logger: colossalai.logging.DistributedLogger
:param interval: Recording interval, defaults to 1
:type interval: int, optional
:param priority: Priority in the printing, hooks with small priority will be printed in front
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10
:type priority: int, optional
:param log_eval: Whether writes in evaluation
:param log_eval: Whether writes in evaluation, defaults to True
:type log_eval: bool, optional
:param ignore_num_train_steps: Number of training steps to ignore, defaults to 0
:type ignore_num_train_steps: int, optional
:param mode: Mode of metrics, 'train' and 'test'
:param trainer: Trainer attached with current hook
"""
def __init__(self,
timer: MultiTimer,
@@ -249,13 +276,13 @@ class LogTimingByEpochHook(LogByEpochHook):
class LogMemoryByEpochHook(LogByEpochHook):
"""Specialized Hook to write memory usage record to log.
:param trainer: Trainer attached with current hook
:type trainer: Trainer
:param interval: Recording interval
:param logger: Logger for the log
:type logger: colossalai.logging.DistributedLogger
:param interval: Recording interval, defaults to 1
:type interval: int, optional
:param priority: Priority in the printing, hooks with small priority will be printed in front
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10
:type priority: int, optional
:param log_eval: Whether writes in evaluation
:param log_eval: Whether writes in evaluation, defaults to True
:type log_eval: bool, optional
"""
def __init__(self,
@@ -263,7 +290,8 @@ class LogMemoryByEpochHook(LogByEpochHook):
interval: int = 1,
priority: int = 10,
log_eval: bool = True,
report_cpu: bool = False) -> None:
report_cpu: bool = False, # no reference
) -> None:
super().__init__(logger=logger, interval=interval, priority=priority)
self._log_eval = log_eval
self._is_rank_to_log = is_dp_rank_0() and is_tp_rank_0()

View File

@@ -8,14 +8,14 @@ from ._metric_hook import LearningRateMetric, MetricHook
class LRSchedulerHook(MetricHook):
"""Build LR scheduler
:param trainer: Trainer attached with current hook
:type trainer: Trainer
:param lr_scheduler_cfg: The config of LR scheduler
:type lr_scheduler_cfg: dict
:param by_epoch: If `True`, the LR will be scheduled every epoch. Else, the LR will be scheduled every batch. Defaults to `True`.
:param lr_scheduler: LR scheduler
:param by_epoch: If `True`, the LR will be scheduled every epoch. Else, the LR will be scheduled every batch
:type by_epoch: bool
:param priority: Priority in the printing, hooks with small priority will be printed in front
:param store_lr_in_state: If `True`, store the learning rate in each state, defaults to `True`
:type store_lr_in_state: bool, optional
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 1
:type priority: int, optional
:param trainer: Trainer attached with current hook
"""
def __init__(
self,

View File

@@ -133,6 +133,8 @@ class LearningRateMetric(Metric):
:param epoch_only: Whether the metric only read for the full epoch
:type epoch_only: bool
:param initial_lr: initial learning rate, defaults to 0.0
:type initial_lr: float, optional
"""
def __init__(self, epoch_only: bool, initial_lr: float = 0.):
@@ -161,6 +163,8 @@ class AccuracyMetric(Metric):
:param epoch_only: Whether the metric only read for the full epoch
:type epoch_only: bool
:param accuracy_func: accuracy function for the classification task
:type accuracy_func: typing.Callable
"""
def __init__(self, epoch_only: bool, accuracy_func: Callable):
@@ -182,7 +186,8 @@ class AccuracyMetric(Metric):
and labels. It expects the output has logits and labels.
:param logits: The logits output of the model
:param label: The labels of the input data
:param targets: real labels of the dataset
:param batch_size: batch size of the task
"""
if isinstance(logits, (list, tuple)):
logits = logits[0]
@@ -216,10 +221,10 @@ class MetricHook(BaseHook):
update their states. Others are used to display and
record the metric.
:param trainer: Trainer attached with current hook
:param priority: Priority in the printing, hooks with small priority will be printed in front
:type trainer: Trainer
:type priority: int
:param trainer: Trainer attached with current hook
:type trainer: Trainer
"""
def __init__(
@@ -238,10 +243,10 @@ class MetricHook(BaseHook):
class LossHook(MetricHook):
"""Specialized hook class for :class:`Loss`.
:param trainer: Trainer attached with current hook
:param priority: Priority in the printing, hooks with small priority will be printed in front
:type trainer: Trainer
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 0
:type priority: int, optional
:param trainer: Trainer attached with current hook
:type trainer: Trainer
"""
def __init__(self, priority: int = 0):
@@ -279,10 +284,12 @@ class LossHook(MetricHook):
class AccuracyHook(MetricHook):
"""Specialized hook class for :class:`Accuracy`.
:param accuracy_func: Priority in the printing, hooks with small priority will be printed in front
:type accuracy_func: typing.Callable
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 0
:type priority: int, optional
:param trainer: Trainer attached with current hook
:param priority: Priority in the printing, hooks with small priority will be printed in front
:type trainer: Trainer
:type priority: int
"""
def __init__(self, accuracy_func: Callable, priority: int = 0):
@@ -308,6 +315,13 @@ class AccuracyHook(MetricHook):
class ThroughputMetric(Metric):
"""Metric for :class:`Throughput`.
:param epoch_only: epoch only
:type epoch_only: bool
:param num_samples: number of samples
:param time: time
"""
def __init__(self, epoch_only: bool):
super().__init__(epoch_only=epoch_only)
self.accumulated_num_samples = torch.zeros(1, device=get_current_device())
@@ -345,6 +359,13 @@ class ThroughputMetric(Metric):
@HOOKS.register_module
class ThroughputHook(MetricHook):
"""Specialized hook class for :class:`Throughput`.
:param priority: priority of throughput hook, defaults to 10
:type priority: int, optional
:param trainer: Trainer attached with current hook
:type trainer: Trainer
"""
def __init__(self, priority: int = 10):
super().__init__(priority)