mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-25 03:31:56 +00:00
Update layer integration documentations (#108)
Update the documentations of layer integration Update _log_hook.py Update _operation.py
This commit is contained in:
@@ -10,10 +10,9 @@ class BaseHook(ABC):
|
||||
"""This class allows users to add desired actions in specific time points
|
||||
during training or evaluation.
|
||||
|
||||
:param trainer: Trainer attached with current hook
|
||||
:param priority: Priority in the printing, hooks with small priority will be printed in front
|
||||
:type trainer: Trainer
|
||||
:type priority: int
|
||||
:param trainer: Trainer attached with current hook
|
||||
"""
|
||||
|
||||
def __init__(self, priority: int) -> None:
|
||||
@@ -43,11 +42,11 @@ class BaseHook(ABC):
|
||||
"""Actions after running a training iteration.
|
||||
|
||||
:param output: Output of the model
|
||||
:type output: torch.Tensor
|
||||
:param label: Labels of the input data
|
||||
:type label: torch.Tensor
|
||||
:param loss: Loss between the output and input data
|
||||
:type output: Tensor
|
||||
:type label: Tensor
|
||||
:type loss: Tensor
|
||||
:type loss: torch.Tensor
|
||||
"""
|
||||
pass
|
||||
|
||||
@@ -90,10 +89,10 @@ class BaseHook(ABC):
|
||||
"""Actions after running a testing iteration.
|
||||
|
||||
:param output: Output of the model
|
||||
:param label: Labels of the input data
|
||||
:param loss: Loss between the output and input data
|
||||
:type output: Tensor
|
||||
:param label: Labels of the input data
|
||||
:type label: Tensor
|
||||
:param loss: Loss between the output and input data
|
||||
:type loss: Tensor
|
||||
"""
|
||||
pass
|
||||
|
@@ -16,14 +16,15 @@ from ._lr_scheduler_hook import LRSchedulerHook
|
||||
class SaveCheckpointHook(BaseHook):
|
||||
"""Saves the model by interval in training process.
|
||||
|
||||
:param interval: Saving interval
|
||||
:param checkpoint_dir: Directory of saving checkpoint
|
||||
:param suffix: Saving suffix of the file
|
||||
:param priority: Priority in the printing, hooks with small priority will be printed in front
|
||||
:param interval: Saving interval, defaults to 1
|
||||
:type interval: int, optional
|
||||
:type checkpoint_dir: int, optional
|
||||
:param checkpoint_dir: Directory of saving checkpoint, defaults to None
|
||||
:type checkpoint_dir: str, optional
|
||||
:param suffix: Saving suffix of the file, defaults to ''
|
||||
:type suffix: str, optional
|
||||
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10
|
||||
:type priority: int, optional
|
||||
:param trainer: Trainer attached with current hook
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
@@ -71,16 +72,19 @@ class SaveCheckpointHook(BaseHook):
|
||||
class LoadCheckpointHook(BaseHook):
|
||||
"""Loads the model before training process.
|
||||
|
||||
:param checkpoint_dir: Directory of saving checkpoint
|
||||
:param epoch: Epoch number to be set
|
||||
:param finetune: Whether allows to load a part of the model
|
||||
:param strict: Whether loads a model that has the same shape of parameters
|
||||
:param priority: Priority in the printing, hooks with small priority will be printed in front
|
||||
:param checkpoint_dir: Directory of saving checkpoint, defaults to None
|
||||
:type checkpoint_dir: str, optional
|
||||
:param epoch: Epoch number to be set, defaults to -1
|
||||
:type epoch: str, optional
|
||||
:param finetune: Whether allows to load a part of the model, defaults to False
|
||||
:type finetune: bool, optional
|
||||
:param strict: Whether loads a model that has the same shape of parameters, defaults to False
|
||||
:type strict: bool, optional
|
||||
:param suffix: Suffic, defaults to ''
|
||||
:type suffix: str, optional
|
||||
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 0
|
||||
:type priority: int, optional
|
||||
:param trainer: Trainer attached with current hook
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
|
@@ -25,6 +25,15 @@ def _format_number(val, prec=5):
|
||||
|
||||
|
||||
class LogByEpochHook(BaseHook):
|
||||
"""hook to log by epoch
|
||||
|
||||
:param logger: logger for the log
|
||||
:param interval: Recording interval, defaults to 1
|
||||
:type interval: int, optional
|
||||
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 1
|
||||
:type priority: int, optional
|
||||
:param trainer: Trainer attached with current hook
|
||||
"""
|
||||
def __init__(self,
|
||||
logger,
|
||||
interval: int = 1,
|
||||
@@ -39,6 +48,12 @@ class LogByEpochHook(BaseHook):
|
||||
|
||||
@HOOKS.register_module
|
||||
class LogMetricByStepHook(BaseHook):
|
||||
"""hook to log metric by step
|
||||
|
||||
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10
|
||||
:type priority: int, optional
|
||||
:param trainer: Trainer attached with current hook
|
||||
"""
|
||||
def __init__(self, priority: int = 10):
|
||||
super().__init__(priority)
|
||||
|
||||
@@ -59,12 +74,13 @@ class LogMetricByStepHook(BaseHook):
|
||||
class LogMetricByEpochHook(LogByEpochHook):
|
||||
"""Specialized Hook to record the metric to log.
|
||||
|
||||
:param trainer: Trainer attached with current hook
|
||||
:type trainer: Trainer
|
||||
:param interval: Recording interval
|
||||
:param logger: logger for the log
|
||||
:param interval: Recording interval, defaults to 1
|
||||
:type interval: int, optional
|
||||
:param priority: Priority in the printing, hooks with small priority will be printed in front
|
||||
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10
|
||||
:type priority: int, optional
|
||||
:param trainer: Trainer attached with current hook
|
||||
:param mode: Mode of metrics, 'train' and 'test'
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
@@ -102,12 +118,17 @@ class LogMetricByEpochHook(LogByEpochHook):
|
||||
class TensorboardHook(BaseHook):
|
||||
"""Specialized Hook to record the metric to Tensorboard.
|
||||
|
||||
:param trainer: Trainer attached with current hook
|
||||
:type trainer: Trainer
|
||||
:param log_dir: Directory of log
|
||||
:type log_dir: str, optional
|
||||
:param priority: Priority in the printing, hooks with small priority will be printed in front
|
||||
:type log_dir: str
|
||||
:param ranks: ranks of processors
|
||||
:type ranks: typing.List
|
||||
:param parallel_mode: Parallel mode, defaults to colossalai.context.parallel_mode.ParallelMode.GLOBAL
|
||||
:type parallel_mode: colossalai.context.parallel_mode.ParallelMode, optional
|
||||
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10
|
||||
:type priority: int, optional
|
||||
:param trainer: Trainer attached with current hook
|
||||
:param mode: Mode of metrics, 'train' and 'test'
|
||||
:type mode: str
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
@@ -184,14 +205,20 @@ class TensorboardHook(BaseHook):
|
||||
class LogTimingByEpochHook(LogByEpochHook):
|
||||
"""Specialized Hook to write timing record to log.
|
||||
|
||||
:param trainer: Trainer attached with current hook
|
||||
:type trainer: Trainer
|
||||
:param interval: Recording interval
|
||||
:param timer: Timer for the hook
|
||||
:type timer: colossalai.utils.MultiTimer
|
||||
:param logger: Logger for the log
|
||||
:type logger: colossalai.logging.DistributedLogger
|
||||
:param interval: Recording interval, defaults to 1
|
||||
:type interval: int, optional
|
||||
:param priority: Priority in the printing, hooks with small priority will be printed in front
|
||||
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10
|
||||
:type priority: int, optional
|
||||
:param log_eval: Whether writes in evaluation
|
||||
:param log_eval: Whether writes in evaluation, defaults to True
|
||||
:type log_eval: bool, optional
|
||||
:param ignore_num_train_steps: Number of training steps to ignore, defaults to 0
|
||||
:type ignore_num_train_steps: int, optional
|
||||
:param mode: Mode of metrics, 'train' and 'test'
|
||||
:param trainer: Trainer attached with current hook
|
||||
"""
|
||||
def __init__(self,
|
||||
timer: MultiTimer,
|
||||
@@ -249,13 +276,13 @@ class LogTimingByEpochHook(LogByEpochHook):
|
||||
class LogMemoryByEpochHook(LogByEpochHook):
|
||||
"""Specialized Hook to write memory usage record to log.
|
||||
|
||||
:param trainer: Trainer attached with current hook
|
||||
:type trainer: Trainer
|
||||
:param interval: Recording interval
|
||||
:param logger: Logger for the log
|
||||
:type logger: colossalai.logging.DistributedLogger
|
||||
:param interval: Recording interval, defaults to 1
|
||||
:type interval: int, optional
|
||||
:param priority: Priority in the printing, hooks with small priority will be printed in front
|
||||
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10
|
||||
:type priority: int, optional
|
||||
:param log_eval: Whether writes in evaluation
|
||||
:param log_eval: Whether writes in evaluation, defaults to True
|
||||
:type log_eval: bool, optional
|
||||
"""
|
||||
def __init__(self,
|
||||
@@ -263,7 +290,8 @@ class LogMemoryByEpochHook(LogByEpochHook):
|
||||
interval: int = 1,
|
||||
priority: int = 10,
|
||||
log_eval: bool = True,
|
||||
report_cpu: bool = False) -> None:
|
||||
report_cpu: bool = False, # no reference
|
||||
) -> None:
|
||||
super().__init__(logger=logger, interval=interval, priority=priority)
|
||||
self._log_eval = log_eval
|
||||
self._is_rank_to_log = is_dp_rank_0() and is_tp_rank_0()
|
||||
|
@@ -8,14 +8,14 @@ from ._metric_hook import LearningRateMetric, MetricHook
|
||||
class LRSchedulerHook(MetricHook):
|
||||
"""Build LR scheduler
|
||||
|
||||
:param trainer: Trainer attached with current hook
|
||||
:type trainer: Trainer
|
||||
:param lr_scheduler_cfg: The config of LR scheduler
|
||||
:type lr_scheduler_cfg: dict
|
||||
:param by_epoch: If `True`, the LR will be scheduled every epoch. Else, the LR will be scheduled every batch. Defaults to `True`.
|
||||
:param lr_scheduler: LR scheduler
|
||||
:param by_epoch: If `True`, the LR will be scheduled every epoch. Else, the LR will be scheduled every batch
|
||||
:type by_epoch: bool
|
||||
:param priority: Priority in the printing, hooks with small priority will be printed in front
|
||||
:param store_lr_in_state: If `True`, store the learning rate in each state, defaults to `True`
|
||||
:type store_lr_in_state: bool, optional
|
||||
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 1
|
||||
:type priority: int, optional
|
||||
:param trainer: Trainer attached with current hook
|
||||
"""
|
||||
def __init__(
|
||||
self,
|
||||
|
@@ -133,6 +133,8 @@ class LearningRateMetric(Metric):
|
||||
|
||||
:param epoch_only: Whether the metric only read for the full epoch
|
||||
:type epoch_only: bool
|
||||
:param initial_lr: initial learning rate, defaults to 0.0
|
||||
:type initial_lr: float, optional
|
||||
"""
|
||||
|
||||
def __init__(self, epoch_only: bool, initial_lr: float = 0.):
|
||||
@@ -161,6 +163,8 @@ class AccuracyMetric(Metric):
|
||||
|
||||
:param epoch_only: Whether the metric only read for the full epoch
|
||||
:type epoch_only: bool
|
||||
:param accuracy_func: accuracy function for the classification task
|
||||
:type accuracy_func: typing.Callable
|
||||
"""
|
||||
|
||||
def __init__(self, epoch_only: bool, accuracy_func: Callable):
|
||||
@@ -182,7 +186,8 @@ class AccuracyMetric(Metric):
|
||||
and labels. It expects the output has logits and labels.
|
||||
|
||||
:param logits: The logits output of the model
|
||||
:param label: The labels of the input data
|
||||
:param targets: real labels of the dataset
|
||||
:param batch_size: batch size of the task
|
||||
"""
|
||||
if isinstance(logits, (list, tuple)):
|
||||
logits = logits[0]
|
||||
@@ -216,10 +221,10 @@ class MetricHook(BaseHook):
|
||||
update their states. Others are used to display and
|
||||
record the metric.
|
||||
|
||||
:param trainer: Trainer attached with current hook
|
||||
:param priority: Priority in the printing, hooks with small priority will be printed in front
|
||||
:type trainer: Trainer
|
||||
:type priority: int
|
||||
:param trainer: Trainer attached with current hook
|
||||
:type trainer: Trainer
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
@@ -238,10 +243,10 @@ class MetricHook(BaseHook):
|
||||
class LossHook(MetricHook):
|
||||
"""Specialized hook class for :class:`Loss`.
|
||||
|
||||
:param trainer: Trainer attached with current hook
|
||||
:param priority: Priority in the printing, hooks with small priority will be printed in front
|
||||
:type trainer: Trainer
|
||||
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 0
|
||||
:type priority: int, optional
|
||||
:param trainer: Trainer attached with current hook
|
||||
:type trainer: Trainer
|
||||
"""
|
||||
|
||||
def __init__(self, priority: int = 0):
|
||||
@@ -279,10 +284,12 @@ class LossHook(MetricHook):
|
||||
class AccuracyHook(MetricHook):
|
||||
"""Specialized hook class for :class:`Accuracy`.
|
||||
|
||||
:param accuracy_func: Priority in the printing, hooks with small priority will be printed in front
|
||||
:type accuracy_func: typing.Callable
|
||||
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 0
|
||||
:type priority: int, optional
|
||||
:param trainer: Trainer attached with current hook
|
||||
:param priority: Priority in the printing, hooks with small priority will be printed in front
|
||||
:type trainer: Trainer
|
||||
:type priority: int
|
||||
"""
|
||||
|
||||
def __init__(self, accuracy_func: Callable, priority: int = 0):
|
||||
@@ -308,6 +315,13 @@ class AccuracyHook(MetricHook):
|
||||
|
||||
|
||||
class ThroughputMetric(Metric):
|
||||
"""Metric for :class:`Throughput`.
|
||||
|
||||
:param epoch_only: epoch only
|
||||
:type epoch_only: bool
|
||||
:param num_samples: number of samples
|
||||
:param time: time
|
||||
"""
|
||||
def __init__(self, epoch_only: bool):
|
||||
super().__init__(epoch_only=epoch_only)
|
||||
self.accumulated_num_samples = torch.zeros(1, device=get_current_device())
|
||||
@@ -345,6 +359,13 @@ class ThroughputMetric(Metric):
|
||||
|
||||
@HOOKS.register_module
|
||||
class ThroughputHook(MetricHook):
|
||||
"""Specialized hook class for :class:`Throughput`.
|
||||
|
||||
:param priority: priority of throughput hook, defaults to 10
|
||||
:type priority: int, optional
|
||||
:param trainer: Trainer attached with current hook
|
||||
:type trainer: Trainer
|
||||
"""
|
||||
def __init__(self, priority: int = 10):
|
||||
super().__init__(priority)
|
||||
|
||||
|
Reference in New Issue
Block a user