Update layer integration documentations (#108)

Update the documentations of layer integration Update _log_hook.py Update _operation.py
2025-09-25 03:31:56 +00:00 · 2022-01-10 18:05:58 +08:00
parent 3a61d785b5
commit 4a3d3446b0
25 changed files with 1020 additions and 98 deletions
--- a/colossalai/trainer/hooks/_base_hook.py
+++ b/colossalai/trainer/hooks/_base_hook.py
@@ -10,10 +10,9 @@ class BaseHook(ABC):
    """This class allows users to add desired actions in specific time points
    during training or evaluation.

-    :param trainer: Trainer attached with current hook
    :param priority: Priority in the printing, hooks with small priority will be printed in front
-    :type trainer: Trainer
    :type priority: int
+    :param trainer: Trainer attached with current hook
    """

    def __init__(self, priority: int) -> None:
@@ -43,11 +42,11 @@ class BaseHook(ABC):
        """Actions after running a training iteration.

        :param output: Output of the model
+        :type output: torch.Tensor
        :param label: Labels of the input data
+        :type label: torch.Tensor
        :param loss: Loss between the output and input data
-        :type output: Tensor
-        :type label: Tensor
-        :type loss: Tensor
+        :type loss: torch.Tensor
        """
        pass

@@ -90,10 +89,10 @@ class BaseHook(ABC):
        """Actions after running a testing iteration.

        :param output: Output of the model
-        :param label: Labels of the input data
-        :param loss: Loss between the output and input data
        :type output: Tensor
+        :param label: Labels of the input data
        :type label: Tensor
+        :param loss: Loss between the output and input data
        :type loss: Tensor
        """
        pass
--- a/colossalai/trainer/hooks/_checkpoint_hook.py
+++ b/colossalai/trainer/hooks/_checkpoint_hook.py
@@ -16,14 +16,15 @@ from ._lr_scheduler_hook import LRSchedulerHook
 class SaveCheckpointHook(BaseHook):
    """Saves the model by interval in training process.

-    :param interval: Saving interval 
-    :param checkpoint_dir: Directory of saving checkpoint 
-    :param suffix: Saving suffix of the file
-    :param priority: Priority in the printing, hooks with small priority will be printed in front
+    :param interval: Saving interval, defaults to 1
    :type interval: int, optional
-    :type checkpoint_dir: int, optional
+    :param checkpoint_dir: Directory of saving checkpoint, defaults to None
+    :type checkpoint_dir: str, optional
+    :param suffix: Saving suffix of the file, defaults to ''
    :type suffix: str, optional
+    :param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10
    :type priority: int, optional
+    :param trainer: Trainer attached with current hook
    """

    def __init__(self,
@@ -71,16 +72,19 @@ class SaveCheckpointHook(BaseHook):
 class LoadCheckpointHook(BaseHook):
    """Loads the model before training process.

-    :param checkpoint_dir: Directory of saving checkpoint 
-    :param epoch: Epoch number to be set
-    :param finetune: Whether allows to load a part of the model
-    :param strict: Whether loads a model that has the same shape of parameters 
-    :param priority: Priority in the printing, hooks with small priority will be printed in front
+    :param checkpoint_dir: Directory of saving checkpoint, defaults to None
    :type checkpoint_dir: str, optional
+    :param epoch: Epoch number to be set, defaults to -1
    :type epoch: str, optional
+    :param finetune: Whether allows to load a part of the model, defaults to False
    :type finetune: bool, optional
+    :param strict: Whether loads a model that has the same shape of parameters, defaults to False
    :type strict: bool, optional
+    :param suffix: Suffic, defaults to ''
+    :type suffix: str, optional
+    :param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 0
    :type priority: int, optional
+    :param trainer: Trainer attached with current hook
    """

    def __init__(self,
--- a/colossalai/trainer/hooks/_log_hook.py
+++ b/colossalai/trainer/hooks/_log_hook.py
@@ -25,6 +25,15 @@ def _format_number(val, prec=5):


 class LogByEpochHook(BaseHook):
+    """hook to log by epoch
+
+    :param logger: logger for the log
+    :param interval: Recording interval, defaults to 1
+    :type interval: int, optional
+    :param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 1
+    :type priority: int, optional
+    :param trainer: Trainer attached with current hook
+    """
    def __init__(self,
                 logger,
                 interval: int = 1,
@@ -39,6 +48,12 @@ class LogByEpochHook(BaseHook):

@HOOKS.register_module
 class LogMetricByStepHook(BaseHook):
+    """hook to log metric by step
+
+    :param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10
+    :type priority: int, optional
+    :param trainer: Trainer attached with current hook
+    """
    def __init__(self, priority: int = 10):
        super().__init__(priority)

@@ -59,12 +74,13 @@ class LogMetricByStepHook(BaseHook):
 class LogMetricByEpochHook(LogByEpochHook):
    """Specialized Hook to record the metric to log.

-    :param trainer: Trainer attached with current hook
-    :type trainer: Trainer
-    :param interval: Recording interval
+    :param logger: logger for the log
+    :param interval: Recording interval, defaults to 1
    :type interval: int, optional
-    :param priority: Priority in the printing, hooks with small priority will be printed in front
+    :param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10
    :type priority: int, optional
+    :param trainer: Trainer attached with current hook
+    :param mode: Mode of metrics, 'train' and 'test'
    """

    def __init__(self,
@@ -102,12 +118,17 @@ class LogMetricByEpochHook(LogByEpochHook):
 class TensorboardHook(BaseHook):
    """Specialized Hook to record the metric to Tensorboard.

-    :param trainer: Trainer attached with current hook
-    :type trainer: Trainer
    :param log_dir: Directory of log
-    :type log_dir: str, optional
-    :param priority: Priority in the printing, hooks with small priority will be printed in front
+    :type log_dir: str
+    :param ranks: ranks of processors
+    :type ranks: typing.List
+    :param parallel_mode: Parallel mode, defaults to colossalai.context.parallel_mode.ParallelMode.GLOBAL
+    :type parallel_mode: colossalai.context.parallel_mode.ParallelMode, optional
+    :param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10
    :type priority: int, optional
+    :param trainer: Trainer attached with current hook
+    :param mode: Mode of metrics, 'train' and 'test'
+    :type mode: str
    """

    def __init__(self,
@@ -184,14 +205,20 @@ class TensorboardHook(BaseHook):
 class LogTimingByEpochHook(LogByEpochHook):
    """Specialized Hook to write timing record to log.

-    :param trainer: Trainer attached with current hook
-    :type trainer: Trainer
-    :param interval: Recording interval
+    :param timer: Timer for the hook
+    :type timer: colossalai.utils.MultiTimer
+    :param logger: Logger for the log
+    :type logger: colossalai.logging.DistributedLogger
+    :param interval: Recording interval, defaults to 1
    :type interval: int, optional
-    :param priority: Priority in the printing, hooks with small priority will be printed in front
+    :param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10
    :type priority: int, optional
-    :param log_eval: Whether writes in evaluation
+    :param log_eval: Whether writes in evaluation, defaults to True
    :type log_eval: bool, optional
+    :param ignore_num_train_steps: Number of training steps to ignore, defaults to 0
+    :type ignore_num_train_steps: int, optional
+    :param mode: Mode of metrics, 'train' and 'test'
+    :param trainer: Trainer attached with current hook
    """
    def __init__(self,
                 timer: MultiTimer,
@@ -249,13 +276,13 @@ class LogTimingByEpochHook(LogByEpochHook):
 class LogMemoryByEpochHook(LogByEpochHook):
    """Specialized Hook to write memory usage record to log.

-    :param trainer: Trainer attached with current hook
-    :type trainer: Trainer
-    :param interval: Recording interval
+    :param logger: Logger for the log
+    :type logger: colossalai.logging.DistributedLogger
+    :param interval: Recording interval, defaults to 1
    :type interval: int, optional
-    :param priority: Priority in the printing, hooks with small priority will be printed in front
+    :param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10
    :type priority: int, optional
-    :param log_eval: Whether writes in evaluation
+    :param log_eval: Whether writes in evaluation, defaults to True
    :type log_eval: bool, optional
    """
    def __init__(self,
@@ -263,7 +290,8 @@ class LogMemoryByEpochHook(LogByEpochHook):
                 interval: int = 1,
                 priority: int = 10,
                 log_eval: bool = True,
-                 report_cpu: bool = False) -> None:
+                 report_cpu: bool = False, # no reference
+                 ) -> None:
        super().__init__(logger=logger, interval=interval, priority=priority)
        self._log_eval = log_eval
        self._is_rank_to_log = is_dp_rank_0() and is_tp_rank_0()
--- a/colossalai/trainer/hooks/_lr_scheduler_hook.py
+++ b/colossalai/trainer/hooks/_lr_scheduler_hook.py
@@ -8,14 +8,14 @@ from ._metric_hook import LearningRateMetric, MetricHook
 class LRSchedulerHook(MetricHook):
    """Build LR scheduler

-    :param trainer: Trainer attached with current hook
-    :type trainer: Trainer
-    :param lr_scheduler_cfg: The config of LR scheduler
-    :type lr_scheduler_cfg: dict
-    :param by_epoch: If `True`, the LR will be scheduled every epoch. Else, the LR will be scheduled every batch. Defaults to `True`.
+    :param lr_scheduler: LR scheduler
+    :param by_epoch: If `True`, the LR will be scheduled every epoch. Else, the LR will be scheduled every batch
    :type by_epoch: bool
-    :param priority: Priority in the printing, hooks with small priority will be printed in front
+    :param store_lr_in_state: If `True`, store the learning rate in each state, defaults to `True`
+    :type store_lr_in_state: bool, optional
+    :param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 1
    :type priority: int, optional
+    :param trainer: Trainer attached with current hook
    """
    def __init__(
        self,
--- a/colossalai/trainer/hooks/_metric_hook.py
+++ b/colossalai/trainer/hooks/_metric_hook.py
@@ -133,6 +133,8 @@ class LearningRateMetric(Metric):

    :param epoch_only: Whether the metric only read for the full epoch
    :type epoch_only: bool
+    :param initial_lr: initial learning rate, defaults to 0.0
+    :type initial_lr: float, optional
    """

    def __init__(self, epoch_only: bool, initial_lr: float = 0.):
@@ -161,6 +163,8 @@ class AccuracyMetric(Metric):

    :param epoch_only: Whether the metric only read for the full epoch
    :type epoch_only: bool
+    :param accuracy_func: accuracy function for the classification task
+    :type accuracy_func: typing.Callable
    """

    def __init__(self, epoch_only: bool, accuracy_func: Callable):
@@ -182,7 +186,8 @@ class AccuracyMetric(Metric):
        and labels. It expects the output has logits and labels.

        :param logits: The logits output of the model
-        :param label: The labels of the input data
+        :param targets: real labels of the dataset
+        :param batch_size: batch size of the task
        """
        if isinstance(logits, (list, tuple)):
            logits = logits[0]
@@ -216,10 +221,10 @@ class MetricHook(BaseHook):
    update their states. Others are used to display and 
    record the metric.

-    :param trainer: Trainer attached with current hook
    :param priority: Priority in the printing, hooks with small priority will be printed in front
-    :type trainer: Trainer
    :type priority: int
+    :param trainer: Trainer attached with current hook
+    :type trainer: Trainer
    """

    def __init__(
@@ -238,10 +243,10 @@ class MetricHook(BaseHook):
 class LossHook(MetricHook):
    """Specialized hook class for :class:`Loss`.

-    :param trainer: Trainer attached with current hook
-    :param priority: Priority in the printing, hooks with small priority will be printed in front
-    :type trainer: Trainer
+    :param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 0
    :type priority: int, optional
+    :param trainer: Trainer attached with current hook
+    :type trainer: Trainer
    """

    def __init__(self, priority: int = 0):
@@ -279,10 +284,12 @@ class LossHook(MetricHook):
 class AccuracyHook(MetricHook):
    """Specialized hook class for :class:`Accuracy`.

+    :param accuracy_func: Priority in the printing, hooks with small priority will be printed in front
+    :type accuracy_func: typing.Callable
+    :param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 0
+    :type priority: int, optional
    :param trainer: Trainer attached with current hook
-    :param priority: Priority in the printing, hooks with small priority will be printed in front
    :type trainer: Trainer
-    :type priority: int
    """

    def __init__(self, accuracy_func: Callable, priority: int = 0):
@@ -308,6 +315,13 @@ class AccuracyHook(MetricHook):


 class ThroughputMetric(Metric):
+    """Metric for :class:`Throughput`.
+
+    :param epoch_only: epoch only
+    :type epoch_only: bool
+    :param num_samples: number of samples
+    :param time: time
+    """
    def __init__(self, epoch_only: bool):
        super().__init__(epoch_only=epoch_only)
        self.accumulated_num_samples = torch.zeros(1, device=get_current_device())
@@ -345,6 +359,13 @@ class ThroughputMetric(Metric):

@HOOKS.register_module
 class ThroughputHook(MetricHook):
+    """Specialized hook class for :class:`Throughput`.
+
+    :param priority: priority of throughput hook, defaults to 10
+    :type priority: int, optional
+    :param trainer: Trainer attached with current hook
+    :type trainer: Trainer
+    """
    def __init__(self, priority: int = 10):
        super().__init__(priority)