moved env variables to global variables; (#215)

added branch context; added vocab parallel layers; moved split_batch from load_batch to tensor parallel embedding layers; updated gpt model; updated unit test cases; fixed few collective communicator bugs
2025-09-21 17:40:33 +00:00 · 2022-02-14 11:15:02 +08:00
parent b82d60be02
commit 9ee197d0e9
63 changed files with 4304 additions and 1040 deletions
--- a/colossalai/trainer/hooks/_log_hook.py
+++ b/colossalai/trainer/hooks/_log_hook.py
@@ -224,7 +224,7 @@ class LogTimingByEpochHook(LogByEpochHook):
        super().__init__(logger=logger, interval=interval, priority=priority)
        self._timer = timer
        self._log_eval = log_eval
-        self._is_rank_to_log = is_dp_rank_0() and is_tp_rank_0()
+        self._is_rank_to_log = is_dp_rank_0() and is_tp_rank_0() and is_no_pp_or_last_stage()

        # extra handling to avoid the unstable readings of the first
        # few training steps to affect the history mean time
@@ -256,7 +256,7 @@ class LogTimingByEpochHook(LogByEpochHook):
        """
        if self._is_epoch_to_log(trainer) and self._is_rank_to_log:
            msg = self._get_message('Train')
-            self.logger.info(f'[Epoch {trainer.cur_epoch} / Train]: {msg}, #steps/epoch = {trainer.steps_per_epoch}')
+            self.logger.info(f'[Epoch {trainer.cur_epoch} / Train]: {msg} | #steps/epoch = {trainer.steps_per_epoch}')

    def after_test_epoch(self, trainer):
        """Writes log after finishing a testing epoch.
--- a/colossalai/trainer/hooks/_metric_hook.py
+++ b/colossalai/trainer/hooks/_metric_hook.py
@@ -317,24 +317,29 @@ class ThroughputMetric(Metric):
    :param epoch_only: epoch only
    :type epoch_only: bool
    """
-    def __init__(self, epoch_only: bool):
+    def __init__(self, epoch_only: bool, ignored_steps: int = 0):
        super().__init__(epoch_only=epoch_only)
+        self.ignored_steps = ignored_steps
+        self.cur_steps = 0
        self.accumulated_num_samples = torch.zeros(1, device=get_current_device())
        self.accumulated_used_time = torch.zeros(1, device=get_current_device())
        self.last_step_num_samples = torch.zeros(1, device=get_current_device())
        self.last_step_used_time = torch.zeros(1, device=get_current_device())

    def reset(self) -> None:
+        # self.cur_steps = 0
        self.accumulated_num_samples.zero_()
        self.accumulated_used_time.zero_()
        self.last_step_num_samples.zero_()
        self.last_step_used_time.zero_()

    def update(self, num_samples, time) -> None:
+        self.cur_steps += 1
        self.last_step_num_samples.fill_(num_samples)
        self.last_step_used_time.fill_(time)
-        self.accumulated_num_samples += self.last_step_num_samples
-        self.accumulated_used_time += self.last_step_used_time
+        if self.cur_steps >= self.ignored_steps:
+            self.accumulated_num_samples += self.last_step_num_samples
+            self.accumulated_used_time += self.last_step_used_time

    def get_last_step_value(self):
        self.last_step_used_time = all_reduce(self.last_step_used_time, ParallelMode.DATA) / \
@@ -360,13 +365,14 @@ class ThroughputHook(MetricHook):
    :param priority: priority of throughput hook, defaults to 10
    :type priority: int, optional
    """
-    def __init__(self, priority: int = 10):
+    def __init__(self, ignored_steps: int = 0, priority: int = 10):
        super().__init__(priority)
+        self.ignored_steps = ignored_steps

    def after_hook_is_attached(self, trainer):
        self._check_metric_states_initialization(trainer)
        if self._is_stage_to_compute:
-            self.metric = ThroughputMetric(epoch_only=True)
+            self.metric = ThroughputMetric(epoch_only=True, ignored_steps=self.ignored_steps)

            # register the metric
            trainer.states['metrics']['train']['Throughput'] = self.metric