moved env variables to global variables; (#215)

added branch context;
added vocab parallel layers;
moved split_batch from load_batch to tensor parallel embedding layers;
updated gpt model;
updated unit test cases;
fixed few collective communicator bugs
This commit is contained in:
アマデウス
2022-02-14 11:15:02 +08:00
committed by Frank Lee
parent b82d60be02
commit 9ee197d0e9
63 changed files with 4304 additions and 1040 deletions

View File

@@ -224,7 +224,7 @@ class LogTimingByEpochHook(LogByEpochHook):
super().__init__(logger=logger, interval=interval, priority=priority)
self._timer = timer
self._log_eval = log_eval
self._is_rank_to_log = is_dp_rank_0() and is_tp_rank_0()
self._is_rank_to_log = is_dp_rank_0() and is_tp_rank_0() and is_no_pp_or_last_stage()
# extra handling to avoid the unstable readings of the first
# few training steps to affect the history mean time
@@ -256,7 +256,7 @@ class LogTimingByEpochHook(LogByEpochHook):
"""
if self._is_epoch_to_log(trainer) and self._is_rank_to_log:
msg = self._get_message('Train')
self.logger.info(f'[Epoch {trainer.cur_epoch} / Train]: {msg}, #steps/epoch = {trainer.steps_per_epoch}')
self.logger.info(f'[Epoch {trainer.cur_epoch} / Train]: {msg} | #steps/epoch = {trainer.steps_per_epoch}')
def after_test_epoch(self, trainer):
"""Writes log after finishing a testing epoch.

View File

@@ -317,24 +317,29 @@ class ThroughputMetric(Metric):
:param epoch_only: epoch only
:type epoch_only: bool
"""
def __init__(self, epoch_only: bool):
def __init__(self, epoch_only: bool, ignored_steps: int = 0):
super().__init__(epoch_only=epoch_only)
self.ignored_steps = ignored_steps
self.cur_steps = 0
self.accumulated_num_samples = torch.zeros(1, device=get_current_device())
self.accumulated_used_time = torch.zeros(1, device=get_current_device())
self.last_step_num_samples = torch.zeros(1, device=get_current_device())
self.last_step_used_time = torch.zeros(1, device=get_current_device())
def reset(self) -> None:
# self.cur_steps = 0
self.accumulated_num_samples.zero_()
self.accumulated_used_time.zero_()
self.last_step_num_samples.zero_()
self.last_step_used_time.zero_()
def update(self, num_samples, time) -> None:
self.cur_steps += 1
self.last_step_num_samples.fill_(num_samples)
self.last_step_used_time.fill_(time)
self.accumulated_num_samples += self.last_step_num_samples
self.accumulated_used_time += self.last_step_used_time
if self.cur_steps >= self.ignored_steps:
self.accumulated_num_samples += self.last_step_num_samples
self.accumulated_used_time += self.last_step_used_time
def get_last_step_value(self):
self.last_step_used_time = all_reduce(self.last_step_used_time, ParallelMode.DATA) / \
@@ -360,13 +365,14 @@ class ThroughputHook(MetricHook):
:param priority: priority of throughput hook, defaults to 10
:type priority: int, optional
"""
def __init__(self, priority: int = 10):
def __init__(self, ignored_steps: int = 0, priority: int = 10):
super().__init__(priority)
self.ignored_steps = ignored_steps
def after_hook_is_attached(self, trainer):
self._check_metric_states_initialization(trainer)
if self._is_stage_to_compute:
self.metric = ThroughputMetric(epoch_only=True)
self.metric = ThroughputMetric(epoch_only=True, ignored_steps=self.ignored_steps)
# register the metric
trainer.states['metrics']['train']['Throughput'] = self.metric