[example] reorganize for community examples (#3557)

2025-09-02 09:38:05 +00:00 · 2023-04-14 16:27:48 +08:00
parent 1a809eddaa
commit f1b3d60cae
31 changed files with 785 additions and 844 deletions
--- a/examples/community/roberta/pretraining/utils/WandbLog.py
+++ b/examples/community/roberta/pretraining/utils/WandbLog.py
@@ -0,0 +1,42 @@
+import os
+import time
+
+import wandb
+from torch.utils.tensorboard import SummaryWriter
+
+
+class WandbLog:
+
+    @classmethod
+    def init_wandb(cls, project, notes=None, name=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), config=None):
+        wandb.init(project=project, notes=notes, name=name, config=config)
+
+    @classmethod
+    def log(cls, result, model=None, gradient=None):
+        wandb.log(result)
+
+        if model:
+            wandb.watch(model)
+
+        if gradient:
+            wandb.watch(gradient)
+
+
+class TensorboardLog:
+
+    def __init__(self, location, name=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), config=None):
+        if not os.path.exists(location):
+            os.mkdir(location)
+        self.writer = SummaryWriter(location, comment=name)
+
+    def log_train(self, result, step):
+        for k, v in result.items():
+            self.writer.add_scalar(f'{k}/train', v, step)
+
+    def log_eval(self, result, step):
+        for k, v in result.items():
+            self.writer.add_scalar(f'{k}/eval', v, step)
+
+    def log_zeroshot(self, result, step):
+        for k, v in result.items():
+            self.writer.add_scalar(f'{k}_acc/eval', v, step)
--- a/examples/community/roberta/pretraining/utils/exp_util.py
+++ b/examples/community/roberta/pretraining/utils/exp_util.py
@@ -0,0 +1,114 @@
+import functools
+import os
+import shutil
+
+import psutil
+import torch
+
+from colossalai.core import global_context as gpc
+
+
+def logging(s, log_path, print_=True, log_=True):
+    if print_:
+        print(s)
+    if log_:
+        with open(log_path, 'a+') as f_log:
+            f_log.write(s + '\n')
+
+
+def get_logger(log_path, **kwargs):
+    return functools.partial(logging, log_path=log_path, **kwargs)
+
+
+def create_exp_dir(dir_path, scripts_to_save=None, debug=False):
+    if debug:
+        print('Debug Mode : no experiment dir created')
+        return functools.partial(logging, log_path=None, log_=False)
+
+    if not os.path.exists(dir_path):
+        os.makedirs(dir_path)
+
+    print('Experiment dir : {}'.format(dir_path))
+    if scripts_to_save is not None:
+        script_path = os.path.join(dir_path, 'scripts')
+        if not os.path.exists(script_path):
+            os.makedirs(script_path)
+        for script in scripts_to_save:
+            dst_file = os.path.join(dir_path, 'scripts', os.path.basename(script))
+            shutil.copyfile(script, dst_file)
+
+    return get_logger(log_path=os.path.join(dir_path, 'log.txt'))
+
+
+def get_cpu_mem():
+    return psutil.Process().memory_info().rss / 1024**2
+
+
+def get_gpu_mem():
+    return torch.cuda.memory_allocated() / 1024**2
+
+
+def get_mem_info(prefix=''):
+    return f'{prefix}GPU memory usage: {get_gpu_mem():.2f} MB, CPU memory usage: {get_cpu_mem():.2f} MB'
+
+
+def get_tflops(model_numel, batch_size, seq_len, step_time):
+    return model_numel * batch_size * seq_len * 8 / 1e12 / (step_time + 1e-12)
+
+
+def get_parameters_in_billions(model, world_size=1):
+    gpus_per_model = world_size
+
+    approx_parameters_in_billions = sum([
+        sum([p.ds_numel if hasattr(p, 'ds_id') else p.nelement()
+             for p in model_module.parameters()])
+        for model_module in model
+    ])
+
+    return approx_parameters_in_billions * gpus_per_model / (1e9)
+
+
+def throughput_calculator(numel, args, config, iteration_time, total_iterations, world_size=1):
+    gpus_per_model = 1
+    batch_size = args.train_micro_batch_size_per_gpu
+    samples_per_model = batch_size * args.max_seq_length
+    model_replica_count = world_size / gpus_per_model
+    approx_parameters_in_billions = numel
+    elapsed_time_per_iter = iteration_time / total_iterations
+    samples_per_second = batch_size / elapsed_time_per_iter
+
+    #flops calculator
+    hidden_size = config.hidden_size
+    num_layers = config.num_hidden_layers
+    vocab_size = config.vocab_size
+
+    # General TFLOPs formula (borrowed from Equation 3 in Section 5.1 of
+    # https://arxiv.org/pdf/2104.04473.pdf).
+    # The factor of 4 is when used with activation check-pointing,
+    # otherwise it will be 3.
+    checkpoint_activations_factor = 4 if args.checkpoint_activations else 3
+    flops_per_iteration = (24 * checkpoint_activations_factor * batch_size * args.max_seq_length * num_layers *
+                           (hidden_size**2)) * (1. + (args.max_seq_length / (6. * hidden_size)) +
+                                                (vocab_size / (16. * num_layers * hidden_size)))
+    tflops = flops_per_iteration / (elapsed_time_per_iter * (10**12))
+    return samples_per_second, tflops, approx_parameters_in_billions
+
+
+def synchronize():
+    if not torch.distributed.is_available():
+        return
+    if not torch.distributed.is_intialized():
+        return
+    world_size = torch.distributed.get_world_size()
+    if world_size == 1:
+        return
+    torch.distributed.barrier()
+
+
+def log_args(logger, args):
+    logger.info('--------args----------')
+    message = '\n'.join([f'{k:<30}: {v}' for k, v in vars(args).items()])
+    message += '\n'
+    message += '\n'.join([f'{k:<30}: {v}' for k, v in gpc.config.items()])
+    logger.info(message)
+    logger.info('--------args----------\n')
--- a/examples/community/roberta/pretraining/utils/global_vars.py
+++ b/examples/community/roberta/pretraining/utils/global_vars.py
@@ -0,0 +1,130 @@
+import time
+
+import torch
+
+from .WandbLog import TensorboardLog
+
+_GLOBAL_TIMERS = None
+_GLOBAL_TENSORBOARD_WRITER = None
+
+
+def set_global_variables(launch_time, tensorboard_path):
+    _set_timers()
+    _set_tensorboard_writer(launch_time, tensorboard_path)
+
+
+def _set_timers():
+    """Initialize timers."""
+    global _GLOBAL_TIMERS
+    _ensure_var_is_not_initialized(_GLOBAL_TIMERS, 'timers')
+    _GLOBAL_TIMERS = Timers()
+
+
+def _set_tensorboard_writer(launch_time, tensorboard_path):
+    """Set tensorboard writer."""
+    global _GLOBAL_TENSORBOARD_WRITER
+    _ensure_var_is_not_initialized(_GLOBAL_TENSORBOARD_WRITER, 'tensorboard writer')
+    if torch.distributed.get_rank() == 0:
+        _GLOBAL_TENSORBOARD_WRITER = TensorboardLog(tensorboard_path + f'/{launch_time}', launch_time)
+
+
+def get_timers():
+    """Return timers."""
+    _ensure_var_is_initialized(_GLOBAL_TIMERS, 'timers')
+    return _GLOBAL_TIMERS
+
+
+def get_tensorboard_writer():
+    """Return tensorboard writer. It can be None so no need
+    to check if it is initialized."""
+    return _GLOBAL_TENSORBOARD_WRITER
+
+
+def _ensure_var_is_initialized(var, name):
+    """Make sure the input variable is not None."""
+    assert var is not None, '{} is not initialized.'.format(name)
+
+
+def _ensure_var_is_not_initialized(var, name):
+    """Make sure the input variable is not None."""
+    assert var is None, '{} is already initialized.'.format(name)
+
+
+class _Timer:
+    """Timer."""
+
+    def __init__(self, name):
+        self.name_ = name
+        self.elapsed_ = 0.0
+        self.started_ = False
+        self.start_time = time.time()
+
+    def start(self):
+        """Start the timer."""
+        # assert not self.started_, 'timer has already been started'
+        torch.cuda.synchronize()
+        self.start_time = time.time()
+        self.started_ = True
+
+    def stop(self):
+        """Stop the timer."""
+        assert self.started_, 'timer is not started'
+        torch.cuda.synchronize()
+        self.elapsed_ += (time.time() - self.start_time)
+        self.started_ = False
+
+    def reset(self):
+        """Reset timer."""
+        self.elapsed_ = 0.0
+        self.started_ = False
+
+    def elapsed(self, reset=True):
+        """Calculate the elapsed time."""
+        started_ = self.started_
+        # If the timing in progress, end it first.
+        if self.started_:
+            self.stop()
+        # Get the elapsed time.
+        elapsed_ = self.elapsed_
+        # Reset the elapsed time
+        if reset:
+            self.reset()
+        # If timing was in progress, set it back.
+        if started_:
+            self.start()
+        return elapsed_
+
+
+class Timers:
+    """Group of timers."""
+
+    def __init__(self):
+        self.timers = {}
+
+    def __call__(self, name):
+        if name not in self.timers:
+            self.timers[name] = _Timer(name)
+        return self.timers[name]
+
+    def write(self, names, writer, iteration, normalizer=1.0, reset=False):
+        """Write timers to a tensorboard writer"""
+        # currently when using add_scalars,
+        # torch.utils.add_scalars makes each timer its own run, which
+        # polutes the runs list, so we just add each as a scalar
+        assert normalizer > 0.0
+        for name in names:
+            value = self.timers[name].elapsed(reset=reset) / normalizer
+            writer.add_scalar(name + '-time', value, iteration)
+
+    def log(self, names, normalizer=1.0, reset=True):
+        """Log a group of timers."""
+        assert normalizer > 0.0
+        string = 'time (ms)'
+        for name in names:
+            elapsed_time = self.timers[name].elapsed(reset=reset) * 1000.0 / normalizer
+            string += ' | {}: {:.2f}'.format(name, elapsed_time)
+        if torch.distributed.is_initialized():
+            if torch.distributed.get_rank() == (torch.distributed.get_world_size() - 1):
+                print(string, flush=True)
+        else:
+            print(string, flush=True)
--- a/examples/community/roberta/pretraining/utils/logger.py
+++ b/examples/community/roberta/pretraining/utils/logger.py
@@ -0,0 +1,30 @@
+import logging
+import os
+
+import torch.distributed as dist
+
+logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
+                    datefmt='%m/%d/%Y %H:%M:%S',
+                    level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+class Logger():
+
+    def __init__(self, log_path, cuda=False, debug=False):
+        self.logger = logging.getLogger(__name__)
+        self.cuda = cuda
+        self.log_path = log_path
+        self.debug = debug
+
+    def info(self, message, log_=True, print_=True, *args, **kwargs):
+        if (self.cuda and dist.get_rank() == 0) or not self.cuda:
+            if print_:
+                self.logger.info(message, *args, **kwargs)
+
+            if log_:
+                with open(self.log_path, 'a+') as f_log:
+                    f_log.write(message + '\n')
+
+    def error(self, message, *args, **kwargs):
+        self.logger.error(message, *args, **kwargs)