mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-02 09:38:05 +00:00
[example] reorganize for community examples (#3557)
This commit is contained in:
42
examples/community/roberta/pretraining/utils/WandbLog.py
Normal file
42
examples/community/roberta/pretraining/utils/WandbLog.py
Normal file
@@ -0,0 +1,42 @@
|
||||
import os
|
||||
import time
|
||||
|
||||
import wandb
|
||||
from torch.utils.tensorboard import SummaryWriter
|
||||
|
||||
|
||||
class WandbLog:
|
||||
|
||||
@classmethod
|
||||
def init_wandb(cls, project, notes=None, name=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), config=None):
|
||||
wandb.init(project=project, notes=notes, name=name, config=config)
|
||||
|
||||
@classmethod
|
||||
def log(cls, result, model=None, gradient=None):
|
||||
wandb.log(result)
|
||||
|
||||
if model:
|
||||
wandb.watch(model)
|
||||
|
||||
if gradient:
|
||||
wandb.watch(gradient)
|
||||
|
||||
|
||||
class TensorboardLog:
|
||||
|
||||
def __init__(self, location, name=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), config=None):
|
||||
if not os.path.exists(location):
|
||||
os.mkdir(location)
|
||||
self.writer = SummaryWriter(location, comment=name)
|
||||
|
||||
def log_train(self, result, step):
|
||||
for k, v in result.items():
|
||||
self.writer.add_scalar(f'{k}/train', v, step)
|
||||
|
||||
def log_eval(self, result, step):
|
||||
for k, v in result.items():
|
||||
self.writer.add_scalar(f'{k}/eval', v, step)
|
||||
|
||||
def log_zeroshot(self, result, step):
|
||||
for k, v in result.items():
|
||||
self.writer.add_scalar(f'{k}_acc/eval', v, step)
|
114
examples/community/roberta/pretraining/utils/exp_util.py
Normal file
114
examples/community/roberta/pretraining/utils/exp_util.py
Normal file
@@ -0,0 +1,114 @@
|
||||
import functools
|
||||
import os
|
||||
import shutil
|
||||
|
||||
import psutil
|
||||
import torch
|
||||
|
||||
from colossalai.core import global_context as gpc
|
||||
|
||||
|
||||
def logging(s, log_path, print_=True, log_=True):
|
||||
if print_:
|
||||
print(s)
|
||||
if log_:
|
||||
with open(log_path, 'a+') as f_log:
|
||||
f_log.write(s + '\n')
|
||||
|
||||
|
||||
def get_logger(log_path, **kwargs):
|
||||
return functools.partial(logging, log_path=log_path, **kwargs)
|
||||
|
||||
|
||||
def create_exp_dir(dir_path, scripts_to_save=None, debug=False):
|
||||
if debug:
|
||||
print('Debug Mode : no experiment dir created')
|
||||
return functools.partial(logging, log_path=None, log_=False)
|
||||
|
||||
if not os.path.exists(dir_path):
|
||||
os.makedirs(dir_path)
|
||||
|
||||
print('Experiment dir : {}'.format(dir_path))
|
||||
if scripts_to_save is not None:
|
||||
script_path = os.path.join(dir_path, 'scripts')
|
||||
if not os.path.exists(script_path):
|
||||
os.makedirs(script_path)
|
||||
for script in scripts_to_save:
|
||||
dst_file = os.path.join(dir_path, 'scripts', os.path.basename(script))
|
||||
shutil.copyfile(script, dst_file)
|
||||
|
||||
return get_logger(log_path=os.path.join(dir_path, 'log.txt'))
|
||||
|
||||
|
||||
def get_cpu_mem():
|
||||
return psutil.Process().memory_info().rss / 1024**2
|
||||
|
||||
|
||||
def get_gpu_mem():
|
||||
return torch.cuda.memory_allocated() / 1024**2
|
||||
|
||||
|
||||
def get_mem_info(prefix=''):
|
||||
return f'{prefix}GPU memory usage: {get_gpu_mem():.2f} MB, CPU memory usage: {get_cpu_mem():.2f} MB'
|
||||
|
||||
|
||||
def get_tflops(model_numel, batch_size, seq_len, step_time):
|
||||
return model_numel * batch_size * seq_len * 8 / 1e12 / (step_time + 1e-12)
|
||||
|
||||
|
||||
def get_parameters_in_billions(model, world_size=1):
|
||||
gpus_per_model = world_size
|
||||
|
||||
approx_parameters_in_billions = sum([
|
||||
sum([p.ds_numel if hasattr(p, 'ds_id') else p.nelement()
|
||||
for p in model_module.parameters()])
|
||||
for model_module in model
|
||||
])
|
||||
|
||||
return approx_parameters_in_billions * gpus_per_model / (1e9)
|
||||
|
||||
|
||||
def throughput_calculator(numel, args, config, iteration_time, total_iterations, world_size=1):
|
||||
gpus_per_model = 1
|
||||
batch_size = args.train_micro_batch_size_per_gpu
|
||||
samples_per_model = batch_size * args.max_seq_length
|
||||
model_replica_count = world_size / gpus_per_model
|
||||
approx_parameters_in_billions = numel
|
||||
elapsed_time_per_iter = iteration_time / total_iterations
|
||||
samples_per_second = batch_size / elapsed_time_per_iter
|
||||
|
||||
#flops calculator
|
||||
hidden_size = config.hidden_size
|
||||
num_layers = config.num_hidden_layers
|
||||
vocab_size = config.vocab_size
|
||||
|
||||
# General TFLOPs formula (borrowed from Equation 3 in Section 5.1 of
|
||||
# https://arxiv.org/pdf/2104.04473.pdf).
|
||||
# The factor of 4 is when used with activation check-pointing,
|
||||
# otherwise it will be 3.
|
||||
checkpoint_activations_factor = 4 if args.checkpoint_activations else 3
|
||||
flops_per_iteration = (24 * checkpoint_activations_factor * batch_size * args.max_seq_length * num_layers *
|
||||
(hidden_size**2)) * (1. + (args.max_seq_length / (6. * hidden_size)) +
|
||||
(vocab_size / (16. * num_layers * hidden_size)))
|
||||
tflops = flops_per_iteration / (elapsed_time_per_iter * (10**12))
|
||||
return samples_per_second, tflops, approx_parameters_in_billions
|
||||
|
||||
|
||||
def synchronize():
|
||||
if not torch.distributed.is_available():
|
||||
return
|
||||
if not torch.distributed.is_intialized():
|
||||
return
|
||||
world_size = torch.distributed.get_world_size()
|
||||
if world_size == 1:
|
||||
return
|
||||
torch.distributed.barrier()
|
||||
|
||||
|
||||
def log_args(logger, args):
|
||||
logger.info('--------args----------')
|
||||
message = '\n'.join([f'{k:<30}: {v}' for k, v in vars(args).items()])
|
||||
message += '\n'
|
||||
message += '\n'.join([f'{k:<30}: {v}' for k, v in gpc.config.items()])
|
||||
logger.info(message)
|
||||
logger.info('--------args----------\n')
|
130
examples/community/roberta/pretraining/utils/global_vars.py
Normal file
130
examples/community/roberta/pretraining/utils/global_vars.py
Normal file
@@ -0,0 +1,130 @@
|
||||
import time
|
||||
|
||||
import torch
|
||||
|
||||
from .WandbLog import TensorboardLog
|
||||
|
||||
_GLOBAL_TIMERS = None
|
||||
_GLOBAL_TENSORBOARD_WRITER = None
|
||||
|
||||
|
||||
def set_global_variables(launch_time, tensorboard_path):
|
||||
_set_timers()
|
||||
_set_tensorboard_writer(launch_time, tensorboard_path)
|
||||
|
||||
|
||||
def _set_timers():
|
||||
"""Initialize timers."""
|
||||
global _GLOBAL_TIMERS
|
||||
_ensure_var_is_not_initialized(_GLOBAL_TIMERS, 'timers')
|
||||
_GLOBAL_TIMERS = Timers()
|
||||
|
||||
|
||||
def _set_tensorboard_writer(launch_time, tensorboard_path):
|
||||
"""Set tensorboard writer."""
|
||||
global _GLOBAL_TENSORBOARD_WRITER
|
||||
_ensure_var_is_not_initialized(_GLOBAL_TENSORBOARD_WRITER, 'tensorboard writer')
|
||||
if torch.distributed.get_rank() == 0:
|
||||
_GLOBAL_TENSORBOARD_WRITER = TensorboardLog(tensorboard_path + f'/{launch_time}', launch_time)
|
||||
|
||||
|
||||
def get_timers():
|
||||
"""Return timers."""
|
||||
_ensure_var_is_initialized(_GLOBAL_TIMERS, 'timers')
|
||||
return _GLOBAL_TIMERS
|
||||
|
||||
|
||||
def get_tensorboard_writer():
|
||||
"""Return tensorboard writer. It can be None so no need
|
||||
to check if it is initialized."""
|
||||
return _GLOBAL_TENSORBOARD_WRITER
|
||||
|
||||
|
||||
def _ensure_var_is_initialized(var, name):
|
||||
"""Make sure the input variable is not None."""
|
||||
assert var is not None, '{} is not initialized.'.format(name)
|
||||
|
||||
|
||||
def _ensure_var_is_not_initialized(var, name):
|
||||
"""Make sure the input variable is not None."""
|
||||
assert var is None, '{} is already initialized.'.format(name)
|
||||
|
||||
|
||||
class _Timer:
|
||||
"""Timer."""
|
||||
|
||||
def __init__(self, name):
|
||||
self.name_ = name
|
||||
self.elapsed_ = 0.0
|
||||
self.started_ = False
|
||||
self.start_time = time.time()
|
||||
|
||||
def start(self):
|
||||
"""Start the timer."""
|
||||
# assert not self.started_, 'timer has already been started'
|
||||
torch.cuda.synchronize()
|
||||
self.start_time = time.time()
|
||||
self.started_ = True
|
||||
|
||||
def stop(self):
|
||||
"""Stop the timer."""
|
||||
assert self.started_, 'timer is not started'
|
||||
torch.cuda.synchronize()
|
||||
self.elapsed_ += (time.time() - self.start_time)
|
||||
self.started_ = False
|
||||
|
||||
def reset(self):
|
||||
"""Reset timer."""
|
||||
self.elapsed_ = 0.0
|
||||
self.started_ = False
|
||||
|
||||
def elapsed(self, reset=True):
|
||||
"""Calculate the elapsed time."""
|
||||
started_ = self.started_
|
||||
# If the timing in progress, end it first.
|
||||
if self.started_:
|
||||
self.stop()
|
||||
# Get the elapsed time.
|
||||
elapsed_ = self.elapsed_
|
||||
# Reset the elapsed time
|
||||
if reset:
|
||||
self.reset()
|
||||
# If timing was in progress, set it back.
|
||||
if started_:
|
||||
self.start()
|
||||
return elapsed_
|
||||
|
||||
|
||||
class Timers:
|
||||
"""Group of timers."""
|
||||
|
||||
def __init__(self):
|
||||
self.timers = {}
|
||||
|
||||
def __call__(self, name):
|
||||
if name not in self.timers:
|
||||
self.timers[name] = _Timer(name)
|
||||
return self.timers[name]
|
||||
|
||||
def write(self, names, writer, iteration, normalizer=1.0, reset=False):
|
||||
"""Write timers to a tensorboard writer"""
|
||||
# currently when using add_scalars,
|
||||
# torch.utils.add_scalars makes each timer its own run, which
|
||||
# polutes the runs list, so we just add each as a scalar
|
||||
assert normalizer > 0.0
|
||||
for name in names:
|
||||
value = self.timers[name].elapsed(reset=reset) / normalizer
|
||||
writer.add_scalar(name + '-time', value, iteration)
|
||||
|
||||
def log(self, names, normalizer=1.0, reset=True):
|
||||
"""Log a group of timers."""
|
||||
assert normalizer > 0.0
|
||||
string = 'time (ms)'
|
||||
for name in names:
|
||||
elapsed_time = self.timers[name].elapsed(reset=reset) * 1000.0 / normalizer
|
||||
string += ' | {}: {:.2f}'.format(name, elapsed_time)
|
||||
if torch.distributed.is_initialized():
|
||||
if torch.distributed.get_rank() == (torch.distributed.get_world_size() - 1):
|
||||
print(string, flush=True)
|
||||
else:
|
||||
print(string, flush=True)
|
30
examples/community/roberta/pretraining/utils/logger.py
Normal file
30
examples/community/roberta/pretraining/utils/logger.py
Normal file
@@ -0,0 +1,30 @@
|
||||
import logging
|
||||
import os
|
||||
|
||||
import torch.distributed as dist
|
||||
|
||||
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
||||
datefmt='%m/%d/%Y %H:%M:%S',
|
||||
level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Logger():
|
||||
|
||||
def __init__(self, log_path, cuda=False, debug=False):
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self.cuda = cuda
|
||||
self.log_path = log_path
|
||||
self.debug = debug
|
||||
|
||||
def info(self, message, log_=True, print_=True, *args, **kwargs):
|
||||
if (self.cuda and dist.get_rank() == 0) or not self.cuda:
|
||||
if print_:
|
||||
self.logger.info(message, *args, **kwargs)
|
||||
|
||||
if log_:
|
||||
with open(self.log_path, 'a+') as f_log:
|
||||
f_log.write(message + '\n')
|
||||
|
||||
def error(self, message, *args, **kwargs):
|
||||
self.logger.error(message, *args, **kwargs)
|
Reference in New Issue
Block a user