mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-02 17:46:42 +00:00
[legacy] clean up legacy code (#4743)
* [legacy] remove outdated codes of pipeline (#4692) * [legacy] remove cli of benchmark and update optim (#4690) * [legacy] remove cli of benchmark and update optim * [doc] fix cli doc test * [legacy] fix engine clip grad norm * [legacy] remove outdated colo tensor (#4694) * [legacy] remove outdated colo tensor * [test] fix test import * [legacy] move outdated zero to legacy (#4696) * [legacy] clean up utils (#4700) * [legacy] clean up utils * [example] update examples * [legacy] clean up amp * [legacy] fix amp module * [legacy] clean up gpc (#4742) * [legacy] clean up context * [legacy] clean core, constants and global vars * [legacy] refactor initialize * [example] fix examples ci * [example] fix examples ci * [legacy] fix tests * [example] fix gpt example * [example] fix examples ci * [devops] fix ci installation * [example] fix examples ci
This commit is contained in:
@@ -1,28 +0,0 @@
|
||||
import click
|
||||
|
||||
from colossalai.context import Config
|
||||
|
||||
from .benchmark import run_benchmark
|
||||
from .utils import *
|
||||
|
||||
__all__ = ['benchmark']
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option("-g", "--gpus", type=int, default=None, help="Total number of devices to use.")
|
||||
@click.option("-b", "--batch_size", type=int, default=8, help="Batch size of the input tensor.")
|
||||
@click.option("-s", "--seq_len", type=int, default=512, help="Sequence length of the input tensor.")
|
||||
@click.option("-d", "--dimension", type=int, default=1024, help="Hidden dimension of the input tensor.")
|
||||
@click.option("-w", "--warmup_steps", type=int, default=10, help="The number of warmup steps.")
|
||||
@click.option("-p", "--profile_steps", type=int, default=50, help="The number of profiling steps.")
|
||||
@click.option("-l", "--layers", type=int, default=2)
|
||||
@click.option("-m",
|
||||
"--model",
|
||||
type=click.Choice(['mlp'], case_sensitive=False),
|
||||
default='mlp',
|
||||
help="Select the model to benchmark, currently only supports MLP")
|
||||
def benchmark(gpus: int, batch_size: int, seq_len: int, dimension: int, warmup_steps: int, profile_steps: int,
|
||||
layers: int, model: str):
|
||||
args_dict = locals()
|
||||
args = Config(args_dict)
|
||||
run_benchmark(args)
|
@@ -1,105 +0,0 @@
|
||||
from functools import partial
|
||||
from typing import Dict, List
|
||||
|
||||
import click
|
||||
import torch.multiprocessing as mp
|
||||
|
||||
import colossalai
|
||||
from colossalai.cli.benchmark.utils import find_all_configs, get_batch_data, profile_model
|
||||
from colossalai.context import Config
|
||||
from colossalai.context.random import reset_seeds
|
||||
from colossalai.core import global_context as gpc
|
||||
from colossalai.logging import disable_existing_loggers, get_dist_logger
|
||||
from colossalai.testing import free_port
|
||||
from colossalai.utils import MultiTimer
|
||||
|
||||
from .models import MLP
|
||||
|
||||
|
||||
def run_benchmark(args: Config) -> None:
|
||||
"""
|
||||
Run benchmarking with torch.multiprocessing.
|
||||
"""
|
||||
|
||||
# sanity checks
|
||||
if args.gpus is None:
|
||||
click.echo("Error: --num_gpus is not given")
|
||||
exit()
|
||||
if args.gpus <= 1:
|
||||
click.echo("Warning: tensor parallel will be activated with at least 2 devices.")
|
||||
|
||||
click.echo("=== Benchmarking Parameters ===")
|
||||
for k, v in args.items():
|
||||
click.echo(f'{k}: {v}')
|
||||
click.echo('')
|
||||
|
||||
config_list = find_all_configs(args.gpus)
|
||||
|
||||
avail_ports = [free_port() for _ in range(len(config_list))]
|
||||
run_func = partial(run_dist_profiling,
|
||||
world_size=args.gpus,
|
||||
port_list=avail_ports,
|
||||
config_list=config_list,
|
||||
hyperparams=args)
|
||||
mp.spawn(run_func, nprocs=args.gpus)
|
||||
|
||||
|
||||
def run_dist_profiling(rank: int, world_size: int, port_list: List[int], config_list: List[Dict],
|
||||
hyperparams: Config) -> None:
|
||||
"""
|
||||
A function executed for profiling, this function should be spawn by torch.multiprocessing.
|
||||
|
||||
Args:
|
||||
rank (int): rank of the process
|
||||
world_size (int): the number of processes
|
||||
port_list (List[int]): a list of free ports for initializing distributed networks
|
||||
config_list (List[Dict]): a list of configuration
|
||||
hyperparams (Config): the hyperparameters given by the user
|
||||
|
||||
"""
|
||||
|
||||
# disable logging for clean output
|
||||
disable_existing_loggers()
|
||||
logger = get_dist_logger()
|
||||
logger.set_level('WARNING')
|
||||
|
||||
for config, port in zip(config_list, port_list):
|
||||
colossalai.launch(config=config, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
|
||||
timer = MultiTimer()
|
||||
|
||||
# 1D parallel should be skipped if in_features or out_features is not able to be divided exactly by 1D parallel size.
|
||||
if config.parallel.tensor.mode == '1d' and hyperparams.dimension % config.parallel.tensor.size != 0:
|
||||
click.echo(
|
||||
"1D parallel will be skipped because in_features or out_features is not able to be divided exactly by 1D parallel size."
|
||||
)
|
||||
continue
|
||||
|
||||
if hyperparams.model == 'mlp':
|
||||
model = MLP(dim=hyperparams.dimension, layers=hyperparams.layers)
|
||||
else:
|
||||
if gpc.get_global_rank() == 0:
|
||||
click.echo("Error: Invalid argument for --model")
|
||||
exit()
|
||||
|
||||
data_func = partial(get_batch_data,
|
||||
dim=hyperparams.dimension,
|
||||
batch_size=hyperparams.batch_size,
|
||||
seq_length=hyperparams.seq_len,
|
||||
mode=config.parallel.tensor.mode)
|
||||
|
||||
fwd_time, bwd_time, max_allocated, max_cached = profile_model(model=model,
|
||||
warmup_steps=hyperparams.warmup_steps,
|
||||
profile_steps=hyperparams.profile_steps,
|
||||
data_func=data_func,
|
||||
timer=timer)
|
||||
|
||||
gpc.destroy()
|
||||
reset_seeds()
|
||||
|
||||
if gpc.get_global_rank() == 0:
|
||||
config_str = ', '.join([f'{k}: {v}' for k, v in config.parallel.tensor.items()])
|
||||
click.echo(f"=== {config_str} ===")
|
||||
click.echo(f"Average forward time: {fwd_time}")
|
||||
click.echo(f"Average backward time: {bwd_time}")
|
||||
click.echo(f"Max allocated GPU memory: {max_allocated}")
|
||||
click.echo(f"Max cached GPU memory: {max_cached}\n")
|
@@ -1,18 +0,0 @@
|
||||
import torch
|
||||
|
||||
import colossalai.legacy.nn as col_nn
|
||||
|
||||
|
||||
class MLP(torch.nn.Module):
|
||||
|
||||
def __init__(self, dim: int, layers: int):
|
||||
super().__init__()
|
||||
self.layers = torch.nn.ModuleList()
|
||||
|
||||
for _ in range(layers):
|
||||
self.layers.append(col_nn.Linear(dim, dim))
|
||||
|
||||
def forward(self, x):
|
||||
for layer in self.layers:
|
||||
x = layer(x)
|
||||
return x
|
@@ -1,159 +0,0 @@
|
||||
import math
|
||||
import time
|
||||
from typing import Callable, Dict, List, Tuple
|
||||
|
||||
import torch
|
||||
|
||||
from colossalai.context import Config, ParallelMode
|
||||
from colossalai.utils import MultiTimer
|
||||
|
||||
|
||||
def get_time_stamp() -> int:
|
||||
"""
|
||||
Return the time stamp for profiling.
|
||||
|
||||
Returns:
|
||||
time_stamp (int): the time given by time.time()
|
||||
"""
|
||||
|
||||
torch.cuda.synchronize()
|
||||
time_stamp = time.time()
|
||||
return time_stamp
|
||||
|
||||
|
||||
def get_memory_states() -> Tuple[float]:
|
||||
"""
|
||||
Return the memory statistics.
|
||||
|
||||
Returns:
|
||||
max_allocated (float): the allocated CUDA memory
|
||||
max_cached (float): the cached CUDA memory
|
||||
"""
|
||||
|
||||
max_allocated = torch.cuda.max_memory_allocated() / (1024**3)
|
||||
max_cached = torch.cuda.max_memory_reserved() / (1024**3)
|
||||
torch.cuda.reset_peak_memory_stats()
|
||||
torch.cuda.empty_cache()
|
||||
return max_allocated, max_cached
|
||||
|
||||
|
||||
def find_all_configs(device_cnt: int) -> List[Dict]:
|
||||
"""
|
||||
Find all possible configurations for tensor parallelism
|
||||
|
||||
Args:
|
||||
device_cnt (int): the number of devices
|
||||
|
||||
Returns:
|
||||
config_list (List[Dict]): a list of configurations
|
||||
"""
|
||||
|
||||
def _is_square(num):
|
||||
# 2D parallel should be implemented with at least 2 devices.
|
||||
if num <= 1:
|
||||
return False
|
||||
return math.floor(math.sqrt(num))**2 == num
|
||||
|
||||
def _is_cube(num):
|
||||
# 3D parallel should be implemented with at least 2 devices.
|
||||
if num <= 1:
|
||||
return False
|
||||
return math.floor(num**(1. / 3.))**3 == num
|
||||
|
||||
config_list = []
|
||||
|
||||
# add non-parallel config
|
||||
config = dict(parallel=dict(tensor=dict(size=device_cnt, mode=None)))
|
||||
config_list.append(config)
|
||||
|
||||
# add 1D config
|
||||
config = dict(parallel=dict(tensor=dict(size=device_cnt, mode='1d')))
|
||||
config_list.append(config)
|
||||
|
||||
# add 2D config only if device_cnt is a square
|
||||
if _is_square(device_cnt):
|
||||
config = dict(parallel=dict(tensor=dict(size=device_cnt, mode='2d')))
|
||||
config_list.append(config)
|
||||
|
||||
# check for 2.5D
|
||||
# iterate over depth
|
||||
for depth in range(1, device_cnt):
|
||||
if device_cnt % depth == 0 and _is_square(device_cnt // depth):
|
||||
config = dict(parallel=dict(tensor=dict(size=device_cnt, mode='2.5d', depth=depth)))
|
||||
config_list.append(config)
|
||||
|
||||
# check for 3D if device_cnt is a cube
|
||||
if _is_cube(device_cnt):
|
||||
config = dict(parallel=dict(tensor=dict(size=device_cnt, mode='3d')))
|
||||
config_list.append(config)
|
||||
|
||||
config_list = [Config(cfg) for cfg in config_list]
|
||||
return config_list
|
||||
|
||||
|
||||
def profile_model(model: torch.nn.Module, warmup_steps: int, profile_steps: int, data_func: Callable,
|
||||
timer: MultiTimer) -> Tuple[float]:
|
||||
"""
|
||||
Profile the forward and backward of a model
|
||||
|
||||
Args:
|
||||
model (torch.nn.Module): a PyTorch model
|
||||
warmup_steps (int): the number of steps for warmup
|
||||
profile_steps (int): the number of steps for profiling
|
||||
data_func (Callable): a function to generate random data
|
||||
timer (colossalai.utils.Multitimer): a timer instance for time recording
|
||||
|
||||
Returns:
|
||||
fwd_time (float): the average forward time taken by forward pass in second
|
||||
bwd_time (float): the average backward time taken by forward pass in second
|
||||
max_allocated (float): the maximum GPU memory allocated in GB
|
||||
max_cached (float): the maximum GPU memory cached in GB
|
||||
"""
|
||||
|
||||
def _run_step(data):
|
||||
timer.start('forward')
|
||||
out = model(data)
|
||||
timer.stop('forward', keep_in_history=True)
|
||||
timer.start('backward')
|
||||
out.mean().backward()
|
||||
timer.stop('backward', keep_in_history=True)
|
||||
|
||||
data_list = [data_func() for _ in range(warmup_steps)]
|
||||
for data in data_list:
|
||||
_run_step(data)
|
||||
timer.reset('forward')
|
||||
timer.reset('backward')
|
||||
|
||||
for _ in range(profile_steps):
|
||||
data = data_func()
|
||||
_run_step(data)
|
||||
|
||||
max_allocated, max_cached = get_memory_states()
|
||||
fwd_time = timer.get_timer('forward').get_history_mean()
|
||||
bwd_time = timer.get_timer('backward').get_history_mean()
|
||||
return fwd_time, bwd_time, max_allocated, max_cached
|
||||
|
||||
|
||||
def get_batch_data(dim: int, batch_size: int, seq_length: int, mode: ParallelMode) -> torch.Tensor:
|
||||
"""
|
||||
Return a random data of shape (batch_size, seq_length, dim) for profiling.
|
||||
|
||||
Args:
|
||||
dim (int): hidden size
|
||||
batch_size (int): the number of data samples
|
||||
seq_length (int): the number of tokens
|
||||
mode (ParallelMode): Colossal-AI ParallelMode enum
|
||||
|
||||
Returns:
|
||||
data (torch.Tensor): random data
|
||||
"""
|
||||
|
||||
if mode in ['2d', '2.5d']:
|
||||
batch_size = batch_size // 2
|
||||
dim = dim // 2
|
||||
elif mode == '3d':
|
||||
batch_size = batch_size // 4
|
||||
dim = dim // 2
|
||||
|
||||
data = torch.rand(batch_size, seq_length, dim).cuda()
|
||||
return data
|
@@ -1,6 +1,5 @@
|
||||
import click
|
||||
|
||||
from .benchmark import benchmark
|
||||
from .check import check
|
||||
from .launcher import run
|
||||
|
||||
@@ -19,7 +18,6 @@ def cli():
|
||||
|
||||
cli.add_command(run)
|
||||
cli.add_command(check)
|
||||
cli.add_command(benchmark)
|
||||
|
||||
if __name__ == '__main__':
|
||||
cli()
|
||||
|
Reference in New Issue
Block a user