[legacy] clean up legacy code (#4743)

* [legacy] remove outdated codes of pipeline (#4692) * [legacy] remove cli of benchmark and update optim (#4690) * [legacy] remove cli of benchmark and update optim * [doc] fix cli doc test * [legacy] fix engine clip grad norm * [legacy] remove outdated colo tensor (#4694) * [legacy] remove outdated colo tensor * [test] fix test import * [legacy] move outdated zero to legacy (#4696) * [legacy] clean up utils (#4700) * [legacy] clean up utils * [example] update examples * [legacy] clean up amp * [legacy] fix amp module * [legacy] clean up gpc (#4742) * [legacy] clean up context * [legacy] clean core, constants and global vars * [legacy] refactor initialize * [example] fix examples ci * [example] fix examples ci * [legacy] fix tests * [example] fix gpt example * [example] fix examples ci * [devops] fix ci installation * [example] fix examples ci
2025-09-02 17:46:42 +00:00 · 2023-09-18 16:31:06 +08:00
parent 32e7f99416
commit b5f9e37c70
342 changed files with 2919 additions and 4182 deletions
--- a/colossalai/cli/benchmark/init.py
+++ b/colossalai/cli/benchmark/init.py
@@ -1,28 +0,0 @@
-import click
-
-from colossalai.context import Config
-
-from .benchmark import run_benchmark
-from .utils import *
-
-__all__ = ['benchmark']
-
-
-@click.command()
-@click.option("-g", "--gpus", type=int, default=None, help="Total number of devices to use.")
-@click.option("-b", "--batch_size", type=int, default=8, help="Batch size of the input tensor.")
-@click.option("-s", "--seq_len", type=int, default=512, help="Sequence length of the input tensor.")
-@click.option("-d", "--dimension", type=int, default=1024, help="Hidden dimension of the input tensor.")
-@click.option("-w", "--warmup_steps", type=int, default=10, help="The number of warmup steps.")
-@click.option("-p", "--profile_steps", type=int, default=50, help="The number of profiling steps.")
-@click.option("-l", "--layers", type=int, default=2)
-@click.option("-m",
-              "--model",
-              type=click.Choice(['mlp'], case_sensitive=False),
-              default='mlp',
-              help="Select the model to benchmark, currently only supports MLP")
-def benchmark(gpus: int, batch_size: int, seq_len: int, dimension: int, warmup_steps: int, profile_steps: int,
-              layers: int, model: str):
-    args_dict = locals()
-    args = Config(args_dict)
-    run_benchmark(args)
--- a/colossalai/cli/benchmark/benchmark.py
+++ b/colossalai/cli/benchmark/benchmark.py
@@ -1,105 +0,0 @@
-from functools import partial
-from typing import Dict, List
-
-import click
-import torch.multiprocessing as mp
-
-import colossalai
-from colossalai.cli.benchmark.utils import find_all_configs, get_batch_data, profile_model
-from colossalai.context import Config
-from colossalai.context.random import reset_seeds
-from colossalai.core import global_context as gpc
-from colossalai.logging import disable_existing_loggers, get_dist_logger
-from colossalai.testing import free_port
-from colossalai.utils import MultiTimer
-
-from .models import MLP
-
-
-def run_benchmark(args: Config) -> None:
-    """
-    Run benchmarking with torch.multiprocessing.
-    """
-
-    # sanity checks
-    if args.gpus is None:
-        click.echo("Error: --num_gpus is not given")
-        exit()
-    if args.gpus <= 1:
-        click.echo("Warning: tensor parallel will be activated with at least 2 devices.")
-
-    click.echo("=== Benchmarking Parameters ===")
-    for k, v in args.items():
-        click.echo(f'{k}: {v}')
-    click.echo('')
-
-    config_list = find_all_configs(args.gpus)
-
-    avail_ports = [free_port() for _ in range(len(config_list))]
-    run_func = partial(run_dist_profiling,
-                       world_size=args.gpus,
-                       port_list=avail_ports,
-                       config_list=config_list,
-                       hyperparams=args)
-    mp.spawn(run_func, nprocs=args.gpus)
-
-
-def run_dist_profiling(rank: int, world_size: int, port_list: List[int], config_list: List[Dict],
-                       hyperparams: Config) -> None:
-    """
-    A function executed for profiling, this function should be spawn by torch.multiprocessing.
-
-    Args:
-        rank (int): rank of the process
-        world_size (int): the number of processes
-        port_list (List[int]): a list of free ports for initializing distributed networks
-        config_list (List[Dict]): a list of configuration
-        hyperparams (Config): the hyperparameters given by the user
-
-    """
-
-    # disable logging for clean output
-    disable_existing_loggers()
-    logger = get_dist_logger()
-    logger.set_level('WARNING')
-
-    for config, port in zip(config_list, port_list):
-        colossalai.launch(config=config, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
-        timer = MultiTimer()
-
-        # 1D parallel should be skipped if in_features or out_features is not able to be divided exactly by 1D parallel size.
-        if config.parallel.tensor.mode == '1d' and hyperparams.dimension % config.parallel.tensor.size != 0:
-            click.echo(
-                "1D parallel will be skipped because in_features or out_features is not able to be divided exactly by 1D parallel size."
-            )
-            continue
-
-        if hyperparams.model == 'mlp':
-            model = MLP(dim=hyperparams.dimension, layers=hyperparams.layers)
-        else:
-            if gpc.get_global_rank() == 0:
-                click.echo("Error: Invalid argument for --model")
-                exit()
-
-        data_func = partial(get_batch_data,
-                            dim=hyperparams.dimension,
-                            batch_size=hyperparams.batch_size,
-                            seq_length=hyperparams.seq_len,
-                            mode=config.parallel.tensor.mode)
-
-        fwd_time, bwd_time, max_allocated, max_cached = profile_model(model=model,
-                                                                      warmup_steps=hyperparams.warmup_steps,
-                                                                      profile_steps=hyperparams.profile_steps,
-                                                                      data_func=data_func,
-                                                                      timer=timer)
-
-        gpc.destroy()
-        reset_seeds()
-
-        if gpc.get_global_rank() == 0:
-            config_str = ', '.join([f'{k}: {v}' for k, v in config.parallel.tensor.items()])
-            click.echo(f"=== {config_str} ===")
-            click.echo(f"Average forward time: {fwd_time}")
-            click.echo(f"Average backward time: {bwd_time}")
-            click.echo(f"Max allocated GPU memory: {max_allocated}")
-            click.echo(f"Max cached GPU memory: {max_cached}\n")
--- a/colossalai/cli/benchmark/models.py
+++ b/colossalai/cli/benchmark/models.py
@@ -1,18 +0,0 @@
-import torch
-
-import colossalai.legacy.nn as col_nn
-
-
-class MLP(torch.nn.Module):
-
-    def __init__(self, dim: int, layers: int):
-        super().__init__()
-        self.layers = torch.nn.ModuleList()
-
-        for _ in range(layers):
-            self.layers.append(col_nn.Linear(dim, dim))
-
-    def forward(self, x):
-        for layer in self.layers:
-            x = layer(x)
-        return x
--- a/colossalai/cli/benchmark/utils.py
+++ b/colossalai/cli/benchmark/utils.py
@@ -1,159 +0,0 @@
-import math
-import time
-from typing import Callable, Dict, List, Tuple
-
-import torch
-
-from colossalai.context import Config, ParallelMode
-from colossalai.utils import MultiTimer
-
-
-def get_time_stamp() -> int:
-    """
-    Return the time stamp for profiling.
-
-    Returns:
-        time_stamp (int): the time given by time.time()
-    """
-
-    torch.cuda.synchronize()
-    time_stamp = time.time()
-    return time_stamp
-
-
-def get_memory_states() -> Tuple[float]:
-    """
-    Return the memory statistics.
-
-    Returns:
-        max_allocated (float): the allocated CUDA memory
-        max_cached (float):  the cached CUDA memory
-    """
-
-    max_allocated = torch.cuda.max_memory_allocated() / (1024**3)
-    max_cached = torch.cuda.max_memory_reserved() / (1024**3)
-    torch.cuda.reset_peak_memory_stats()
-    torch.cuda.empty_cache()
-    return max_allocated, max_cached
-
-
-def find_all_configs(device_cnt: int) -> List[Dict]:
-    """
-    Find all possible configurations for tensor parallelism
-
-    Args:
-        device_cnt (int): the number of devices
-
-    Returns:
-        config_list (List[Dict]): a list of configurations
-    """
-
-    def _is_square(num):
-        # 2D parallel should be implemented with at least 2 devices.
-        if num <= 1:
-            return False
-        return math.floor(math.sqrt(num))**2 == num
-
-    def _is_cube(num):
-        # 3D parallel should be implemented with at least 2 devices.
-        if num <= 1:
-            return False
-        return math.floor(num**(1. / 3.))**3 == num
-
-    config_list = []
-
-    # add non-parallel config
-    config = dict(parallel=dict(tensor=dict(size=device_cnt, mode=None)))
-    config_list.append(config)
-
-    # add 1D config
-    config = dict(parallel=dict(tensor=dict(size=device_cnt, mode='1d')))
-    config_list.append(config)
-
-    # add 2D config only if device_cnt is a square
-    if _is_square(device_cnt):
-        config = dict(parallel=dict(tensor=dict(size=device_cnt, mode='2d')))
-        config_list.append(config)
-
-    # check for 2.5D
-    # iterate over depth
-    for depth in range(1, device_cnt):
-        if device_cnt % depth == 0 and _is_square(device_cnt // depth):
-            config = dict(parallel=dict(tensor=dict(size=device_cnt, mode='2.5d', depth=depth)))
-            config_list.append(config)
-
-    # check for 3D if device_cnt is a cube
-    if _is_cube(device_cnt):
-        config = dict(parallel=dict(tensor=dict(size=device_cnt, mode='3d')))
-        config_list.append(config)
-
-    config_list = [Config(cfg) for cfg in config_list]
-    return config_list
-
-
-def profile_model(model: torch.nn.Module, warmup_steps: int, profile_steps: int, data_func: Callable,
-                  timer: MultiTimer) -> Tuple[float]:
-    """
-    Profile the forward and backward of a model
-
-    Args:
-        model (torch.nn.Module): a PyTorch model
-        warmup_steps (int): the number of steps for warmup
-        profile_steps (int): the number of steps for profiling
-        data_func (Callable): a function to generate random data
-        timer (colossalai.utils.Multitimer): a timer instance for time recording
-
-    Returns:
-        fwd_time (float): the average forward time taken by forward pass in second
-        bwd_time (float): the average backward time taken by forward pass in second
-        max_allocated (float): the maximum GPU memory allocated in GB
-        max_cached (float): the maximum GPU memory cached in GB
-    """
-
-    def _run_step(data):
-        timer.start('forward')
-        out = model(data)
-        timer.stop('forward', keep_in_history=True)
-        timer.start('backward')
-        out.mean().backward()
-        timer.stop('backward', keep_in_history=True)
-
-    data_list = [data_func() for _ in range(warmup_steps)]
-    for data in data_list:
-        _run_step(data)
-    timer.reset('forward')
-    timer.reset('backward')
-
-    for _ in range(profile_steps):
-        data = data_func()
-        _run_step(data)
-
-    max_allocated, max_cached = get_memory_states()
-    fwd_time = timer.get_timer('forward').get_history_mean()
-    bwd_time = timer.get_timer('backward').get_history_mean()
-    return fwd_time, bwd_time, max_allocated, max_cached
-
-
-def get_batch_data(dim: int, batch_size: int, seq_length: int, mode: ParallelMode) -> torch.Tensor:
-    """
-    Return a random data of shape (batch_size, seq_length, dim) for profiling.
-
-    Args:
-        dim (int): hidden size
-        batch_size (int): the number of data samples
-        seq_length (int): the number of tokens
-        mode (ParallelMode): Colossal-AI ParallelMode enum
-
-    Returns:
-        data (torch.Tensor): random data
-    """
-
-    if mode in ['2d', '2.5d']:
-        batch_size = batch_size // 2
-        dim = dim // 2
-    elif mode == '3d':
-        batch_size = batch_size // 4
-        dim = dim // 2
-
-    data = torch.rand(batch_size, seq_length, dim).cuda()
-    return data
--- a/colossalai/cli/cli.py
+++ b/colossalai/cli/cli.py
@@ -1,6 +1,5 @@
 import click

-from .benchmark import benchmark
 from .check import check
 from .launcher import run

@@ -19,7 +18,6 @@ def cli():

 cli.add_command(run)
 cli.add_command(check)
-cli.add_command(benchmark)

 if __name__ == '__main__':
    cli()