diff --git a/colossalai/_analyzer/__init__.py b/colossalai/_analyzer/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/tutorial/new_api/torch_ddp/.gitignore b/examples/tutorial/new_api/cifar_resnet/.gitignore similarity index 100% rename from examples/tutorial/new_api/torch_ddp/.gitignore rename to examples/tutorial/new_api/cifar_resnet/.gitignore diff --git a/examples/tutorial/new_api/cifar_resnet/README.md b/examples/tutorial/new_api/cifar_resnet/README.md new file mode 100644 index 000000000..4ed86aa7a --- /dev/null +++ b/examples/tutorial/new_api/cifar_resnet/README.md @@ -0,0 +1,56 @@ +# Train ResNet on CIFAR-10 from scratch + +## 🚀 Quick Start + +This example provides a training script and an evaluation script. The training script provides an example of training ResNet on CIFAR10 dataset from scratch. + +- Training Arguments + - `-p`, `--plugin`: Plugin to use. Choices: `torch_ddp`, `torch_ddp_fp16`, `low_level_zero`. Defaults to `torch_ddp`. + - `-r`, `--resume`: Resume from checkpoint file path. Defaults to `-1`, which means not resuming. + - `-c`, `--checkpoint`: The folder to save checkpoints. Defaults to `./checkpoint`. + - `-i`, `--interval`: Epoch interval to save checkpoints. Defaults to `5`. If set to `0`, no checkpoint will be saved. + - `--target_acc`: Target accuracy. Raise exception if not reached. Defaults to `None`. + +- Eval Arguments + - `-e`, `--epoch`: select the epoch to evaluate + - `-c`, `--checkpoint`: the folder where checkpoints are found + +### Install requirements + +```bash +pip install -r requirements.txt +``` + +### Train + +```bash +# train with torch DDP with fp32 +colossalai run --nproc_per_node 2 train.py -c ./ckpt-fp32 + +# train with torch DDP with mixed precision training +colossalai run --nproc_per_node 2 train.py -c ./ckpt-fp16 -p torch_ddp_fp16 + +# train with low level zero +colossalai run --nproc_per_node 2 train.py -c ./ckpt-low_level_zero -p low_level_zero +``` + +### Eval + +```bash +# evaluate fp32 training +python eval.py -c ./ckpt-fp32 -e 80 + +# evaluate fp16 mixed precision training +python eval.py -c ./ckpt-fp16 -e 80 + +# evaluate low level zero training +python eval.py -c ./ckpt-low_level_zero -e 80 +``` + +Expected accuracy performance will be: + +| Model | Single-GPU Baseline FP32 | Booster DDP with FP32 | Booster DDP with FP16 | Booster Low Level Zero | +| --------- | ------------------------ | --------------------- | --------------------- | ---------------------- | +| ResNet-18 | 85.85% | 84.91% | 85.46% | 84.50% | + +**Note: the baseline is adapted from the [script](https://pytorch-tutorial.readthedocs.io/en/latest/tutorial/chapter03_intermediate/3_2_2_cnn_resnet_cifar10/) to use `torchvision.models.resnet18`** diff --git a/examples/tutorial/new_api/torch_ddp/eval.py b/examples/tutorial/new_api/cifar_resnet/eval.py similarity index 100% rename from examples/tutorial/new_api/torch_ddp/eval.py rename to examples/tutorial/new_api/cifar_resnet/eval.py diff --git a/examples/tutorial/new_api/cifar_resnet/requirements.txt b/examples/tutorial/new_api/cifar_resnet/requirements.txt new file mode 100644 index 000000000..85522f412 --- /dev/null +++ b/examples/tutorial/new_api/cifar_resnet/requirements.txt @@ -0,0 +1,4 @@ +colossalai +torch +torchvision +tqdm diff --git a/examples/tutorial/new_api/cifar_resnet/test_ci.sh b/examples/tutorial/new_api/cifar_resnet/test_ci.sh new file mode 100755 index 000000000..3954b84ff --- /dev/null +++ b/examples/tutorial/new_api/cifar_resnet/test_ci.sh @@ -0,0 +1,10 @@ +#!/bin/bash +set -xe + +export DATA=/data/scratch/cifar-10 + +pip install -r requirements.txt + +for plugin in "torch_ddp" "torch_ddp_fp16" "low_level_zero"; do + colossalai run --nproc_per_node 4 train.py --interval 0 --target_acc 0.84 --plugin $plugin +done diff --git a/examples/tutorial/new_api/cifar_resnet/train.py b/examples/tutorial/new_api/cifar_resnet/train.py new file mode 100644 index 000000000..e64e95fc2 --- /dev/null +++ b/examples/tutorial/new_api/cifar_resnet/train.py @@ -0,0 +1,210 @@ +import argparse +import os +from pathlib import Path + +import torch +import torch.distributed as dist +import torch.nn as nn +import torchvision +import torchvision.transforms as transforms +from torch.optim import Optimizer +from torch.optim.lr_scheduler import MultiStepLR +from torch.utils.data import DataLoader +from tqdm import tqdm + +import colossalai +from colossalai.booster import Booster +from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin +from colossalai.booster.plugin.dp_plugin_base import DPPluginBase +from colossalai.cluster import DistCoordinator +from colossalai.nn.optimizer import HybridAdam +from colossalai.utils import get_current_device + +# ============================== +# Prepare Hyperparameters +# ============================== +NUM_EPOCHS = 80 +LEARNING_RATE = 1e-3 + + +def build_dataloader(batch_size: int, coordinator: DistCoordinator, plugin: DPPluginBase): + # trainsform + transform_train = transforms.Compose( + [transforms.Pad(4), + transforms.RandomHorizontalFlip(), + transforms.RandomCrop(32), + transforms.ToTensor()]) + transform_test = transforms.ToTensor() + + # CIFAR-10 dataset + data_path = os.environ.get('DATA', './data') + with coordinator.priority_execution(): + train_dataset = torchvision.datasets.CIFAR10(root=data_path, + train=True, + transform=transform_train, + download=True) + test_dataset = torchvision.datasets.CIFAR10(root=data_path, + train=False, + transform=transform_test, + download=True) + + # Data loader + train_dataloader = plugin.prepare_train_dataloader(train_dataset, + batch_size=batch_size, + shuffle=True, + drop_last=True) + test_dataloader = plugin.prepare_train_dataloader(test_dataset, + batch_size=batch_size, + shuffle=False, + drop_last=False) + return train_dataloader, test_dataloader + + +@torch.no_grad() +def evaluate(model: nn.Module, test_dataloader: DataLoader, coordinator: DistCoordinator) -> float: + model.eval() + correct = torch.zeros(1, dtype=torch.int64, device=get_current_device()) + total = torch.zeros(1, dtype=torch.int64, device=get_current_device()) + for images, labels in test_dataloader: + images = images.cuda() + labels = labels.cuda() + outputs = model(images) + _, predicted = torch.max(outputs.data, 1) + total += labels.size(0) + correct += (predicted == labels).sum().item() + dist.all_reduce(correct) + dist.all_reduce(total) + accuracy = correct.item() / total.item() + if coordinator.is_master(): + print(f'Accuracy of the model on the test images: {accuracy * 100:.2f} %') + return accuracy + + +def train_epoch(epoch: int, model: nn.Module, optimizer: Optimizer, criterion: nn.Module, train_dataloader: DataLoader, + booster: Booster, coordinator: DistCoordinator): + model.train() + with tqdm(train_dataloader, desc=f'Epoch [{epoch + 1}/{NUM_EPOCHS}]', disable=not coordinator.is_master()) as pbar: + for images, labels in pbar: + images = images.cuda() + labels = labels.cuda() + # Forward pass + outputs = model(images) + loss = criterion(outputs, labels) + + # Backward and optimize + booster.backward(loss, optimizer) + optimizer.step() + optimizer.zero_grad() + + # Print log info + pbar.set_postfix({'loss': loss.item()}) + + +def main(): + # ============================== + # Parse Arguments + # ============================== + parser = argparse.ArgumentParser() + # FIXME(ver217): gemini is not supported resnet now + parser.add_argument('-p', + '--plugin', + type=str, + default='torch_ddp', + choices=['torch_ddp', 'torch_ddp_fp16', 'low_level_zero'], + help="plugin to use") + parser.add_argument('-r', '--resume', type=int, default=-1, help="resume from the epoch's checkpoint") + parser.add_argument('-c', '--checkpoint', type=str, default='./checkpoint', help="checkpoint directory") + parser.add_argument('-i', '--interval', type=int, default=5, help="interval of saving checkpoint") + parser.add_argument('--target_acc', + type=float, + default=None, + help="target accuracy. Raise exception if not reached") + args = parser.parse_args() + + # ============================== + # Prepare Checkpoint Directory + # ============================== + if args.interval > 0: + Path(args.checkpoint).mkdir(parents=True, exist_ok=True) + + # ============================== + # Launch Distributed Environment + # ============================== + colossalai.launch_from_torch(config={}) + coordinator = DistCoordinator() + + # update the learning rate with linear scaling + # old_gpu_num / old_lr = new_gpu_num / new_lr + global LEARNING_RATE + LEARNING_RATE *= coordinator.world_size + + # ============================== + # Instantiate Plugin and Booster + # ============================== + booster_kwargs = {} + if args.plugin == 'torch_ddp_fp16': + booster_kwargs['mixed_precision'] = 'fp16' + if args.plugin.startswith('torch_ddp'): + plugin = TorchDDPPlugin() + elif args.plugin == 'gemini': + plugin = GeminiPlugin(placement_policy='cuda', strict_ddp_mode=True, initial_scale=2**5) + elif args.plugin == 'low_level_zero': + plugin = LowLevelZeroPlugin(initial_scale=2**5) + + booster = Booster(plugin=plugin, **booster_kwargs) + + # ============================== + # Prepare Dataloader + # ============================== + train_dataloader, test_dataloader = build_dataloader(100, coordinator, plugin) + + # ==================================== + # Prepare model, optimizer, criterion + # ==================================== + # resent50 + model = torchvision.models.resnet18(num_classes=10) + + # Loss and optimizer + criterion = nn.CrossEntropyLoss() + optimizer = HybridAdam(model.parameters(), lr=LEARNING_RATE) + + # lr scheduler + lr_scheduler = MultiStepLR(optimizer, milestones=[20, 40, 60, 80], gamma=1 / 3) + + # ============================== + # Boost with ColossalAI + # ============================== + model, optimizer, criterion, _, lr_scheduler = booster.boost(model, + optimizer, + criterion=criterion, + lr_scheduler=lr_scheduler) + + # ============================== + # Resume from checkpoint + # ============================== + if args.resume >= 0: + booster.load_model(model, f'{args.checkpoint}/model_{args.resume}.pth') + booster.load_optimizer(optimizer, f'{args.checkpoint}/optimizer_{args.resume}.pth') + booster.load_lr_scheduler(lr_scheduler, f'{args.checkpoint}/lr_scheduler_{args.resume}.pth') + + # ============================== + # Train model + # ============================== + start_epoch = args.resume if args.resume >= 0 else 0 + for epoch in range(start_epoch, NUM_EPOCHS): + train_epoch(epoch, model, optimizer, criterion, train_dataloader, booster, coordinator) + lr_scheduler.step() + + # save checkpoint + if args.interval > 0 and (epoch + 1) % args.interval == 0: + booster.save_model(model, f'{args.checkpoint}/model_{epoch + 1}.pth') + booster.save_optimizer(optimizer, f'{args.checkpoint}/optimizer_{epoch + 1}.pth') + booster.save_lr_scheduler(lr_scheduler, f'{args.checkpoint}/lr_scheduler_{epoch + 1}.pth') + + accuracy = evaluate(model, test_dataloader, coordinator) + if args.target_acc is not None: + assert accuracy >= args.target_acc, f'Accuracy {accuracy} is lower than target accuracy {args.target_acc}' + + +if __name__ == '__main__': + main() diff --git a/examples/tutorial/new_api/cifar_vit/README.md b/examples/tutorial/new_api/cifar_vit/README.md new file mode 100644 index 000000000..fa76447c5 --- /dev/null +++ b/examples/tutorial/new_api/cifar_vit/README.md @@ -0,0 +1,37 @@ +# Train ViT on CIFAR-10 from scratch + +## 🚀 Quick Start + +This example provides a training script, which provides an example of training ViT on CIFAR10 dataset from scratch. + +- Training Arguments + - `-p`, `--plugin`: Plugin to use. Choices: `torch_ddp`, `torch_ddp_fp16`, `low_level_zero`. Defaults to `torch_ddp`. + - `-r`, `--resume`: Resume from checkpoint file path. Defaults to `-1`, which means not resuming. + - `-c`, `--checkpoint`: The folder to save checkpoints. Defaults to `./checkpoint`. + - `-i`, `--interval`: Epoch interval to save checkpoints. Defaults to `5`. If set to `0`, no checkpoint will be saved. + - `--target_acc`: Target accuracy. Raise exception if not reached. Defaults to `None`. + +### Install requirements + +```bash +pip install -r requirements.txt +``` + +### Train + +```bash +# train with torch DDP with fp32 +colossalai run --nproc_per_node 4 train.py -c ./ckpt-fp32 + +# train with torch DDP with mixed precision training +colossalai run --nproc_per_node 4 train.py -c ./ckpt-fp16 -p torch_ddp_fp16 + +# train with low level zero +colossalai run --nproc_per_node 4 train.py -c ./ckpt-low_level_zero -p low_level_zero +``` + +Expected accuracy performance will be: + +| Model | Single-GPU Baseline FP32 | Booster DDP with FP32 | Booster DDP with FP16 | Booster Low Level Zero | +| --------- | ------------------------ | --------------------- | --------------------- | ---------------------- | +| ViT | 83.00% | 84.03% | 84.00% | 84.43% | diff --git a/examples/tutorial/new_api/cifar_vit/requirements.txt b/examples/tutorial/new_api/cifar_vit/requirements.txt new file mode 100644 index 000000000..6d53ce7b5 --- /dev/null +++ b/examples/tutorial/new_api/cifar_vit/requirements.txt @@ -0,0 +1,5 @@ +colossalai +timm +torch +torchvision +tqdm diff --git a/examples/tutorial/new_api/cifar_vit/test_ci.sh b/examples/tutorial/new_api/cifar_vit/test_ci.sh new file mode 100755 index 000000000..43239d400 --- /dev/null +++ b/examples/tutorial/new_api/cifar_vit/test_ci.sh @@ -0,0 +1,10 @@ +#!/bin/bash +set -xe + +export DATA=/data/scratch/cifar-10 + +pip install -r requirements.txt + +for plugin in "torch_ddp" "torch_ddp_fp16" "low_level_zero"; do + colossalai run --nproc_per_node 4 train.py --interval 0 --target_acc 0.83 --plugin $plugin +done diff --git a/examples/tutorial/new_api/cifar_vit/train.py b/examples/tutorial/new_api/cifar_vit/train.py new file mode 100644 index 000000000..fee53df07 --- /dev/null +++ b/examples/tutorial/new_api/cifar_vit/train.py @@ -0,0 +1,225 @@ +import argparse +import os +from pathlib import Path + +import torch +import torch.distributed as dist +import torch.nn as nn +import torchvision +import torchvision.transforms as transforms +from timm.models.vision_transformer import _cfg, _create_vision_transformer +from torch.optim import Optimizer +from torch.utils.data import DataLoader +from tqdm import tqdm + +import colossalai +from colossalai.booster import Booster +from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin +from colossalai.booster.plugin.dp_plugin_base import DPPluginBase +from colossalai.cluster import DistCoordinator +from colossalai.nn.lr_scheduler import LinearWarmupLR +from colossalai.nn.optimizer import HybridAdam +from colossalai.utils import get_current_device + +# ============================== +# Prepare Hyperparameters +# ============================== +NUM_EPOCHS = 60 +WARMUP_EPOCSH = 5 +LEARNING_RATE = 1e-3 + + +def vit_cifar(**kwargs): + pretrained_cfg = _cfg(num_classes=10, input_size=(3, 32, 32), crop_pct=1.0) + model_kwargs = dict(patch_size=4, embed_dim=512, depth=6, num_heads=8, drop_rate=0.1, mlp_ratio=1.0, **kwargs) + model = _create_vision_transformer('vit_cifar', pretrained_cfg=pretrained_cfg, **model_kwargs) + return model + + +def build_dataloader(batch_size: int, coordinator: DistCoordinator, plugin: DPPluginBase): + # trainsform + transform_train = transforms.Compose([ + transforms.RandomCrop(32, padding=4), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + transforms.Normalize((0.49139968, 0.48215827, 0.44653124), (0.24703233, 0.24348505, 0.26158768)), + ]) + transform_test = transforms.Compose([ + transforms.Resize(32), + transforms.ToTensor(), + transforms.Normalize((0.49139968, 0.48215827, 0.44653124), (0.24703233, 0.24348505, 0.26158768)), + ]) + + # CIFAR-10 dataset + data_path = os.environ.get('DATA', './data') + with coordinator.priority_execution(): + train_dataset = torchvision.datasets.CIFAR10(root=data_path, + train=True, + transform=transform_train, + download=True) + test_dataset = torchvision.datasets.CIFAR10(root=data_path, + train=False, + transform=transform_test, + download=True) + + # Data loader + train_dataloader = plugin.prepare_train_dataloader(train_dataset, + batch_size=batch_size, + shuffle=True, + drop_last=True) + test_dataloader = plugin.prepare_train_dataloader(test_dataset, + batch_size=batch_size, + shuffle=False, + drop_last=False) + return train_dataloader, test_dataloader + + +@torch.no_grad() +def evaluate(model: nn.Module, test_dataloader: DataLoader, coordinator: DistCoordinator) -> float: + model.eval() + correct = torch.zeros(1, dtype=torch.int64, device=get_current_device()) + total = torch.zeros(1, dtype=torch.int64, device=get_current_device()) + for images, labels in test_dataloader: + images = images.cuda() + labels = labels.cuda() + outputs = model(images) + _, predicted = torch.max(outputs.data, 1) + total += labels.size(0) + correct += (predicted == labels).sum().item() + dist.all_reduce(correct) + dist.all_reduce(total) + accuracy = correct.item() / total.item() + if coordinator.is_master(): + print(f'Accuracy of the model on the test images: {accuracy * 100:.2f} %') + return accuracy + + +def train_epoch(epoch: int, model: nn.Module, optimizer: Optimizer, criterion: nn.Module, train_dataloader: DataLoader, + booster: Booster, coordinator: DistCoordinator): + model.train() + with tqdm(train_dataloader, desc=f'Epoch [{epoch + 1}/{NUM_EPOCHS}]', disable=not coordinator.is_master()) as pbar: + for images, labels in pbar: + images = images.cuda() + labels = labels.cuda() + # Forward pass + outputs = model(images) + loss = criterion(outputs, labels) + + # Backward and optimize + booster.backward(loss, optimizer) + optimizer.step() + optimizer.zero_grad() + + # Print log info + pbar.set_postfix({'loss': loss.item()}) + + +def main(): + # ============================== + # Parse Arguments + # ============================== + parser = argparse.ArgumentParser() + # FIXME(ver217): gemini is not supported resnet now + parser.add_argument('-p', + '--plugin', + type=str, + default='torch_ddp', + choices=['torch_ddp', 'torch_ddp_fp16', 'low_level_zero'], + help="plugin to use") + parser.add_argument('-r', '--resume', type=int, default=-1, help="resume from the epoch's checkpoint") + parser.add_argument('-c', '--checkpoint', type=str, default='./checkpoint', help="checkpoint directory") + parser.add_argument('-i', '--interval', type=int, default=5, help="interval of saving checkpoint") + parser.add_argument('--target_acc', + type=float, + default=None, + help="target accuracy. Raise exception if not reached") + args = parser.parse_args() + + # ============================== + # Prepare Checkpoint Directory + # ============================== + if args.interval > 0: + Path(args.checkpoint).mkdir(parents=True, exist_ok=True) + + # ============================== + # Launch Distributed Environment + # ============================== + colossalai.launch_from_torch(config={}) + coordinator = DistCoordinator() + + # update the learning rate with linear scaling + # old_gpu_num / old_lr = new_gpu_num / new_lr + global LEARNING_RATE + LEARNING_RATE *= coordinator.world_size + + # ============================== + # Instantiate Plugin and Booster + # ============================== + booster_kwargs = {} + if args.plugin == 'torch_ddp_fp16': + booster_kwargs['mixed_precision'] = 'fp16' + if args.plugin.startswith('torch_ddp'): + plugin = TorchDDPPlugin() + elif args.plugin == 'gemini': + plugin = GeminiPlugin(placement_policy='cuda', strict_ddp_mode=True, initial_scale=2**5) + elif args.plugin == 'low_level_zero': + plugin = LowLevelZeroPlugin(initial_scale=2**5) + + booster = Booster(plugin=plugin, **booster_kwargs) + + # ============================== + # Prepare Dataloader + # ============================== + train_dataloader, test_dataloader = build_dataloader(512, coordinator, plugin) + + # ==================================== + # Prepare model, optimizer, criterion + # ==================================== + # resent50 + model = torchvision.models.resnet18(num_classes=10) + + # Loss and optimizer + criterion = nn.CrossEntropyLoss() + optimizer = HybridAdam(model.parameters(), lr=LEARNING_RATE) + + # lr scheduler + lr_scheduler = LinearWarmupLR(optimizer, NUM_EPOCHS, WARMUP_EPOCSH) + + # ============================== + # Boost with ColossalAI + # ============================== + model, optimizer, criterion, train_dataloader, lr_scheduler = booster.boost(model, + optimizer, + criterion=criterion, + dataloader=train_dataloader, + lr_scheduler=lr_scheduler) + + # ============================== + # Resume from checkpoint + # ============================== + if args.resume >= 0: + booster.load_model(model, f'{args.checkpoint}/model_{args.resume}.pth') + booster.load_optimizer(optimizer, f'{args.checkpoint}/optimizer_{args.resume}.pth') + booster.load_lr_scheduler(lr_scheduler, f'{args.checkpoint}/lr_scheduler_{args.resume}.pth') + + # ============================== + # Train model + # ============================== + start_epoch = args.resume if args.resume >= 0 else 0 + for epoch in range(start_epoch, NUM_EPOCHS): + train_epoch(epoch, model, optimizer, criterion, train_dataloader, booster, coordinator) + lr_scheduler.step() + + # save checkpoint + if args.interval > 0 and (epoch + 1) % args.interval == 0: + booster.save_model(model, f'{args.checkpoint}/model_{epoch + 1}.pth') + booster.save_optimizer(optimizer, f'{args.checkpoint}/optimizer_{epoch + 1}.pth') + booster.save_lr_scheduler(lr_scheduler, f'{args.checkpoint}/lr_scheduler_{epoch + 1}.pth') + + accuracy = evaluate(model, test_dataloader, coordinator) + if args.target_acc is not None: + assert accuracy >= args.target_acc, f'Accuracy {accuracy} is lower than target accuracy {args.target_acc}' + + +if __name__ == '__main__': + main() diff --git a/examples/tutorial/new_api/glue_bert/README.md b/examples/tutorial/new_api/glue_bert/README.md index d65c7705b..0030eead9 100644 --- a/examples/tutorial/new_api/glue_bert/README.md +++ b/examples/tutorial/new_api/glue_bert/README.md @@ -10,6 +10,12 @@ This example provides a training script, which provides an example of finetuning - `--target_f1`: Target f1 score. Raise exception if not reached. Defaults to `None`. +### Install requirements + +```bash +pip install -r requirements.txt +``` + ### Train ```bash diff --git a/examples/tutorial/new_api/glue_bert/requirements.txt b/examples/tutorial/new_api/glue_bert/requirements.txt new file mode 100644 index 000000000..950c2d378 --- /dev/null +++ b/examples/tutorial/new_api/glue_bert/requirements.txt @@ -0,0 +1,7 @@ +colossalai +datasets +torch +tqdm +transformers +scipy +scikit-learn diff --git a/examples/tutorial/new_api/glue_bert/test_ci.sh b/examples/tutorial/new_api/glue_bert/test_ci.sh index f8c2dfbe9..c2c097f8d 100755 --- a/examples/tutorial/new_api/glue_bert/test_ci.sh +++ b/examples/tutorial/new_api/glue_bert/test_ci.sh @@ -1,6 +1,8 @@ #!/bin/bash set -xe +pip install -r requirements.txt + for plugin in "torch_ddp" "torch_ddp_fp16" "gemini" "low_level_zero"; do torchrun --standalone --nproc_per_node 4 finetune.py --target_f1 0.86 --plugin $plugin done diff --git a/examples/tutorial/new_api/test_ci.sh b/examples/tutorial/new_api/test_ci.sh index 8b4475e9f..a08844dbe 100644 --- a/examples/tutorial/new_api/test_ci.sh +++ b/examples/tutorial/new_api/test_ci.sh @@ -1,2 +1,6 @@ -#!/usr/bin/env -echo "The CI integration will be completed when the API is stable" +#!/bin/bash +set -xe + +# FIXME(ver217): only run bert finetune to save time + +cd glue_bert && bash ./test_ci.sh && cd .. diff --git a/examples/tutorial/new_api/torch_ddp/README.md b/examples/tutorial/new_api/torch_ddp/README.md deleted file mode 100644 index e120bacb0..000000000 --- a/examples/tutorial/new_api/torch_ddp/README.md +++ /dev/null @@ -1,44 +0,0 @@ -# Distributed Data Parallel - -## 🚀 Quick Start - -This example provides a training script and an evaluation script. The training script provides an example of training ResNet on CIFAR10 dataset from scratch. - -- Training Arguments - - `-r`, `--resume`: resume from checkpoint file path - - `-c`, `--checkpoint`: the folder to save checkpoints - - `-i`, `--interval`: epoch interval to save checkpoints - - `-f`, `--fp16`: use fp16 - -- Eval Arguments - - `-e`, `--epoch`: select the epoch to evaluate - - `-c`, `--checkpoint`: the folder where checkpoints are found - - -### Train - -```bash -# train with torch DDP with fp32 -colossalai run --nproc_per_node 2 train.py -c ./ckpt-fp32 - -# train with torch DDP with mixed precision training -colossalai run --nproc_per_node 2 train.py -c ./ckpt-fp16 --fp16 -``` - -### Eval - -```bash -# evaluate fp32 training -python eval.py -c ./ckpt-fp32 -e 80 - -# evaluate fp16 mixed precision training -python eval.py -c ./ckpt-fp16 -e 80 -``` - -Expected accuracy performance will be: - -| Model | Single-GPU Baseline FP32 | Booster DDP with FP32 | Booster DDP with FP16 | -| --------- | ------------------------ | --------------------- | --------------------- | -| ResNet-18 | 85.85% | 85.03% | 85.12% | - -**Note: the baseline is adapted from the [script](https://pytorch-tutorial.readthedocs.io/en/latest/tutorial/chapter03_intermediate/3_2_2_cnn_resnet_cifar10/) to use `torchvision.models.resnet18`** diff --git a/examples/tutorial/new_api/torch_ddp/train.py b/examples/tutorial/new_api/torch_ddp/train.py deleted file mode 100644 index 4741c3151..000000000 --- a/examples/tutorial/new_api/torch_ddp/train.py +++ /dev/null @@ -1,128 +0,0 @@ -import argparse -from pathlib import Path - -import torch -import torch.nn as nn -import torchvision -import torchvision.transforms as transforms -from torch.optim.lr_scheduler import MultiStepLR - -import colossalai -from colossalai.booster import Booster -from colossalai.booster.plugin import TorchDDPPlugin -from colossalai.cluster import DistCoordinator - -# ============================== -# Parse Arguments -# ============================== -parser = argparse.ArgumentParser() -parser.add_argument('-r', '--resume', type=int, default=-1, help="resume from the epoch's checkpoint") -parser.add_argument('-c', '--checkpoint', type=str, default='./checkpoint', help="checkpoint directory") -parser.add_argument('-i', '--interval', type=int, default=5, help="interval of saving checkpoint") -parser.add_argument('-f', '--fp16', action='store_true', help="use fp16") -args = parser.parse_args() - -# ============================== -# Prepare Checkpoint Directory -# ============================== -Path(args.checkpoint).mkdir(parents=True, exist_ok=True) - -# ============================== -# Prepare Hyperparameters -# ============================== -NUM_EPOCHS = 80 -LEARNING_RATE = 1e-3 -START_EPOCH = args.resume if args.resume >= 0 else 0 - -# ============================== -# Launch Distributed Environment -# ============================== -colossalai.launch_from_torch(config={}) -coordinator = DistCoordinator() - -# update the learning rate with linear scaling -# old_gpu_num / old_lr = new_gpu_num / new_lr -LEARNING_RATE *= coordinator.world_size - -# ============================== -# Prepare Booster -# ============================== -plugin = TorchDDPPlugin() -if args.fp16: - booster = Booster(mixed_precision='fp16', plugin=plugin) -else: - booster = Booster(plugin=plugin) - -# ============================== -# Prepare Train Dataset -# ============================== -transform = transforms.Compose( - [transforms.Pad(4), - transforms.RandomHorizontalFlip(), - transforms.RandomCrop(32), - transforms.ToTensor()]) - -# CIFAR-10 dataset -with coordinator.priority_execution(): - train_dataset = torchvision.datasets.CIFAR10(root='./data/', train=True, transform=transform, download=True) - -# ==================================== -# Prepare model, optimizer, criterion -# ==================================== -# resent50 -model = torchvision.models.resnet18(num_classes=10).cuda() - -# Loss and optimizer -criterion = nn.CrossEntropyLoss() -optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE) - -# lr scheduler -lr_scheduler = MultiStepLR(optimizer, milestones=[20, 40, 60, 80], gamma=1 / 3) - -# prepare dataloader with torch ddp plugin -train_dataloader = plugin.prepare_train_dataloader(train_dataset, batch_size=100, shuffle=True) - -# ============================== -# Resume from checkpoint -# ============================== -if args.resume >= 0: - booster.load_model(model, f'{args.checkpoint}/model_{args.resume}.pth') - booster.load_optimizer(optimizer, f'{args.checkpoint}/optimizer_{args.resume}.pth') - booster.load_lr_scheduler(lr_scheduler, f'{args.checkpoint}/lr_scheduler_{args.resume}.pth') - -# ============================== -# Boost with ColossalAI -# ============================== -model, optimizer, criterion, train_dataloader, lr_scheduler = booster.boost(model, optimizer, criterion, - train_dataloader, lr_scheduler) - -# ============================== -# Train model -# ============================== -total_step = len(train_dataloader) - -for epoch in range(START_EPOCH, NUM_EPOCHS): - for i, (images, labels) in enumerate(train_dataloader): - images = images.cuda() - labels = labels.cuda() - - # Forward pass - outputs = model(images) - loss = criterion(outputs, labels) - - # Backward and optimize - optimizer.zero_grad() - booster.backward(loss, optimizer) - optimizer.step() - - if (i + 1) % 100 == 0: - print("Epoch [{}/{}], Step [{}/{}] Loss: {:.4f}".format(epoch + 1, NUM_EPOCHS, i + 1, total_step, - loss.item())) - - lr_scheduler.step() - - # save checkpoint every 5 epoch - if (epoch + 1) % args.interval == 0: - booster.save_model(model, f'{args.checkpoint}/model_{epoch + 1}.pth') - booster.save_optimizer(optimizer, f'{args.checkpoint}/optimizer_{epoch + 1}.pth') - booster.save_lr_scheduler(lr_scheduler, f'{args.checkpoint}/lr_scheduler_{epoch + 1}.pth')