update examples and sphnix docs for the new api (#63)

2025-09-01 00:59:48 +00:00 · 2021-12-13 22:07:01 +08:00
parent 7d3711058f
commit 35813ed3c4
124 changed files with 1251 additions and 1462 deletions
--- a/examples/colossal_cifar_demo.ipynb
+++ b/examples/colossal_cifar_demo.ipynb
@@ -1,370 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "uhrbvVEh2iJd"
-   },
-   "source": [
-    "# Train an image classifier\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "vP7LvCpG23a2",
-    "outputId": "b37f7203-8a02-4736-c527-603f2bb34d7d"
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Requirement already satisfied: ColossalAI in /usr/local/lib/python3.7/dist-packages (0.1)\n",
-      "Requirement already satisfied: deepspeed in /usr/local/lib/python3.7/dist-packages (0.5.4)\n",
-      "Requirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from deepspeed) (21.0)\n",
-      "Requirement already satisfied: triton in /usr/local/lib/python3.7/dist-packages (from deepspeed) (1.1.1)\n",
-      "Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from deepspeed) (4.62.3)\n",
-      "Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from deepspeed) (1.19.5)\n",
-      "Requirement already satisfied: tensorboardX==1.8 in /usr/local/lib/python3.7/dist-packages (from deepspeed) (1.8)\n",
-      "Requirement already satisfied: ninja in /usr/local/lib/python3.7/dist-packages (from deepspeed) (1.10.2.2)\n",
-      "Requirement already satisfied: torch in /usr/local/lib/python3.7/dist-packages (from deepspeed) (1.9.0+cu111)\n",
-      "Requirement already satisfied: psutil in /usr/local/lib/python3.7/dist-packages (from deepspeed) (5.4.8)\n",
-      "Requirement already satisfied: protobuf>=3.2.0 in /usr/local/lib/python3.7/dist-packages (from tensorboardX==1.8->deepspeed) (3.17.3)\n",
-      "Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from tensorboardX==1.8->deepspeed) (1.15.0)\n",
-      "Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->deepspeed) (2.4.7)\n",
-      "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from torch->deepspeed) (3.7.4.3)\n",
-      "Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from triton->deepspeed) (3.3.0)\n"
-     ]
-    }
-   ],
-   "source": [
-    "!pip install ColossalAI deepspeed"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "UVKEurtS4SFS",
-    "outputId": "99fb6050-5da7-4f27-b4eb-9b3ccf830efb"
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Please install apex to use FP16 Optimizer\n",
-      "Apex should be installed to use the FP16 optimizer\n",
-      "apex is required for mixed precision training\n"
-     ]
-    }
-   ],
-   "source": [
-    "import colossalai\n",
-    "from colossalai.engine import Engine, NonPipelineSchedule\n",
-    "from colossalai.trainer import Trainer\n",
-    "from colossalai.context import Config\n",
-    "import torch"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "PpFfhNBD7NSn"
-   },
-   "source": [
-    "First, we should initialize distributed environment. Though we just use single GPU in this example, we still need initialize distributed environment for compatibility. We just consider the simplest case here, so we just set the number of parallel processes to 1."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "8yF7Lc-K7NAS",
-    "outputId": "01312349-a8b0-4de4-9103-7d1b48e6cc36"
-   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "colossalai - torch.distributed.distributed_c10d - 2021-10-15 03:27:51,596 INFO: Added key: store_based_barrier_key:1 to store for rank: 0\n",
-      "colossalai - torch.distributed.distributed_c10d - 2021-10-15 03:27:51,598 INFO: Rank 0: Completed store-based barrier for 1 nodes.\n",
-      "colossalai - torch.distributed.distributed_c10d - 2021-10-15 03:27:51,602 INFO: Added key: store_based_barrier_key:2 to store for rank: 0\n",
-      "colossalai - torch.distributed.distributed_c10d - 2021-10-15 03:27:51,605 INFO: Rank 0: Completed store-based barrier for 1 nodes.\n",
-      "colossalai - torch.distributed.distributed_c10d - 2021-10-15 03:27:51,608 INFO: Added key: store_based_barrier_key:3 to store for rank: 0\n",
-      "colossalai - torch.distributed.distributed_c10d - 2021-10-15 03:27:51,610 INFO: Rank 0: Completed store-based barrier for 1 nodes.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "process rank 0 is bound to device 0\n",
-      "initialized seed on rank 0, numpy: 1024, python random: 1024, ParallelMode.DATA: 1024, ParallelMode.TENSOR: 1124,the default parallel seed is ParallelMode.DATA.\n"
-     ]
-    }
-   ],
-   "source": [
-    "parallel_cfg = Config(dict(parallel=dict(\n",
-    "    data=dict(size=1),\n",
-    "    pipeline=dict(size=1),\n",
-    "    tensor=dict(size=1, mode=None),\n",
-    ")))\n",
-    "colossalai.init_dist(config=parallel_cfg,\n",
-    "          local_rank=0,\n",
-    "          world_size=1,\n",
-    "          host='127.0.0.1',\n",
-    "          port=8888,\n",
-    "          backend='nccl')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "ppjmMxc_81TK"
-   },
-   "source": [
-    "Load and normalize the CIFAR10 training and test datasets using `colossalai.nn.data`. Note that we have wrapped `torchvision.transforms`, so that we can simply use the config dict to use them."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "ZyGhyD47-dUY",
-    "outputId": "98bbf2d1-a1c4-4bb4-b6df-600777b1e8f5"
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Files already downloaded and verified\n",
-      "Files already downloaded and verified\n"
-     ]
-    }
-   ],
-   "source": [
-    "transform_cfg = [\n",
-    "    dict(type='ToTensor'),\n",
-    "    dict(type='Normalize',\n",
-    "        mean=[0.4914, 0.4822, 0.4465],\n",
-    "        std=[0.2023, 0.1994, 0.2010]),\n",
-    "]\n",
-    "\n",
-    "batch_size = 128\n",
-    "\n",
-    "trainset = colossalai.nn.data.CIFAR10Dataset(transform_cfg, root='./data', train=True)\n",
-    "trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=2)\n",
-    "\n",
-    "testset = colossalai.nn.data.CIFAR10Dataset(transform_cfg, root='./data', train=False)\n",
-    "testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=2)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "NvPbfLLR9NzC"
-   },
-   "source": [
-    "We just define a simple Convolutional Neural Network here."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "cQ_y7lBG09LS"
-   },
-   "outputs": [],
-   "source": [
-    "import torch.nn as nn\n",
-    "import torch.nn.functional as F\n",
-    "\n",
-    "\n",
-    "class Net(nn.Module):\n",
-    "    def __init__(self):\n",
-    "        super().__init__()\n",
-    "        self.conv1 = nn.Conv2d(3, 6, 5)\n",
-    "        self.pool = nn.MaxPool2d(2, 2)\n",
-    "        self.conv2 = nn.Conv2d(6, 16, 5)\n",
-    "        self.fc1 = nn.Linear(16 * 5 * 5, 120)\n",
-    "        self.fc2 = nn.Linear(120, 84)\n",
-    "        self.fc3 = nn.Linear(84, 10)\n",
-    "\n",
-    "    def forward(self, x):\n",
-    "        x = self.pool(F.relu(self.conv1(x)))\n",
-    "        x = self.pool(F.relu(self.conv2(x)))\n",
-    "        x = torch.flatten(x, 1) # flatten all dimensions except batch\n",
-    "        x = F.relu(self.fc1(x))\n",
-    "        x = F.relu(self.fc2(x))\n",
-    "        x = self.fc3(x)\n",
-    "        return x\n",
-    "\n",
-    "\n",
-    "model = Net().cuda()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "tgsszAmM9dYZ"
-   },
-   "source": [
-    "Define a Loss function and optimizer. And then we use them to initialize `Engine` and `Trainer`. We provide various training / evaluating hooks. In this case, we just use the simplest hooks which can compute and print loss and accuracy."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "YtaDoCax1BCf",
-    "outputId": "b33b1641-03d8-4597-c8c2-1a4c1d61e9b0"
-   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "colossalai - rank_0 - 2021-10-15 03:27:56,018 WARNING: No gradient handler is set up, please make sure you do not need to all-reduce the gradients after a training step.\n",
-      "colossalai - rank_0 - 2021-10-15 03:27:56,024 INFO: build LogMetricByEpochHook for train, priority = 1\n",
-      "colossalai - rank_0 - 2021-10-15 03:27:56,026 INFO: build LossHook for train, priority = 10\n",
-      "colossalai - rank_0 - 2021-10-15 03:27:56,029 INFO: build AccuracyHook for train, priority = 10\n"
-     ]
-    }
-   ],
-   "source": [
-    "import torch.optim as optim\n",
-    "\n",
-    "criterion = nn.CrossEntropyLoss()\n",
-    "optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)\n",
-    "schedule = NoPipelineSchedule()\n",
-    "engine = Engine(\n",
-    "        model=model,\n",
-    "        criterion=criterion,\n",
-    "        optimizer=optimizer,\n",
-    "        lr_scheduler=None,\n",
-    "        schedule=schedule\n",
-    "    )\n",
-    "trainer = Trainer(engine=engine,\n",
-    "          hooks_cfg=[dict(type='LossHook'), dict(type='LogMetricByEpochHook'), dict(type='AccuracyHook')],\n",
-    "          verbose=True)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "_JR2TuvH99Ik"
-   },
-   "source": [
-    "Then we set training configs. We train our model for 10 epochs and it will be evaluated every 1 epoch. Set `display_progress` to `True` to display the training / evaluating progress bar."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "w-J3IP-J1sfx",
-    "outputId": "bdb76939-04f1-4124-ce5e-3af44c0d902c"
-   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "[Epoch 0 train]:   0%|          | 0/391 [00:00<?, ?it/s]/usr/local/lib/python3.7/dist-packages/torch/nn/functional.py:718: UserWarning: Named tensors and all their associated APIs are an experimental feature and subject to change. Please do not use them for anything important until they are released as stable. (Triggered internally at  /pytorch/c10/core/TensorImpl.h:1156.)\n",
-      "  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)\n",
-      "[Epoch 0 train]: 100%|██████████| 391/391 [00:14<00:00, 26.82it/s]\n",
-      "colossalai - rank_0 - 2021-10-15 03:28:11,088 INFO: Training - Epoch 1 - LogMetricByEpochHook: Loss = 2.29158\n",
-      "[Epoch 0 val]: 100%|██████████| 79/79 [00:02<00:00, 28.66it/s]\n",
-      "colossalai - rank_0 - 2021-10-15 03:28:14,040 INFO: Testing - Epoch 1 - LogMetricByEpochHook: Loss = 2.26517, Accuracy = 0.14820\n",
-      "[Epoch 1 train]: 100%|██████████| 391/391 [00:14<00:00, 26.31it/s]\n",
-      "colossalai - rank_0 - 2021-10-15 03:28:29,059 INFO: Training - Epoch 2 - LogMetricByEpochHook: Loss = 2.15763\n",
-      "[Epoch 1 val]: 100%|██████████| 79/79 [00:02<00:00, 28.50it/s]\n",
-      "colossalai - rank_0 - 2021-10-15 03:28:32,007 INFO: Testing - Epoch 2 - LogMetricByEpochHook: Loss = 2.00450, Accuracy = 0.27850\n",
-      "[Epoch 2 train]: 100%|██████████| 391/391 [00:14<00:00, 26.08it/s]\n",
-      "colossalai - rank_0 - 2021-10-15 03:28:47,167 INFO: Training - Epoch 3 - LogMetricByEpochHook: Loss = 1.85409\n",
-      "[Epoch 2 val]: 100%|██████████| 79/79 [00:02<00:00, 27.89it/s]\n",
-      "colossalai - rank_0 - 2021-10-15 03:28:50,168 INFO: Testing - Epoch 3 - LogMetricByEpochHook: Loss = 1.73788, Accuracy = 0.35990\n",
-      "[Epoch 3 train]: 100%|██████████| 391/391 [00:14<00:00, 26.09it/s]\n",
-      "colossalai - rank_0 - 2021-10-15 03:29:05,330 INFO: Training - Epoch 4 - LogMetricByEpochHook: Loss = 1.69363\n",
-      "[Epoch 3 val]: 100%|██████████| 79/79 [00:02<00:00, 28.43it/s]\n",
-      "colossalai - rank_0 - 2021-10-15 03:29:08,290 INFO: Testing - Epoch 4 - LogMetricByEpochHook: Loss = 1.65005, Accuracy = 0.39350\n",
-      "[Epoch 4 train]: 100%|██████████| 391/391 [00:15<00:00, 25.97it/s]\n",
-      "colossalai - rank_0 - 2021-10-15 03:29:23,530 INFO: Training - Epoch 5 - LogMetricByEpochHook: Loss = 1.61387\n",
-      "[Epoch 4 val]: 100%|██████████| 79/79 [00:02<00:00, 27.75it/s]\n",
-      "colossalai - rank_0 - 2021-10-15 03:29:26,515 INFO: Testing - Epoch 5 - LogMetricByEpochHook: Loss = 1.57507, Accuracy = 0.42430\n",
-      "[Epoch 5 train]: 100%|██████████| 391/391 [00:15<00:00, 25.92it/s]\n",
-      "colossalai - rank_0 - 2021-10-15 03:29:41,764 INFO: Training - Epoch 6 - LogMetricByEpochHook: Loss = 1.55712\n",
-      "[Epoch 5 val]: 100%|██████████| 79/79 [00:02<00:00, 27.51it/s]\n",
-      "colossalai - rank_0 - 2021-10-15 03:29:44,778 INFO: Testing - Epoch 6 - LogMetricByEpochHook: Loss = 1.53242, Accuracy = 0.43700\n",
-      "[Epoch 6 train]: 100%|██████████| 391/391 [00:14<00:00, 26.13it/s]\n",
-      "colossalai - rank_0 - 2021-10-15 03:29:59,927 INFO: Training - Epoch 7 - LogMetricByEpochHook: Loss = 1.51618\n",
-      "[Epoch 6 val]: 100%|██████████| 79/79 [00:02<00:00, 28.31it/s]\n",
-      "colossalai - rank_0 - 2021-10-15 03:30:02,884 INFO: Testing - Epoch 7 - LogMetricByEpochHook: Loss = 1.49720, Accuracy = 0.45430\n",
-      "[Epoch 7 train]: 100%|██████████| 391/391 [00:14<00:00, 26.23it/s]\n",
-      "colossalai - rank_0 - 2021-10-15 03:30:17,968 INFO: Training - Epoch 8 - LogMetricByEpochHook: Loss = 1.47857\n",
-      "[Epoch 7 val]: 100%|██████████| 79/79 [00:02<00:00, 27.97it/s]\n",
-      "colossalai - rank_0 - 2021-10-15 03:30:20,967 INFO: Testing - Epoch 8 - LogMetricByEpochHook: Loss = 1.45808, Accuracy = 0.46320\n",
-      "[Epoch 8 train]: 100%|██████████| 391/391 [00:14<00:00, 26.11it/s]\n",
-      "colossalai - rank_0 - 2021-10-15 03:30:36,129 INFO: Training - Epoch 9 - LogMetricByEpochHook: Loss = 1.44656\n",
-      "[Epoch 8 val]: 100%|██████████| 79/79 [00:02<00:00, 28.18it/s]\n",
-      "colossalai - rank_0 - 2021-10-15 03:30:39,096 INFO: Testing - Epoch 9 - LogMetricByEpochHook: Loss = 1.44903, Accuracy = 0.46580\n",
-      "[Epoch 9 train]: 100%|██████████| 391/391 [00:15<00:00, 25.97it/s]\n",
-      "colossalai - rank_0 - 2021-10-15 03:30:54,342 INFO: Training - Epoch 10 - LogMetricByEpochHook: Loss = 1.41120\n",
-      "[Epoch 9 val]: 100%|██████████| 79/79 [00:02<00:00, 28.05it/s]\n",
-      "colossalai - rank_0 - 2021-10-15 03:30:57,332 INFO: Testing - Epoch 10 - LogMetricByEpochHook: Loss = 1.41242, Accuracy = 0.48500\n"
-     ]
-    }
-   ],
-   "source": [
-    "num_epochs = 10\n",
-    "test_interval = 1\n",
-    "trainer.fit(\n",
-    "        train_dataloader=trainloader,\n",
-    "        test_dataloader=testloader,\n",
-    "        max_epochs=num_epochs,\n",
-    "        display_progress=True,\n",
-    "        test_interval=test_interval\n",
-    "    )"
-   ]
-  }
- ],
- "metadata": {
-  "accelerator": "GPU",
-  "colab": {
-   "name": "colossal_cifar_demo.ipynb",
-   "provenance": []
-  },
-  "kernelspec": {
-   "display_name": "Python 3",
-   "name": "python3"
-  },
-  "language_info": {
-   "name": "python"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 0
-}
--- a/examples/resnet_cifar10_data_parallel/README.md
+++ b/examples/resnet_cifar10_data_parallel/README.md
@@ -0,0 +1,50 @@
+# Train ResNet34 on CIFAR10
+
+## Prepare Dataset
+
+In the script, we used CIFAR10 dataset provided by the `torchvision` library. The code snippet is shown below:
+
+```python
+train_dataset = CIFAR10(
+        root=Path(os.environ['DATA']),
+        download=True,
+        transform=transforms.Compose(
+            [
+                transforms.RandomCrop(size=32, padding=4),
+                transforms.RandomHorizontalFlip(),
+                transforms.ToTensor(),
+                transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[
+                    0.2023, 0.1994, 0.2010]),
+            ]
+        )
+    )
+```
+
+Firstly, you need to specify where you want to store your CIFAR10 dataset by setting the environment variable `DATA`. 
+
+```bash
+export DATA=/path/to/data
+
+# example
+# this will store the data in the current directory
+export DATA=$PWD/data
+```
+
+The `torchvison` module will download the data automatically for you into the specified directory.
+
+
+## Run training
+
+We provide two examples of training resnet 34 on the CIFAR10 dataset. One example is with engine and the other is 
+with the trainer. You can invoke the training script by the following command. This batch size and learning rate 
+are for a single GPU. Thus, in the following command, `nproc_per_node` is 1, which means there is only one process 
+invoked. If you change `nproc_per_node`, you will have to change the learning rate accordingly as the global batch
+size has changed.
+
+```bash
+# with engine
+python -m torch.distributed.launch --nproc_per_node 1 run_resnet_cifar10_with_engine.py
+
+# with trainer
+python -m torch.distributed.launch --nproc_per_node 1 run_resnet_cifar10_with_trainer.py
+```
--- a/examples/resnet_cifar10_data_parallel/config.py
+++ b/examples/resnet_cifar10_data_parallel/config.py
@@ -0,0 +1,10 @@
+from colossalai.amp import AMP_TYPE
+
+BATCH_SIZE = 128
+NUM_EPOCHS = 200
+
+CONFIG = dict(
+    fp16=dict(
+        mode=AMP_TYPE.TORCH
+    )
+)
--- a/examples/resnet_cifar10_data_parallel/run_resnet_cifar10_with_engine.py
+++ b/examples/resnet_cifar10_data_parallel/run_resnet_cifar10_with_engine.py
@@ -0,0 +1,118 @@
+from pathlib import Path
+from colossalai.logging import get_dist_logger
+import colossalai
+import torch
+import os
+from colossalai.core import global_context as gpc
+from colossalai.utils import get_dataloader
+from torchvision import transforms
+from colossalai.nn.lr_scheduler import CosineAnnealingLR
+from torchvision.datasets import CIFAR10
+from torchvision.models import resnet34
+from tqdm import tqdm
+
+
+def main():
+    colossalai.launch_from_torch(config='./config.py',
+                                 host='localhost',
+                                 port=29500)
+
+    logger = get_dist_logger()
+
+    # build resnet
+    model = resnet34(num_classes=10)
+
+    # build dataloaders
+    train_dataset = CIFAR10(
+        root=Path(os.environ['DATA']),
+        download=True,
+        transform=transforms.Compose(
+            [
+                transforms.RandomCrop(size=32, padding=4),
+                transforms.RandomHorizontalFlip(),
+                transforms.ToTensor(),
+                transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[
+                    0.2023, 0.1994, 0.2010]),
+            ]
+        )
+    )
+
+    test_dataset = CIFAR10(
+        root=Path(os.environ['DATA']),
+        train=False,
+        transform=transforms.Compose(
+            [
+                transforms.ToTensor(),
+                transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[
+                    0.2023, 0.1994, 0.2010]),
+            ]
+        )
+    )
+
+    train_dataloader = get_dataloader(dataset=train_dataset,
+                                      shuffle=True,
+                                      batch_size=gpc.config.BATCH_SIZE,
+                                      num_workers=1,
+                                      pin_memory=True,
+                                      )
+
+    test_dataloader = get_dataloader(dataset=test_dataset,
+                                     add_sampler=False,
+                                     batch_size=gpc.config.BATCH_SIZE,
+                                     num_workers=1,
+                                     pin_memory=True,
+                                     )
+
+    # build criterion
+    criterion = torch.nn.CrossEntropyLoss()
+
+    # optimizer
+    optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
+
+    # lr_scheduler
+    lr_scheduler = CosineAnnealingLR(optimizer, total_steps=gpc.config.NUM_EPOCHS)
+
+    engine, train_dataloader, test_dataloader, _ = colossalai.initialize(model,
+                                                                         optimizer,
+                                                                         criterion,
+                                                                         train_dataloader,
+                                                                         test_dataloader,
+                                                                         )
+
+    for epoch in range(gpc.config.NUM_EPOCHS):
+        engine.train()
+        if gpc.get_global_rank() == 0:
+            train_dl = tqdm(train_dataloader)
+        else:
+            train_dl = train_dataloader
+        for img, label in train_dl:
+            img = img.cuda()
+            label = label.cuda()
+
+            engine.zero_grad()
+            output = engine(img)
+            train_loss = engine.criterion(output, label)
+            engine.backward(train_loss)
+            engine.step()
+        lr_scheduler.step()
+
+        engine.eval()
+        correct = 0
+        total = 0
+        for img, label in test_dataloader:
+            img = img.cuda()
+            label = label.cuda()
+
+            with torch.no_grad():
+                output = engine(img)
+                test_loss = engine.criterion(output, label)
+            pred = torch.argmax(output, dim=-1)
+            correct += torch.sum(pred == label)
+            total += img.size(0)
+
+        logger.info(
+            f"Epoch {epoch} - train loss: {train_loss:.5}, test loss: {test_loss:.5}, acc: {correct / total:.5}, lr: {lr_scheduler.get_last_lr()[0]:.5g}", ranks=[0])
+
+
+if __name__ == '__main__':
+    main()
--- a/examples/resnet_cifar10_data_parallel/run_resnet_cifar10_with_trainer.py
+++ b/examples/resnet_cifar10_data_parallel/run_resnet_cifar10_with_trainer.py
@@ -0,0 +1,118 @@
+from pathlib import Path
+from colossalai.logging import get_dist_logger
+import colossalai
+import torch
+import os
+from colossalai.core import global_context as gpc
+from colossalai.utils import get_dataloader, MultiTimer
+from torchvision import transforms
+from colossalai.trainer import hooks, Trainer
+from torchvision.datasets import CIFAR10
+from torchvision.models import resnet34
+from colossalai.nn import CosineAnnealingLR
+from tqdm import tqdm
+
+
+def main():
+    colossalai.launch_from_torch(config='./config.py',
+                                 host='localhost',
+                                 port=29500)
+
+    logger = get_dist_logger()
+
+    # build resnet
+    model = resnet34(num_classes=10)
+
+    # build dataloaders
+    train_dataset = CIFAR10(
+        root=Path(os.environ['DATA']),
+        download=True,
+        transform=transforms.Compose(
+            [
+                transforms.RandomCrop(size=32, padding=4),
+                transforms.RandomHorizontalFlip(),
+                transforms.ToTensor(),
+                transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[
+                    0.2023, 0.1994, 0.2010]),
+            ]
+        )
+    )
+
+    test_dataset = CIFAR10(
+        root=Path(os.environ['DATA']),
+        train=False,
+        transform=transforms.Compose(
+            [
+                transforms.ToTensor(),
+                transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[
+                    0.2023, 0.1994, 0.2010]),
+            ]
+        )
+    )
+
+    train_dataloader = get_dataloader(dataset=train_dataset,
+                                      shuffle=True,
+                                      batch_size=gpc.config.BATCH_SIZE,
+                                      num_workers=1,
+                                      pin_memory=True,
+                                      )
+
+    test_dataloader = get_dataloader(dataset=test_dataset,
+                                     add_sampler=False,
+                                     batch_size=gpc.config.BATCH_SIZE,
+                                     num_workers=1,
+                                     pin_memory=True,
+                                     )
+
+    # build criterion
+    criterion = torch.nn.CrossEntropyLoss()
+
+    # optimizer
+    optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
+
+    # lr_scheduler
+    lr_scheduler = CosineAnnealingLR(optimizer, total_steps=gpc.config.NUM_EPOCHS)
+
+    engine, train_dataloader, test_dataloader, _ = colossalai.initialize(model,
+                                                                         optimizer,
+                                                                         criterion,
+                                                                         train_dataloader,
+                                                                         test_dataloader,
+                                                                         )
+    # build a timer to measure time
+    timer = MultiTimer()
+
+    # create a trainer object
+    trainer = Trainer(
+        engine=engine,
+        timer=timer,
+        logger=logger
+    )
+
+    # define the hooks to attach to the trainer
+    hook_list = [
+        hooks.LossHook(),
+        hooks.LRSchedulerHook(lr_scheduler=lr_scheduler, by_epoch=True),
+        hooks.AccuracyHook(),
+        hooks.LogMetricByEpochHook(logger),
+        hooks.LogMemoryByEpochHook(logger),
+        hooks.LogTimingByEpochHook(timer, logger),
+
+        # you can uncomment these lines if you wish to use them
+        # hooks.TensorboardHook(log_dir='./tb_logs', ranks=[0]),
+        # hooks.SaveCheckpointHook(checkpoint_dir='./ckpt')
+    ]
+
+    # start training
+    trainer.fit(
+        train_dataloader=train_dataloader,
+        epochs=gpc.config.NUM_EPOCHS,
+        test_dataloader=test_dataloader,
+        test_interval=1,
+        hooks=hook_list,
+        display_progress=True
+    )
+
+
+if __name__ == '__main__':
+    main()
--- a/examples/run_trainer.py
+++ b/examples/run_trainer.py
@@ -1,33 +0,0 @@
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-
-import colossalai
-from colossalai.core import global_context as gpc
-from colossalai.logging import get_dist_logger
-from colossalai.trainer import Trainer
-
-
-def run_trainer():
-    engine, train_dataloader, test_dataloader = colossalai.initialize()
-    logger = get_dist_logger()
-    engine.schedule.data_sync = False
-
-    logger.info("engine is built", ranks=[0])
-
-    trainer = Trainer(engine=engine,
-                      verbose=True)
-    logger.info("trainer is built", ranks=[0])
-
-    logger.info("start training", ranks=[0])
-    trainer.fit(
-        train_dataloader=train_dataloader,
-        test_dataloader=test_dataloader,
-        epochs=gpc.config.num_epochs,
-        hooks_cfg=gpc.config.hooks,
-        display_progress=True,
-        test_interval=2
-    )
-
-
-if __name__ == '__main__':
-    run_trainer()
--- a/examples/vit-b16/README.md
+++ b/examples/vit-b16/README.md
@@ -1,40 +0,0 @@
-# Overview
-
-Here is an example of training ViT-B/16 on Imagenet-1K with batch size 32K.
-We use 8x NVIDIA A100 GPU in this example. 
-
-# How to run
-Using [Slurm](https://slurm.schedmd.com/documentation.html):
-```shell
-srun python train_dali.py --local_rank=$SLURM_PROCID --world_size=$SLURM_NPROCS --host=$HOST --port=29500 --config=vit-b16.py
-```
-
-# Results
-
-![Loss Curve](./loss.jpeg)
-![Accuracy](./acc.jpeg)
-
-# Details
-`vit-b16.py`
-
-It is a [config file](https://colossalai.org/config.html), which is used by ColossalAI to define all kinds of training arguments, such as the model, dataset, and training method (optimizer, lr_scheduler, epoch, etc.). You can access config content by `gpc.config`.
-
-In this example, we train the ViT-Base patch 16 model 300 epochs on ImageNet-1K. The batch size is set to 32K through data parallel (4K on each GPU from 16x gradient accumulation with batch size 256). Since the batch size is very large than common usage, leading to convergence difficulties, we use a 
-large batch optimizer [LAMB](https://arxiv.org/abs/1904.00962), and we can scale the batch size to 32K with a little accuracy loss. The learning rate and weight decay of the optimizer are set to 1.8e-2 and 0.1, respectively. We use a linear warmup learning rate scheduler and warmup 150 epochs.
-We introduce FP16 mixed precision to accelerate training and use gradient clipping to help convergence.
-For simplicity and speed, we didn't apply `RandAug` and just used [Mixup](https://arxiv.org/abs/1710.09412) in data augmentation.
-
-If you have enough computing resources, you can expand this example conveniently with data parallel on a very large scale without gradient accumulation, and finish the training process even within one hour.
-
-
-`imagenet_dali_dataloader.py`
-To accelerate the training process, we use [DALI](https://github.com/NVIDIA/DALI) as data loader. Note that it requires the dataset in TFRecord format, avoiding read raw images which reduces efficiency of the file system.
-
-`train_dali.py`
-We build the DALI data loader and train process using Colossal-AI here.
-
-`mixup.py`
-Since we used Mixup, we define mixup loss in this file.
-
-`hooks.py`
-We also define useful hooks to log information help debugging.
--- a/examples/vit-b16/hooks.py
+++ b/examples/vit-b16/hooks.py
@@ -1,15 +0,0 @@
-from colossalai.registry import HOOKS
-from colossalai.trainer import BaseHook
-from colossalai.core import global_context as gpc
-from colossalai.context import ParallelMode
-
-
-@HOOKS.register_module
-class TotalBatchsizeHook(BaseHook):
-    def __init__(self, trainer, priority: int = 2) -> None:
-        super().__init__(trainer, priority)
-
-    def before_train(self):
-        total_batch_size = gpc.config.BATCH_SIZE * \
-            gpc.config.engine.gradient_accumulation * gpc.get_world_size(ParallelMode.DATA)
-        self.logger.info(f'Total batch size = {total_batch_size}', ranks=[0])
--- a/examples/vit-b16/train_dali.py
+++ b/examples/vit-b16/train_dali.py
@@ -1,70 +0,0 @@
-import glob
-import os
-import colossalai
-from colossalai.context import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.logging import get_dist_logger
-from colossalai.trainer import Trainer
-from colossalai.utils import set_global_multitimer_status
-from dataloader.imagenet_dali_dataloader import DaliDataloader
-
-
-def build_dali_train():
-    root = gpc.config.dali.root
-    train_pat = os.path.join(root, 'train/*')
-    train_idx_pat = os.path.join(root, 'idx_files/train/*')
-    return DaliDataloader(
-        sorted(glob.glob(train_pat)),
-        sorted(glob.glob(train_idx_pat)),
-        batch_size=gpc.config.BATCH_SIZE,
-        shard_id=gpc.get_local_rank(ParallelMode.DATA),
-        num_shards=gpc.get_world_size(ParallelMode.DATA),
-        training=True,
-        gpu_aug=gpc.config.dali.gpu_aug,
-        cuda=True,
-        mixup_alpha=gpc.config.dali.mixup_alpha
-    )
-
-
-def build_dali_test():
-    root = gpc.config.dali.root
-    val_pat = os.path.join(root, 'validation/*')
-    val_idx_pat = os.path.join(root, 'idx_files/validation/*')
-    return DaliDataloader(
-        sorted(glob.glob(val_pat)),
-        sorted(glob.glob(val_idx_pat)),
-        batch_size=gpc.config.BATCH_SIZE,
-        shard_id=gpc.get_local_rank(ParallelMode.DATA),
-        num_shards=gpc.get_world_size(ParallelMode.DATA),
-        training=False,
-        # gpu_aug=gpc.config.dali.gpu_aug,
-        gpu_aug=False,
-        cuda=True,
-        mixup_alpha=gpc.config.dali.mixup_alpha
-    )
-
-
-def main():
-    engine, train_dataloader, test_dataloader = colossalai.initialize(
-        train_dataloader=build_dali_train,
-        test_dataloader=build_dali_test
-    )
-    logger = get_dist_logger()
-    set_global_multitimer_status(True)
-    timer = colossalai.utils.get_global_multitimer()
-    trainer = Trainer(engine=engine,
-                      verbose=True,
-                      timer=timer)
-
-    trainer.fit(
-        train_dataloader=train_dataloader,
-        test_dataloader=test_dataloader,
-        epochs=gpc.config.NUM_EPOCHS,
-        hooks_cfg=gpc.config.hooks,
-        display_progress=True,
-        test_interval=1
-    )
-
-
-if __name__ == '__main__':
-    main()
--- a/examples/vit-b16/vit-b16.py
+++ b/examples/vit-b16/vit-b16.py
@@ -1,78 +0,0 @@
-from colossalai.engine import AMP_TYPE
-from torch.nn import CrossEntropyLoss
-from mixup import MixupLoss
-from hooks import TotalBatchsizeHook
-from colossalai.registry import MODELS
-from timm.models import vit_base_patch16_224
-
-MODELS.register_module(vit_base_patch16_224)
-
-LOG_NAME = 'vit-b16-1k-32k-mixup-light2'
-# ViT Base
-BATCH_SIZE = 256
-DROP_RATE = 0.1
-NUM_EPOCHS = 300
-
-parallel = dict(
-    pipeline=dict(size=1),
-    tensor=dict(size=1, mode=None),
-)
-
-optimizer = dict(
-    type='Lamb',
-    lr=1.8e-2,
-    weight_decay=0.1,
-)
-
-
-loss = dict(
-    type='MixupLoss',
-    loss_fn_cls=CrossEntropyLoss
-)
-
-model = dict(
-    type='vit_base_patch16_224',
-    drop_rate=DROP_RATE,
-)
-
-hooks = [
-    dict(type='LogMetricByEpochHook'),
-    dict(type='AccuracyHook'),
-    dict(type='LossHook'),
-    dict(type='TotalBatchsizeHook'),
-    dict(type='TensorboardHook', log_dir=f'./tb_logs/{LOG_NAME}'),
-    dict(type='SaveCheckpointHook', interval=1,
-         checkpoint_dir=f'./ckpt/{LOG_NAME}'),
-    # dict(type='LoadCheckpointHook', epoch=10,
-    #      checkpoint_dir=f'./ckpt/{LOG_NAME}'),
-    dict(
-        type='LRSchedulerHook',
-        by_epoch=True,
-        lr_scheduler_cfg=dict(
-            type='LinearWarmupLR',
-            warmup_steps=150
-        )
-    ),
-]
-
-fp16 = dict(
-    mode=AMP_TYPE.TORCH,
-)
-
-
-logging = dict(
-    root_path=f"./logs/{LOG_NAME}"
-)
-
-dali = dict(
-    root='./dataset/ILSVRC2012_1k',
-    gpu_aug=True,
-    mixup_alpha=0.2
-)
-
-engine = dict(
-    schedule=None,
-    gradient_handlers=None,
-    gradient_accumulation=16,
-    gradient_clipping=1.0,
-)
--- a/examples/vit_b16_imagenet_data_parallel/README.md
+++ b/examples/vit_b16_imagenet_data_parallel/README.md
@@ -0,0 +1,90 @@
+# Overview
+
+A common way to speed up AI model training is to implement large-batch training with the help of data parallelism, but this requires expensive supercomputer clusters. In this example, we used a small server with only 4 GPUs to reproduce the large-scale pre-training of Vision Transformer (ViT) on ImageNet-1K in 14 hours.
+
+# How to run
+
+On a single server, you can directly use torch.distributed to start pre-training on multiple GPUs in parallel. In Colossal-AI, we provided several launch methods to init the distributed backend. You can use `colossalai.launch` and `colossalai.get_default_parser` to pass the parameters via command line. If you happen to use launchers such as SLURM, OpenMPI and PyTorch launch utility, you can use `colossalai.launch_from_<torch/slurm/openmpi>` to read rank and world size from the environment variables directly for convenience. In this example, we use `launch_from_slurm` for demo purpose. You can check out more information about SLURM [here](https://slurm.schedmd.com/documentation.html).
+
+```shell
+HOST=<node name> srun bash ./scripts/train_slurm.sh
+```
+
+---
+
+If you are using `colossalai.launch`, do this:
+In your training script:
+```python
+# initialize distributed setting
+parser = colossalai.get_default_parser()
+args = parser.parse_args()
+colossalai.launch(config=args.config,
+                    rank=args.rank,
+                    world_size=args.world_size,
+                    host=args.host,
+                    port=args.port,
+                    backend=args.backend
+                    )
+```
+
+In your terminal:
+```shell
+<some_launcher> python train.py --config ./config.py --rank <rank> --world_size <world_size> --host <node name> --port 29500
+```
+---
+If you are using `colossalai.launch_from_torch`, do this:
+In your training script:
+
+```python
+# initialize distributed setting
+parser = colossalai.get_default_parser()
+args = parser.parse_args()
+colossalai.launch_from_torch(config=args.config,
+                  host=args.host,
+                  port=args.port,
+                  backend=args.backend
+                  )
+```
+
+In your terminal
+```shell
+python -m torch.distributed.launch --nproc_per_node <world_size> train.py --config ./config.py --host <node name> --port 29500
+```
+
+# Experiments
+To facilitate more people to reproduce the experiments with large-scale data parallel, we pre-trained ViT-Base/32 in only 14.58 hours on a small server with 4 NVIDIA A100 GPUs using ImageNet-1K dataset with batch size 32K for 300 epochs maintaining accuracy. For more complex pre-training of ViT-Base/16 and ViT-Large/32, it also takes only 78.58 hours and 37.83 hours to complete. Since the server used in this example is not a standard NVIDIA DGX A100 supercomputing unit, perhaps a better acceleration can be obtained on more professional hardware.
+
+![Loss Curve](./results/loss.jpeg)
+![Accuracy](./results/acc.jpeg)
+
+As can be seen from the above figure, the ViT model eventually converges well after training 300 epochs. It is worth noting that, unlike the common small-batch training convergence process, the model performance has a temporary decline in the middle of the large-batch training process. This is due to the difficulty of convergence in large-batch training. As the number of iterations is reduced, a larger learning rate is needed to ensure the final convergence. Since we did not carefully adjust the parameters, perhaps other parameter settings could get better convergence.
+
+# Details
+`config.py`
+
+This is a [configuration file](https://colossalai.org/config.html) that defines hyperparameters and trainign scheme (fp16, gradient accumulation, etc.). The config content can be accessed through `gpc.config` in the program.
+
+In this example, we trained ViT-Base/16 for 300 epochs on the ImageNet-1K dataset. The batch size is expanded to 32K through data parallelism. Since only 4 A100 GPUs on one small server are used, and the GPU memory is limited, the batch size of 32K cannot be used directly. Therefore, the batch size used on each GPU is only 256, and the 256 batch size is equivalently expanded to 8K through gradient accumulation 32 times. Finally, data parallelism is used between 4 GPUs to achieve an equivalent batch size of 32K.
+
+Since the batch size of 32K far exceeds the use range of common optimizers and is difficult to train, we use the large-batch optimizer [LAMB](https://arxiv.org/abs/1904.00962) provided by Colossal-AI to achieve a better convergence. The learning rate and weight decay of [LAMB](https://arxiv.org/abs/1904.00962) are set to 1.8e-2 and 0.1, respectively. The learning rate scheduler uses a linear warmup strategy of 150 epochs. We also used FP16 mixed precision to speed up the training process, and introduced gradient clipping to help convergence. For simplicity and speed, we only use [Mixup](https://arxiv.org/abs/1710.09412) instead of `RandAug` in data augmentation.
+
+By tuning the parallelism, this example can be quickly deployed to a single server with several GPUs or to a large cluster with lots of nodes and GPUs. If there are enough computing resources to allow data parallel to be directly extended to hundreds or even thousands of GPUs, the training process of several days on a single A100 GPU can be shortened to less than half an hour.
+
+`imagenet_dali_dataloader.py`
+
+To accelerate the training process, we use [DALI](https://github.com/NVIDIA/DALI) to read data and require the dataset to be in TFRecord format, which avoids directly reading a large number of raw image files and being limited by the efficiency of the file system.
+
+`train.py`
+
+We call DALI in this file to read data and start the training process using Colossal-AI.
+
+`mixup.py`
+
+Since Mixup is used as data augmentation, we define the loss function of Mixup here.
+
+`myhooks.py`
+We define hook functions that record running information to help debugging.
+
+# How to build TFRecords dataset
+
+As we use [DALI](https://github.com/NVIDIA/DALI) to read data, we use the TFRecords dataset instead of raw Imagenet dataset. If you don't have TFRecords dataset, follow [imagenet-tools](https://github.com/ver217/imagenet-tools) to build one.
--- a/examples/vit_b16_imagenet_data_parallel/config.py
+++ b/examples/vit_b16_imagenet_data_parallel/config.py
@@ -0,0 +1,21 @@
+from colossalai.amp import AMP_TYPE
+
+
+# ViT Base
+BATCH_SIZE = 256
+DROP_RATE = 0.1
+NUM_EPOCHS = 300
+
+fp16 = dict(
+    mode=AMP_TYPE.TORCH,
+)
+
+gradient_accumulation = 16
+gradient_clipping = 1.0
+
+dali = dict(
+    # root='./dataset/ILSVRC2012_1k',
+    root='/project/scratch/p200012/dataset/ILSVRC2012_1k',
+    gpu_aug=True,
+    mixup_alpha=0.2
+)
--- a/examples/vit_b16_imagenet_data_parallel/dataloader/init.py
+++ b/examples/vit_b16_imagenet_data_parallel/dataloader/init.py
--- a/examples/vit_b16_imagenet_data_parallel/dataloader/imagenet_dali_dataloader.py
+++ b/examples/vit_b16_imagenet_data_parallel/dataloader/imagenet_dali_dataloader.py
--- a/examples/vit_b16_imagenet_data_parallel/mixup.py
+++ b/examples/vit_b16_imagenet_data_parallel/mixup.py
--- a/examples/vit_b16_imagenet_data_parallel/myhooks.py
+++ b/examples/vit_b16_imagenet_data_parallel/myhooks.py
@@ -0,0 +1,15 @@
+from colossalai.trainer.hooks import BaseHook
+from colossalai.core import global_context as gpc
+from colossalai.context import ParallelMode
+from colossalai.logging import get_dist_logger
+
+
+class TotalBatchsizeHook(BaseHook):
+    def __init__(self, priority: int = 2) -> None:
+        super().__init__(priority)
+        self.logger = get_dist_logger()
+
+    def before_train(self, trainer):
+        total_batch_size = gpc.config.BATCH_SIZE * \
+            gpc.config.gradient_accumulation * gpc.get_world_size(ParallelMode.DATA)
+        self.logger.info(f'Total batch size = {total_batch_size}', ranks=[0])
--- a/examples/vit_b16_imagenet_data_parallel/results/acc.jpeg
+++ b/examples/vit_b16_imagenet_data_parallel/results/acc.jpeg
--- a/examples/vit_b16_imagenet_data_parallel/results/loss.jpeg
+++ b/examples/vit_b16_imagenet_data_parallel/results/loss.jpeg
--- a/examples/vit_b16_imagenet_data_parallel/scripts/train_slurm.sh
+++ b/examples/vit_b16_imagenet_data_parallel/scripts/train_slurm.sh
@@ -0,0 +1,3 @@
+#!/usr/bin/env bash
+
+python train.py --host $HOST --config ./config.py --port 29500
--- a/examples/vit_b16_imagenet_data_parallel/train.py
+++ b/examples/vit_b16_imagenet_data_parallel/train.py
@@ -0,0 +1,116 @@
+import glob
+from math import log
+import os
+import colossalai
+import torch
+
+from colossalai.context import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.logging import get_dist_logger
+from colossalai.trainer import Trainer, hooks
+from colossalai.nn.lr_scheduler import LinearWarmupLR
+from dataloader.imagenet_dali_dataloader import DaliDataloader
+from mixup import MixupLoss
+from timm.models import vit_base_patch16_224
+from myhooks import TotalBatchsizeHook
+
+
+def build_dali_train():
+    root = gpc.config.dali.root
+    train_pat = os.path.join(root, 'train/*')
+    train_idx_pat = os.path.join(root, 'idx_files/train/*')
+    return DaliDataloader(
+        sorted(glob.glob(train_pat)),
+        sorted(glob.glob(train_idx_pat)),
+        batch_size=gpc.config.BATCH_SIZE,
+        shard_id=gpc.get_local_rank(ParallelMode.DATA),
+        num_shards=gpc.get_world_size(ParallelMode.DATA),
+        training=True,
+        gpu_aug=gpc.config.dali.gpu_aug,
+        cuda=True,
+        mixup_alpha=gpc.config.dali.mixup_alpha
+    )
+
+
+def build_dali_test():
+    root = gpc.config.dali.root
+    val_pat = os.path.join(root, 'validation/*')
+    val_idx_pat = os.path.join(root, 'idx_files/validation/*')
+    return DaliDataloader(
+        sorted(glob.glob(val_pat)),
+        sorted(glob.glob(val_idx_pat)),
+        batch_size=gpc.config.BATCH_SIZE,
+        shard_id=gpc.get_local_rank(ParallelMode.DATA),
+        num_shards=gpc.get_world_size(ParallelMode.DATA),
+        training=False,
+        # gpu_aug=gpc.config.dali.gpu_aug,
+        gpu_aug=False,
+        cuda=True,
+        mixup_alpha=gpc.config.dali.mixup_alpha
+    )
+
+
+def main():
+    # initialize distributed setting
+    parser = colossalai.get_default_parser()
+    args = parser.parse_args()
+    colossalai.launch_from_slurm(config=args.config,
+                                 host=args.host,
+                                 port=args.port,
+                                 backend=args.backend
+                                 )
+
+    # get logger
+    logger = get_dist_logger()
+    logger.info("initialized distributed environment", ranks=[0])
+
+    # build model
+    model = vit_base_patch16_224(drop_rate=0.1)
+
+    # build dataloader
+    train_dataloader = build_dali_train()
+    test_dataloader = build_dali_test()
+
+    # build optimizer
+    optimizer = colossalai.nn.Lamb(model.parameters(), lr=1.8e-2, weight_decay=0.1)
+
+    # build loss
+    criterion = MixupLoss(loss_fn_cls=torch.nn.CrossEntropyLoss)
+
+    # lr_scheduelr
+    lr_scheduler = LinearWarmupLR(optimizer, warmup_steps=50, total_steps=gpc.config.NUM_EPOCHS)
+
+    engine, train_dataloader, test_dataloader, _ = colossalai.initialize(
+        model, optimizer, criterion, train_dataloader, test_dataloader
+    )
+    logger.info("initialized colossalai components", ranks=[0])
+
+    # build trainer
+    trainer = Trainer(engine=engine, logger=logger)
+
+    # build hooks
+    hook_list = [
+        hooks.LossHook(),
+        hooks.AccuracyHook(),
+        hooks.LogMetricByEpochHook(logger),
+        hooks.LRSchedulerHook(lr_scheduler, by_epoch=True),
+        TotalBatchsizeHook(),
+
+        # comment if you do not need to use the hooks below
+        hooks.SaveCheckpointHook(interval=1, checkpoint_dir='./ckpt'),
+        hooks.TensorboardHook(log_dir='./tb_logs', ranks=[0]),
+    ]
+
+    # start training
+    trainer.fit(
+        train_dataloader=train_dataloader,
+        test_dataloader=test_dataloader,
+        epochs=gpc.config.NUM_EPOCHS,
+        hooks=hook_list,
+        display_progress=True,
+        test_interval=1
+    )
+
+
+if __name__ == '__main__':
+    main()