diff --git a/examples/tutorial/auto_parallel/auto_ckpt_demo.ipynb b/examples/tutorial/auto_parallel/auto_ckpt_demo.ipynb new file mode 100644 index 000000000..cacf5d5f3 --- /dev/null +++ b/examples/tutorial/auto_parallel/auto_ckpt_demo.ipynb @@ -0,0 +1,878 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/lcsjy/.conda/envs/autoparallel/lib/python3.10/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + }, + { + "data": { + "text/html": [ + "
[11/10/22 18:04:14] INFO colossalai - torch.distributed.distributed_c10d - INFO: Added key: \n", + " store_based_barrier_key:1 to store for rank: 0 \n", + "\n" + ], + "text/plain": [ + "\u001b[2;36m[11/10/22 18:04:14]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - torch.distributed.distributed_c10d - INFO: Added key: \n", + "\u001b[2;36m \u001b[0m store_based_barrier_key:\u001b[1;36m1\u001b[0m to store for rank: \u001b[1;36m0\u001b[0m \n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
INFO colossalai - torch.distributed.distributed_c10d - INFO: Rank 0: Completed store-based \n", + " barrier for key:store_based_barrier_key:1 with 1 nodes. \n", + "\n" + ], + "text/plain": [ + "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - torch.distributed.distributed_c10d - INFO: Rank \u001b[1;36m0\u001b[0m: Completed store-based \n", + "\u001b[2;36m \u001b[0m barrier for key:store_based_barrier_key:\u001b[1;36m1\u001b[0m with \u001b[1;36m1\u001b[0m nodes. \n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
INFO colossalai - torch.distributed.distributed_c10d - INFO: Added key: \n", + " store_based_barrier_key:2 to store for rank: 0 \n", + "\n" + ], + "text/plain": [ + "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - torch.distributed.distributed_c10d - INFO: Added key: \n", + "\u001b[2;36m \u001b[0m store_based_barrier_key:\u001b[1;36m2\u001b[0m to store for rank: \u001b[1;36m0\u001b[0m \n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
INFO colossalai - torch.distributed.distributed_c10d - INFO: Rank 0: Completed store-based \n", + " barrier for key:store_based_barrier_key:2 with 1 nodes. \n", + "\n" + ], + "text/plain": [ + "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - torch.distributed.distributed_c10d - INFO: Rank \u001b[1;36m0\u001b[0m: Completed store-based \n", + "\u001b[2;36m \u001b[0m barrier for key:store_based_barrier_key:\u001b[1;36m2\u001b[0m with \u001b[1;36m1\u001b[0m nodes. \n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
INFO colossalai - torch.distributed.distributed_c10d - INFO: Added key: \n", + " store_based_barrier_key:3 to store for rank: 0 \n", + "\n" + ], + "text/plain": [ + "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - torch.distributed.distributed_c10d - INFO: Added key: \n", + "\u001b[2;36m \u001b[0m store_based_barrier_key:\u001b[1;36m3\u001b[0m to store for rank: \u001b[1;36m0\u001b[0m \n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
INFO colossalai - torch.distributed.distributed_c10d - INFO: Rank 0: Completed store-based \n", + " barrier for key:store_based_barrier_key:3 with 1 nodes. \n", + "\n" + ], + "text/plain": [ + "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - torch.distributed.distributed_c10d - INFO: Rank \u001b[1;36m0\u001b[0m: Completed store-based \n", + "\u001b[2;36m \u001b[0m barrier for key:store_based_barrier_key:\u001b[1;36m3\u001b[0m with \u001b[1;36m1\u001b[0m nodes. \n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
INFO colossalai - torch.distributed.distributed_c10d - INFO: Added key: \n", + " store_based_barrier_key:4 to store for rank: 0 \n", + "\n" + ], + "text/plain": [ + "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - torch.distributed.distributed_c10d - INFO: Added key: \n", + "\u001b[2;36m \u001b[0m store_based_barrier_key:\u001b[1;36m4\u001b[0m to store for rank: \u001b[1;36m0\u001b[0m \n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
INFO colossalai - torch.distributed.distributed_c10d - INFO: Rank 0: Completed store-based \n", + " barrier for key:store_based_barrier_key:4 with 1 nodes. \n", + "\n" + ], + "text/plain": [ + "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - torch.distributed.distributed_c10d - INFO: Rank \u001b[1;36m0\u001b[0m: Completed store-based \n", + "\u001b[2;36m \u001b[0m barrier for key:store_based_barrier_key:\u001b[1;36m4\u001b[0m with \u001b[1;36m1\u001b[0m nodes. \n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
INFO colossalai - torch.distributed.distributed_c10d - INFO: Added key: \n", + " store_based_barrier_key:5 to store for rank: 0 \n", + "\n" + ], + "text/plain": [ + "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - torch.distributed.distributed_c10d - INFO: Added key: \n", + "\u001b[2;36m \u001b[0m store_based_barrier_key:\u001b[1;36m5\u001b[0m to store for rank: \u001b[1;36m0\u001b[0m \n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
INFO colossalai - torch.distributed.distributed_c10d - INFO: Rank 0: Completed store-based \n", + " barrier for key:store_based_barrier_key:5 with 1 nodes. \n", + "\n" + ], + "text/plain": [ + "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - torch.distributed.distributed_c10d - INFO: Rank \u001b[1;36m0\u001b[0m: Completed store-based \n", + "\u001b[2;36m \u001b[0m barrier for key:store_based_barrier_key:\u001b[1;36m5\u001b[0m with \u001b[1;36m1\u001b[0m nodes. \n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
INFO colossalai - torch.distributed.distributed_c10d - INFO: Added key: \n", + " store_based_barrier_key:6 to store for rank: 0 \n", + "\n" + ], + "text/plain": [ + "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - torch.distributed.distributed_c10d - INFO: Added key: \n", + "\u001b[2;36m \u001b[0m store_based_barrier_key:\u001b[1;36m6\u001b[0m to store for rank: \u001b[1;36m0\u001b[0m \n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
INFO colossalai - torch.distributed.distributed_c10d - INFO: Rank 0: Completed store-based \n", + " barrier for key:store_based_barrier_key:6 with 1 nodes. \n", + "\n" + ], + "text/plain": [ + "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - torch.distributed.distributed_c10d - INFO: Rank \u001b[1;36m0\u001b[0m: Completed store-based \n", + "\u001b[2;36m \u001b[0m barrier for key:store_based_barrier_key:\u001b[1;36m6\u001b[0m with \u001b[1;36m1\u001b[0m nodes. \n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
INFO colossalai - torch.distributed.distributed_c10d - INFO: Added key: \n", + " store_based_barrier_key:7 to store for rank: 0 \n", + "\n" + ], + "text/plain": [ + "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - torch.distributed.distributed_c10d - INFO: Added key: \n", + "\u001b[2;36m \u001b[0m store_based_barrier_key:\u001b[1;36m7\u001b[0m to store for rank: \u001b[1;36m0\u001b[0m \n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
INFO colossalai - torch.distributed.distributed_c10d - INFO: Rank 0: Completed store-based \n", + " barrier for key:store_based_barrier_key:7 with 1 nodes. \n", + "\n" + ], + "text/plain": [ + "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - torch.distributed.distributed_c10d - INFO: Rank \u001b[1;36m0\u001b[0m: Completed store-based \n", + "\u001b[2;36m \u001b[0m barrier for key:store_based_barrier_key:\u001b[1;36m7\u001b[0m with \u001b[1;36m1\u001b[0m nodes. \n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
INFO colossalai - torch.distributed.distributed_c10d - INFO: Added key: \n", + " store_based_barrier_key:8 to store for rank: 0 \n", + "\n" + ], + "text/plain": [ + "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - torch.distributed.distributed_c10d - INFO: Added key: \n", + "\u001b[2;36m \u001b[0m store_based_barrier_key:\u001b[1;36m8\u001b[0m to store for rank: \u001b[1;36m0\u001b[0m \n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
INFO colossalai - torch.distributed.distributed_c10d - INFO: Rank 0: Completed store-based \n", + " barrier for key:store_based_barrier_key:8 with 1 nodes. \n", + "\n" + ], + "text/plain": [ + "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - torch.distributed.distributed_c10d - INFO: Rank \u001b[1;36m0\u001b[0m: Completed store-based \n", + "\u001b[2;36m \u001b[0m barrier for key:store_based_barrier_key:\u001b[1;36m8\u001b[0m with \u001b[1;36m1\u001b[0m nodes. \n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
INFO colossalai - colossalai - INFO: \n", + " /home/lcsjy/ColossalAI/colossalai/context/parallel_context.py:521 set_device \n", + "\n" + ], + "text/plain": [ + "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - colossalai - INFO: \n", + "\u001b[2;36m \u001b[0m \u001b[35m/home/lcsjy/ColossalAI/colossalai/context/\u001b[0m\u001b[95mparallel_context.py\u001b[0m:\u001b[1;36m521\u001b[0m set_device \n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
INFO colossalai - colossalai - INFO: process rank 0 is bound to device 0 \n", + "\n" + ], + "text/plain": [ + "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - colossalai - INFO: process rank \u001b[1;36m0\u001b[0m is bound to device \u001b[1;36m0\u001b[0m \n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
INFO colossalai - colossalai - INFO: \n", + " /home/lcsjy/ColossalAI/colossalai/context/parallel_context.py:557 set_seed \n", + "\n" + ], + "text/plain": [ + "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - colossalai - INFO: \n", + "\u001b[2;36m \u001b[0m \u001b[35m/home/lcsjy/ColossalAI/colossalai/context/\u001b[0m\u001b[95mparallel_context.py\u001b[0m:\u001b[1;36m557\u001b[0m set_seed \n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
INFO colossalai - colossalai - INFO: initialized seed on rank 0, numpy: 1024, python \n", + " random: 1024, ParallelMode.DATA: 1024, ParallelMode.TENSOR: 1024,the default parallel \n", + " seed is ParallelMode.DATA. \n", + "\n" + ], + "text/plain": [ + "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - colossalai - INFO: initialized seed on rank \u001b[1;36m0\u001b[0m, numpy: \u001b[1;36m1024\u001b[0m, python \n", + "\u001b[2;36m \u001b[0m random: \u001b[1;36m1024\u001b[0m, ParallelMode.DATA: \u001b[1;36m1024\u001b[0m, ParallelMode.TENSOR: \u001b[1;36m1024\u001b[0m,the default parallel \n", + "\u001b[2;36m \u001b[0m seed is ParallelMode.DATA. \n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
INFO colossalai - colossalai - INFO: /home/lcsjy/ColossalAI/colossalai/initialize.py:117 \n", + " launch \n", + "\n" + ], + "text/plain": [ + "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - colossalai - INFO: \u001b[35m/home/lcsjy/ColossalAI/colossalai/\u001b[0m\u001b[95minitialize.py\u001b[0m:\u001b[1;36m117\u001b[0m \n", + "\u001b[2;36m \u001b[0m launch \n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
INFO colossalai - colossalai - INFO: Distributed environment is initialized, data parallel \n", + " size: 1, pipeline parallel size: 1, tensor parallel size: 1 \n", + "\n" + ], + "text/plain": [ + "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - colossalai - INFO: Distributed environment is initialized, data parallel \n", + "\u001b[2;36m \u001b[0m size: \u001b[1;36m1\u001b[0m, pipeline parallel size: \u001b[1;36m1\u001b[0m, tensor parallel size: \u001b[1;36m1\u001b[0m \n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import time\n", + "import torchvision.models as tm\n", + "import torch\n", + "import colossalai\n", + "from colossalai.fx import symbolic_trace, metainfo_trace\n", + "from colossalai.auto_parallel.checkpoint import CheckpointSolverRotor\n", + "from functools import partial\n", + "from colossalai.utils import free_port\n", + "\n", + "from bench_utils import bench, bench_rotor\n", + "import matplotlib.pyplot as plt\n", + "\n", + "colossalai.launch(config={}, rank=0, world_size=1, host='localhost', port=free_port(), backend='nccl')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### ResNet152 with batch size = 512 fails" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(78990.4404296875, inf)" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def data_gen(batch_size, shape, device='cuda'):\n", + " data = torch.empty(batch_size, *shape, device=device)\n", + " label = torch.empty(batch_size, dtype=torch.long, device=device).random_(1000)\n", + " return {'x': data}, label\n", + "\n", + "model = tm.resnet152()\n", + "gm = symbolic_trace(model)\n", + "gm = metainfo_trace(gm, torch.empty(512, 3, 224, 224, device='meta'))\n", + "bench(gm, torch.nn.CrossEntropyLoss(), partial(data_gen, batch_size=512, shape=(3, 224, 224)), num_steps=5)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### ResNet152 with batch size = 2048 succeeds " + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(74495.8486328125, 5634.262561798096)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def data_gen(batch_size, shape, device='cuda'):\n", + " data = torch.empty(batch_size, *shape, device=device)\n", + " label = torch.empty(batch_size, dtype=torch.long, device=device).random_(1000)\n", + " return {'x': data}, label\n", + "\n", + "model = tm.resnet152()\n", + "gm = symbolic_trace(model)\n", + "gm = metainfo_trace(gm, torch.empty(2048, 3, 224, 224, device='meta'))\n", + "solver = CheckpointSolverRotor(gm.graph, free_memory=torch.cuda.mem_get_info(device=0)[0] * 0.95)\n", + "gm.graph = solver.solve()\n", + "bench(gm, torch.nn.CrossEntropyLoss(), partial(data_gen, batch_size=2048, shape=(3, 224, 224)), num_steps=5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Benchmarking on ResNet18" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
[11/10/22 18:04:20] WARNING colossalai - colossalai - WARNING: \n", + " /home/lcsjy/ColossalAI/colossalai/auto_parallel/checkpoint/ckpt_solver_rotor.py:82 \n", + " solve \n", + "\n" + ], + "text/plain": [ + "\u001b[2;36m[11/10/22 18:04:20]\u001b[0m\u001b[2;36m \u001b[0m\u001b[31mWARNING \u001b[0m colossalai - colossalai - WARNING: \n", + "\u001b[2;36m \u001b[0m \u001b[35m/home/lcsjy/ColossalAI/colossalai/auto_parallel/checkpoint/\u001b[0m\u001b[95mckpt_solver_rotor.py\u001b[0m:\u001b[1;36m82\u001b[0m \n", + "\u001b[2;36m \u001b[0m solve \n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
WARNING colossalai - colossalai - WARNING: Checkpoint solver failed: Can not process this \n", + " chain from index 0 to 14 with memory 500 \n", + "\n" + ], + "text/plain": [ + "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[31mWARNING \u001b[0m colossalai - colossalai - WARNING: Checkpoint solver failed: Can not process this \n", + "\u001b[2;36m \u001b[0m chain from index \u001b[1;36m0\u001b[0m to \u001b[1;36m14\u001b[0m with memory \u001b[1;36m500\u001b[0m \n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
WARNING colossalai - colossalai - WARNING: \n", + " /home/lcsjy/ColossalAI/colossalai/auto_parallel/checkpoint/ckpt_solver_rotor.py:82 \n", + " solve \n", + "\n" + ], + "text/plain": [ + "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[31mWARNING \u001b[0m colossalai - colossalai - WARNING: \n", + "\u001b[2;36m \u001b[0m \u001b[35m/home/lcsjy/ColossalAI/colossalai/auto_parallel/checkpoint/\u001b[0m\u001b[95mckpt_solver_rotor.py\u001b[0m:\u001b[1;36m82\u001b[0m \n", + "\u001b[2;36m \u001b[0m solve \n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
WARNING colossalai - colossalai - WARNING: Checkpoint solver failed: Can not process this \n", + " chain from index 0 to 14 with memory 500 \n", + "\n" + ], + "text/plain": [ + "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[31mWARNING \u001b[0m colossalai - colossalai - WARNING: Checkpoint solver failed: Can not process this \n", + "\u001b[2;36m \u001b[0m chain from index \u001b[1;36m0\u001b[0m to \u001b[1;36m14\u001b[0m with memory \u001b[1;36m500\u001b[0m \n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
WARNING colossalai - colossalai - WARNING: \n", + " /home/lcsjy/ColossalAI/colossalai/auto_parallel/checkpoint/ckpt_solver_rotor.py:82 \n", + " solve \n", + "\n" + ], + "text/plain": [ + "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[31mWARNING \u001b[0m colossalai - colossalai - WARNING: \n", + "\u001b[2;36m \u001b[0m \u001b[35m/home/lcsjy/ColossalAI/colossalai/auto_parallel/checkpoint/\u001b[0m\u001b[95mckpt_solver_rotor.py\u001b[0m:\u001b[1;36m82\u001b[0m \n", + "\u001b[2;36m \u001b[0m solve \n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
WARNING colossalai - colossalai - WARNING: Checkpoint solver failed: Can not process this \n", + " chain from index 0 to 14 with memory 500 \n", + "\n" + ], + "text/plain": [ + "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[31mWARNING \u001b[0m colossalai - colossalai - WARNING: Checkpoint solver failed: Can not process this \n", + "\u001b[2;36m \u001b[0m chain from index \u001b[1;36m0\u001b[0m to \u001b[1;36m14\u001b[0m with memory \u001b[1;36m500\u001b[0m \n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
WARNING colossalai - colossalai - WARNING: \n", + " /home/lcsjy/ColossalAI/colossalai/auto_parallel/checkpoint/ckpt_solver_rotor.py:82 \n", + " solve \n", + "\n" + ], + "text/plain": [ + "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[31mWARNING \u001b[0m colossalai - colossalai - WARNING: \n", + "\u001b[2;36m \u001b[0m \u001b[35m/home/lcsjy/ColossalAI/colossalai/auto_parallel/checkpoint/\u001b[0m\u001b[95mckpt_solver_rotor.py\u001b[0m:\u001b[1;36m82\u001b[0m \n", + "\u001b[2;36m \u001b[0m solve \n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
WARNING colossalai - colossalai - WARNING: Checkpoint solver failed: Can not process this \n", + " chain from index 0 to 14 with memory 500 \n", + "\n" + ], + "text/plain": [ + "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[31mWARNING \u001b[0m colossalai - colossalai - WARNING: Checkpoint solver failed: Can not process this \n", + "\u001b[2;36m \u001b[0m chain from index \u001b[1;36m0\u001b[0m to \u001b[1;36m14\u001b[0m with memory \u001b[1;36m500\u001b[0m \n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
[11/10/22 18:04:21] WARNING colossalai - colossalai - WARNING: \n", + " /home/lcsjy/ColossalAI/colossalai/auto_parallel/checkpoint/ckpt_solver_rotor.py:82 \n", + " solve \n", + "\n" + ], + "text/plain": [ + "\u001b[2;36m[11/10/22 18:04:21]\u001b[0m\u001b[2;36m \u001b[0m\u001b[31mWARNING \u001b[0m colossalai - colossalai - WARNING: \n", + "\u001b[2;36m \u001b[0m \u001b[35m/home/lcsjy/ColossalAI/colossalai/auto_parallel/checkpoint/\u001b[0m\u001b[95mckpt_solver_rotor.py\u001b[0m:\u001b[1;36m82\u001b[0m \n", + "\u001b[2;36m \u001b[0m solve \n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
WARNING colossalai - colossalai - WARNING: Checkpoint solver failed: Can not process this \n", + " chain from index 0 to 14 with memory 500 \n", + "\n" + ], + "text/plain": [ + "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[31mWARNING \u001b[0m colossalai - colossalai - WARNING: Checkpoint solver failed: Can not process this \n", + "\u001b[2;36m \u001b[0m chain from index \u001b[1;36m0\u001b[0m to \u001b[1;36m14\u001b[0m with memory \u001b[1;36m500\u001b[0m \n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
WARNING colossalai - colossalai - WARNING: \n", + " /home/lcsjy/ColossalAI/colossalai/auto_parallel/checkpoint/ckpt_solver_rotor.py:82 \n", + " solve \n", + "\n" + ], + "text/plain": [ + "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[31mWARNING \u001b[0m colossalai - colossalai - WARNING: \n", + "\u001b[2;36m \u001b[0m \u001b[35m/home/lcsjy/ColossalAI/colossalai/auto_parallel/checkpoint/\u001b[0m\u001b[95mckpt_solver_rotor.py\u001b[0m:\u001b[1;36m82\u001b[0m \n", + "\u001b[2;36m \u001b[0m solve \n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
WARNING colossalai - colossalai - WARNING: Checkpoint solver failed: Can not process this \n", + " chain from index 0 to 14 with memory 500 \n", + "\n" + ], + "text/plain": [ + "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[31mWARNING \u001b[0m colossalai - colossalai - WARNING: Checkpoint solver failed: Can not process this \n", + "\u001b[2;36m \u001b[0m chain from index \u001b[1;36m0\u001b[0m to \u001b[1;36m14\u001b[0m with memory \u001b[1;36m500\u001b[0m \n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
WARNING colossalai - colossalai - WARNING: \n", + " /home/lcsjy/ColossalAI/colossalai/auto_parallel/checkpoint/ckpt_solver_rotor.py:82 \n", + " solve \n", + "\n" + ], + "text/plain": [ + "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[31mWARNING \u001b[0m colossalai - colossalai - WARNING: \n", + "\u001b[2;36m \u001b[0m \u001b[35m/home/lcsjy/ColossalAI/colossalai/auto_parallel/checkpoint/\u001b[0m\u001b[95mckpt_solver_rotor.py\u001b[0m:\u001b[1;36m82\u001b[0m \n", + "\u001b[2;36m \u001b[0m solve \n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
WARNING colossalai - colossalai - WARNING: Checkpoint solver failed: Can not process this \n", + " chain from index 0 to 14 with memory 500 \n", + "\n" + ], + "text/plain": [ + "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[31mWARNING \u001b[0m colossalai - colossalai - WARNING: Checkpoint solver failed: Can not process this \n", + "\u001b[2;36m \u001b[0m chain from index \u001b[1;36m0\u001b[0m to \u001b[1;36m14\u001b[0m with memory \u001b[1;36m500\u001b[0m \n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
[11/10/22 18:04:22] WARNING colossalai - colossalai - WARNING: \n", + " /home/lcsjy/ColossalAI/colossalai/auto_parallel/checkpoint/ckpt_solver_rotor.py:82 \n", + " solve \n", + "\n" + ], + "text/plain": [ + "\u001b[2;36m[11/10/22 18:04:22]\u001b[0m\u001b[2;36m \u001b[0m\u001b[31mWARNING \u001b[0m colossalai - colossalai - WARNING: \n", + "\u001b[2;36m \u001b[0m \u001b[35m/home/lcsjy/ColossalAI/colossalai/auto_parallel/checkpoint/\u001b[0m\u001b[95mckpt_solver_rotor.py\u001b[0m:\u001b[1;36m82\u001b[0m \n", + "\u001b[2;36m \u001b[0m solve \n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
WARNING colossalai - colossalai - WARNING: Checkpoint solver failed: Can not process this \n", + " chain from index 0 to 14 with memory 500 \n", + "\n" + ], + "text/plain": [ + "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[31mWARNING \u001b[0m colossalai - colossalai - WARNING: Checkpoint solver failed: Can not process this \n", + "\u001b[2;36m \u001b[0m chain from index \u001b[1;36m0\u001b[0m to \u001b[1;36m14\u001b[0m with memory \u001b[1;36m500\u001b[0m \n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
WARNING colossalai - colossalai - WARNING: \n", + " /home/lcsjy/ColossalAI/colossalai/auto_parallel/checkpoint/ckpt_solver_rotor.py:82 \n", + " solve \n", + "\n" + ], + "text/plain": [ + "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[31mWARNING \u001b[0m colossalai - colossalai - WARNING: \n", + "\u001b[2;36m \u001b[0m \u001b[35m/home/lcsjy/ColossalAI/colossalai/auto_parallel/checkpoint/\u001b[0m\u001b[95mckpt_solver_rotor.py\u001b[0m:\u001b[1;36m82\u001b[0m \n", + "\u001b[2;36m \u001b[0m solve \n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
WARNING colossalai - colossalai - WARNING: Checkpoint solver failed: Can not process this \n", + " chain from index 0 to 14 with memory 500 \n", + "\n" + ], + "text/plain": [ + "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[31mWARNING \u001b[0m colossalai - colossalai - WARNING: Checkpoint solver failed: Can not process this \n", + "\u001b[2;36m \u001b[0m chain from index \u001b[1;36m0\u001b[0m to \u001b[1;36m14\u001b[0m with memory \u001b[1;36m500\u001b[0m \n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
[11/10/22 18:04:23] WARNING colossalai - colossalai - WARNING: \n", + " /home/lcsjy/ColossalAI/colossalai/auto_parallel/checkpoint/ckpt_solver_rotor.py:82 \n", + " solve \n", + "\n" + ], + "text/plain": [ + "\u001b[2;36m[11/10/22 18:04:23]\u001b[0m\u001b[2;36m \u001b[0m\u001b[31mWARNING \u001b[0m colossalai - colossalai - WARNING: \n", + "\u001b[2;36m \u001b[0m \u001b[35m/home/lcsjy/ColossalAI/colossalai/auto_parallel/checkpoint/\u001b[0m\u001b[95mckpt_solver_rotor.py\u001b[0m:\u001b[1;36m82\u001b[0m \n", + "\u001b[2;36m \u001b[0m solve \n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
WARNING colossalai - colossalai - WARNING: Checkpoint solver failed: Can not process this \n", + " chain from index 0 to 14 with memory 500 \n", + "\n" + ], + "text/plain": [ + "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[31mWARNING \u001b[0m colossalai - colossalai - WARNING: Checkpoint solver failed: Can not process this \n", + "\u001b[2;36m \u001b[0m chain from index \u001b[1;36m0\u001b[0m to \u001b[1;36m14\u001b[0m with memory \u001b[1;36m500\u001b[0m \n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "def data_gen(batch_size, shape, device='cuda'):\n", + " data = torch.empty(batch_size, *shape, device=device)\n", + " label = torch.empty(batch_size, dtype=torch.long, device=device).random_(1000)\n", + " return (data, ), label\n", + "\n", + "model = tm.resnet18()\n", + "gm = symbolic_trace(model)\n", + "gm = metainfo_trace(gm, torch.empty(128, 3, 224, 224, device='meta'))\n", + "peak_hist, step_hist = bench_rotor(gm, torch.nn.CrossEntropyLoss(), partial(data_gen, batch_size=128, shape=(3, 224, 224)), num_steps=5, sample_points=20, free_memory=2700 * 1024**2)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[