{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/lcsjy/.conda/envs/autoparallel/lib/python3.10/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] }, { "data": { "text/html": [ "
[11/10/22 18:04:14] INFO colossalai - torch.distributed.distributed_c10d - INFO: Added key: \n", " store_based_barrier_key:1 to store for rank: 0 \n", "\n" ], "text/plain": [ "\u001b[2;36m[11/10/22 18:04:14]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - torch.distributed.distributed_c10d - INFO: Added key: \n", "\u001b[2;36m \u001b[0m store_based_barrier_key:\u001b[1;36m1\u001b[0m to store for rank: \u001b[1;36m0\u001b[0m \n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
INFO colossalai - torch.distributed.distributed_c10d - INFO: Rank 0: Completed store-based \n", " barrier for key:store_based_barrier_key:1 with 1 nodes. \n", "\n" ], "text/plain": [ "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - torch.distributed.distributed_c10d - INFO: Rank \u001b[1;36m0\u001b[0m: Completed store-based \n", "\u001b[2;36m \u001b[0m barrier for key:store_based_barrier_key:\u001b[1;36m1\u001b[0m with \u001b[1;36m1\u001b[0m nodes. \n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
INFO colossalai - torch.distributed.distributed_c10d - INFO: Added key: \n", " store_based_barrier_key:2 to store for rank: 0 \n", "\n" ], "text/plain": [ "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - torch.distributed.distributed_c10d - INFO: Added key: \n", "\u001b[2;36m \u001b[0m store_based_barrier_key:\u001b[1;36m2\u001b[0m to store for rank: \u001b[1;36m0\u001b[0m \n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
INFO colossalai - torch.distributed.distributed_c10d - INFO: Rank 0: Completed store-based \n", " barrier for key:store_based_barrier_key:2 with 1 nodes. \n", "\n" ], "text/plain": [ "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - torch.distributed.distributed_c10d - INFO: Rank \u001b[1;36m0\u001b[0m: Completed store-based \n", "\u001b[2;36m \u001b[0m barrier for key:store_based_barrier_key:\u001b[1;36m2\u001b[0m with \u001b[1;36m1\u001b[0m nodes. \n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
INFO colossalai - torch.distributed.distributed_c10d - INFO: Added key: \n", " store_based_barrier_key:3 to store for rank: 0 \n", "\n" ], "text/plain": [ "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - torch.distributed.distributed_c10d - INFO: Added key: \n", "\u001b[2;36m \u001b[0m store_based_barrier_key:\u001b[1;36m3\u001b[0m to store for rank: \u001b[1;36m0\u001b[0m \n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
INFO colossalai - torch.distributed.distributed_c10d - INFO: Rank 0: Completed store-based \n", " barrier for key:store_based_barrier_key:3 with 1 nodes. \n", "\n" ], "text/plain": [ "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - torch.distributed.distributed_c10d - INFO: Rank \u001b[1;36m0\u001b[0m: Completed store-based \n", "\u001b[2;36m \u001b[0m barrier for key:store_based_barrier_key:\u001b[1;36m3\u001b[0m with \u001b[1;36m1\u001b[0m nodes. \n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
INFO colossalai - torch.distributed.distributed_c10d - INFO: Added key: \n", " store_based_barrier_key:4 to store for rank: 0 \n", "\n" ], "text/plain": [ "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - torch.distributed.distributed_c10d - INFO: Added key: \n", "\u001b[2;36m \u001b[0m store_based_barrier_key:\u001b[1;36m4\u001b[0m to store for rank: \u001b[1;36m0\u001b[0m \n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
INFO colossalai - torch.distributed.distributed_c10d - INFO: Rank 0: Completed store-based \n", " barrier for key:store_based_barrier_key:4 with 1 nodes. \n", "\n" ], "text/plain": [ "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - torch.distributed.distributed_c10d - INFO: Rank \u001b[1;36m0\u001b[0m: Completed store-based \n", "\u001b[2;36m \u001b[0m barrier for key:store_based_barrier_key:\u001b[1;36m4\u001b[0m with \u001b[1;36m1\u001b[0m nodes. \n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
INFO colossalai - torch.distributed.distributed_c10d - INFO: Added key: \n", " store_based_barrier_key:5 to store for rank: 0 \n", "\n" ], "text/plain": [ "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - torch.distributed.distributed_c10d - INFO: Added key: \n", "\u001b[2;36m \u001b[0m store_based_barrier_key:\u001b[1;36m5\u001b[0m to store for rank: \u001b[1;36m0\u001b[0m \n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
INFO colossalai - torch.distributed.distributed_c10d - INFO: Rank 0: Completed store-based \n", " barrier for key:store_based_barrier_key:5 with 1 nodes. \n", "\n" ], "text/plain": [ "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - torch.distributed.distributed_c10d - INFO: Rank \u001b[1;36m0\u001b[0m: Completed store-based \n", "\u001b[2;36m \u001b[0m barrier for key:store_based_barrier_key:\u001b[1;36m5\u001b[0m with \u001b[1;36m1\u001b[0m nodes. \n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
INFO colossalai - torch.distributed.distributed_c10d - INFO: Added key: \n", " store_based_barrier_key:6 to store for rank: 0 \n", "\n" ], "text/plain": [ "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - torch.distributed.distributed_c10d - INFO: Added key: \n", "\u001b[2;36m \u001b[0m store_based_barrier_key:\u001b[1;36m6\u001b[0m to store for rank: \u001b[1;36m0\u001b[0m \n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
INFO colossalai - torch.distributed.distributed_c10d - INFO: Rank 0: Completed store-based \n", " barrier for key:store_based_barrier_key:6 with 1 nodes. \n", "\n" ], "text/plain": [ "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - torch.distributed.distributed_c10d - INFO: Rank \u001b[1;36m0\u001b[0m: Completed store-based \n", "\u001b[2;36m \u001b[0m barrier for key:store_based_barrier_key:\u001b[1;36m6\u001b[0m with \u001b[1;36m1\u001b[0m nodes. \n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
INFO colossalai - torch.distributed.distributed_c10d - INFO: Added key: \n", " store_based_barrier_key:7 to store for rank: 0 \n", "\n" ], "text/plain": [ "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - torch.distributed.distributed_c10d - INFO: Added key: \n", "\u001b[2;36m \u001b[0m store_based_barrier_key:\u001b[1;36m7\u001b[0m to store for rank: \u001b[1;36m0\u001b[0m \n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
INFO colossalai - torch.distributed.distributed_c10d - INFO: Rank 0: Completed store-based \n", " barrier for key:store_based_barrier_key:7 with 1 nodes. \n", "\n" ], "text/plain": [ "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - torch.distributed.distributed_c10d - INFO: Rank \u001b[1;36m0\u001b[0m: Completed store-based \n", "\u001b[2;36m \u001b[0m barrier for key:store_based_barrier_key:\u001b[1;36m7\u001b[0m with \u001b[1;36m1\u001b[0m nodes. \n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
INFO colossalai - torch.distributed.distributed_c10d - INFO: Added key: \n", " store_based_barrier_key:8 to store for rank: 0 \n", "\n" ], "text/plain": [ "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - torch.distributed.distributed_c10d - INFO: Added key: \n", "\u001b[2;36m \u001b[0m store_based_barrier_key:\u001b[1;36m8\u001b[0m to store for rank: \u001b[1;36m0\u001b[0m \n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
INFO colossalai - torch.distributed.distributed_c10d - INFO: Rank 0: Completed store-based \n", " barrier for key:store_based_barrier_key:8 with 1 nodes. \n", "\n" ], "text/plain": [ "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - torch.distributed.distributed_c10d - INFO: Rank \u001b[1;36m0\u001b[0m: Completed store-based \n", "\u001b[2;36m \u001b[0m barrier for key:store_based_barrier_key:\u001b[1;36m8\u001b[0m with \u001b[1;36m1\u001b[0m nodes. \n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
INFO colossalai - colossalai - INFO: \n", " /home/lcsjy/ColossalAI/colossalai/context/parallel_context.py:521 set_device \n", "\n" ], "text/plain": [ "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - colossalai - INFO: \n", "\u001b[2;36m \u001b[0m \u001b[35m/home/lcsjy/ColossalAI/colossalai/context/\u001b[0m\u001b[95mparallel_context.py\u001b[0m:\u001b[1;36m521\u001b[0m set_device \n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
INFO colossalai - colossalai - INFO: process rank 0 is bound to device 0 \n", "\n" ], "text/plain": [ "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - colossalai - INFO: process rank \u001b[1;36m0\u001b[0m is bound to device \u001b[1;36m0\u001b[0m \n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
INFO colossalai - colossalai - INFO: \n", " /home/lcsjy/ColossalAI/colossalai/context/parallel_context.py:557 set_seed \n", "\n" ], "text/plain": [ "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - colossalai - INFO: \n", "\u001b[2;36m \u001b[0m \u001b[35m/home/lcsjy/ColossalAI/colossalai/context/\u001b[0m\u001b[95mparallel_context.py\u001b[0m:\u001b[1;36m557\u001b[0m set_seed \n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
INFO colossalai - colossalai - INFO: initialized seed on rank 0, numpy: 1024, python \n", " random: 1024, ParallelMode.DATA: 1024, ParallelMode.TENSOR: 1024,the default parallel \n", " seed is ParallelMode.DATA. \n", "\n" ], "text/plain": [ "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - colossalai - INFO: initialized seed on rank \u001b[1;36m0\u001b[0m, numpy: \u001b[1;36m1024\u001b[0m, python \n", "\u001b[2;36m \u001b[0m random: \u001b[1;36m1024\u001b[0m, ParallelMode.DATA: \u001b[1;36m1024\u001b[0m, ParallelMode.TENSOR: \u001b[1;36m1024\u001b[0m,the default parallel \n", "\u001b[2;36m \u001b[0m seed is ParallelMode.DATA. \n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
INFO colossalai - colossalai - INFO: /home/lcsjy/ColossalAI/colossalai/initialize.py:117 \n", " launch \n", "\n" ], "text/plain": [ "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - colossalai - INFO: \u001b[35m/home/lcsjy/ColossalAI/colossalai/\u001b[0m\u001b[95minitialize.py\u001b[0m:\u001b[1;36m117\u001b[0m \n", "\u001b[2;36m \u001b[0m launch \n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
INFO colossalai - colossalai - INFO: Distributed environment is initialized, data parallel \n", " size: 1, pipeline parallel size: 1, tensor parallel size: 1 \n", "\n" ], "text/plain": [ "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m colossalai - colossalai - INFO: Distributed environment is initialized, data parallel \n", "\u001b[2;36m \u001b[0m size: \u001b[1;36m1\u001b[0m, pipeline parallel size: \u001b[1;36m1\u001b[0m, tensor parallel size: \u001b[1;36m1\u001b[0m \n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import time\n", "import torchvision.models as tm\n", "import torch\n", "import colossalai\n", "from colossalai.fx import symbolic_trace, metainfo_trace\n", "from colossalai.auto_parallel.checkpoint import CheckpointSolverRotor\n", "from functools import partial\n", "from colossalai.utils import free_port\n", "\n", "from bench_utils import bench, bench_rotor\n", "import matplotlib.pyplot as plt\n", "\n", "colossalai.launch(config={}, rank=0, world_size=1, host='localhost', port=free_port(), backend='nccl')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### ResNet152 with batch size = 512 fails" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(78990.4404296875, inf)" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def data_gen(batch_size, shape, device='cuda'):\n", " data = torch.empty(batch_size, *shape, device=device)\n", " label = torch.empty(batch_size, dtype=torch.long, device=device).random_(1000)\n", " return {'x': data}, label\n", "\n", "model = tm.resnet152()\n", "gm = symbolic_trace(model)\n", "gm = metainfo_trace(gm, torch.empty(512, 3, 224, 224, device='meta'))\n", "bench(gm, torch.nn.CrossEntropyLoss(), partial(data_gen, batch_size=512, shape=(3, 224, 224)), num_steps=5)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### ResNet152 with batch size = 2048 succeeds " ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(74495.8486328125, 5634.262561798096)" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def data_gen(batch_size, shape, device='cuda'):\n", " data = torch.empty(batch_size, *shape, device=device)\n", " label = torch.empty(batch_size, dtype=torch.long, device=device).random_(1000)\n", " return {'x': data}, label\n", "\n", "model = tm.resnet152()\n", "gm = symbolic_trace(model)\n", "gm = metainfo_trace(gm, torch.empty(2048, 3, 224, 224, device='meta'))\n", "solver = CheckpointSolverRotor(gm.graph, free_memory=torch.cuda.mem_get_info(device=0)[0] * 0.95)\n", "gm.graph = solver.solve()\n", "bench(gm, torch.nn.CrossEntropyLoss(), partial(data_gen, batch_size=2048, shape=(3, 224, 224)), num_steps=5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Benchmarking on ResNet18" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
[11/10/22 18:04:20] WARNING colossalai - colossalai - WARNING: \n", " /home/lcsjy/ColossalAI/colossalai/auto_parallel/checkpoint/ckpt_solver_rotor.py:82 \n", " solve \n", "\n" ], "text/plain": [ "\u001b[2;36m[11/10/22 18:04:20]\u001b[0m\u001b[2;36m \u001b[0m\u001b[31mWARNING \u001b[0m colossalai - colossalai - WARNING: \n", "\u001b[2;36m \u001b[0m \u001b[35m/home/lcsjy/ColossalAI/colossalai/auto_parallel/checkpoint/\u001b[0m\u001b[95mckpt_solver_rotor.py\u001b[0m:\u001b[1;36m82\u001b[0m \n", "\u001b[2;36m \u001b[0m solve \n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
WARNING colossalai - colossalai - WARNING: Checkpoint solver failed: Can not process this \n", " chain from index 0 to 14 with memory 500 \n", "\n" ], "text/plain": [ "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[31mWARNING \u001b[0m colossalai - colossalai - WARNING: Checkpoint solver failed: Can not process this \n", "\u001b[2;36m \u001b[0m chain from index \u001b[1;36m0\u001b[0m to \u001b[1;36m14\u001b[0m with memory \u001b[1;36m500\u001b[0m \n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
WARNING colossalai - colossalai - WARNING: \n", " /home/lcsjy/ColossalAI/colossalai/auto_parallel/checkpoint/ckpt_solver_rotor.py:82 \n", " solve \n", "\n" ], "text/plain": [ "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[31mWARNING \u001b[0m colossalai - colossalai - WARNING: \n", "\u001b[2;36m \u001b[0m \u001b[35m/home/lcsjy/ColossalAI/colossalai/auto_parallel/checkpoint/\u001b[0m\u001b[95mckpt_solver_rotor.py\u001b[0m:\u001b[1;36m82\u001b[0m \n", "\u001b[2;36m \u001b[0m solve \n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
WARNING colossalai - colossalai - WARNING: Checkpoint solver failed: Can not process this \n", " chain from index 0 to 14 with memory 500 \n", "\n" ], "text/plain": [ "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[31mWARNING \u001b[0m colossalai - colossalai - WARNING: Checkpoint solver failed: Can not process this \n", "\u001b[2;36m \u001b[0m chain from index \u001b[1;36m0\u001b[0m to \u001b[1;36m14\u001b[0m with memory \u001b[1;36m500\u001b[0m \n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
WARNING colossalai - colossalai - WARNING: \n", " /home/lcsjy/ColossalAI/colossalai/auto_parallel/checkpoint/ckpt_solver_rotor.py:82 \n", " solve \n", "\n" ], "text/plain": [ "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[31mWARNING \u001b[0m colossalai - colossalai - WARNING: \n", "\u001b[2;36m \u001b[0m \u001b[35m/home/lcsjy/ColossalAI/colossalai/auto_parallel/checkpoint/\u001b[0m\u001b[95mckpt_solver_rotor.py\u001b[0m:\u001b[1;36m82\u001b[0m \n", "\u001b[2;36m \u001b[0m solve \n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
WARNING colossalai - colossalai - WARNING: Checkpoint solver failed: Can not process this \n", " chain from index 0 to 14 with memory 500 \n", "\n" ], "text/plain": [ "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[31mWARNING \u001b[0m colossalai - colossalai - WARNING: Checkpoint solver failed: Can not process this \n", "\u001b[2;36m \u001b[0m chain from index \u001b[1;36m0\u001b[0m to \u001b[1;36m14\u001b[0m with memory \u001b[1;36m500\u001b[0m \n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
WARNING colossalai - colossalai - WARNING: \n", " /home/lcsjy/ColossalAI/colossalai/auto_parallel/checkpoint/ckpt_solver_rotor.py:82 \n", " solve \n", "\n" ], "text/plain": [ "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[31mWARNING \u001b[0m colossalai - colossalai - WARNING: \n", "\u001b[2;36m \u001b[0m \u001b[35m/home/lcsjy/ColossalAI/colossalai/auto_parallel/checkpoint/\u001b[0m\u001b[95mckpt_solver_rotor.py\u001b[0m:\u001b[1;36m82\u001b[0m \n", "\u001b[2;36m \u001b[0m solve \n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
WARNING colossalai - colossalai - WARNING: Checkpoint solver failed: Can not process this \n", " chain from index 0 to 14 with memory 500 \n", "\n" ], "text/plain": [ "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[31mWARNING \u001b[0m colossalai - colossalai - WARNING: Checkpoint solver failed: Can not process this \n", "\u001b[2;36m \u001b[0m chain from index \u001b[1;36m0\u001b[0m to \u001b[1;36m14\u001b[0m with memory \u001b[1;36m500\u001b[0m \n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
[11/10/22 18:04:21] WARNING colossalai - colossalai - WARNING: \n", " /home/lcsjy/ColossalAI/colossalai/auto_parallel/checkpoint/ckpt_solver_rotor.py:82 \n", " solve \n", "\n" ], "text/plain": [ "\u001b[2;36m[11/10/22 18:04:21]\u001b[0m\u001b[2;36m \u001b[0m\u001b[31mWARNING \u001b[0m colossalai - colossalai - WARNING: \n", "\u001b[2;36m \u001b[0m \u001b[35m/home/lcsjy/ColossalAI/colossalai/auto_parallel/checkpoint/\u001b[0m\u001b[95mckpt_solver_rotor.py\u001b[0m:\u001b[1;36m82\u001b[0m \n", "\u001b[2;36m \u001b[0m solve \n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
WARNING colossalai - colossalai - WARNING: Checkpoint solver failed: Can not process this \n", " chain from index 0 to 14 with memory 500 \n", "\n" ], "text/plain": [ "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[31mWARNING \u001b[0m colossalai - colossalai - WARNING: Checkpoint solver failed: Can not process this \n", "\u001b[2;36m \u001b[0m chain from index \u001b[1;36m0\u001b[0m to \u001b[1;36m14\u001b[0m with memory \u001b[1;36m500\u001b[0m \n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
WARNING colossalai - colossalai - WARNING: \n", " /home/lcsjy/ColossalAI/colossalai/auto_parallel/checkpoint/ckpt_solver_rotor.py:82 \n", " solve \n", "\n" ], "text/plain": [ "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[31mWARNING \u001b[0m colossalai - colossalai - WARNING: \n", "\u001b[2;36m \u001b[0m \u001b[35m/home/lcsjy/ColossalAI/colossalai/auto_parallel/checkpoint/\u001b[0m\u001b[95mckpt_solver_rotor.py\u001b[0m:\u001b[1;36m82\u001b[0m \n", "\u001b[2;36m \u001b[0m solve \n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
WARNING colossalai - colossalai - WARNING: Checkpoint solver failed: Can not process this \n", " chain from index 0 to 14 with memory 500 \n", "\n" ], "text/plain": [ "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[31mWARNING \u001b[0m colossalai - colossalai - WARNING: Checkpoint solver failed: Can not process this \n", "\u001b[2;36m \u001b[0m chain from index \u001b[1;36m0\u001b[0m to \u001b[1;36m14\u001b[0m with memory \u001b[1;36m500\u001b[0m \n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
WARNING colossalai - colossalai - WARNING: \n", " /home/lcsjy/ColossalAI/colossalai/auto_parallel/checkpoint/ckpt_solver_rotor.py:82 \n", " solve \n", "\n" ], "text/plain": [ "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[31mWARNING \u001b[0m colossalai - colossalai - WARNING: \n", "\u001b[2;36m \u001b[0m \u001b[35m/home/lcsjy/ColossalAI/colossalai/auto_parallel/checkpoint/\u001b[0m\u001b[95mckpt_solver_rotor.py\u001b[0m:\u001b[1;36m82\u001b[0m \n", "\u001b[2;36m \u001b[0m solve \n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
WARNING colossalai - colossalai - WARNING: Checkpoint solver failed: Can not process this \n", " chain from index 0 to 14 with memory 500 \n", "\n" ], "text/plain": [ "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[31mWARNING \u001b[0m colossalai - colossalai - WARNING: Checkpoint solver failed: Can not process this \n", "\u001b[2;36m \u001b[0m chain from index \u001b[1;36m0\u001b[0m to \u001b[1;36m14\u001b[0m with memory \u001b[1;36m500\u001b[0m \n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
[11/10/22 18:04:22] WARNING colossalai - colossalai - WARNING: \n", " /home/lcsjy/ColossalAI/colossalai/auto_parallel/checkpoint/ckpt_solver_rotor.py:82 \n", " solve \n", "\n" ], "text/plain": [ "\u001b[2;36m[11/10/22 18:04:22]\u001b[0m\u001b[2;36m \u001b[0m\u001b[31mWARNING \u001b[0m colossalai - colossalai - WARNING: \n", "\u001b[2;36m \u001b[0m \u001b[35m/home/lcsjy/ColossalAI/colossalai/auto_parallel/checkpoint/\u001b[0m\u001b[95mckpt_solver_rotor.py\u001b[0m:\u001b[1;36m82\u001b[0m \n", "\u001b[2;36m \u001b[0m solve \n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
WARNING colossalai - colossalai - WARNING: Checkpoint solver failed: Can not process this \n", " chain from index 0 to 14 with memory 500 \n", "\n" ], "text/plain": [ "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[31mWARNING \u001b[0m colossalai - colossalai - WARNING: Checkpoint solver failed: Can not process this \n", "\u001b[2;36m \u001b[0m chain from index \u001b[1;36m0\u001b[0m to \u001b[1;36m14\u001b[0m with memory \u001b[1;36m500\u001b[0m \n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
WARNING colossalai - colossalai - WARNING: \n", " /home/lcsjy/ColossalAI/colossalai/auto_parallel/checkpoint/ckpt_solver_rotor.py:82 \n", " solve \n", "\n" ], "text/plain": [ "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[31mWARNING \u001b[0m colossalai - colossalai - WARNING: \n", "\u001b[2;36m \u001b[0m \u001b[35m/home/lcsjy/ColossalAI/colossalai/auto_parallel/checkpoint/\u001b[0m\u001b[95mckpt_solver_rotor.py\u001b[0m:\u001b[1;36m82\u001b[0m \n", "\u001b[2;36m \u001b[0m solve \n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
WARNING colossalai - colossalai - WARNING: Checkpoint solver failed: Can not process this \n", " chain from index 0 to 14 with memory 500 \n", "\n" ], "text/plain": [ "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[31mWARNING \u001b[0m colossalai - colossalai - WARNING: Checkpoint solver failed: Can not process this \n", "\u001b[2;36m \u001b[0m chain from index \u001b[1;36m0\u001b[0m to \u001b[1;36m14\u001b[0m with memory \u001b[1;36m500\u001b[0m \n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
[11/10/22 18:04:23] WARNING colossalai - colossalai - WARNING: \n", " /home/lcsjy/ColossalAI/colossalai/auto_parallel/checkpoint/ckpt_solver_rotor.py:82 \n", " solve \n", "\n" ], "text/plain": [ "\u001b[2;36m[11/10/22 18:04:23]\u001b[0m\u001b[2;36m \u001b[0m\u001b[31mWARNING \u001b[0m colossalai - colossalai - WARNING: \n", "\u001b[2;36m \u001b[0m \u001b[35m/home/lcsjy/ColossalAI/colossalai/auto_parallel/checkpoint/\u001b[0m\u001b[95mckpt_solver_rotor.py\u001b[0m:\u001b[1;36m82\u001b[0m \n", "\u001b[2;36m \u001b[0m solve \n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
WARNING colossalai - colossalai - WARNING: Checkpoint solver failed: Can not process this \n", " chain from index 0 to 14 with memory 500 \n", "\n" ], "text/plain": [ "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[31mWARNING \u001b[0m colossalai - colossalai - WARNING: Checkpoint solver failed: Can not process this \n", "\u001b[2;36m \u001b[0m chain from index \u001b[1;36m0\u001b[0m to \u001b[1;36m14\u001b[0m with memory \u001b[1;36m500\u001b[0m \n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "def data_gen(batch_size, shape, device='cuda'):\n", " data = torch.empty(batch_size, *shape, device=device)\n", " label = torch.empty(batch_size, dtype=torch.long, device=device).random_(1000)\n", " return (data, ), label\n", "\n", "model = tm.resnet18()\n", "gm = symbolic_trace(model)\n", "gm = metainfo_trace(gm, torch.empty(128, 3, 224, 224, device='meta'))\n", "peak_hist, step_hist = bench_rotor(gm, torch.nn.CrossEntropyLoss(), partial(data_gen, batch_size=128, shape=(3, 224, 224)), num_steps=5, sample_points=20, free_memory=2700 * 1024**2)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[