From f525d1f528dc25518c931f9e1f294787cf1b59b6 Mon Sep 17 00:00:00 2001 From: ver217 Date: Fri, 13 Jan 2023 22:37:31 +0800 Subject: [PATCH] [example] update gpt gemini example ci test (#2477) --- .../language/gpt/gemini/train_gpt_demo.py | 5 ++-- examples/language/gpt/test_ci.sh | 25 +++++++++---------- 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/examples/language/gpt/gemini/train_gpt_demo.py b/examples/language/gpt/gemini/train_gpt_demo.py index 7bec980f9..f77be12d2 100644 --- a/examples/language/gpt/gemini/train_gpt_demo.py +++ b/examples/language/gpt/gemini/train_gpt_demo.py @@ -65,6 +65,7 @@ def parse_args(): default="gpt2_medium", help="model model scale", ) + parser.add_argument("--steps", type=int, default=10, help="num of training steps") args = parser.parse_args() return args @@ -236,7 +237,7 @@ def main(): SEQ_LEN = 1024 VOCAB_SIZE = 50257 - NUM_STEPS = 10 + NUM_STEPS = args.steps WARMUP_STEPS = 1 assert WARMUP_STEPS < NUM_STEPS, "warmup steps should smaller than the total steps" assert (NUM_STEPS - WARMUP_STEPS) % 2 == 1, "the number of valid steps should be odd to take the median " @@ -290,14 +291,12 @@ def main(): from torch.distributed.optim import ZeroRedundancyOptimizer optimizer = ZeroRedundancyOptimizer(model.parameters(), optimizer_class=torch.optim.Adam, lr=0.01) elif args.distplan.startswith("zero"): - pg = ProcessGroup() model = model.half() partition_flag = (args.distplan == "zero2") optimizer = torch.optim.Adam(model.parameters(), lr=0.01) optimizer = LowLevelZeroOptimizer( optimizer, - pg=pg, reduce_bucket_size=12 * 1024 * 1024, overlap_communication=True, partition_grad=partition_flag, diff --git a/examples/language/gpt/test_ci.sh b/examples/language/gpt/test_ci.sh index ad0cfa325..d04ece182 100644 --- a/examples/language/gpt/test_ci.sh +++ b/examples/language/gpt/test_ci.sh @@ -1,16 +1,15 @@ pip install -r requirements.txt -# distplan in ["colossalai", "zero1", "zero2", "torch_ddp", "torch_zero"] -export DISTPAN="colossalai" +# test colossalai +for TP in 1 2; do + for PLACEMENT in "cpu" "cuda" "auto" "const"; do + for SHARD in "True" "False"; do + colossalai run --nproc_per_node=4 ./gemini/train_gpt_demo.py --steps 4 --distplan colossalai --tp_degree $TP --placement $PLACEMENT --shardinit $SHARD || exit 1 + done + done +done -# The following options only valid when DISTPAN="colossalai" -export TPDEGREE=2 -export GPUNUM=4 -export PLACEMENT='cpu' -export USE_SHARD_INIT=False -export BATCH_SIZE=8 -export MODEL_TYPE="gpt2_medium" - - -mkdir -p logs -torchrun --standalone --nproc_per_node=${GPUNUM} train_gpt_demo.py --tp_degree=${TPDEGREE} --model_type=${MODEL_TYPE} --batch_size=${BATCH_SIZE} --placement ${PLACEMENT} --shardinit ${USE_SHARD_INIT} --distplan ${DISTPAN} 2>&1 | tee ./logs/${MODEL_TYPE}_${DISTPAN}_gpu_${GPUNUM}_bs_${BATCH_SIZE}_tp_${TPDEGREE}.log +# test zero1&2 +for DIST in "zero1" "zero2"; do + colossalai run --nproc_per_node=4 ./gemini/train_gpt_demo.py --steps 4 --distplan $DIST || exit 1 +done