mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-24 03:03:37 +00:00
[gemini] update the gpt example (#2527)
This commit is contained in:
@@ -1,6 +1,6 @@
|
||||
set -x
|
||||
# distplan in ["colossalai", "zero1", "zero2", "torch_ddp", "torch_zero"]
|
||||
export DISTPLAN=${DISTPLAN:-"colossalai"}
|
||||
# distplan in ["CAI_ZeRO1", "CAI_ZeRO2", "CAI_Gemini", "Pytorch_DDP", "Pytorch_ZeRO"]
|
||||
export DISTPLAN=${DISTPLAN:-"CAI_Gemini"}
|
||||
|
||||
# The following options only valid when DISTPLAN="colossalai"
|
||||
export GPUNUM=${GPUNUM:-1}
|
||||
@@ -12,6 +12,12 @@ export MODEL_TYPE=${MODEL_TYPE:-"gpt2_medium"}
|
||||
export TRAIN_STEP=${TRAIN_STEP:-10}
|
||||
# export PYTHONPATH=$PWD:$PYTHONPATH
|
||||
|
||||
if [ ${USE_SHARD_INIT} = "True" ]; then
|
||||
USE_SHARD_INIT="--shardinit"
|
||||
else
|
||||
USE_SHARD_INIT=""
|
||||
fi
|
||||
|
||||
mkdir -p gemini_logs
|
||||
|
||||
torchrun --standalone --nproc_per_node=${GPUNUM} ./train_gpt_demo.py \
|
||||
@@ -19,7 +25,7 @@ torchrun --standalone --nproc_per_node=${GPUNUM} ./train_gpt_demo.py \
|
||||
--model_type=${MODEL_TYPE} \
|
||||
--batch_size=${BATCH_SIZE} \
|
||||
--placement=${PLACEMENT} \
|
||||
--shardinit=${USE_SHARD_INIT} \
|
||||
${USE_SHARD_INIT} \
|
||||
--distplan=${DISTPLAN} \
|
||||
--train_step=${TRAIN_STEP} \
|
||||
2>&1 | tee ./gemini_logs/${MODEL_TYPE}_${DISTPLAN}_gpu_${GPUNUM}_bs_${BATCH_SIZE}_tp_${TPDEGREE}_${PLACEMENT}.log
|
||||
|
Reference in New Issue
Block a user