diff --git a/examples/language/gpt/benchmark_gemini.sh b/examples/language/gpt/benchmark_gemini.sh index 86de819e9..8cbca98cf 100644 --- a/examples/language/gpt/benchmark_gemini.sh +++ b/examples/language/gpt/benchmark_gemini.sh @@ -1,22 +1,20 @@ -for MODEL_TYPE in "gpt2_medium" -do -for BATCH_SIZE in 16 -do -for GPUNUM in 1 2 4 8 -do -for TPDEGREE in 1 2 4 8 -do -if [ ${TPDEGREE} -gt ${GPUNUM} ] -then - continue -fi -echo "****************** Begin ***************************" -echo "* benchmrking MODEL_TYPE ${MODEL_TYPE} BS ${BATCH_SIZE} BS ${BS} GPUNUM ${GPUNUM} TPDEGREE ${TPDEGREE}" -MODEL_TYPE=${MODEL_TYPE} BATCH_SIZE=${BATCH_SIZE} GPUNUM=${GPUNUM} TPDEGREE=${TPDEGREE} bash ./run_gemini.sh -echo "****************** Finished ***************************" -echo "" -echo "" -done -done -done +for MODEL_TYPE in "gpt2_medium"; do + for BATCH_SIZE in 16; do + for GPUNUM in 1 2 4 8; do + for TPDEGREE in 1 2 4 8; do + if [ ${TPDEGREE} -gt ${GPUNUM} ]; then + continue + fi + for PLACEMENT in "cpu" "auto"; do + echo "****************** Begin ***************************" + echo "* benchmrking MODEL_TYPE ${MODEL_TYPE} BS ${BATCH_SIZE} BS ${BS} GPUNUM ${GPUNUM} TPDEGREE ${TPDEGREE} PLACEMENT ${PLACEMENT}" + MODEL_TYPE=${MODEL_TYPE} BATCH_SIZE=${BATCH_SIZE} GPUNUM=${GPUNUM} TPDEGREE=${TPDEGREE} PLACEMENT=${PLACEMENT} \ + bash ./run_gemini.sh + echo "****************** Finished ***************************" + echo "" + echo "" + done + done + done + done done diff --git a/examples/language/gpt/run_gemini.sh b/examples/language/gpt/run_gemini.sh index 368790e33..c2b6de567 100644 --- a/examples/language/gpt/run_gemini.sh +++ b/examples/language/gpt/run_gemini.sh @@ -10,4 +10,11 @@ export BATCH_SIZE=${BATCH_SIZE:-16} export MODEL_TYPE=${MODEL_TYPE:-"gpt2_medium"} mkdir -p gemini_logs -torchrun --standalone --nproc_per_node=${GPUNUM} train_gpt_demo.py --tp_degree=${TPDEGREE} --model_type=${MODEL_TYPE} --batch_size=${BATCH_SIZE} --placement ${PLACEMENT} --shardinit ${USE_SHARD_INIT} --distplan ${DISTPAN} 2>&1 | tee ./gemini_logs/${MODEL_TYPE}_${DISTPAN}_gpu_${GPUNUM}_bs_${BATCH_SIZE}_tp_${TPDEGREE}.log +torchrun --standalone --nproc_per_node=${GPUNUM} train_gpt_demo.py \ +--tp_degree=${TPDEGREE} \ +--model_type=${MODEL_TYPE} \ +--batch_size=${BATCH_SIZE} \ +--placement=${PLACEMENT} \ +--shardinit=${USE_SHARD_INIT} \ +--distplan=${DISTPAN} \ +2>&1 | tee ./gemini_logs/${MODEL_TYPE}_${DISTPAN}_gpu_${GPUNUM}_bs_${BATCH_SIZE}_tp_${TPDEGREE}_${PLACEMENT}.log diff --git a/examples/language/gpt/train_gpt_demo.py b/examples/language/gpt/train_gpt_demo.py index 8704be9e0..b18ff5111 100644 --- a/examples/language/gpt/train_gpt_demo.py +++ b/examples/language/gpt/train_gpt_demo.py @@ -217,8 +217,7 @@ def build_gemini(model: torch.nn.Module, pg: ProcessGroup, placement_policy: str def main(): # version check - # this example is supposed to work for versions less than 0.2.0 but greater than 0.1.9 - assert version.parse(CAI_VERSION) < version.parse("0.2.0") + # this example is supposed to work for versions greater than 0.1.9 assert version.parse(CAI_VERSION) >= version.parse("0.1.9") set_cpu_maximum_parallelism()