mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-05 02:51:59 +00:00
[CI] add test_ci.sh for palm, opt and gpt (#2475)
This commit is contained in:
@@ -9,7 +9,7 @@ export PLACEMENT=${PLACEMENT:-"cpu"}
|
||||
export USE_SHARD_INIT=${USE_SHARD_INIT:-False}
|
||||
export BATCH_SIZE=${BATCH_SIZE:-16}
|
||||
export MODEL_TYPE=${MODEL_TYPE:-"gpt2_medium"}
|
||||
|
||||
export TRAIN_STEP=${TRAIN_STEP:-10}
|
||||
# export PYTHONPATH=$PWD:$PYTHONPATH
|
||||
|
||||
mkdir -p gemini_logs
|
||||
@@ -21,4 +21,5 @@ torchrun --standalone --nproc_per_node=${GPUNUM} ./train_gpt_demo.py \
|
||||
--placement=${PLACEMENT} \
|
||||
--shardinit=${USE_SHARD_INIT} \
|
||||
--distplan=${DISTPLAN} \
|
||||
--train_step=${TRAIN_STEP} \
|
||||
2>&1 | tee ./gemini_logs/${MODEL_TYPE}_${DISTPLAN}_gpu_${GPUNUM}_bs_${BATCH_SIZE}_tp_${TPDEGREE}_${PLACEMENT}.log
|
||||
|
35
examples/language/gpt/gemini/test_ci.sh
Normal file
35
examples/language/gpt/gemini/test_ci.sh
Normal file
@@ -0,0 +1,35 @@
|
||||
set -x
|
||||
$(cd `dirname $0`;pwd)
|
||||
export TRAIN_STEP=4
|
||||
|
||||
for MODEL_TYPE in "gpt2_medium"; do
|
||||
for DISTPLAN in "colossalai"; do
|
||||
for BATCH_SIZE in 2; do
|
||||
for GPUNUM in 1 4; do
|
||||
for TPDEGREE in 1 2; do
|
||||
if [ ${TPDEGREE} -gt ${GPUNUM} ]; then
|
||||
continue
|
||||
fi
|
||||
for PLACEMENT in "cpu" "auto"; do
|
||||
MODEL_TYPE=${MODEL_TYPE} DISTPLAN=${DISTPLAN} BATCH_SIZE=${BATCH_SIZE} GPUNUM=${GPUNUM} TPDEGREE=${TPDEGREE} PLACEMENT=${PLACEMENT} \
|
||||
bash ./run_gemini.sh
|
||||
done
|
||||
done
|
||||
done
|
||||
done
|
||||
done
|
||||
|
||||
for DISTPLAN in "zero1" "zero2"; do
|
||||
for BATCH_SIZE in 2; do
|
||||
for GPUNUM in 1 4; do
|
||||
for TPDEGREE in 1; do
|
||||
if [ ${TPDEGREE} -gt ${GPUNUM} ]; then
|
||||
continue
|
||||
fi
|
||||
MODEL_TYPE=${MODEL_TYPE} DISTPLAN=${DISTPLAN} BATCH_SIZE=${BATCH_SIZE} GPUNUM=${GPUNUM} TPDEGREE=${TPDEGREE}\
|
||||
bash ./run_gemini.sh
|
||||
done
|
||||
done
|
||||
done
|
||||
done
|
||||
done
|
@@ -65,7 +65,13 @@ def parse_args():
|
||||
default="gpt2_medium",
|
||||
help="model model scale",
|
||||
)
|
||||
parser.add_argument("--steps", type=int, default=10, help="num of training steps")
|
||||
parser.add_argument(
|
||||
"--train_step",
|
||||
type=int,
|
||||
default=10,
|
||||
help="training iterations for test",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
@@ -237,7 +243,8 @@ def main():
|
||||
SEQ_LEN = 1024
|
||||
VOCAB_SIZE = 50257
|
||||
|
||||
NUM_STEPS = args.steps
|
||||
NUM_STEPS = args.train_step
|
||||
|
||||
WARMUP_STEPS = 1
|
||||
assert WARMUP_STEPS < NUM_STEPS, "warmup steps should smaller than the total steps"
|
||||
assert (NUM_STEPS - WARMUP_STEPS) % 2 == 1, "the number of valid steps should be odd to take the median "
|
||||
|
Reference in New Issue
Block a user