mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-06-25 15:01:43 +00:00
[example] update gpt gemini example ci test (#2477)
This commit is contained in:
parent
fef5c949c3
commit
f525d1f528
@ -65,6 +65,7 @@ def parse_args():
|
|||||||
default="gpt2_medium",
|
default="gpt2_medium",
|
||||||
help="model model scale",
|
help="model model scale",
|
||||||
)
|
)
|
||||||
|
parser.add_argument("--steps", type=int, default=10, help="num of training steps")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
return args
|
return args
|
||||||
|
|
||||||
@ -236,7 +237,7 @@ def main():
|
|||||||
SEQ_LEN = 1024
|
SEQ_LEN = 1024
|
||||||
VOCAB_SIZE = 50257
|
VOCAB_SIZE = 50257
|
||||||
|
|
||||||
NUM_STEPS = 10
|
NUM_STEPS = args.steps
|
||||||
WARMUP_STEPS = 1
|
WARMUP_STEPS = 1
|
||||||
assert WARMUP_STEPS < NUM_STEPS, "warmup steps should smaller than the total steps"
|
assert WARMUP_STEPS < NUM_STEPS, "warmup steps should smaller than the total steps"
|
||||||
assert (NUM_STEPS - WARMUP_STEPS) % 2 == 1, "the number of valid steps should be odd to take the median "
|
assert (NUM_STEPS - WARMUP_STEPS) % 2 == 1, "the number of valid steps should be odd to take the median "
|
||||||
@ -290,14 +291,12 @@ def main():
|
|||||||
from torch.distributed.optim import ZeroRedundancyOptimizer
|
from torch.distributed.optim import ZeroRedundancyOptimizer
|
||||||
optimizer = ZeroRedundancyOptimizer(model.parameters(), optimizer_class=torch.optim.Adam, lr=0.01)
|
optimizer = ZeroRedundancyOptimizer(model.parameters(), optimizer_class=torch.optim.Adam, lr=0.01)
|
||||||
elif args.distplan.startswith("zero"):
|
elif args.distplan.startswith("zero"):
|
||||||
pg = ProcessGroup()
|
|
||||||
model = model.half()
|
model = model.half()
|
||||||
partition_flag = (args.distplan == "zero2")
|
partition_flag = (args.distplan == "zero2")
|
||||||
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
|
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
|
||||||
|
|
||||||
optimizer = LowLevelZeroOptimizer(
|
optimizer = LowLevelZeroOptimizer(
|
||||||
optimizer,
|
optimizer,
|
||||||
pg=pg,
|
|
||||||
reduce_bucket_size=12 * 1024 * 1024,
|
reduce_bucket_size=12 * 1024 * 1024,
|
||||||
overlap_communication=True,
|
overlap_communication=True,
|
||||||
partition_grad=partition_flag,
|
partition_grad=partition_flag,
|
||||||
|
@ -1,16 +1,15 @@
|
|||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
|
|
||||||
# distplan in ["colossalai", "zero1", "zero2", "torch_ddp", "torch_zero"]
|
# test colossalai
|
||||||
export DISTPAN="colossalai"
|
for TP in 1 2; do
|
||||||
|
for PLACEMENT in "cpu" "cuda" "auto" "const"; do
|
||||||
|
for SHARD in "True" "False"; do
|
||||||
|
colossalai run --nproc_per_node=4 ./gemini/train_gpt_demo.py --steps 4 --distplan colossalai --tp_degree $TP --placement $PLACEMENT --shardinit $SHARD || exit 1
|
||||||
|
done
|
||||||
|
done
|
||||||
|
done
|
||||||
|
|
||||||
# The following options only valid when DISTPAN="colossalai"
|
# test zero1&2
|
||||||
export TPDEGREE=2
|
for DIST in "zero1" "zero2"; do
|
||||||
export GPUNUM=4
|
colossalai run --nproc_per_node=4 ./gemini/train_gpt_demo.py --steps 4 --distplan $DIST || exit 1
|
||||||
export PLACEMENT='cpu'
|
done
|
||||||
export USE_SHARD_INIT=False
|
|
||||||
export BATCH_SIZE=8
|
|
||||||
export MODEL_TYPE="gpt2_medium"
|
|
||||||
|
|
||||||
|
|
||||||
mkdir -p logs
|
|
||||||
torchrun --standalone --nproc_per_node=${GPUNUM} train_gpt_demo.py --tp_degree=${TPDEGREE} --model_type=${MODEL_TYPE} --batch_size=${BATCH_SIZE} --placement ${PLACEMENT} --shardinit ${USE_SHARD_INIT} --distplan ${DISTPAN} 2>&1 | tee ./logs/${MODEL_TYPE}_${DISTPAN}_gpu_${GPUNUM}_bs_${BATCH_SIZE}_tp_${TPDEGREE}.log
|
|
||||||
|
Loading…
Reference in New Issue
Block a user