skip pp eval

This commit is contained in:
Tong Li 2024-08-14 10:07:45 +00:00
parent 409f4b5ab3
commit e87cd8bcfb
2 changed files with 52 additions and 24 deletions

View File

@ -91,7 +91,7 @@ SKIPPED_TESTS=(
llama-gemini_auto-20 # gemini_auto plugin doesn't support lora llama-gemini_auto-20 # gemini_auto plugin doesn't support lora
llama-gemini-20 # gemini doesn't support lora llama-gemini-20 # gemini doesn't support lora
) )
skip_eval=false
GRAD_CKPTS=('--grad_checkpoint') GRAD_CKPTS=('--grad_checkpoint')
for lora_rank in ${LORA_RANK[@]}; do for lora_rank in ${LORA_RANK[@]}; do
for model in ${MODELS[@]}; do for model in ${MODELS[@]}; do
@ -129,15 +129,18 @@ for lora_rank in ${LORA_RANK[@]}; do
plugin='3d' plugin='3d'
fi fi
if [[ $plugin == "tp_pp" ]]; then if [[ $plugin == "tp_pp" ]]; then
echo "Here"
tp='2' tp='2'
bs='8' bs='8'
pp='2' pp='2'
plugin='3d' plugin='3d'
skip_eval=true
fi fi
if [[ $plugin == "pp" ]]; then if [[ $plugin == "pp" ]]; then
bs='8' bs='8'
pp='2' pp='2'
plugin='3d' plugin='3d'
skip_eval=true
fi fi
if [[ $plugin == "sp_split_gather" ]]; then if [[ $plugin == "sp_split_gather" ]]; then
enable_sequence_parallelism='--enable_sequence_parallelism' enable_sequence_parallelism='--enable_sequence_parallelism'
@ -175,28 +178,53 @@ for lora_rank in ${LORA_RANK[@]}; do
for split in $(seq -f "%05g" 0 0); do for split in $(seq -f "%05g" 0 0); do
dataset+=("$TEMP_DIR/rlhf_data/tokenized_${model}_sft/arrow/part-$split") dataset+=("$TEMP_DIR/rlhf_data/tokenized_${model}_sft/arrow/part-$split")
done done
colossalai run --nproc_per_node 4 --master_port 31332 $EXAMPLES_DIR/training_scripts/train_sft.py \
--pretrain $pretrain \ if [[ $skip_eval ]]; then
--tokenizer_dir $tokenizer_dir \ colossalai run --nproc_per_node 4 --master_port 31332 $EXAMPLES_DIR/training_scripts/train_sft.py \
--dataset ${dataset[@]} \ --pretrain $pretrain \
--eval_dataset ${dataset[@]} \ --tokenizer_dir $tokenizer_dir \
--save_path $MODEL_SAVE_PATH \ --dataset ${dataset[@]} \
--config_file $MODELS_DIR/config.jsonl \ --save_path $MODEL_SAVE_PATH \
$lora_config \ --config_file $MODELS_DIR/config.jsonl \
--plugin $plugin \ $lora_config \
--batch_size $bs \ --plugin $plugin \
--max_epochs 1 \ --batch_size $bs \
--accumulation_steps $grad_accu \ --max_epochs 1 \
--tp $tp \ --accumulation_steps $grad_accu \
--pp $pp \ --tp $tp \
--zero_stage $zero_stage \ --pp $pp \
--sp $sp \ --zero_stage $zero_stage \
--sp_mode $sp_mode \ --sp $sp \
$enable_sequence_parallelism \ --sp_mode $sp_mode \
--lr 2e-5 \ $enable_sequence_parallelism \
$grad_ckpt \ --lr 2e-5 \
--max_len 400 \ $grad_ckpt \
--use_flash_attn --max_len 400 \
--use_flash_attn
else
colossalai run --nproc_per_node 4 --master_port 31332 $EXAMPLES_DIR/training_scripts/train_sft.py \
--pretrain $pretrain \
--tokenizer_dir $tokenizer_dir \
--dataset ${dataset[@]} \
--eval_dataset ${dataset[@]} \
--save_path $MODEL_SAVE_PATH \
--config_file $MODELS_DIR/config.jsonl \
$lora_config \
--plugin $plugin \
--batch_size $bs \
--max_epochs 1 \
--accumulation_steps $grad_accu \
--tp $tp \
--pp $pp \
--zero_stage $zero_stage \
--sp $sp \
--sp_mode $sp_mode \
$enable_sequence_parallelism \
--lr 2e-5 \
$grad_ckpt \
--max_len 400 \
--use_flash_attn
fi
passed=$? passed=$?
if [ $passed -eq 0 ]; then if [ $passed -eq 0 ]; then
rm -rf ${MODEL_SAVE_PATH:?}/* rm -rf ${MODEL_SAVE_PATH:?}/*

View File

@ -1332,7 +1332,7 @@ class HybridParallelPlugin(PipelinePluginBase):
or not torch.is_grad_enabled() or not torch.is_grad_enabled()
): ):
return outputs return outputs
print("Show torch status:", torch.is_grad_enabled())
# Synchronize the grads of shared parameters of the model. # Synchronize the grads of shared parameters of the model.
model.sync_shared_params() model.sync_shared_params()
# Synchronize sequence parallelism gradients of the model. # Synchronize sequence parallelism gradients of the model.