From e87cd8bcfb33f6a8274f4beb0ec002e555329989 Mon Sep 17 00:00:00 2001 From: Tong Li Date: Wed, 14 Aug 2024 10:07:45 +0000 Subject: [PATCH] skip pp eval --- applications/ColossalChat/tests/test_train.sh | 74 +++++++++++++------ .../booster/plugin/hybrid_parallel_plugin.py | 2 +- 2 files changed, 52 insertions(+), 24 deletions(-) diff --git a/applications/ColossalChat/tests/test_train.sh b/applications/ColossalChat/tests/test_train.sh index 3b06495cb..2935a6369 100755 --- a/applications/ColossalChat/tests/test_train.sh +++ b/applications/ColossalChat/tests/test_train.sh @@ -91,7 +91,7 @@ SKIPPED_TESTS=( llama-gemini_auto-20 # gemini_auto plugin doesn't support lora llama-gemini-20 # gemini doesn't support lora ) - +skip_eval=false GRAD_CKPTS=('--grad_checkpoint') for lora_rank in ${LORA_RANK[@]}; do for model in ${MODELS[@]}; do @@ -129,15 +129,18 @@ for lora_rank in ${LORA_RANK[@]}; do plugin='3d' fi if [[ $plugin == "tp_pp" ]]; then + echo "Here" tp='2' bs='8' pp='2' plugin='3d' + skip_eval=true fi if [[ $plugin == "pp" ]]; then bs='8' pp='2' plugin='3d' + skip_eval=true fi if [[ $plugin == "sp_split_gather" ]]; then enable_sequence_parallelism='--enable_sequence_parallelism' @@ -175,28 +178,53 @@ for lora_rank in ${LORA_RANK[@]}; do for split in $(seq -f "%05g" 0 0); do dataset+=("$TEMP_DIR/rlhf_data/tokenized_${model}_sft/arrow/part-$split") done - colossalai run --nproc_per_node 4 --master_port 31332 $EXAMPLES_DIR/training_scripts/train_sft.py \ - --pretrain $pretrain \ - --tokenizer_dir $tokenizer_dir \ - --dataset ${dataset[@]} \ - --eval_dataset ${dataset[@]} \ - --save_path $MODEL_SAVE_PATH \ - --config_file $MODELS_DIR/config.jsonl \ - $lora_config \ - --plugin $plugin \ - --batch_size $bs \ - --max_epochs 1 \ - --accumulation_steps $grad_accu \ - --tp $tp \ - --pp $pp \ - --zero_stage $zero_stage \ - --sp $sp \ - --sp_mode $sp_mode \ - $enable_sequence_parallelism \ - --lr 2e-5 \ - $grad_ckpt \ - --max_len 400 \ - --use_flash_attn + + if [[ $skip_eval ]]; then + colossalai run --nproc_per_node 4 --master_port 31332 $EXAMPLES_DIR/training_scripts/train_sft.py \ + --pretrain $pretrain \ + --tokenizer_dir $tokenizer_dir \ + --dataset ${dataset[@]} \ + --save_path $MODEL_SAVE_PATH \ + --config_file $MODELS_DIR/config.jsonl \ + $lora_config \ + --plugin $plugin \ + --batch_size $bs \ + --max_epochs 1 \ + --accumulation_steps $grad_accu \ + --tp $tp \ + --pp $pp \ + --zero_stage $zero_stage \ + --sp $sp \ + --sp_mode $sp_mode \ + $enable_sequence_parallelism \ + --lr 2e-5 \ + $grad_ckpt \ + --max_len 400 \ + --use_flash_attn + else + colossalai run --nproc_per_node 4 --master_port 31332 $EXAMPLES_DIR/training_scripts/train_sft.py \ + --pretrain $pretrain \ + --tokenizer_dir $tokenizer_dir \ + --dataset ${dataset[@]} \ + --eval_dataset ${dataset[@]} \ + --save_path $MODEL_SAVE_PATH \ + --config_file $MODELS_DIR/config.jsonl \ + $lora_config \ + --plugin $plugin \ + --batch_size $bs \ + --max_epochs 1 \ + --accumulation_steps $grad_accu \ + --tp $tp \ + --pp $pp \ + --zero_stage $zero_stage \ + --sp $sp \ + --sp_mode $sp_mode \ + $enable_sequence_parallelism \ + --lr 2e-5 \ + $grad_ckpt \ + --max_len 400 \ + --use_flash_attn + fi passed=$? if [ $passed -eq 0 ]; then rm -rf ${MODEL_SAVE_PATH:?}/* diff --git a/colossalai/booster/plugin/hybrid_parallel_plugin.py b/colossalai/booster/plugin/hybrid_parallel_plugin.py index e359957f5..e5acdb051 100644 --- a/colossalai/booster/plugin/hybrid_parallel_plugin.py +++ b/colossalai/booster/plugin/hybrid_parallel_plugin.py @@ -1332,7 +1332,7 @@ class HybridParallelPlugin(PipelinePluginBase): or not torch.is_grad_enabled() ): return outputs - print("Show torch status:", torch.is_grad_enabled()) + # Synchronize the grads of shared parameters of the model. model.sync_shared_params() # Synchronize sequence parallelism gradients of the model.