From cc1eec2f533dffc1069cbb42fbac3d01670f9371 Mon Sep 17 00:00:00 2001 From: binmakeswell Date: Mon, 17 Apr 2023 15:02:55 +0800 Subject: [PATCH] [chat] update reward model sh (#3578) --- applications/Chat/examples/train_rm.sh | 30 ++++++++++++++++++++------ 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/applications/Chat/examples/train_rm.sh b/applications/Chat/examples/train_rm.sh index 4f9f55b6b..80abe62d2 100755 --- a/applications/Chat/examples/train_rm.sh +++ b/applications/Chat/examples/train_rm.sh @@ -1,8 +1,24 @@ -set_n_least_used_CUDA_VISIBLE_DEVICES 1 +set_n_least_used_CUDA_VISIBLE_DEVICES() { + local n=${1:-"9999"} + echo "GPU Memory Usage:" + local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv \ + | tail -n +2 \ + | nl -v 0 \ + | tee /dev/tty \ + | sort -g -k 2 \ + | awk '{print $1}' \ + | head -n $n) + export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g') + echo "Now CUDA_VISIBLE_DEVICES is set to:" + echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" +} -python train_reward_model.py --pretrain 'microsoft/deberta-v3-large' \ - --model 'deberta' \ - --strategy naive \ - --loss_fn 'log_exp'\ - --save_path 'rmstatic.pt' \ - --test True +set_n_least_used_CUDA_VISIBLE_DEVICES 2 + +torchrun --standalone --nproc_per_node=2 train_reward_model.py \ + --pretrain \ + --model 'bloom' \ + --strategy colossalai_zero2 \ + --loss_fn 'log_sig'\ + --save_path \ + --dataset 'Anthropic/hh-rlhf'\