mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-01 09:07:51 +00:00
[ci] fixed booster test (#5251)
* [ci] fixed booster test * [ci] fixed booster test * [ci] fixed booster test
This commit is contained in:
4
.github/workflows/build_on_pr.yml
vendored
4
.github/workflows/build_on_pr.yml
vendored
@@ -90,7 +90,7 @@ jobs:
|
||||
runs-on: [self-hosted, gpu]
|
||||
container:
|
||||
image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
|
||||
options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
|
||||
options: --gpus all --rm -v /dev/shm -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
|
||||
timeout-minutes: 60
|
||||
defaults:
|
||||
run:
|
||||
@@ -165,7 +165,6 @@ jobs:
|
||||
--ignore tests/test_checkpoint_io \
|
||||
tests/
|
||||
env:
|
||||
NCCL_SHM_DISABLE: 1
|
||||
LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
|
||||
LLAMA_PATH: /data/scratch/llama-tiny
|
||||
|
||||
@@ -205,4 +204,3 @@ jobs:
|
||||
with:
|
||||
name: report
|
||||
path: report/
|
||||
|
||||
|
9
.github/workflows/build_on_schedule.yml
vendored
9
.github/workflows/build_on_schedule.yml
vendored
@@ -13,15 +13,16 @@ jobs:
|
||||
runs-on: [self-hosted, gpu]
|
||||
container:
|
||||
image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
|
||||
options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
|
||||
options: --gpus all --rm -v /dev/shm -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
|
||||
timeout-minutes: 90
|
||||
steps:
|
||||
- name: Check GPU Availability # ensure all GPUs have enough memory
|
||||
id: check-avai
|
||||
run: |
|
||||
avai=true
|
||||
for i in $(seq 0 3);
|
||||
do
|
||||
ngpu=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
|
||||
endIndex=$(($ngpu-1))
|
||||
for i in $(seq 0 $endIndex);
|
||||
gpu_used=$(nvidia-smi -i $i --query-gpu=memory.used --format=csv,noheader,nounits)
|
||||
[ "$gpu_used" -gt "2000" ] && avai=false
|
||||
done
|
||||
@@ -74,7 +75,7 @@ jobs:
|
||||
if: ${{ failure() }}
|
||||
run: |
|
||||
url=$SERVER_URL/$REPO/actions/runs/$RUN_ID
|
||||
msg="Scheduled Build and Test failed on 8 GPUs, please visit $url for details"
|
||||
msg="Scheduled Build and Test failed, please visit $url for details"
|
||||
echo $msg
|
||||
python .github/workflows/scripts/send_message_to_lark.py -m "$msg" -u $WEBHOOK_URL
|
||||
env:
|
||||
|
Reference in New Issue
Block a user