ColossalAI/.github/workflows/build_on_schedule.yml

name: Build on Schedule

on:
  schedule:
    # run at 00:00 of every Sunday
    - cron: "0 0 * * *"
  workflow_dispatch:

jobs:
  build:
    name: Build and Test Colossal-AI
    if: github.repository == 'hpcaitech/ColossalAI'
    runs-on: [self-hosted, gpu]
    container:
      image: hpcaitech/pytorch-cuda:2.2.2-12.1.0
      options: --gpus all --rm -v /dev/shm -v /data/scratch/:/data/scratch/
    timeout-minutes: 90
    steps:
      - name: Check GPU Availability # ensure all GPUs have enough memory
        id: check-avai
        run: |
          avai=true
          ngpu=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
          endIndex=$(($ngpu-1))
          for i in $(seq 0 $endIndex);
          do
            gpu_used=$(nvidia-smi -i $i --query-gpu=memory.used --format=csv,noheader,nounits)
            [ "$gpu_used" -gt "2000" ] && avai=false
          done

          echo "GPU is available: $avai"
          echo "avai=$avai" >> $GITHUB_OUTPUT

      - uses: actions/checkout@v2
        if: steps.check-avai.outputs.avai == 'true'
        with:
          repository: hpcaitech/TensorNVMe
          ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
          path: TensorNVMe

      - name: Install tensornvme
        if: steps.check-avai.outputs.avai == 'true'
        run: |
          cd TensorNVMe
          conda install cmake
          pip install -r requirements.txt
          DISABLE_URING=1 pip install -v .

      - uses: actions/checkout@v2
        if: steps.check-avai.outputs.avai == 'true'
        with:
          ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}

      - name: Install Colossal-AI
        if: steps.check-avai.outputs.avai == 'true'
        run: |
          [ ! -z "$(ls -A /github/home/cuda_ext_cache/)" ] && cp -r /github/home/cuda_ext_cache/* /__w/ColossalAI/ColossalAI/
          BUILD_EXT=1 pip install -v -e .
          cp -r /__w/ColossalAI/ColossalAI/build /github/home/cuda_ext_cache/
          pip install --no-cache-dir -r requirements/requirements-test.txt

      - name: Unit Testing
        if: steps.check-avai.outputs.avai == 'true'
        run: |
          PYTHONPATH=$PWD pytest \
          -m "not largedist" \
          --durations=0 \
          tests/
        env:
          LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
          LLAMA_PATH: /data/scratch/llama-tiny
          MOE_TENSOR_PATH: /data/scratch/moe_tensors
          HF_ENDPOINT: https://hf-mirror.com

      - name: Notify Lark
        id: message-preparation
        if: ${{ failure() }}
        run: |
          url=$SERVER_URL/$REPO/actions/runs/$RUN_ID
          msg="Scheduled Build and Test failed, please visit $url for details"
          echo $msg
          python .github/workflows/scripts/send_message_to_lark.py -m "$msg" -u $WEBHOOK_URL
        env:
          SERVER_URL: ${{github.server_url }}
          REPO: ${{ github.repository }}
          RUN_ID: ${{ github.run_id }}
          WEBHOOK_URL: ${{ secrets.LARK_NOTIFICATION_WEBHOOK_URL }}