mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-04-28 03:43:01 +00:00
* [lazy] patch from pretrained * [lazy] fix from pretrained and add tests * [devops] update ci
104 lines
3.3 KiB
YAML
104 lines
3.3 KiB
YAML
name: Compatibility Test on Schedule
|
|
|
|
on:
|
|
# run at 03:00 of every Sunday(singapore time) so here is UTC time Saturday 16:00
|
|
schedule:
|
|
- cron: '0 19 * * 6'
|
|
workflow_dispatch:
|
|
|
|
jobs:
|
|
matrix_preparation:
|
|
name: Prepare Container List
|
|
runs-on: ubuntu-latest
|
|
outputs:
|
|
matrix: ${{ steps.set-matrix.outputs.matrix }}
|
|
steps:
|
|
- uses: actions/checkout@v3
|
|
- id: set-matrix
|
|
run: |
|
|
IFS=','
|
|
DOCKER_IMAGE=()
|
|
|
|
while read tag; do
|
|
DOCKER_IMAGE+=("\"hpcaitech/pytorch-cuda:${tag}\"")
|
|
done <.compatibility
|
|
|
|
container=$( IFS=',' ; echo "${DOCKER_IMAGE[*]}" )
|
|
container="[${container}]"
|
|
echo "$container"
|
|
echo "::set-output name=matrix::{\"container\":$(echo "$container")}"
|
|
|
|
build:
|
|
name: Test for PyTorch Compatibility
|
|
needs: matrix_preparation
|
|
if: github.repository == 'hpcaitech/ColossalAI'
|
|
runs-on: [self-hosted, 8-gpu]
|
|
strategy:
|
|
fail-fast: false
|
|
matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
|
|
container:
|
|
image: ${{ matrix.container }}
|
|
options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
|
|
timeout-minutes: 120
|
|
steps:
|
|
- name: Install dependencies
|
|
run: |
|
|
pip install -U pip setuptools wheel --user
|
|
|
|
- uses: actions/checkout@v2
|
|
with:
|
|
repository: hpcaitech/TensorNVMe
|
|
ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
|
|
path: TensorNVMe
|
|
|
|
- name: Install tensornvme
|
|
run: |
|
|
cd TensorNVMe
|
|
apt update && apt install -y cmake
|
|
pip install -r requirements.txt
|
|
pip install -v .
|
|
- uses: actions/checkout@v2
|
|
with:
|
|
ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
|
|
|
|
- name: Download cub for CUDA 10.2
|
|
run: |
|
|
CUDA_VERSION=$(nvcc -V | awk -F ',| ' '/release/{print $6}')
|
|
|
|
# check if it is CUDA 10.2
|
|
# download cub
|
|
if [ "$CUDA_VERSION" = "10.2" ]; then
|
|
wget https://github.com/NVIDIA/cub/archive/refs/tags/1.8.0.zip
|
|
unzip 1.8.0.zip
|
|
cp -r cub-1.8.0/cub/ colossalai/kernel/cuda_native/csrc/kernels/include/
|
|
fi
|
|
|
|
- name: Install Colossal-AI
|
|
run: |
|
|
CUDA_EXT=1 pip install -v .
|
|
pip install -r requirements/requirements-test.txt
|
|
|
|
- name: Unit Testing
|
|
run: |
|
|
PYTHONPATH=$PWD pytest tests
|
|
env:
|
|
DATA: /data/scratch/cifar-10
|
|
NCCL_SHM_DISABLE: 1
|
|
LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
|
|
LLAMA_PATH: /data/scratch/llama-tiny
|
|
|
|
- name: Notify Lark
|
|
id: message-preparation
|
|
if: ${{ failure() }}
|
|
run: |
|
|
url=$SERVER_URL/$REPO/actions/runs/$RUN_ID
|
|
msg="Compatibility test failed with $container, please visit $url for details"
|
|
echo $msg
|
|
python .github/workflows/scripts/send_message_to_lark.py -m "$msg" -u $WEBHOOK_URL
|
|
env:
|
|
SERVER_URL: ${{github.server_url }}
|
|
REPO: ${{ github.repository }}
|
|
RUN_ID: ${{ github.run_id }}
|
|
WEBHOOK_URL: ${{ secrets.LARK_NOTIFICATION_WEBHOOK_URL }}
|
|
container: ${{ matrix.container }}
|