Files
ColossalAI/.github/workflows/build_gpu_8.yml
ver217 c415240db6 [nvme] CPUAdam and HybridAdam support NVMe offload (#1360)
* impl nvme optimizer

* update cpu adam

* add unit test

* update hybrid adam

* update docstr

* add TODOs

* update CI

* fix CI

* fix CI

* fix CI path

* fix CI path

* fix CI path

* fix install tensornvme

* fix CI

* fix CI path

* fix CI env variables

* test CI

* test CI

* fix CI

* fix nvme optim __del__

* fix adam __del__

* fix nvme optim

* fix CI env variables

* fix nvme optim import

* test CI

* test CI

* fix CI
2022-07-26 17:25:24 +08:00

48 lines
1.7 KiB
YAML

name: Build on 8 GPUs
on:
schedule:
# run at 00:00 of every Sunday
- cron: '0 0 * * *'
workflow_dispatch:
jobs:
build:
name: Build and Test Colossal-AI
if: github.repository == 'hpcaitech/ColossalAI'
runs-on: [self-hosted, 8-gpu]
container:
image: hpcaitech/pytorch-cuda:1.11.0-11.3.0
options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10
timeout-minutes: 40
steps:
- uses: actions/checkout@v2
with:
repository: hpcaitech/TensorNVMe
ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
path: TensorNVMe
- name: Install tensornvme
run: |
cd TensorNVMe
conda install cmake
pip install -r requirements.txt
pip install -v .
- uses: actions/checkout@v2
with:
ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
- name: Install Colossal-AI
run: |
[ ! -z "$(ls -A /github/home/cuda_ext_cache/)" ] && cp -r /github/home/cuda_ext_cache/* /__w/ColossalAI/ColossalAI/
pip install -r requirements/requirements.txt
pip install -v -e .
cp -r /__w/ColossalAI/ColossalAI/build /github/home/cuda_ext_cache/
cp /__w/ColossalAI/ColossalAI/*.so /github/home/cuda_ext_cache/
pip install -r requirements/requirements-test.txt
- name: Unit Testing
run: |
gpu_used=$(nvidia-smi -i 0 --query-gpu=memory.used --format=csv,noheader,nounits)
[ "$gpu_used" -le "100" ] && PYTHONPATH=$PWD pytest tests
env:
DATA: /data/scratch/cifar-10
LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64