[devops] update torch version of CI (#3725)

* [test] fix flop tensor test

* [test] fix autochunk test

* [test] fix lazyinit test

* [devops] update torch version of CI

* [devops] enable testmon

* [devops] fix ci

* [devops] fix ci

* [test] fix checkpoint io test

* [test] fix cluster test

* [test] fix timm test

* [devops] fix ci

* [devops] fix ci

* [devops] fix ci

* [devops] fix ci

* [devops] force sync to test ci

* [test] skip fsdp test
This commit is contained in:
Hongxin Liu
2023-05-15 17:20:56 +08:00
committed by GitHub
parent b37797ed3d
commit afb239bbf8
17 changed files with 74 additions and 46 deletions

View File

@@ -68,9 +68,9 @@ jobs:
needs: detect
runs-on: [self-hosted, gpu]
container:
image: hpcaitech/pytorch-cuda:1.11.0-11.3.0
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10
timeout-minutes: 40
timeout-minutes: 60
defaults:
run:
shell: bash
@@ -120,15 +120,26 @@ jobs:
# -p flag is required to preserve the file timestamp to avoid ninja rebuild
cp -p -r /__w/ColossalAI/ColossalAI/build /github/home/cuda_ext_cache/
- name: Restore Testmon Cache
run: |
if [ -d /github/home/testmon_cache ]; then
[ ! -z "$(ls -A /github/home/testmon_cache)" ] && cp -p -r /github/home/testmon_cache/.testmondata /__w/ColossalAI/ColossalAI/
fi
- name: Execute Unit Testing
if: needs.detect.outputs.anyLibraryFileChanged == 'true'
run: |
CURL_CA_BUNDLE="" PYTHONPATH=$PWD pytest --cov=. --cov-report xml tests/
CURL_CA_BUNDLE="" PYTHONPATH=$PWD pytest --testmon --testmon-cov=. tests/
env:
DATA: /data/scratch/cifar-10
NCCL_SHM_DISABLE: 1
LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
- name: Store Testmon Cache
run: |
[ -d /github/home/testmon_cache ] || mkdir /github/home/testmon_cache
cp -p -r /__w/ColossalAI/ColossalAI/.testmondata /github/home/testmon_cache/
- name: Collate artifact
env:
PR_NUMBER: ${{ github.event.number }}
@@ -140,7 +151,7 @@ jobs:
echo $PR_NUMBER > ./report/pr_number
# generate coverage.xml if any
if [ "$anyLibraryFileChanged" == "true" ]; then
if [ "$anyLibraryFileChanged" == "true" ] && [ -e .coverage ]; then
allFiles=""
for file in $changedLibraryFiles; do
if [ "$allFiles" == "" ]; then

View File

@@ -12,7 +12,7 @@ jobs:
if: github.repository == 'hpcaitech/ColossalAI'
runs-on: [self-hosted, 8-gpu]
container:
image: hpcaitech/pytorch-cuda:1.11.0-11.3.0
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10
timeout-minutes: 40
steps: