mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-01 17:17:05 +00:00
[devops] update torch version of CI (#3725)
* [test] fix flop tensor test * [test] fix autochunk test * [test] fix lazyinit test * [devops] update torch version of CI * [devops] enable testmon * [devops] fix ci * [devops] fix ci * [test] fix checkpoint io test * [test] fix cluster test * [test] fix timm test * [devops] fix ci * [devops] fix ci * [devops] fix ci * [devops] fix ci * [devops] force sync to test ci * [test] skip fsdp test
This commit is contained in:
19
.github/workflows/build_on_pr.yml
vendored
19
.github/workflows/build_on_pr.yml
vendored
@@ -68,9 +68,9 @@ jobs:
|
||||
needs: detect
|
||||
runs-on: [self-hosted, gpu]
|
||||
container:
|
||||
image: hpcaitech/pytorch-cuda:1.11.0-11.3.0
|
||||
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
|
||||
options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10
|
||||
timeout-minutes: 40
|
||||
timeout-minutes: 60
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
@@ -120,15 +120,26 @@ jobs:
|
||||
# -p flag is required to preserve the file timestamp to avoid ninja rebuild
|
||||
cp -p -r /__w/ColossalAI/ColossalAI/build /github/home/cuda_ext_cache/
|
||||
|
||||
- name: Restore Testmon Cache
|
||||
run: |
|
||||
if [ -d /github/home/testmon_cache ]; then
|
||||
[ ! -z "$(ls -A /github/home/testmon_cache)" ] && cp -p -r /github/home/testmon_cache/.testmondata /__w/ColossalAI/ColossalAI/
|
||||
fi
|
||||
|
||||
- name: Execute Unit Testing
|
||||
if: needs.detect.outputs.anyLibraryFileChanged == 'true'
|
||||
run: |
|
||||
CURL_CA_BUNDLE="" PYTHONPATH=$PWD pytest --cov=. --cov-report xml tests/
|
||||
CURL_CA_BUNDLE="" PYTHONPATH=$PWD pytest --testmon --testmon-cov=. tests/
|
||||
env:
|
||||
DATA: /data/scratch/cifar-10
|
||||
NCCL_SHM_DISABLE: 1
|
||||
LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
|
||||
|
||||
- name: Store Testmon Cache
|
||||
run: |
|
||||
[ -d /github/home/testmon_cache ] || mkdir /github/home/testmon_cache
|
||||
cp -p -r /__w/ColossalAI/ColossalAI/.testmondata /github/home/testmon_cache/
|
||||
|
||||
- name: Collate artifact
|
||||
env:
|
||||
PR_NUMBER: ${{ github.event.number }}
|
||||
@@ -140,7 +151,7 @@ jobs:
|
||||
echo $PR_NUMBER > ./report/pr_number
|
||||
|
||||
# generate coverage.xml if any
|
||||
if [ "$anyLibraryFileChanged" == "true" ]; then
|
||||
if [ "$anyLibraryFileChanged" == "true" ] && [ -e .coverage ]; then
|
||||
allFiles=""
|
||||
for file in $changedLibraryFiles; do
|
||||
if [ "$allFiles" == "" ]; then
|
||||
|
2
.github/workflows/build_on_schedule.yml
vendored
2
.github/workflows/build_on_schedule.yml
vendored
@@ -12,7 +12,7 @@ jobs:
|
||||
if: github.repository == 'hpcaitech/ColossalAI'
|
||||
runs-on: [self-hosted, 8-gpu]
|
||||
container:
|
||||
image: hpcaitech/pytorch-cuda:1.11.0-11.3.0
|
||||
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
|
||||
options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10
|
||||
timeout-minutes: 40
|
||||
steps:
|
||||
|
Reference in New Issue
Block a user