mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-02 09:38:05 +00:00
Merge branch 'main' into sync/npu
This commit is contained in:
138
.github/workflows/build_on_pr.yml
vendored
138
.github/workflows/build_on_pr.yml
vendored
@@ -22,57 +22,6 @@ on:
|
||||
delete:
|
||||
|
||||
jobs:
|
||||
prepare_cache:
|
||||
name: Prepare testmon cache
|
||||
if: |
|
||||
github.event_name == 'create' &&
|
||||
github.event.ref_type == 'branch' &&
|
||||
github.event.repository.full_name == 'hpcaitech/ColossalAI'
|
||||
runs-on: [self-hosted, gpu]
|
||||
container:
|
||||
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
|
||||
options: --rm
|
||||
timeout-minutes: 5
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
steps:
|
||||
- name: Copy testmon cache
|
||||
run: | # branch name may contain slash, we need to replace it with space
|
||||
export REF_BRANCH=$(echo ${{ github.event.ref }} | sed "s/\// /")
|
||||
if [ -d /github/home/testmon_cache/${MAIN_BRANCH} ]; then
|
||||
cp -p -r /github/home/testmon_cache/${MAIN_BRANCH} "/github/home/testmon_cache/${REF_BRANCH}"
|
||||
fi
|
||||
env:
|
||||
MAIN_BRANCH: ${{ github.event.master_branch }}
|
||||
|
||||
prepare_cache_for_pr:
|
||||
name: Prepare testmon cache for PR
|
||||
if: |
|
||||
github.event_name == 'pull_request' &&
|
||||
(github.event.action == 'opened' || github.event.action == 'reopened' || (github.event.action == 'edited' && github.event.changes.base != null)) &&
|
||||
github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
|
||||
runs-on: [self-hosted, gpu]
|
||||
container:
|
||||
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
|
||||
options: --rm
|
||||
timeout-minutes: 5
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-repare-cache
|
||||
cancel-in-progress: true
|
||||
steps:
|
||||
- name: Copy testmon cache
|
||||
run: | # branch name may contain slash, we need to replace it with space
|
||||
export BASE=$(echo ${{ github.event.pull_request.base.ref }} | sed "s/\// /")
|
||||
if [ -d "/github/home/testmon_cache/${BASE}" ] && [ ! -z "$(ls -A "/github/home/testmon_cache/${BASE}")" ]; then
|
||||
mkdir -p /github/home/testmon_cache/_pull/${PR_NUMBER} && cp -p -r "/github/home/testmon_cache/${BASE}"/.testmondata* /github/home/testmon_cache/_pull/${PR_NUMBER}
|
||||
fi
|
||||
env:
|
||||
PR_NUMBER: ${{ github.event.number }}
|
||||
|
||||
detect:
|
||||
name: Detect file change
|
||||
if: |
|
||||
@@ -140,8 +89,8 @@ jobs:
|
||||
if: needs.detect.outputs.anyLibraryFileChanged == 'true'
|
||||
runs-on: [self-hosted, gpu]
|
||||
container:
|
||||
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
|
||||
options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
|
||||
image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
|
||||
options: --gpus all --rm -v /dev/shm -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
|
||||
timeout-minutes: 60
|
||||
defaults:
|
||||
run:
|
||||
@@ -174,6 +123,7 @@ jobs:
|
||||
run: |
|
||||
cd TensorNVMe
|
||||
cp -p -r ./build /github/home/tensornvme_cache/
|
||||
cp -p -r ./cmake-build /github/home/tensornvme_cache/
|
||||
|
||||
- name: Checkout Colossal-AI
|
||||
uses: actions/checkout@v2
|
||||
@@ -198,31 +148,24 @@ jobs:
|
||||
# -p flag is required to preserve the file timestamp to avoid ninja rebuild
|
||||
cp -p -r /__w/ColossalAI/ColossalAI/build /github/home/cuda_ext_cache/
|
||||
|
||||
- name: Restore Testmon Cache
|
||||
run: |
|
||||
if [ -d /github/home/testmon_cache/_pull/${PR_NUMBER} ] && [ ! -z "$(ls -A /github/home/testmon_cache/_pull/${PR_NUMBER})" ]; then
|
||||
cp -p -r /github/home/testmon_cache/_pull/${PR_NUMBER}/.testmondata* /__w/ColossalAI/ColossalAI/
|
||||
fi
|
||||
env:
|
||||
PR_NUMBER: ${{ github.event.number }}
|
||||
|
||||
- name: Execute Unit Testing
|
||||
run: |
|
||||
CURL_CA_BUNDLE="" PYTHONPATH=$PWD pytest -m "not largedist" --testmon --testmon-forceselect --testmon-cov=. --durations=10 tests/
|
||||
CURL_CA_BUNDLE="" PYTHONPATH=$PWD FAST_TEST=1 pytest \
|
||||
-m "not largedist" \
|
||||
--durations=0 \
|
||||
--ignore tests/test_analyzer \
|
||||
--ignore tests/test_auto_parallel \
|
||||
--ignore tests/test_fx \
|
||||
--ignore tests/test_autochunk \
|
||||
--ignore tests/test_gptq \
|
||||
--ignore tests/test_infer_ops \
|
||||
--ignore tests/test_legacy \
|
||||
--ignore tests/test_smoothquant \
|
||||
tests/
|
||||
env:
|
||||
DATA: /data/scratch/cifar-10
|
||||
NCCL_SHM_DISABLE: 1
|
||||
LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
|
||||
TESTMON_CORE_PKGS: /__w/ColossalAI/ColossalAI/requirements/requirements.txt,/__w/ColossalAI/ColossalAI/requirements/requirements-test.txt
|
||||
LLAMA_PATH: /data/scratch/llama-tiny
|
||||
|
||||
- name: Store Testmon Cache
|
||||
run: |
|
||||
mkdir -p /github/home/testmon_cache/_pull/${PR_NUMBER}
|
||||
cp -p -r /__w/ColossalAI/ColossalAI/.testmondata* /github/home/testmon_cache/_pull/${PR_NUMBER}/
|
||||
env:
|
||||
PR_NUMBER: ${{ github.event.number }}
|
||||
|
||||
- name: Collate artifact
|
||||
env:
|
||||
PR_NUMBER: ${{ github.event.number }}
|
||||
@@ -259,54 +202,3 @@ jobs:
|
||||
with:
|
||||
name: report
|
||||
path: report/
|
||||
|
||||
store_cache:
|
||||
name: Store testmon cache for PR
|
||||
if: |
|
||||
github.event_name == 'pull_request' &&
|
||||
github.event.action == 'closed' &&
|
||||
github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
|
||||
runs-on: [self-hosted, gpu]
|
||||
container:
|
||||
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
|
||||
options: --rm
|
||||
timeout-minutes: 5
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
steps:
|
||||
- name: Store testmon cache if possible
|
||||
if: github.event.pull_request.merged == true
|
||||
run: | # branch name may contain slash, we need to replace it with space
|
||||
export BASE=$(echo ${{ github.event.pull_request.base.ref }} | sed "s/\// /")
|
||||
if [ -d /github/home/testmon_cache/_pull/${PR_NUMBER} ] && [ ! -z "$(ls -A /github/home/testmon_cache/_pull/${PR_NUMBER})" ]; then
|
||||
cp -p -r /github/home/testmon_cache/_pull/${PR_NUMBER}/.testmondata* "/github/home/testmon_cache/${BASE}/"
|
||||
fi
|
||||
env:
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
|
||||
- name: Remove testmon cache
|
||||
run: |
|
||||
rm -rf /github/home/testmon_cache/_pull/${PR_NUMBER}
|
||||
env:
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
|
||||
remove_cache:
|
||||
name: Remove testmon cache
|
||||
if: |
|
||||
github.event_name == 'delete' &&
|
||||
github.event.ref_type == 'branch' &&
|
||||
github.event.repository.full_name == 'hpcaitech/ColossalAI'
|
||||
runs-on: [self-hosted, gpu]
|
||||
container:
|
||||
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
|
||||
options: --rm
|
||||
timeout-minutes: 5
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
steps:
|
||||
- name: Remove testmon cache
|
||||
run: | # branch name may contain slash, we need to replace it with space
|
||||
export BASE=$(echo ${{ github.event.ref }} | sed "s/\// /")
|
||||
rm -rf "/github/home/testmon_cache/${BASE}"
|
||||
|
23
.github/workflows/build_on_schedule.yml
vendored
23
.github/workflows/build_on_schedule.yml
vendored
@@ -10,20 +10,22 @@ jobs:
|
||||
build:
|
||||
name: Build and Test Colossal-AI
|
||||
if: github.repository == 'hpcaitech/ColossalAI'
|
||||
runs-on: [self-hosted, 8-gpu]
|
||||
runs-on: [self-hosted, gpu]
|
||||
container:
|
||||
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
|
||||
options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
|
||||
timeout-minutes: 40
|
||||
image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
|
||||
options: --gpus all --rm -v /dev/shm -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
|
||||
timeout-minutes: 90
|
||||
steps:
|
||||
- name: Check GPU Availability # ensure all GPUs have enough memory
|
||||
id: check-avai
|
||||
run: |
|
||||
avai=true
|
||||
for i in $(seq 0 7);
|
||||
ngpu=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
|
||||
endIndex=$(($ngpu-1))
|
||||
for i in $(seq 0 $endIndex);
|
||||
do
|
||||
gpu_used=$(nvidia-smi -i $i --query-gpu=memory.used --format=csv,noheader,nounits)
|
||||
[ "$gpu_used" -gt "10000" ] && avai=false
|
||||
[ "$gpu_used" -gt "2000" ] && avai=false
|
||||
done
|
||||
|
||||
echo "GPU is available: $avai"
|
||||
@@ -60,9 +62,12 @@ jobs:
|
||||
- name: Unit Testing
|
||||
if: steps.check-avai.outputs.avai == 'true'
|
||||
run: |
|
||||
PYTHONPATH=$PWD pytest --durations=0 tests
|
||||
PYTHONPATH=$PWD pytest \
|
||||
-m "not largedist" \
|
||||
--durations=0 \
|
||||
tests/
|
||||
env:
|
||||
DATA: /data/scratch/cifar-10
|
||||
NCCL_SHM_DISABLE: 1
|
||||
LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
|
||||
LLAMA_PATH: /data/scratch/llama-tiny
|
||||
|
||||
@@ -71,7 +76,7 @@ jobs:
|
||||
if: ${{ failure() }}
|
||||
run: |
|
||||
url=$SERVER_URL/$REPO/actions/runs/$RUN_ID
|
||||
msg="Scheduled Build and Test failed on 8 GPUs, please visit $url for details"
|
||||
msg="Scheduled Build and Test failed, please visit $url for details"
|
||||
echo $msg
|
||||
python .github/workflows/scripts/send_message_to_lark.py -m "$msg" -u $WEBHOOK_URL
|
||||
env:
|
||||
|
2
.github/workflows/doc_test_on_pr.yml
vendored
2
.github/workflows/doc_test_on_pr.yml
vendored
@@ -56,7 +56,7 @@ jobs:
|
||||
needs: detect-changed-doc
|
||||
runs-on: [self-hosted, gpu]
|
||||
container:
|
||||
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
|
||||
image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
|
||||
options: --gpus all --rm
|
||||
timeout-minutes: 20
|
||||
defaults:
|
||||
|
2
.github/workflows/doc_test_on_schedule.yml
vendored
2
.github/workflows/doc_test_on_schedule.yml
vendored
@@ -12,7 +12,7 @@ jobs:
|
||||
name: Test the changed Doc
|
||||
runs-on: [self-hosted, gpu]
|
||||
container:
|
||||
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
|
||||
image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
|
||||
options: --gpus all --rm
|
||||
timeout-minutes: 60
|
||||
steps:
|
||||
|
@@ -45,7 +45,7 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix: ${{fromJson(needs.manual_check_matrix_preparation.outputs.matrix)}}
|
||||
container:
|
||||
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
|
||||
image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
|
||||
options: --gpus all --rm -v /data/scratch/examples-data:/data/
|
||||
timeout-minutes: 15
|
||||
steps:
|
||||
|
4
.github/workflows/example_check_on_pr.yml
vendored
4
.github/workflows/example_check_on_pr.yml
vendored
@@ -77,9 +77,9 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix: ${{fromJson(needs.detect-changed-example.outputs.matrix)}}
|
||||
container:
|
||||
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
|
||||
image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
|
||||
options: --gpus all --rm -v /data/scratch/examples-data:/data/
|
||||
timeout-minutes: 15
|
||||
timeout-minutes: 20
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-run-example-${{ matrix.directory }}
|
||||
cancel-in-progress: true
|
||||
|
@@ -34,8 +34,8 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
|
||||
container:
|
||||
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
|
||||
timeout-minutes: 15
|
||||
image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
|
||||
timeout-minutes: 10
|
||||
steps:
|
||||
- name: 📚 Checkout
|
||||
uses: actions/checkout@v3
|
||||
|
Reference in New Issue
Block a user