diff --git a/.cuda_ext.json b/.cuda_ext.json index 01a30a9c1..c83f633f7 100644 --- a/.cuda_ext.json +++ b/.cuda_ext.json @@ -2,11 +2,11 @@ "build": [ { "torch_command": "pip install torch==2.3.0 torchvision==0.18.0 torchaudio==2.3.0 --index-url https://download.pytorch.org/whl/cu121", - "cuda_image": "hpcaitech/cuda-conda:12.1" + "cuda_image": "image-cloud.luchentech.com/hpcaitech/cuda-conda:12.1" }, { "torch_command": "pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu124", - "cuda_image": "hpcaitech/cuda-conda:12.4" + "cuda_image": "image-cloud.luchentech.com/hpcaitech/cuda-conda:12.4" } ] } diff --git a/.github/workflows/build_on_schedule.yml b/.github/workflows/build_on_schedule.yml index fd7dc42e5..607013851 100644 --- a/.github/workflows/build_on_schedule.yml +++ b/.github/workflows/build_on_schedule.yml @@ -12,7 +12,7 @@ jobs: if: github.repository == 'hpcaitech/ColossalAI' runs-on: [self-hosted, gpu] container: - image: hpcaitech/pytorch-cuda:2.2.2-12.1.0 + image: image-cloud.luchentech.com/hpcaitech/pytorch-cuda:2.2.2-12.1.0 options: --gpus all --rm -v /dev/shm -v /data/scratch/:/data/scratch/ timeout-minutes: 90 steps: diff --git a/.github/workflows/close_inactive.yml b/.github/workflows/close_inactive.yml index e7dec4430..a175661b4 100644 --- a/.github/workflows/close_inactive.yml +++ b/.github/workflows/close_inactive.yml @@ -7,7 +7,7 @@ on: jobs: close-issues: if: github.event.pull_request.draft == false && github.base_ref == 'main' && github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' - runs-on: ubuntu-latest + runs-on: [self-hosted, ubuntu-latest]-latest permissions: issues: write pull-requests: write diff --git a/.github/workflows/compatiblity_test_on_dispatch.yml b/.github/workflows/compatiblity_test_on_dispatch.yml index 1534fa7f6..7f74f83c6 100644 --- a/.github/workflows/compatiblity_test_on_dispatch.yml +++ b/.github/workflows/compatiblity_test_on_dispatch.yml @@ -15,7 +15,7 @@ on: jobs: matrix_preparation: name: Prepare Container List - runs-on: ubuntu-latest + runs-on: [self-hosted, ubuntu-latest]-latest outputs: matrix: ${{ steps.set-matrix.outputs.matrix }} steps: @@ -31,7 +31,7 @@ jobs: do for cv in $CUDA_VERSIONS do - DOCKER_IMAGE+=("\"hpcaitech/pytorch-cuda:${tv}-${cv}\"") + DOCKER_IMAGE+=("\"image-cloud.luchentech.com/hpcaitech/pytorch-cuda:${tv}-${cv}\"") done done @@ -44,7 +44,7 @@ jobs: name: Test for PyTorch Compatibility needs: matrix_preparation if: github.repository == 'hpcaitech/ColossalAI' - runs-on: [self-hosted, 8-gpu] + runs-on: [self-hosted, ubuntu-latest] strategy: fail-fast: false matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}} diff --git a/.github/workflows/compatiblity_test_on_pr.yml b/.github/workflows/compatiblity_test_on_pr.yml index c2cc85b3f..bf9fb6ecb 100644 --- a/.github/workflows/compatiblity_test_on_pr.yml +++ b/.github/workflows/compatiblity_test_on_pr.yml @@ -9,7 +9,7 @@ on: jobs: matrix_preparation: name: Prepare Container List - runs-on: ubuntu-latest + runs-on: [self-hosted, ubuntu-latest]-latest outputs: matrix: ${{ steps.set-matrix.outputs.matrix }} concurrency: @@ -23,7 +23,7 @@ jobs: DOCKER_IMAGE=() while read tag; do - DOCKER_IMAGE+=("\"hpcaitech/pytorch-cuda:${tag}\"") + DOCKER_IMAGE+=("\"image-cloud.luchentech.com/hpcaitech/pytorch-cuda:${tag}\"") done <.compatibility container=$( IFS=',' ; echo "${DOCKER_IMAGE[*]}" ) @@ -35,7 +35,7 @@ jobs: name: Test for PyTorch Compatibility needs: matrix_preparation if: github.repository == 'hpcaitech/ColossalAI' - runs-on: [self-hosted, 8-gpu] + runs-on: [self-hosted, ubuntu-latest] strategy: fail-fast: false matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}} diff --git a/.github/workflows/compatiblity_test_on_schedule.yml b/.github/workflows/compatiblity_test_on_schedule.yml index 1bd24b0a2..04928d7ac 100644 --- a/.github/workflows/compatiblity_test_on_schedule.yml +++ b/.github/workflows/compatiblity_test_on_schedule.yml @@ -9,7 +9,7 @@ on: jobs: matrix_preparation: name: Prepare Container List - runs-on: ubuntu-latest + runs-on: [self-hosted, ubuntu-latest]-latest outputs: matrix: ${{ steps.set-matrix.outputs.matrix }} steps: @@ -20,7 +20,7 @@ jobs: DOCKER_IMAGE=() while read tag; do - DOCKER_IMAGE+=("\"hpcaitech/pytorch-cuda:${tag}\"") + DOCKER_IMAGE+=("\"image-cloud.luchentech.com/hpcaitech/pytorch-cuda:${tag}\"") done <.compatibility container=$( IFS=',' ; echo "${DOCKER_IMAGE[*]}" ) @@ -32,7 +32,7 @@ jobs: name: Test for PyTorch Compatibility needs: matrix_preparation if: github.repository == 'hpcaitech/ColossalAI' - runs-on: [self-hosted, 8-gpu] + runs-on: [self-hosted, ubuntu-latest] strategy: fail-fast: false matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}} diff --git a/.github/workflows/doc_build_on_schedule_after_release.yml b/.github/workflows/doc_build_on_schedule_after_release.yml index 62dfdc672..863c216e7 100644 --- a/.github/workflows/doc_build_on_schedule_after_release.yml +++ b/.github/workflows/doc_build_on_schedule_after_release.yml @@ -11,7 +11,7 @@ jobs: build-doc: name: Trigger Documentation Build Workflow if: github.repository == 'hpcaitech/ColossalAI' - runs-on: ubuntu-latest + runs-on: [self-hosted, ubuntu-latest]-latest steps: - name: trigger workflow in ColossalAI-Documentation run: | diff --git a/.github/workflows/doc_check_on_pr.yml b/.github/workflows/doc_check_on_pr.yml index 68e13a971..91fc16148 100644 --- a/.github/workflows/doc_check_on_pr.yml +++ b/.github/workflows/doc_check_on_pr.yml @@ -15,7 +15,7 @@ jobs: if: | github.event.pull_request.draft == false && github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' - runs-on: ubuntu-latest + runs-on: ubuntu-[self-hosted, ubuntu-latest] concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-check-i18n cancel-in-progress: true @@ -33,7 +33,7 @@ jobs: if: | github.event.pull_request.draft == false && github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' - runs-on: ubuntu-latest + runs-on: [self-hosted, ubuntu-latest]-latest concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-check-doc cancel-in-progress: true diff --git a/.github/workflows/doc_test_on_pr.yml b/.github/workflows/doc_test_on_pr.yml index 99a3f18a0..04a4c044f 100644 --- a/.github/workflows/doc_test_on_pr.yml +++ b/.github/workflows/doc_test_on_pr.yml @@ -15,7 +15,7 @@ jobs: if: | github.event.pull_request.draft == false && github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'pull_request' - runs-on: ubuntu-latest + runs-on: [self-hosted, ubuntu-latest]-latest outputs: any_changed: ${{ steps.changed-files.outputs.any_changed }} changed_files: ${{ steps.changed-files.outputs.all_changed_files }} @@ -56,7 +56,7 @@ jobs: needs: detect-changed-doc runs-on: [self-hosted, gpu] container: - image: hpcaitech/pytorch-cuda:2.2.2-12.1.0 + image: image-cloud.luchentech.com/hpcaitech/pytorch-cuda:2.2.2-12.1.0 options: --gpus all --rm timeout-minutes: 30 defaults: diff --git a/.github/workflows/doc_test_on_schedule.yml b/.github/workflows/doc_test_on_schedule.yml index 902aba774..42ec8a9de 100644 --- a/.github/workflows/doc_test_on_schedule.yml +++ b/.github/workflows/doc_test_on_schedule.yml @@ -12,7 +12,7 @@ jobs: name: Test the changed Doc runs-on: [self-hosted, gpu] container: - image: hpcaitech/pytorch-cuda:2.2.2-12.1.0 + image: image-cloud.luchentech.com/hpcaitech/pytorch-cuda:2.2.2-12.1.0 options: --gpus all --rm timeout-minutes: 60 steps: diff --git a/.github/workflows/draft_github_release_post_after_merge.yml b/.github/workflows/draft_github_release_post_after_merge.yml index 53bfa9e8d..fbd7f735e 100644 --- a/.github/workflows/draft_github_release_post_after_merge.yml +++ b/.github/workflows/draft_github_release_post_after_merge.yml @@ -12,7 +12,7 @@ jobs: release: name: Draft Release Post if: ( github.event_name == 'workflow_dispatch' || github.event.pull_request.merged == true ) && github.repository == 'hpcaitech/ColossalAI' - runs-on: ubuntu-latest + runs-on: [self-hosted, ubuntu-latest]-latest steps: - uses: actions/checkout@v2 with: diff --git a/.github/workflows/example_check_on_dispatch.yml b/.github/workflows/example_check_on_dispatch.yml index 7039ed9c2..e5b0ec0ec 100644 --- a/.github/workflows/example_check_on_dispatch.yml +++ b/.github/workflows/example_check_on_dispatch.yml @@ -14,7 +14,7 @@ jobs: github.base_ref == 'main' && github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' name: Check the examples user want - runs-on: ubuntu-latest + runs-on: [self-hosted, ubuntu-latest]-latest outputs: matrix: ${{ steps.set-matrix.outputs.matrix }} steps: @@ -45,7 +45,7 @@ jobs: fail-fast: false matrix: ${{fromJson(needs.manual_check_matrix_preparation.outputs.matrix)}} container: - image: hpcaitech/pytorch-cuda:2.2.2-12.1.0 + image: image-cloud.luchentech.com/hpcaitech/pytorch-cuda:2.2.2-12.1.0 options: --gpus all --rm -v /data/scratch/examples-data:/data/ -v /dev/shm timeout-minutes: 15 steps: diff --git a/.github/workflows/example_check_on_pr.yml b/.github/workflows/example_check_on_pr.yml index af8da0383..ff6e62b72 100644 --- a/.github/workflows/example_check_on_pr.yml +++ b/.github/workflows/example_check_on_pr.yml @@ -17,7 +17,7 @@ jobs: if: | github.event.pull_request.draft == false && github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'pull_request' - runs-on: ubuntu-latest + runs-on: [self-hosted, ubuntu-latest]-latest outputs: matrix: ${{ steps.setup-matrix.outputs.matrix }} anyChanged: ${{ steps.setup-matrix.outputs.anyChanged }} @@ -90,7 +90,7 @@ jobs: fail-fast: false matrix: ${{fromJson(needs.detect-changed-example.outputs.matrix)}} container: - image: hpcaitech/pytorch-cuda:2.2.2-12.1.0 + image: image-cloud.luchentech.com/hpcaitech/pytorch-cuda:2.2.2-12.1.0 options: --gpus all --rm -v /data/scratch/examples-data:/data/ -v /dev/shm timeout-minutes: 30 concurrency: diff --git a/.github/workflows/example_check_on_schedule.yml b/.github/workflows/example_check_on_schedule.yml index db55c305b..cc17e9a30 100644 --- a/.github/workflows/example_check_on_schedule.yml +++ b/.github/workflows/example_check_on_schedule.yml @@ -10,7 +10,7 @@ jobs: matrix_preparation: if: github.repository == 'hpcaitech/ColossalAI' name: Prepare matrix for weekly check - runs-on: ubuntu-latest + runs-on: ubunt[self-hosted, ubuntu-latest]u-latest outputs: matrix: ${{ steps.setup-matrix.outputs.matrix }} steps: @@ -34,7 +34,7 @@ jobs: fail-fast: false matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}} container: - image: hpcaitech/pytorch-cuda:2.2.2-12.1.0 + image: image-cloud.luchentech.com/hpcaitech/pytorch-cuda:2.2.2-12.1.0 options: --gpus all --rm -v /data/scratch/examples-data:/data/ -v /dev/shm timeout-minutes: 30 steps: diff --git a/.github/workflows/release_docker_after_publish.yml b/.github/workflows/release_docker_after_publish.yml index 23aac9b54..fe37dfcbf 100644 --- a/.github/workflows/release_docker_after_publish.yml +++ b/.github/workflows/release_docker_after_publish.yml @@ -46,7 +46,7 @@ jobs: notify: name: Notify Lark via webhook needs: release - runs-on: ubuntu-latest + runs-on: [self-hosted, ubuntu-latest]-latest if: ${{ always() }} steps: - uses: actions/checkout@v2 diff --git a/.github/workflows/release_nightly_on_schedule.yml b/.github/workflows/release_nightly_on_schedule.yml index 072a943ae..b54a3859d 100644 --- a/.github/workflows/release_nightly_on_schedule.yml +++ b/.github/workflows/release_nightly_on_schedule.yml @@ -9,7 +9,7 @@ jobs: publish: if: github.repository == 'hpcaitech/ColossalAI' name: Build and publish Python 🐍 distributions 📦 to PyPI - runs-on: ubuntu-latest + runs-on: ubuntu-[self-hosted, ubuntu-latest] timeout-minutes: 20 outputs: status: ${{ steps.publish.outcome }} @@ -36,7 +36,7 @@ jobs: notify: name: Notify Lark via webhook needs: publish - runs-on: ubuntu-latest + runs-on: [self-hosted, ubuntu-latest]-latest if: ${{ always() }} && github.repository == 'hpcaitech/ColossalAI' steps: - uses: actions/checkout@v2 diff --git a/.github/workflows/release_pypi_after_merge.yml b/.github/workflows/release_pypi_after_merge.yml index b987b4397..e60c3ce6f 100644 --- a/.github/workflows/release_pypi_after_merge.yml +++ b/.github/workflows/release_pypi_after_merge.yml @@ -12,7 +12,7 @@ jobs: build-n-publish: if: github.event_name == 'workflow_dispatch' || github.repository == 'hpcaitech/ColossalAI' && github.event.pull_request.merged == true && github.base_ref == 'main' name: Build and publish Python 🐍 distributions 📦 to PyPI - runs-on: ubuntu-latest + runs-on: ubuntu-[self-hosted, ubuntu-latest] timeout-minutes: 20 steps: - uses: actions/checkout@v2 @@ -35,7 +35,7 @@ jobs: notify: name: Notify Lark via webhook needs: build-n-publish - runs-on: ubuntu-latest + runs-on: [self-hosted, ubuntu-latest]-latest if: ${{ always() }} steps: - uses: actions/checkout@v2 diff --git a/.github/workflows/release_test_pypi_before_merge.yml b/.github/workflows/release_test_pypi_before_merge.yml index 3d3cfc696..a7f53c649 100644 --- a/.github/workflows/release_test_pypi_before_merge.yml +++ b/.github/workflows/release_test_pypi_before_merge.yml @@ -9,7 +9,7 @@ jobs: build-n-publish: if: github.event_name == 'workflow_dispatch' || github.repository == 'hpcaitech/ColossalAI' name: Build and publish Python 🐍 distributions 📦 to Test PyPI - runs-on: ubuntu-latest + runs-on: [self-hosted, ubuntu-latest]-latest timeout-minutes: 20 steps: - uses: actions/checkout@v2 diff --git a/.github/workflows/report_leaderboard_to_lark.yml b/.github/workflows/report_leaderboard_to_lark.yml index 00d8e9e1f..70a13270d 100644 --- a/.github/workflows/report_leaderboard_to_lark.yml +++ b/.github/workflows/report_leaderboard_to_lark.yml @@ -10,7 +10,7 @@ jobs: generate-and-publish: if: github.repository == 'hpcaitech/ColossalAI' name: Generate leaderboard report and publish to Lark - runs-on: ubuntu-latest + runs-on: ubuntu-[self-hosted, ubuntu-latest] timeout-minutes: 20 steps: - uses: actions/checkout@v2 diff --git a/.github/workflows/report_test_coverage.yml b/.github/workflows/report_test_coverage.yml index c9dc541b8..1c17b63a9 100644 --- a/.github/workflows/report_test_coverage.yml +++ b/.github/workflows/report_test_coverage.yml @@ -8,7 +8,7 @@ on: jobs: report-test-coverage: - runs-on: ubuntu-latest + runs-on: ubuntu-[self-hosted, ubuntu-latest] if: ${{ github.event.workflow_run.conclusion == 'success' }} steps: - name: "Download artifact" diff --git a/.github/workflows/run_chatgpt_examples.yml b/.github/workflows/run_chatgpt_examples.yml index c6ac2b7bd..5f580e4c1 100644 --- a/.github/workflows/run_chatgpt_examples.yml +++ b/.github/workflows/run_chatgpt_examples.yml @@ -19,7 +19,7 @@ jobs: github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' runs-on: [self-hosted, gpu] container: - image: hpcaitech/pytorch-cuda:2.2.2-12.1.0 + image: image-cloud.luchentech.com/hpcaitech/pytorch-cuda:2.2.2-12.1.0 options: --gpus all --rm -v /data/scratch/examples-data:/data/scratch/examples-data --shm-size=10.24gb timeout-minutes: 60 defaults: diff --git a/.github/workflows/run_chatgpt_unit_tests.yml b/.github/workflows/run_chatgpt_unit_tests.yml index 21545098a..a67335690 100644 --- a/.github/workflows/run_chatgpt_unit_tests.yml +++ b/.github/workflows/run_chatgpt_unit_tests.yml @@ -19,7 +19,7 @@ jobs: github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' runs-on: [self-hosted, gpu] container: - image: hpcaitech/pytorch-cuda:2.2.2-12.1.0 + image: image-cloud.luchentech.com/hpcaitech/pytorch-cuda:2.2.2-12.1.0 options: --gpus all --rm -v /data/scratch/examples-data:/data/scratch/examples-data timeout-minutes: 30 defaults: diff --git a/.github/workflows/run_colossalqa_unit_tests.yml b/.github/workflows/run_colossalqa_unit_tests.yml index 326ef4526..f08831e5f 100644 --- a/.github/workflows/run_colossalqa_unit_tests.yml +++ b/.github/workflows/run_colossalqa_unit_tests.yml @@ -19,7 +19,7 @@ jobs: github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' runs-on: [self-hosted, gpu] container: - image: hpcaitech/pytorch-cuda:2.2.2-12.1.0 + image: image-cloud.luchentech.com/hpcaitech/pytorch-cuda:2.2.2-12.1.0 volumes: - /data/scratch/test_data_colossalqa:/data/scratch/test_data_colossalqa - /data/scratch/llama-tiny:/data/scratch/llama-tiny diff --git a/.github/workflows/submodule.yml b/.github/workflows/submodule.yml index 4ffb26118..14d85d1d9 100644 --- a/.github/workflows/submodule.yml +++ b/.github/workflows/submodule.yml @@ -7,7 +7,7 @@ on: jobs: sync-submodule: - runs-on: ubuntu-latest + runs-on: ubuntu-[self-hosted, ubuntu-latest] if: github.repository == 'hpcaitech/ColossalAI' steps: - name: Checkout diff --git a/.github/workflows/translate_comment.yml b/.github/workflows/translate_comment.yml index 83c127b3c..36113aaad 100644 --- a/.github/workflows/translate_comment.yml +++ b/.github/workflows/translate_comment.yml @@ -7,7 +7,7 @@ on: jobs: build: - runs-on: ubuntu-latest + runs-on: ubuntu-[self-hosted, ubuntu-latest] steps: - uses: usthe/issues-translate-action@v2.7 with: