diff --git a/.github/workflows/changed_file_trigger_examples_check_and_weekly_check.yml b/.github/workflows/auto_example_check.yml similarity index 62% rename from .github/workflows/changed_file_trigger_examples_check_and_weekly_check.yml rename to .github/workflows/auto_example_check.yml index 2b7ec3125..7f1e357e3 100644 --- a/.github/workflows/changed_file_trigger_examples_check_and_weekly_check.yml +++ b/.github/workflows/auto_example_check.yml @@ -1,7 +1,7 @@ name: Test Example on: pull_request: - # So only the changes in examples folder will trigger jobs below. + # any change in the examples folder will trigger check for the corresponding example. paths: - 'examples/**' # run at 00:00 of every Sunday(singapore time) so here is UTC time Saturday 16:00 @@ -17,12 +17,14 @@ jobs: github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'pull_request' runs-on: ubuntu-latest outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - name: Check out all files + matrix: ${{ steps.setup-matrix.outputs.matrix }} + anyChanged: ${{ steps.setup-matrix.outputs.anyChanged }} + name: Detect changed example files steps: - uses: actions/checkout@v3 with: - fetch-depth: 2 + fetch-depth: 0 + ref: ${{ github.event.pull_request.head.sha }} - name: Get all changed example files id: changed-files uses: tj-actions/changed-files@v35 @@ -30,46 +32,53 @@ jobs: with: since_last_remote_commit: true - name: setup matrix - id: set-matrix + id: setup-matrix run: | changedFileName="" for file in ${{ steps.changed-files.outputs.all_changed_files }}; do changedFileName="${file}:${changedFileName}" done echo "$changedFileName was changed" - res=`python .github/workflows/scripts/changed_example.py --fileNameList $changedFileName` - echo "All changed files are $res" - loc=$( IFS=',' ; echo "${res[*]}" ) - echo "$loc" - echo "::set-output name=matrix::{\"loc\":$(echo "$loc")}" + res=`python .github/workflows/scripts/example_checks/detect_changed_example.py --fileNameList $changedFileName` + echo "All changed examples are $res" + + if [ "$x" = "[]" ]; then + echo "anyChanged=false" >> $GITHUB_OUTPUT + echo "matrix=null" >> $GITHUB_OUTPUT + else + dirs=$( IFS=',' ; echo "${res[*]}" ) + echo "anyChanged=true" >> $GITHUB_OUTPUT + echo "matrix={\"directory\":$(echo "$dirs")}" >> $GITHUB_OUTPUT + fi # If no file is changed, it will prompt an error and shows the matrix do not have value. - check-all-changed-files: + check-changed-example: # Add this condition to avoid executing this job if the trigger event is workflow_dispatch. if: | github.event.pull_request.draft == false && github.base_ref == 'main' && github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'pull_request' - name: Test each changed example files + name: Test the changed example needs: detect-changed-example runs-on: [self-hosted, gpu] strategy: matrix: ${{fromJson(needs.detect-changed-example.outputs.matrix)}} container: image: hpcaitech/pytorch-cuda:1.12.0-11.3.0 + options: --gpus all --rm -v /data/scratch/examples-data:/data/ + timeout-minutes: 10 steps: - uses: actions/checkout@v3 - with: - fetch-depth: 2 - - name: Install dependancies + - name: Install Colossal-AI run: | - pip install -r ./requirements/requirements.txt - pip install colossalai - - name: List all changed example files + pip install -v . + - name: Test the example run: | - res=${{ matrix.loc }} - cd "${PWD}/examples/${res}" + example_dir=${{ matrix.directory }} + cd "${PWD}/examples/${example_dir}" bash test_ci.sh + env: + NCCL_SHM_DISABLE: 1 # This is for all files' weekly check. Specifically, this job is to find all the directories. matrix_preparation: @@ -77,20 +86,20 @@ jobs: github.event.pull_request.draft == false && github.base_ref == 'main' && github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'schedule' - name: Prepare Directory List for All files + name: Prepare matrix for weekly check runs-on: ubuntu-latest outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} + matrix: ${{ steps.setup-matrix.outputs.matrix }} steps: - name: 📚 Checkout uses: actions/checkout@v3 - name: setup matrix - id: set-matrix + id: setup-matrix run: | - res=`python .github/workflows/scripts/weekly_check_example.py` + res=`python .github/workflows/scripts/example_checks/check_example_weekly.py` all_loc=$( IFS=',' ; echo "${res[*]}" ) - echo "$all_loc" - echo "::set-output name=matrix::{\"all_loc\":$(echo "$all_loc")}" + echo "Found the examples: $all_loc" + echo "matrix={\"directory\":$(echo "$all_loc")}" >> $GITHUB_OUTPUT weekly_check: if: | @@ -104,16 +113,18 @@ jobs: matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}} container: image: hpcaitech/pytorch-cuda:1.12.0-11.3.0 + timeout-minutes: 10 steps: - name: 📚 Checkout uses: actions/checkout@v3 - - name: Install the requirements + - name: Install Colossal-AI run: | - pip install -r ./requirements/requirements.txt - pip install colossalai + pip install -v . - name: Traverse all files run: | - dir=${{ matrix.all_loc }} - echo "${dir} is current directory" - cd "${PWD}/examples/${dir}" + example_dir=${{ matrix.diretory }} + echo "Testing ${example_dir} now" + cd "${PWD}/examples/${example_dir}" bash test_ci.sh + env: + NCCL_SHM_DISABLE: 1 diff --git a/.github/workflows/workflow_dispatch_example.yml b/.github/workflows/dispatch_example_check.yml similarity index 57% rename from .github/workflows/workflow_dispatch_example.yml rename to .github/workflows/dispatch_example_check.yml index d9d576910..e0333422f 100644 --- a/.github/workflows/workflow_dispatch_example.yml +++ b/.github/workflows/dispatch_example_check.yml @@ -8,7 +8,7 @@ on: required: true jobs: - manual_check_matrix_preparation: + matrix_preparation: if: | github.event.pull_request.draft == false && github.base_ref == 'main' && @@ -16,31 +16,24 @@ jobs: name: Check the examples user want runs-on: ubuntu-latest outputs: - matrix: ${{ steps.set-matrix-1.outputs.matrix }} + matrix: ${{ steps.set-matrix.outputs.matrix }} steps: - name: 📚 Checkout uses: actions/checkout@v3 - - name: Get manual directories - id: set-matrix-1 + - name: Set up matrix + id: set-matrix env: check_dir: ${{ inputs.example_directory }} run: | - all_mannual_check_dir=() - for cdi in $check_dir - do - all_mannual_check_dir+=("\"${cdi}\"") - done - man_loc=$( IFS=',' ; echo "${all_mannual_check_dir[*]}" ) - res=`python .github/workflows/scripts/input_check_example.py --fileNameList $man_loc` - echo "${res} is file existance. 1 for all exist, -1 for at least one file not exist." - if [ res == -1 ];then - exit(1) + res=`python .github/workflows/scripts/example_checks/check_dispatch_inputs.py --fileNameList $check_dir` + if [ res == "failure" ];then + exit -1 fi - man_loc="[${man_loc}]" - echo "$man_loc" - echo "::set-output name=matrix::{\"man_loc\":$(echo "$man_loc")}" + dirs="[${check_dir}]" + echo "Testing examples in $dirs" + echo "matrix={\"directory\":$(echo "$dirs")}" >> $GITHUB_OUTPUT - manual_check: + test_example: if: | github.event.pull_request.draft == false && github.base_ref == 'main' && @@ -52,16 +45,19 @@ jobs: matrix: ${{fromJson(needs.manual_check_matrix_preparation.outputs.matrix)}} container: image: hpcaitech/pytorch-cuda:1.12.0-11.3.0 + options: --gpus all --rm -v /data/scratch/examples-data:/data/ + timeout-minutes: 10 steps: - name: 📚 Checkout uses: actions/checkout@v3 - - name: Install the requirements + - name: Install Colossal-AI run: | - pip install -r ./requirements/requirements.txt - pip install colossalai - - name: Traverse all files + pip install -v . + - name: Test the example run: | - dir=${{ matrix.man_loc }} - echo "${dir} is current directory" + dir=${{ matrix.directory }} + echo "Testing ${dir} now" cd "${PWD}/examples/${dir}" bash test_ci.sh + env: + NCCL_SHM_DISABLE: 1 diff --git a/.github/workflows/scripts/example_checks/check_dispatch_inputs.py b/.github/workflows/scripts/example_checks/check_dispatch_inputs.py new file mode 100644 index 000000000..04d2063ec --- /dev/null +++ b/.github/workflows/scripts/example_checks/check_dispatch_inputs.py @@ -0,0 +1,27 @@ +import argparse +import os + + +def check_inputs(input_list): + for path in input_list: + real_path = os.path.join('examples', path) + if not os.path.exists(real_path): + return False + return True + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('-f', '--fileNameList', type=str, help="List of file names") + args = parser.parse_args() + name_list = args.fileNameList.split(",") + is_correct = check_inputs(name_list) + + if is_correct: + print('success') + else: + print('failure') + + +if __name__ == '__main__': + main() diff --git a/.github/workflows/scripts/weekly_check_example.py b/.github/workflows/scripts/example_checks/check_example_weekly.py similarity index 76% rename from .github/workflows/scripts/weekly_check_example.py rename to .github/workflows/scripts/example_checks/check_example_weekly.py index dfedc4628..941e90901 100644 --- a/.github/workflows/scripts/weekly_check_example.py +++ b/.github/workflows/scripts/example_checks/check_example_weekly.py @@ -5,9 +5,9 @@ def show_files(path, all_files): # Traverse all the folder/file in current directory file_list = os.listdir(path) # Determine the element is folder or file. If file, pass it into list, if folder, recurse. - for file in file_list: + for file_name in file_list: # Get the abs directory using os.path.join() and store into cur_path. - cur_path = os.path.join(path, file) + cur_path = os.path.join(path, file_name) # Determine whether folder if os.path.isdir(cur_path): show_files(cur_path, all_files) @@ -26,9 +26,8 @@ def main(): for file_loc in contents: split_loc = file_loc.split('/') # must have two sub-folder levels after examples folder, such as examples/images/vit is acceptable, examples/images/README.md is not, examples/requirements.txt is not. - if len(split_loc) - split_loc.index('examples') >= 3: - tmp_loc = split_loc[(split_loc.index('examples') + 1):(split_loc.index('examples') + 3)] - re_loc = join(tmp_loc, '/') + if len(split_loc) >= 4: + re_loc = '/'.join(split_loc[1:3]) if re_loc not in all_loc: all_loc.append(re_loc) print(all_loc) diff --git a/.github/workflows/scripts/changed_example.py b/.github/workflows/scripts/example_checks/detect_changed_example.py similarity index 52% rename from .github/workflows/scripts/changed_example.py rename to .github/workflows/scripts/example_checks/detect_changed_example.py index ac2f0864e..df4fd6736 100644 --- a/.github/workflows/scripts/changed_example.py +++ b/.github/workflows/scripts/example_checks/detect_changed_example.py @@ -3,14 +3,19 @@ import argparse def main(): parser = argparse.ArgumentParser() - parser.add_argument('--fileNameList', type=str) + parser.add_argument('-f', '--fileNameList', type=str, help="The list of changed files") args = parser.parse_args() name_list = args.fileNameList.split(":") folder_need_check = set() for loc in name_list: - # Find only the sub-folder of 'example' folder + # Find only the sub-sub-folder of 'example' folder + # the examples folder structure is like + # - examples + # - area + # - application + # - file if loc.split("/")[0] == "examples" and len(loc.split("/")) >= 4: - folder_need_check.add(loc.split("/")[1] + "/" + loc.split("/")[2]) + folder_need_check.add('/'.join(loc.split("/")[1:3])) # Output the result using print. Then the shell can get the values. print(list(folder_need_check)) diff --git a/.github/workflows/scripts/input_check_example.py b/.github/workflows/scripts/input_check_example.py deleted file mode 100644 index 5602d8f09..000000000 --- a/.github/workflows/scripts/input_check_example.py +++ /dev/null @@ -1,23 +0,0 @@ -import argparse -import os - - -def detect_correct(loc_li): - for loc in loc_li: - real_loc = 'examples/' + eval(loc) - if not os.path.exists(real_loc): - return -1 - return 1 - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument('--fileNameList', type=str) - args = parser.parse_args() - name_list = args.fileNameList.split(",") - result = detect_correct(name_list) - print(result) - - -if __name__ == '__main__': - main() diff --git a/examples/tutorial/hybrid_parallel/config.py b/examples/tutorial/hybrid_parallel/config.py index 2450ab1c7..ac273c305 100644 --- a/examples/tutorial/hybrid_parallel/config.py +++ b/examples/tutorial/hybrid_parallel/config.py @@ -6,8 +6,8 @@ from colossalai.amp import AMP_TYPE BATCH_SIZE = 256 LEARNING_RATE = 3e-3 WEIGHT_DECAY = 0.3 -NUM_EPOCHS = 10 -WARMUP_EPOCHS = 3 +NUM_EPOCHS = 2 +WARMUP_EPOCHS = 1 # model config IMG_SIZE = 224 diff --git a/examples/tutorial/hybrid_parallel/requirements.txt b/examples/tutorial/hybrid_parallel/requirements.txt index 137a69e80..dbf6aaf3e 100644 --- a/examples/tutorial/hybrid_parallel/requirements.txt +++ b/examples/tutorial/hybrid_parallel/requirements.txt @@ -1,2 +1,3 @@ colossalai >= 0.1.12 torch >= 1.8.1 +titans \ No newline at end of file diff --git a/examples/tutorial/hybrid_parallel/test_ci.sh b/examples/tutorial/hybrid_parallel/test_ci.sh new file mode 100644 index 000000000..8860b72a2 --- /dev/null +++ b/examples/tutorial/hybrid_parallel/test_ci.sh @@ -0,0 +1,5 @@ +#!/bin/bash +set -euxo pipefail + +pip install -r requirements.txt +torchrun --standalone --nproc_per_node 4 train.py --config config.py -s diff --git a/examples/tutorial/hybrid_parallel/train.py b/examples/tutorial/hybrid_parallel/train.py index 0f2a207cb..2a8576db7 100644 --- a/examples/tutorial/hybrid_parallel/train.py +++ b/examples/tutorial/hybrid_parallel/train.py @@ -98,9 +98,9 @@ def main(): root = os.environ.get('DATA', '../data') if args.synthetic: # if we use synthetic dataset - # we train for 30 steps and eval for 10 steps per epoch - train_dataloader = DummyDataloader(length=30, batch_size=gpc.config.BATCH_SIZE) - test_dataloader = DummyDataloader(length=10, batch_size=gpc.config.BATCH_SIZE) + # we train for 10 steps and eval for 5 steps per epoch + train_dataloader = DummyDataloader(length=10, batch_size=gpc.config.BATCH_SIZE) + test_dataloader = DummyDataloader(length=5, batch_size=gpc.config.BATCH_SIZE) else: train_dataloader, test_dataloader = build_cifar(gpc.config.BATCH_SIZE, root, pad_if_needed=True)