diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml index 8eb358c4f..50417ac8a 100644 --- a/.github/workflows/build_on_pr.yml +++ b/.github/workflows/build_on_pr.yml @@ -22,57 +22,6 @@ on: delete: jobs: - prepare_cache: - name: Prepare testmon cache - if: | - github.event_name == 'create' && - github.event.ref_type == 'branch' && - github.event.repository.full_name == 'hpcaitech/ColossalAI' - runs-on: [self-hosted, gpu] - container: - image: hpcaitech/pytorch-cuda:2.0.0-11.7.0 - options: --rm - timeout-minutes: 5 - defaults: - run: - shell: bash - steps: - - name: Copy testmon cache - run: | # branch name may contain slash, we need to replace it with space - export REF_BRANCH=$(echo ${{ github.event.ref }} | sed "s/\// /") - if [ -d /github/home/testmon_cache/${MAIN_BRANCH} ]; then - cp -p -r /github/home/testmon_cache/${MAIN_BRANCH} "/github/home/testmon_cache/${REF_BRANCH}" - fi - env: - MAIN_BRANCH: ${{ github.event.master_branch }} - - prepare_cache_for_pr: - name: Prepare testmon cache for PR - if: | - github.event_name == 'pull_request' && - (github.event.action == 'opened' || github.event.action == 'reopened' || (github.event.action == 'edited' && github.event.changes.base != null)) && - github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' - runs-on: [self-hosted, gpu] - container: - image: hpcaitech/pytorch-cuda:2.0.0-11.7.0 - options: --rm - timeout-minutes: 5 - defaults: - run: - shell: bash - concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-repare-cache - cancel-in-progress: true - steps: - - name: Copy testmon cache - run: | # branch name may contain slash, we need to replace it with space - export BASE=$(echo ${{ github.event.pull_request.base.ref }} | sed "s/\// /") - if [ -d "/github/home/testmon_cache/${BASE}" ] && [ ! -z "$(ls -A "/github/home/testmon_cache/${BASE}")" ]; then - mkdir -p /github/home/testmon_cache/_pull/${PR_NUMBER} && cp -p -r "/github/home/testmon_cache/${BASE}"/.testmondata* /github/home/testmon_cache/_pull/${PR_NUMBER} - fi - env: - PR_NUMBER: ${{ github.event.number }} - detect: name: Detect file change if: | @@ -140,7 +89,7 @@ jobs: if: needs.detect.outputs.anyLibraryFileChanged == 'true' runs-on: [self-hosted, gpu] container: - image: hpcaitech/pytorch-cuda:2.0.0-11.7.0 + image: hpcaitech/pytorch-cuda:2.1.0-12.1.0 options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny timeout-minutes: 60 defaults: @@ -174,6 +123,7 @@ jobs: run: | cd TensorNVMe cp -p -r ./build /github/home/tensornvme_cache/ + cp -p -r ./cmake-build /github/home/tensornvme_cache/ - name: Checkout Colossal-AI uses: actions/checkout@v2 @@ -198,31 +148,27 @@ jobs: # -p flag is required to preserve the file timestamp to avoid ninja rebuild cp -p -r /__w/ColossalAI/ColossalAI/build /github/home/cuda_ext_cache/ - - name: Restore Testmon Cache - run: | - if [ -d /github/home/testmon_cache/_pull/${PR_NUMBER} ] && [ ! -z "$(ls -A /github/home/testmon_cache/_pull/${PR_NUMBER})" ]; then - cp -p -r /github/home/testmon_cache/_pull/${PR_NUMBER}/.testmondata* /__w/ColossalAI/ColossalAI/ - fi - env: - PR_NUMBER: ${{ github.event.number }} - - name: Execute Unit Testing run: | - CURL_CA_BUNDLE="" PYTHONPATH=$PWD pytest -m "not largedist" --testmon --testmon-forceselect --testmon-cov=. --durations=10 tests/ + CURL_CA_BUNDLE="" PYTHONPATH=$PWD FAST_TEST=1 pytest \ + -m "not largedist" \ + --durations=0 \ + --ignore tests/test_analyzer \ + --ignore tests/test_auto_parallel \ + --ignore tests/test_fx \ + --ignore tests/test_autochunk \ + --ignore tests/test_gptq \ + --ignore tests/test_infer_ops \ + --ignore tests/test_legacy \ + --ignore tests/test_moe \ + --ignore tests/test_smoothquant \ + --ignore tests/test_checkpoint_io \ + tests/ env: - DATA: /data/scratch/cifar-10 NCCL_SHM_DISABLE: 1 LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64 - TESTMON_CORE_PKGS: /__w/ColossalAI/ColossalAI/requirements/requirements.txt,/__w/ColossalAI/ColossalAI/requirements/requirements-test.txt LLAMA_PATH: /data/scratch/llama-tiny - - name: Store Testmon Cache - run: | - mkdir -p /github/home/testmon_cache/_pull/${PR_NUMBER} - cp -p -r /__w/ColossalAI/ColossalAI/.testmondata* /github/home/testmon_cache/_pull/${PR_NUMBER}/ - env: - PR_NUMBER: ${{ github.event.number }} - - name: Collate artifact env: PR_NUMBER: ${{ github.event.number }} @@ -260,53 +206,3 @@ jobs: name: report path: report/ - store_cache: - name: Store testmon cache for PR - if: | - github.event_name == 'pull_request' && - github.event.action == 'closed' && - github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' - runs-on: [self-hosted, gpu] - container: - image: hpcaitech/pytorch-cuda:2.0.0-11.7.0 - options: --rm - timeout-minutes: 5 - defaults: - run: - shell: bash - steps: - - name: Store testmon cache if possible - if: github.event.pull_request.merged == true - run: | # branch name may contain slash, we need to replace it with space - export BASE=$(echo ${{ github.event.pull_request.base.ref }} | sed "s/\// /") - if [ -d /github/home/testmon_cache/_pull/${PR_NUMBER} ] && [ ! -z "$(ls -A /github/home/testmon_cache/_pull/${PR_NUMBER})" ]; then - cp -p -r /github/home/testmon_cache/_pull/${PR_NUMBER}/.testmondata* "/github/home/testmon_cache/${BASE}/" - fi - env: - PR_NUMBER: ${{ github.event.pull_request.number }} - - - name: Remove testmon cache - run: | - rm -rf /github/home/testmon_cache/_pull/${PR_NUMBER} - env: - PR_NUMBER: ${{ github.event.pull_request.number }} - - remove_cache: - name: Remove testmon cache - if: | - github.event_name == 'delete' && - github.event.ref_type == 'branch' && - github.event.repository.full_name == 'hpcaitech/ColossalAI' - runs-on: [self-hosted, gpu] - container: - image: hpcaitech/pytorch-cuda:2.0.0-11.7.0 - options: --rm - timeout-minutes: 5 - defaults: - run: - shell: bash - steps: - - name: Remove testmon cache - run: | # branch name may contain slash, we need to replace it with space - export BASE=$(echo ${{ github.event.ref }} | sed "s/\// /") - rm -rf "/github/home/testmon_cache/${BASE}" diff --git a/.github/workflows/build_on_schedule.yml b/.github/workflows/build_on_schedule.yml index e5afe9622..3bee3b4f9 100644 --- a/.github/workflows/build_on_schedule.yml +++ b/.github/workflows/build_on_schedule.yml @@ -10,20 +10,20 @@ jobs: build: name: Build and Test Colossal-AI if: github.repository == 'hpcaitech/ColossalAI' - runs-on: [self-hosted, 8-gpu] + runs-on: [self-hosted, gpu] container: image: hpcaitech/pytorch-cuda:2.0.0-11.7.0 options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny - timeout-minutes: 40 + timeout-minutes: 90 steps: - name: Check GPU Availability # ensure all GPUs have enough memory id: check-avai run: | avai=true - for i in $(seq 0 7); + for i in $(seq 0 3); do gpu_used=$(nvidia-smi -i $i --query-gpu=memory.used --format=csv,noheader,nounits) - [ "$gpu_used" -gt "10000" ] && avai=false + [ "$gpu_used" -gt "2000" ] && avai=false done echo "GPU is available: $avai" @@ -60,9 +60,12 @@ jobs: - name: Unit Testing if: steps.check-avai.outputs.avai == 'true' run: | - PYTHONPATH=$PWD pytest --durations=0 tests + PYTHONPATH=$PWD pytest \ + -m "not largedist" \ + --durations=0 \ + tests/ env: - DATA: /data/scratch/cifar-10 + NCCL_SHM_DISABLE: 1 LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64 LLAMA_PATH: /data/scratch/llama-tiny diff --git a/.github/workflows/doc_test_on_schedule.yml b/.github/workflows/doc_test_on_schedule.yml index b4c776747..b3536184d 100644 --- a/.github/workflows/doc_test_on_schedule.yml +++ b/.github/workflows/doc_test_on_schedule.yml @@ -12,7 +12,7 @@ jobs: name: Test the changed Doc runs-on: [self-hosted, gpu] container: - image: hpcaitech/pytorch-cuda:2.0.0-11.7.0 + image: hpcaitech/pytorch-cuda:2.1.0-12.1.0 options: --gpus all --rm timeout-minutes: 60 steps: diff --git a/tests/kit/model_zoo/__init__.py b/tests/kit/model_zoo/__init__.py index 62b9123b5..5f6789ff3 100644 --- a/tests/kit/model_zoo/__init__.py +++ b/tests/kit/model_zoo/__init__.py @@ -1,5 +1,33 @@ -from . import custom, diffusers, timm, torchaudio, torchrec, torchvision, transformers +import os +from . import custom, diffusers, timm, torchaudio, torchvision, transformers from .executor import run_fwd, run_fwd_bwd from .registry import model_zoo -__all__ = ["model_zoo", "run_fwd", "run_fwd_bwd"] +# We pick a subset of models for fast testing in order to reduce the total testing time +COMMON_MODELS = [ + 'custom_hanging_param_model', + 'custom_nested_model', + 'custom_repeated_computed_layers', + 'custom_simple_net', + 'diffusers_clip_text_model', + 'diffusers_auto_encoder_kl', + 'diffusers_unet2d_model', + 'timm_densenet', + 'timm_resnet', + 'timm_swin_transformer', + 'torchaudio_wav2vec2_base', + 'torchaudio_conformer', + 'transformers_bert_for_masked_lm', + 'transformers_bloom_for_causal_lm', + 'transformers_falcon_for_causal_lm', + 'transformers_chatglm_for_conditional_generation', + 'transformers_llama_for_casual_lm', + 'transformers_vit_for_masked_image_modeling', + 'transformers_mistral_for_casual_lm' +] + +IS_FAST_TEST = os.environ.get('FAST_TEST', '0') == '1' + + +__all__ = ["model_zoo", "run_fwd", "run_fwd_bwd", 'COMMON_MODELS', 'IS_FAST_TEST'] + diff --git a/tests/kit/model_zoo/registry.py b/tests/kit/model_zoo/registry.py index bb522778b..44a0adc6a 100644 --- a/tests/kit/model_zoo/registry.py +++ b/tests/kit/model_zoo/registry.py @@ -1,6 +1,6 @@ #!/usr/bin/env python from dataclasses import dataclass -from typing import Callable +from typing import Callable, List, Union __all__ = ["ModelZooRegistry", "ModelAttribute", "model_zoo"] @@ -61,7 +61,7 @@ class ModelZooRegistry(dict): """ self[name] = (model_fn, data_gen_fn, output_transform_fn, loss_fn, model_attribute) - def get_sub_registry(self, keyword: str): + def get_sub_registry(self, keyword: Union[str, List[str]]): """ Get a sub registry with models that contain the keyword. @@ -70,12 +70,15 @@ class ModelZooRegistry(dict): """ new_dict = dict() + if isinstance(keyword, str): + keyword_list = [keyword] + else: + keyword_list = keyword + assert isinstance(keyword_list, (list, tuple)) + for k, v in self.items(): - if keyword == "transformers_gpt": - if keyword in k and not "gptj" in k: # ensure GPT2 does not retrieve GPTJ models - new_dict[k] = v - else: - if keyword in k: + for kw in keyword_list: + if kw in k: new_dict[k] = v assert len(new_dict) > 0, f"No model found with keyword {keyword}" diff --git a/tests/test_booster/test_plugin/test_gemini_plugin.py b/tests/test_booster/test_plugin/test_gemini_plugin.py index d4205e1f9..3462d5dde 100644 --- a/tests/test_booster/test_plugin/test_gemini_plugin.py +++ b/tests/test_booster/test_plugin/test_gemini_plugin.py @@ -13,7 +13,7 @@ from colossalai.lazy.lazy_init import LazyInitContext from colossalai.nn.optimizer import HybridAdam from colossalai.tensor.colo_parameter import ColoParameter from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn -from tests.kit.model_zoo import model_zoo +from tests.kit.model_zoo import model_zoo, COMMON_MODELS, IS_FAST_TEST def run_fn(init_method, model_fn, data_gen_fn, output_transform_fn, zero_size, tp_size) -> Optional[str]: @@ -66,7 +66,7 @@ def run_fn(init_method, model_fn, data_gen_fn, output_transform_fn, zero_size, t # @parameterize('init_method', ['lazy', 'none', 'colo']) -@parameterize("subset", ["torchvision", "transformers", "diffusers"]) +@parameterize("subset", [COMMON_MODELS] if IS_FAST_TEST else ["torchvision", "transformers", "diffusers"]) @parameterize("init_method", ["none"]) @parameterize("zero_size", [2]) @parameterize("tp_size", [2]) diff --git a/tests/test_booster/test_plugin/test_low_level_zero_plugin.py b/tests/test_booster/test_plugin/test_low_level_zero_plugin.py index 3eaaf882c..bcdcc1470 100644 --- a/tests/test_booster/test_plugin/test_low_level_zero_plugin.py +++ b/tests/test_booster/test_plugin/test_low_level_zero_plugin.py @@ -11,7 +11,7 @@ from colossalai.booster.plugin import LowLevelZeroPlugin # from colossalai.nn.optimizer import HybridAdam from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn -from tests.kit.model_zoo import model_zoo +from tests.kit.model_zoo import model_zoo, IS_FAST_TEST, COMMON_MODELS # These models are not compatible with AMP _AMP_ERR_MODELS = ["timm_convit", "deepfm_interactionarch"] @@ -62,7 +62,12 @@ def check_low_level_zero_plugin(stage: int, early_stop: bool = True): ignore_models = _AMP_ERR_MODELS + _LOW_LEVEL_ZERO_ERR_MODELS + _STUCK_MODELS skipped_models = [] - for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in model_zoo.items(): + if IS_FAST_TEST: + registry = model_zoo.get_sub_registry(COMMON_MODELS) + else: + registry = model_zoo + + for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in registry.items(): # FIXME(ver217): fix these models if name in ignore_models: skipped_models.append(name) diff --git a/tests/test_booster/test_plugin/test_torch_ddp_plugin.py b/tests/test_booster/test_plugin/test_torch_ddp_plugin.py index 1a7ca6f2a..fa32feb2f 100644 --- a/tests/test_booster/test_plugin/test_torch_ddp_plugin.py +++ b/tests/test_booster/test_plugin/test_torch_ddp_plugin.py @@ -11,7 +11,7 @@ from colossalai.booster import Booster from colossalai.booster.plugin import TorchDDPPlugin from colossalai.interface import OptimizerWrapper from colossalai.testing import rerun_if_address_is_in_use, spawn -from tests.kit.model_zoo import model_zoo +from tests.kit.model_zoo import model_zoo, IS_FAST_TEST, COMMON_MODELS def run_fn(model_fn, data_gen_fn, output_transform_fn): @@ -40,7 +40,12 @@ def run_fn(model_fn, data_gen_fn, output_transform_fn): def check_torch_ddp_plugin(): - for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in model_zoo.items(): + if IS_FAST_TEST: + registry = model_zoo.get_sub_registry(COMMON_MODELS) + else: + registry = model_zoo + + for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in registry.items(): if name == "dlrm_interactionarch": continue run_fn(model_fn, data_gen_fn, output_transform_fn) diff --git a/tests/test_booster/test_plugin/test_torch_fsdp_plugin.py b/tests/test_booster/test_plugin/test_torch_fsdp_plugin.py index 8bcbffdd0..8a14d7cf8 100644 --- a/tests/test_booster/test_plugin/test_torch_fsdp_plugin.py +++ b/tests/test_booster/test_plugin/test_torch_fsdp_plugin.py @@ -12,7 +12,7 @@ if version.parse(torch.__version__) >= version.parse("1.12.0"): from colossalai.interface import OptimizerWrapper from colossalai.testing import rerun_if_address_is_in_use, spawn -from tests.kit.model_zoo import model_zoo +from tests.kit.model_zoo import model_zoo, IS_FAST_TEST, COMMON_MODELS # test basic fsdp function @@ -42,7 +42,12 @@ def run_fn(model_fn, data_gen_fn, output_transform_fn): def check_torch_fsdp_plugin(): - for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in model_zoo.items(): + if IS_FAST_TEST: + registry = model_zoo.get_sub_registry(COMMON_MODELS) + else: + registry = model_zoo + + for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in registry.items(): if any( element in name for element in [ diff --git a/tests/test_checkpoint_io/test_gemini_checkpoint_io.py b/tests/test_checkpoint_io/test_gemini_checkpoint_io.py index 8343c5f07..49fd85ffb 100644 --- a/tests/test_checkpoint_io/test_gemini_checkpoint_io.py +++ b/tests/test_checkpoint_io/test_gemini_checkpoint_io.py @@ -7,6 +7,7 @@ from transformers import LlamaForCausalLM from utils import shared_tempdir import colossalai +from colossalai.testing import skip_if_not_enough_gpus from colossalai.booster import Booster from colossalai.booster.plugin import GeminiPlugin from colossalai.lazy import LazyInitContext @@ -68,7 +69,7 @@ def exam_state_dict_with_origin(placement_config, model_name, use_safetensors: b @clear_cache_before_run() @parameterize("placement_config", OPTIM_PLACEMENT_CONFIGS) @parameterize("shard", [True, False]) -@parameterize("model_name", ["transformers_gpt"]) +@parameterize("model_name", ["transformers_llama_for_casual_lm"]) @parameterize("size_per_shard", [32]) @parameterize("tp_size", [1, 2]) @parameterize("zero_size", [2]) @@ -156,13 +157,12 @@ def run_dist(rank, world_size, port): @pytest.mark.dist -@pytest.mark.parametrize("world_size", [4]) @rerun_if_address_is_in_use() -def test_gemini_ckpIO(world_size): - spawn(run_dist, world_size) +def test_gemini_ckpIO(): + spawn(run_dist, 4) @pytest.mark.largedist -@pytest.mark.parametrize("world_size", [8]) +@skip_if_not_enough_gpus(min_gpus=8) @rerun_if_address_is_in_use() -def test_gemini_ckpIO_3d(world_size): - spawn(run_dist, world_size) \ No newline at end of file +def test_gemini_ckpIO_3d(): + spawn(run_dist, 8) \ No newline at end of file diff --git a/tests/test_checkpoint_io/test_gemini_torch_compability.py b/tests/test_checkpoint_io/test_gemini_torch_compability.py index bb7a60035..44a000113 100644 --- a/tests/test_checkpoint_io/test_gemini_torch_compability.py +++ b/tests/test_checkpoint_io/test_gemini_torch_compability.py @@ -20,7 +20,7 @@ from tests.kit.model_zoo import model_zoo @clear_cache_before_run() @parameterize("shard", [False, True]) -@parameterize("model_name", ["transformers_gpt"]) +@parameterize("model_name", ["transformers_llama_for_casual_lm"]) def exam_torch_load_from_gemini(shard: bool, model_name: str): (model_fn, data_gen_fn, output_transform_fn, _, _) = next(iter(model_zoo.get_sub_registry(model_name).values())) criterion = lambda x: x.mean() diff --git a/tests/test_checkpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py b/tests/test_checkpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py index c0bc2d2f5..db3c56da8 100644 --- a/tests/test_checkpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py +++ b/tests/test_checkpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py @@ -40,7 +40,7 @@ else: @clear_cache_before_run() @parameterize("shard", [True, False]) -@parameterize("model_name", ["transformers_gpt"]) +@parameterize("model_name", ["transformers_llama_for_casual_lm"]) @parameterize("size_per_shard", [32]) @parameterize("test_config", TEST_CONFIGS) def exam_state_dict(shard: bool, model_name: str, size_per_shard: int, test_config: dict): diff --git a/tests/test_checkpoint_io/test_plugins_huggingface_compatibility.py b/tests/test_checkpoint_io/test_plugins_huggingface_compatibility.py index a6f67e0d7..0353ff115 100644 --- a/tests/test_checkpoint_io/test_plugins_huggingface_compatibility.py +++ b/tests/test_checkpoint_io/test_plugins_huggingface_compatibility.py @@ -18,7 +18,7 @@ from tests.kit.model_zoo import model_zoo @clear_cache_before_run() -@parameterize("model_name", ["transformers_gpt"]) +@parameterize("model_name", ["transformers_llama_for_casual_lm"]) @parameterize("plugin_type", ["ddp", "zero", "gemini"]) def exam_from_pretrained(plugin_type: str, model_name: str, shard=True, size_per_shard=32): (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) = next( diff --git a/tests/test_lazy/test_models.py b/tests/test_lazy/test_models.py index a1b5763d4..ee50e5b61 100644 --- a/tests/test_lazy/test_models.py +++ b/tests/test_lazy/test_models.py @@ -1,11 +1,11 @@ import pytest from lazy_init_utils import SUPPORT_LAZY, check_lazy_init -from tests.kit.model_zoo import model_zoo +from tests.kit.model_zoo import model_zoo, IS_FAST_TEST, COMMON_MODELS @pytest.mark.skipif(not SUPPORT_LAZY, reason="requires torch >= 1.12.0") -@pytest.mark.parametrize("subset", ["torchvision", "diffusers", "timm", "transformers", "torchaudio", "deepfm", "dlrm"]) +@pytest.mark.parametrize("subset", [COMMON_MODELS] if IS_FAST_TEST else ["torchvision", "diffusers", "timm", "transformers", "torchaudio", "deepfm", "dlrm"]) @pytest.mark.parametrize("default_device", ["cpu", "cuda"]) def test_torchvision_models_lazy_init(subset, default_device): sub_model_zoo = model_zoo.get_sub_registry(subset)