mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-06-27 15:57:16 +00:00
[workflow] fixed build CI (#5240)
* [workflow] fixed build CI * polish * polish * polish * polish * polish
This commit is contained in:
parent
41e52c1c6e
commit
edf94a35c3
136
.github/workflows/build_on_pr.yml
vendored
136
.github/workflows/build_on_pr.yml
vendored
@ -22,57 +22,6 @@ on:
|
|||||||
delete:
|
delete:
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
prepare_cache:
|
|
||||||
name: Prepare testmon cache
|
|
||||||
if: |
|
|
||||||
github.event_name == 'create' &&
|
|
||||||
github.event.ref_type == 'branch' &&
|
|
||||||
github.event.repository.full_name == 'hpcaitech/ColossalAI'
|
|
||||||
runs-on: [self-hosted, gpu]
|
|
||||||
container:
|
|
||||||
image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
|
|
||||||
options: --rm
|
|
||||||
timeout-minutes: 5
|
|
||||||
defaults:
|
|
||||||
run:
|
|
||||||
shell: bash
|
|
||||||
steps:
|
|
||||||
- name: Copy testmon cache
|
|
||||||
run: | # branch name may contain slash, we need to replace it with space
|
|
||||||
export REF_BRANCH=$(echo ${{ github.event.ref }} | sed "s/\// /")
|
|
||||||
if [ -d /github/home/testmon_cache/${MAIN_BRANCH} ]; then
|
|
||||||
cp -p -r /github/home/testmon_cache/${MAIN_BRANCH} "/github/home/testmon_cache/${REF_BRANCH}"
|
|
||||||
fi
|
|
||||||
env:
|
|
||||||
MAIN_BRANCH: ${{ github.event.master_branch }}
|
|
||||||
|
|
||||||
prepare_cache_for_pr:
|
|
||||||
name: Prepare testmon cache for PR
|
|
||||||
if: |
|
|
||||||
github.event_name == 'pull_request' &&
|
|
||||||
(github.event.action == 'opened' || github.event.action == 'reopened' || (github.event.action == 'edited' && github.event.changes.base != null)) &&
|
|
||||||
github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
|
|
||||||
runs-on: [self-hosted, gpu]
|
|
||||||
container:
|
|
||||||
image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
|
|
||||||
options: --rm
|
|
||||||
timeout-minutes: 5
|
|
||||||
defaults:
|
|
||||||
run:
|
|
||||||
shell: bash
|
|
||||||
concurrency:
|
|
||||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-repare-cache
|
|
||||||
cancel-in-progress: true
|
|
||||||
steps:
|
|
||||||
- name: Copy testmon cache
|
|
||||||
run: | # branch name may contain slash, we need to replace it with space
|
|
||||||
export BASE=$(echo ${{ github.event.pull_request.base.ref }} | sed "s/\// /")
|
|
||||||
if [ -d "/github/home/testmon_cache/${BASE}" ] && [ ! -z "$(ls -A "/github/home/testmon_cache/${BASE}")" ]; then
|
|
||||||
mkdir -p /github/home/testmon_cache/_pull/${PR_NUMBER} && cp -p -r "/github/home/testmon_cache/${BASE}"/.testmondata* /github/home/testmon_cache/_pull/${PR_NUMBER}
|
|
||||||
fi
|
|
||||||
env:
|
|
||||||
PR_NUMBER: ${{ github.event.number }}
|
|
||||||
|
|
||||||
detect:
|
detect:
|
||||||
name: Detect file change
|
name: Detect file change
|
||||||
if: |
|
if: |
|
||||||
@ -140,7 +89,7 @@ jobs:
|
|||||||
if: needs.detect.outputs.anyLibraryFileChanged == 'true'
|
if: needs.detect.outputs.anyLibraryFileChanged == 'true'
|
||||||
runs-on: [self-hosted, gpu]
|
runs-on: [self-hosted, gpu]
|
||||||
container:
|
container:
|
||||||
image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
|
image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
|
||||||
options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
|
options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
|
||||||
timeout-minutes: 60
|
timeout-minutes: 60
|
||||||
defaults:
|
defaults:
|
||||||
@ -174,6 +123,7 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
cd TensorNVMe
|
cd TensorNVMe
|
||||||
cp -p -r ./build /github/home/tensornvme_cache/
|
cp -p -r ./build /github/home/tensornvme_cache/
|
||||||
|
cp -p -r ./cmake-build /github/home/tensornvme_cache/
|
||||||
|
|
||||||
- name: Checkout Colossal-AI
|
- name: Checkout Colossal-AI
|
||||||
uses: actions/checkout@v2
|
uses: actions/checkout@v2
|
||||||
@ -198,31 +148,27 @@ jobs:
|
|||||||
# -p flag is required to preserve the file timestamp to avoid ninja rebuild
|
# -p flag is required to preserve the file timestamp to avoid ninja rebuild
|
||||||
cp -p -r /__w/ColossalAI/ColossalAI/build /github/home/cuda_ext_cache/
|
cp -p -r /__w/ColossalAI/ColossalAI/build /github/home/cuda_ext_cache/
|
||||||
|
|
||||||
- name: Restore Testmon Cache
|
|
||||||
run: |
|
|
||||||
if [ -d /github/home/testmon_cache/_pull/${PR_NUMBER} ] && [ ! -z "$(ls -A /github/home/testmon_cache/_pull/${PR_NUMBER})" ]; then
|
|
||||||
cp -p -r /github/home/testmon_cache/_pull/${PR_NUMBER}/.testmondata* /__w/ColossalAI/ColossalAI/
|
|
||||||
fi
|
|
||||||
env:
|
|
||||||
PR_NUMBER: ${{ github.event.number }}
|
|
||||||
|
|
||||||
- name: Execute Unit Testing
|
- name: Execute Unit Testing
|
||||||
run: |
|
run: |
|
||||||
CURL_CA_BUNDLE="" PYTHONPATH=$PWD pytest -m "not largedist" --testmon --testmon-forceselect --testmon-cov=. --durations=10 tests/
|
CURL_CA_BUNDLE="" PYTHONPATH=$PWD FAST_TEST=1 pytest \
|
||||||
|
-m "not largedist" \
|
||||||
|
--durations=0 \
|
||||||
|
--ignore tests/test_analyzer \
|
||||||
|
--ignore tests/test_auto_parallel \
|
||||||
|
--ignore tests/test_fx \
|
||||||
|
--ignore tests/test_autochunk \
|
||||||
|
--ignore tests/test_gptq \
|
||||||
|
--ignore tests/test_infer_ops \
|
||||||
|
--ignore tests/test_legacy \
|
||||||
|
--ignore tests/test_moe \
|
||||||
|
--ignore tests/test_smoothquant \
|
||||||
|
--ignore tests/test_checkpoint_io \
|
||||||
|
tests/
|
||||||
env:
|
env:
|
||||||
DATA: /data/scratch/cifar-10
|
|
||||||
NCCL_SHM_DISABLE: 1
|
NCCL_SHM_DISABLE: 1
|
||||||
LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
|
LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
|
||||||
TESTMON_CORE_PKGS: /__w/ColossalAI/ColossalAI/requirements/requirements.txt,/__w/ColossalAI/ColossalAI/requirements/requirements-test.txt
|
|
||||||
LLAMA_PATH: /data/scratch/llama-tiny
|
LLAMA_PATH: /data/scratch/llama-tiny
|
||||||
|
|
||||||
- name: Store Testmon Cache
|
|
||||||
run: |
|
|
||||||
mkdir -p /github/home/testmon_cache/_pull/${PR_NUMBER}
|
|
||||||
cp -p -r /__w/ColossalAI/ColossalAI/.testmondata* /github/home/testmon_cache/_pull/${PR_NUMBER}/
|
|
||||||
env:
|
|
||||||
PR_NUMBER: ${{ github.event.number }}
|
|
||||||
|
|
||||||
- name: Collate artifact
|
- name: Collate artifact
|
||||||
env:
|
env:
|
||||||
PR_NUMBER: ${{ github.event.number }}
|
PR_NUMBER: ${{ github.event.number }}
|
||||||
@ -260,53 +206,3 @@ jobs:
|
|||||||
name: report
|
name: report
|
||||||
path: report/
|
path: report/
|
||||||
|
|
||||||
store_cache:
|
|
||||||
name: Store testmon cache for PR
|
|
||||||
if: |
|
|
||||||
github.event_name == 'pull_request' &&
|
|
||||||
github.event.action == 'closed' &&
|
|
||||||
github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
|
|
||||||
runs-on: [self-hosted, gpu]
|
|
||||||
container:
|
|
||||||
image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
|
|
||||||
options: --rm
|
|
||||||
timeout-minutes: 5
|
|
||||||
defaults:
|
|
||||||
run:
|
|
||||||
shell: bash
|
|
||||||
steps:
|
|
||||||
- name: Store testmon cache if possible
|
|
||||||
if: github.event.pull_request.merged == true
|
|
||||||
run: | # branch name may contain slash, we need to replace it with space
|
|
||||||
export BASE=$(echo ${{ github.event.pull_request.base.ref }} | sed "s/\// /")
|
|
||||||
if [ -d /github/home/testmon_cache/_pull/${PR_NUMBER} ] && [ ! -z "$(ls -A /github/home/testmon_cache/_pull/${PR_NUMBER})" ]; then
|
|
||||||
cp -p -r /github/home/testmon_cache/_pull/${PR_NUMBER}/.testmondata* "/github/home/testmon_cache/${BASE}/"
|
|
||||||
fi
|
|
||||||
env:
|
|
||||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
|
||||||
|
|
||||||
- name: Remove testmon cache
|
|
||||||
run: |
|
|
||||||
rm -rf /github/home/testmon_cache/_pull/${PR_NUMBER}
|
|
||||||
env:
|
|
||||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
|
||||||
|
|
||||||
remove_cache:
|
|
||||||
name: Remove testmon cache
|
|
||||||
if: |
|
|
||||||
github.event_name == 'delete' &&
|
|
||||||
github.event.ref_type == 'branch' &&
|
|
||||||
github.event.repository.full_name == 'hpcaitech/ColossalAI'
|
|
||||||
runs-on: [self-hosted, gpu]
|
|
||||||
container:
|
|
||||||
image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
|
|
||||||
options: --rm
|
|
||||||
timeout-minutes: 5
|
|
||||||
defaults:
|
|
||||||
run:
|
|
||||||
shell: bash
|
|
||||||
steps:
|
|
||||||
- name: Remove testmon cache
|
|
||||||
run: | # branch name may contain slash, we need to replace it with space
|
|
||||||
export BASE=$(echo ${{ github.event.ref }} | sed "s/\// /")
|
|
||||||
rm -rf "/github/home/testmon_cache/${BASE}"
|
|
||||||
|
15
.github/workflows/build_on_schedule.yml
vendored
15
.github/workflows/build_on_schedule.yml
vendored
@ -10,20 +10,20 @@ jobs:
|
|||||||
build:
|
build:
|
||||||
name: Build and Test Colossal-AI
|
name: Build and Test Colossal-AI
|
||||||
if: github.repository == 'hpcaitech/ColossalAI'
|
if: github.repository == 'hpcaitech/ColossalAI'
|
||||||
runs-on: [self-hosted, 8-gpu]
|
runs-on: [self-hosted, gpu]
|
||||||
container:
|
container:
|
||||||
image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
|
image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
|
||||||
options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
|
options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
|
||||||
timeout-minutes: 40
|
timeout-minutes: 90
|
||||||
steps:
|
steps:
|
||||||
- name: Check GPU Availability # ensure all GPUs have enough memory
|
- name: Check GPU Availability # ensure all GPUs have enough memory
|
||||||
id: check-avai
|
id: check-avai
|
||||||
run: |
|
run: |
|
||||||
avai=true
|
avai=true
|
||||||
for i in $(seq 0 7);
|
for i in $(seq 0 3);
|
||||||
do
|
do
|
||||||
gpu_used=$(nvidia-smi -i $i --query-gpu=memory.used --format=csv,noheader,nounits)
|
gpu_used=$(nvidia-smi -i $i --query-gpu=memory.used --format=csv,noheader,nounits)
|
||||||
[ "$gpu_used" -gt "10000" ] && avai=false
|
[ "$gpu_used" -gt "2000" ] && avai=false
|
||||||
done
|
done
|
||||||
|
|
||||||
echo "GPU is available: $avai"
|
echo "GPU is available: $avai"
|
||||||
@ -60,9 +60,12 @@ jobs:
|
|||||||
- name: Unit Testing
|
- name: Unit Testing
|
||||||
if: steps.check-avai.outputs.avai == 'true'
|
if: steps.check-avai.outputs.avai == 'true'
|
||||||
run: |
|
run: |
|
||||||
PYTHONPATH=$PWD pytest --durations=0 tests
|
PYTHONPATH=$PWD pytest \
|
||||||
|
-m "not largedist" \
|
||||||
|
--durations=0 \
|
||||||
|
tests/
|
||||||
env:
|
env:
|
||||||
DATA: /data/scratch/cifar-10
|
NCCL_SHM_DISABLE: 1
|
||||||
LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
|
LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
|
||||||
LLAMA_PATH: /data/scratch/llama-tiny
|
LLAMA_PATH: /data/scratch/llama-tiny
|
||||||
|
|
||||||
|
2
.github/workflows/doc_test_on_schedule.yml
vendored
2
.github/workflows/doc_test_on_schedule.yml
vendored
@ -12,7 +12,7 @@ jobs:
|
|||||||
name: Test the changed Doc
|
name: Test the changed Doc
|
||||||
runs-on: [self-hosted, gpu]
|
runs-on: [self-hosted, gpu]
|
||||||
container:
|
container:
|
||||||
image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
|
image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
|
||||||
options: --gpus all --rm
|
options: --gpus all --rm
|
||||||
timeout-minutes: 60
|
timeout-minutes: 60
|
||||||
steps:
|
steps:
|
||||||
|
@ -1,5 +1,33 @@
|
|||||||
from . import custom, diffusers, timm, torchaudio, torchrec, torchvision, transformers
|
import os
|
||||||
|
from . import custom, diffusers, timm, torchaudio, torchvision, transformers
|
||||||
from .executor import run_fwd, run_fwd_bwd
|
from .executor import run_fwd, run_fwd_bwd
|
||||||
from .registry import model_zoo
|
from .registry import model_zoo
|
||||||
|
|
||||||
__all__ = ["model_zoo", "run_fwd", "run_fwd_bwd"]
|
# We pick a subset of models for fast testing in order to reduce the total testing time
|
||||||
|
COMMON_MODELS = [
|
||||||
|
'custom_hanging_param_model',
|
||||||
|
'custom_nested_model',
|
||||||
|
'custom_repeated_computed_layers',
|
||||||
|
'custom_simple_net',
|
||||||
|
'diffusers_clip_text_model',
|
||||||
|
'diffusers_auto_encoder_kl',
|
||||||
|
'diffusers_unet2d_model',
|
||||||
|
'timm_densenet',
|
||||||
|
'timm_resnet',
|
||||||
|
'timm_swin_transformer',
|
||||||
|
'torchaudio_wav2vec2_base',
|
||||||
|
'torchaudio_conformer',
|
||||||
|
'transformers_bert_for_masked_lm',
|
||||||
|
'transformers_bloom_for_causal_lm',
|
||||||
|
'transformers_falcon_for_causal_lm',
|
||||||
|
'transformers_chatglm_for_conditional_generation',
|
||||||
|
'transformers_llama_for_casual_lm',
|
||||||
|
'transformers_vit_for_masked_image_modeling',
|
||||||
|
'transformers_mistral_for_casual_lm'
|
||||||
|
]
|
||||||
|
|
||||||
|
IS_FAST_TEST = os.environ.get('FAST_TEST', '0') == '1'
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["model_zoo", "run_fwd", "run_fwd_bwd", 'COMMON_MODELS', 'IS_FAST_TEST']
|
||||||
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Callable
|
from typing import Callable, List, Union
|
||||||
|
|
||||||
__all__ = ["ModelZooRegistry", "ModelAttribute", "model_zoo"]
|
__all__ = ["ModelZooRegistry", "ModelAttribute", "model_zoo"]
|
||||||
|
|
||||||
@ -61,7 +61,7 @@ class ModelZooRegistry(dict):
|
|||||||
"""
|
"""
|
||||||
self[name] = (model_fn, data_gen_fn, output_transform_fn, loss_fn, model_attribute)
|
self[name] = (model_fn, data_gen_fn, output_transform_fn, loss_fn, model_attribute)
|
||||||
|
|
||||||
def get_sub_registry(self, keyword: str):
|
def get_sub_registry(self, keyword: Union[str, List[str]]):
|
||||||
"""
|
"""
|
||||||
Get a sub registry with models that contain the keyword.
|
Get a sub registry with models that contain the keyword.
|
||||||
|
|
||||||
@ -70,12 +70,15 @@ class ModelZooRegistry(dict):
|
|||||||
"""
|
"""
|
||||||
new_dict = dict()
|
new_dict = dict()
|
||||||
|
|
||||||
for k, v in self.items():
|
if isinstance(keyword, str):
|
||||||
if keyword == "transformers_gpt":
|
keyword_list = [keyword]
|
||||||
if keyword in k and not "gptj" in k: # ensure GPT2 does not retrieve GPTJ models
|
|
||||||
new_dict[k] = v
|
|
||||||
else:
|
else:
|
||||||
if keyword in k:
|
keyword_list = keyword
|
||||||
|
assert isinstance(keyword_list, (list, tuple))
|
||||||
|
|
||||||
|
for k, v in self.items():
|
||||||
|
for kw in keyword_list:
|
||||||
|
if kw in k:
|
||||||
new_dict[k] = v
|
new_dict[k] = v
|
||||||
|
|
||||||
assert len(new_dict) > 0, f"No model found with keyword {keyword}"
|
assert len(new_dict) > 0, f"No model found with keyword {keyword}"
|
||||||
|
@ -13,7 +13,7 @@ from colossalai.lazy.lazy_init import LazyInitContext
|
|||||||
from colossalai.nn.optimizer import HybridAdam
|
from colossalai.nn.optimizer import HybridAdam
|
||||||
from colossalai.tensor.colo_parameter import ColoParameter
|
from colossalai.tensor.colo_parameter import ColoParameter
|
||||||
from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
|
from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
|
||||||
from tests.kit.model_zoo import model_zoo
|
from tests.kit.model_zoo import model_zoo, COMMON_MODELS, IS_FAST_TEST
|
||||||
|
|
||||||
|
|
||||||
def run_fn(init_method, model_fn, data_gen_fn, output_transform_fn, zero_size, tp_size) -> Optional[str]:
|
def run_fn(init_method, model_fn, data_gen_fn, output_transform_fn, zero_size, tp_size) -> Optional[str]:
|
||||||
@ -66,7 +66,7 @@ def run_fn(init_method, model_fn, data_gen_fn, output_transform_fn, zero_size, t
|
|||||||
# @parameterize('init_method', ['lazy', 'none', 'colo'])
|
# @parameterize('init_method', ['lazy', 'none', 'colo'])
|
||||||
|
|
||||||
|
|
||||||
@parameterize("subset", ["torchvision", "transformers", "diffusers"])
|
@parameterize("subset", [COMMON_MODELS] if IS_FAST_TEST else ["torchvision", "transformers", "diffusers"])
|
||||||
@parameterize("init_method", ["none"])
|
@parameterize("init_method", ["none"])
|
||||||
@parameterize("zero_size", [2])
|
@parameterize("zero_size", [2])
|
||||||
@parameterize("tp_size", [2])
|
@parameterize("tp_size", [2])
|
||||||
|
@ -11,7 +11,7 @@ from colossalai.booster.plugin import LowLevelZeroPlugin
|
|||||||
|
|
||||||
# from colossalai.nn.optimizer import HybridAdam
|
# from colossalai.nn.optimizer import HybridAdam
|
||||||
from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
|
from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
|
||||||
from tests.kit.model_zoo import model_zoo
|
from tests.kit.model_zoo import model_zoo, IS_FAST_TEST, COMMON_MODELS
|
||||||
|
|
||||||
# These models are not compatible with AMP
|
# These models are not compatible with AMP
|
||||||
_AMP_ERR_MODELS = ["timm_convit", "deepfm_interactionarch"]
|
_AMP_ERR_MODELS = ["timm_convit", "deepfm_interactionarch"]
|
||||||
@ -62,7 +62,12 @@ def check_low_level_zero_plugin(stage: int, early_stop: bool = True):
|
|||||||
ignore_models = _AMP_ERR_MODELS + _LOW_LEVEL_ZERO_ERR_MODELS + _STUCK_MODELS
|
ignore_models = _AMP_ERR_MODELS + _LOW_LEVEL_ZERO_ERR_MODELS + _STUCK_MODELS
|
||||||
skipped_models = []
|
skipped_models = []
|
||||||
|
|
||||||
for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in model_zoo.items():
|
if IS_FAST_TEST:
|
||||||
|
registry = model_zoo.get_sub_registry(COMMON_MODELS)
|
||||||
|
else:
|
||||||
|
registry = model_zoo
|
||||||
|
|
||||||
|
for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in registry.items():
|
||||||
# FIXME(ver217): fix these models
|
# FIXME(ver217): fix these models
|
||||||
if name in ignore_models:
|
if name in ignore_models:
|
||||||
skipped_models.append(name)
|
skipped_models.append(name)
|
||||||
|
@ -11,7 +11,7 @@ from colossalai.booster import Booster
|
|||||||
from colossalai.booster.plugin import TorchDDPPlugin
|
from colossalai.booster.plugin import TorchDDPPlugin
|
||||||
from colossalai.interface import OptimizerWrapper
|
from colossalai.interface import OptimizerWrapper
|
||||||
from colossalai.testing import rerun_if_address_is_in_use, spawn
|
from colossalai.testing import rerun_if_address_is_in_use, spawn
|
||||||
from tests.kit.model_zoo import model_zoo
|
from tests.kit.model_zoo import model_zoo, IS_FAST_TEST, COMMON_MODELS
|
||||||
|
|
||||||
|
|
||||||
def run_fn(model_fn, data_gen_fn, output_transform_fn):
|
def run_fn(model_fn, data_gen_fn, output_transform_fn):
|
||||||
@ -40,7 +40,12 @@ def run_fn(model_fn, data_gen_fn, output_transform_fn):
|
|||||||
|
|
||||||
|
|
||||||
def check_torch_ddp_plugin():
|
def check_torch_ddp_plugin():
|
||||||
for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in model_zoo.items():
|
if IS_FAST_TEST:
|
||||||
|
registry = model_zoo.get_sub_registry(COMMON_MODELS)
|
||||||
|
else:
|
||||||
|
registry = model_zoo
|
||||||
|
|
||||||
|
for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in registry.items():
|
||||||
if name == "dlrm_interactionarch":
|
if name == "dlrm_interactionarch":
|
||||||
continue
|
continue
|
||||||
run_fn(model_fn, data_gen_fn, output_transform_fn)
|
run_fn(model_fn, data_gen_fn, output_transform_fn)
|
||||||
|
@ -12,7 +12,7 @@ if version.parse(torch.__version__) >= version.parse("1.12.0"):
|
|||||||
|
|
||||||
from colossalai.interface import OptimizerWrapper
|
from colossalai.interface import OptimizerWrapper
|
||||||
from colossalai.testing import rerun_if_address_is_in_use, spawn
|
from colossalai.testing import rerun_if_address_is_in_use, spawn
|
||||||
from tests.kit.model_zoo import model_zoo
|
from tests.kit.model_zoo import model_zoo, IS_FAST_TEST, COMMON_MODELS
|
||||||
|
|
||||||
|
|
||||||
# test basic fsdp function
|
# test basic fsdp function
|
||||||
@ -42,7 +42,12 @@ def run_fn(model_fn, data_gen_fn, output_transform_fn):
|
|||||||
|
|
||||||
|
|
||||||
def check_torch_fsdp_plugin():
|
def check_torch_fsdp_plugin():
|
||||||
for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in model_zoo.items():
|
if IS_FAST_TEST:
|
||||||
|
registry = model_zoo.get_sub_registry(COMMON_MODELS)
|
||||||
|
else:
|
||||||
|
registry = model_zoo
|
||||||
|
|
||||||
|
for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in registry.items():
|
||||||
if any(
|
if any(
|
||||||
element in name
|
element in name
|
||||||
for element in [
|
for element in [
|
||||||
|
@ -7,6 +7,7 @@ from transformers import LlamaForCausalLM
|
|||||||
from utils import shared_tempdir
|
from utils import shared_tempdir
|
||||||
|
|
||||||
import colossalai
|
import colossalai
|
||||||
|
from colossalai.testing import skip_if_not_enough_gpus
|
||||||
from colossalai.booster import Booster
|
from colossalai.booster import Booster
|
||||||
from colossalai.booster.plugin import GeminiPlugin
|
from colossalai.booster.plugin import GeminiPlugin
|
||||||
from colossalai.lazy import LazyInitContext
|
from colossalai.lazy import LazyInitContext
|
||||||
@ -68,7 +69,7 @@ def exam_state_dict_with_origin(placement_config, model_name, use_safetensors: b
|
|||||||
@clear_cache_before_run()
|
@clear_cache_before_run()
|
||||||
@parameterize("placement_config", OPTIM_PLACEMENT_CONFIGS)
|
@parameterize("placement_config", OPTIM_PLACEMENT_CONFIGS)
|
||||||
@parameterize("shard", [True, False])
|
@parameterize("shard", [True, False])
|
||||||
@parameterize("model_name", ["transformers_gpt"])
|
@parameterize("model_name", ["transformers_llama_for_casual_lm"])
|
||||||
@parameterize("size_per_shard", [32])
|
@parameterize("size_per_shard", [32])
|
||||||
@parameterize("tp_size", [1, 2])
|
@parameterize("tp_size", [1, 2])
|
||||||
@parameterize("zero_size", [2])
|
@parameterize("zero_size", [2])
|
||||||
@ -156,13 +157,12 @@ def run_dist(rank, world_size, port):
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.dist
|
@pytest.mark.dist
|
||||||
@pytest.mark.parametrize("world_size", [4])
|
|
||||||
@rerun_if_address_is_in_use()
|
@rerun_if_address_is_in_use()
|
||||||
def test_gemini_ckpIO(world_size):
|
def test_gemini_ckpIO():
|
||||||
spawn(run_dist, world_size)
|
spawn(run_dist, 4)
|
||||||
|
|
||||||
@pytest.mark.largedist
|
@pytest.mark.largedist
|
||||||
@pytest.mark.parametrize("world_size", [8])
|
@skip_if_not_enough_gpus(min_gpus=8)
|
||||||
@rerun_if_address_is_in_use()
|
@rerun_if_address_is_in_use()
|
||||||
def test_gemini_ckpIO_3d(world_size):
|
def test_gemini_ckpIO_3d():
|
||||||
spawn(run_dist, world_size)
|
spawn(run_dist, 8)
|
@ -20,7 +20,7 @@ from tests.kit.model_zoo import model_zoo
|
|||||||
|
|
||||||
@clear_cache_before_run()
|
@clear_cache_before_run()
|
||||||
@parameterize("shard", [False, True])
|
@parameterize("shard", [False, True])
|
||||||
@parameterize("model_name", ["transformers_gpt"])
|
@parameterize("model_name", ["transformers_llama_for_casual_lm"])
|
||||||
def exam_torch_load_from_gemini(shard: bool, model_name: str):
|
def exam_torch_load_from_gemini(shard: bool, model_name: str):
|
||||||
(model_fn, data_gen_fn, output_transform_fn, _, _) = next(iter(model_zoo.get_sub_registry(model_name).values()))
|
(model_fn, data_gen_fn, output_transform_fn, _, _) = next(iter(model_zoo.get_sub_registry(model_name).values()))
|
||||||
criterion = lambda x: x.mean()
|
criterion = lambda x: x.mean()
|
||||||
|
@ -40,7 +40,7 @@ else:
|
|||||||
|
|
||||||
@clear_cache_before_run()
|
@clear_cache_before_run()
|
||||||
@parameterize("shard", [True, False])
|
@parameterize("shard", [True, False])
|
||||||
@parameterize("model_name", ["transformers_gpt"])
|
@parameterize("model_name", ["transformers_llama_for_casual_lm"])
|
||||||
@parameterize("size_per_shard", [32])
|
@parameterize("size_per_shard", [32])
|
||||||
@parameterize("test_config", TEST_CONFIGS)
|
@parameterize("test_config", TEST_CONFIGS)
|
||||||
def exam_state_dict(shard: bool, model_name: str, size_per_shard: int, test_config: dict):
|
def exam_state_dict(shard: bool, model_name: str, size_per_shard: int, test_config: dict):
|
||||||
|
@ -18,7 +18,7 @@ from tests.kit.model_zoo import model_zoo
|
|||||||
|
|
||||||
|
|
||||||
@clear_cache_before_run()
|
@clear_cache_before_run()
|
||||||
@parameterize("model_name", ["transformers_gpt"])
|
@parameterize("model_name", ["transformers_llama_for_casual_lm"])
|
||||||
@parameterize("plugin_type", ["ddp", "zero", "gemini"])
|
@parameterize("plugin_type", ["ddp", "zero", "gemini"])
|
||||||
def exam_from_pretrained(plugin_type: str, model_name: str, shard=True, size_per_shard=32):
|
def exam_from_pretrained(plugin_type: str, model_name: str, shard=True, size_per_shard=32):
|
||||||
(model_fn, data_gen_fn, output_transform_fn, loss_fn, _) = next(
|
(model_fn, data_gen_fn, output_transform_fn, loss_fn, _) = next(
|
||||||
|
@ -1,11 +1,11 @@
|
|||||||
import pytest
|
import pytest
|
||||||
from lazy_init_utils import SUPPORT_LAZY, check_lazy_init
|
from lazy_init_utils import SUPPORT_LAZY, check_lazy_init
|
||||||
|
|
||||||
from tests.kit.model_zoo import model_zoo
|
from tests.kit.model_zoo import model_zoo, IS_FAST_TEST, COMMON_MODELS
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(not SUPPORT_LAZY, reason="requires torch >= 1.12.0")
|
@pytest.mark.skipif(not SUPPORT_LAZY, reason="requires torch >= 1.12.0")
|
||||||
@pytest.mark.parametrize("subset", ["torchvision", "diffusers", "timm", "transformers", "torchaudio", "deepfm", "dlrm"])
|
@pytest.mark.parametrize("subset", [COMMON_MODELS] if IS_FAST_TEST else ["torchvision", "diffusers", "timm", "transformers", "torchaudio", "deepfm", "dlrm"])
|
||||||
@pytest.mark.parametrize("default_device", ["cpu", "cuda"])
|
@pytest.mark.parametrize("default_device", ["cpu", "cuda"])
|
||||||
def test_torchvision_models_lazy_init(subset, default_device):
|
def test_torchvision_models_lazy_init(subset, default_device):
|
||||||
sub_model_zoo = model_zoo.get_sub_registry(subset)
|
sub_model_zoo = model_zoo.get_sub_registry(subset)
|
||||||
|
Loading…
Reference in New Issue
Block a user