mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-09 21:09:18 +00:00
[test] skip tests when not enough GPUs are detected (#1090)
* [test] skip tests when not enough GPUs are detected * polish code * polish code
This commit is contained in:
@@ -15,7 +15,7 @@ from colossalai.initialize import launch
|
||||
from colossalai.logging import disable_existing_loggers
|
||||
from colossalai.utils import free_port, is_using_pp
|
||||
from colossalai.utils.checkpointing import gather_pipeline_parallel_state_dict, load_checkpoint, save_checkpoint
|
||||
from colossalai.testing import rerun_on_exception
|
||||
from colossalai.testing import rerun_on_exception, skip_if_not_enough_gpus
|
||||
|
||||
|
||||
def build_pipeline(model):
|
||||
@@ -67,7 +67,7 @@ def check_checkpoint_1d(rank, world_size, port):
|
||||
|
||||
|
||||
@pytest.mark.dist
|
||||
@pytest.mark.skip("This test should be invoked with 8 GPUs")
|
||||
@skip_if_not_enough_gpus(min_gpus=8)
|
||||
@rerun_on_exception(exception_type=mp.ProcessRaisedException, pattern=".*Address already in use.*")
|
||||
def test_checkpoint_1d():
|
||||
world_size = 8
|
||||
|
@@ -15,7 +15,7 @@ from colossalai.initialize import launch
|
||||
from colossalai.logging import disable_existing_loggers
|
||||
from colossalai.utils import free_port, get_current_device, is_using_pp
|
||||
from colossalai.utils.checkpointing import gather_pipeline_parallel_state_dict, load_checkpoint, save_checkpoint
|
||||
from colossalai.testing import rerun_on_exception
|
||||
from colossalai.testing import rerun_on_exception, skip_if_not_enough_gpus
|
||||
|
||||
|
||||
def build_pipeline(model):
|
||||
@@ -67,7 +67,7 @@ def check_checkpoint_2d(rank, world_size, port):
|
||||
|
||||
|
||||
@pytest.mark.dist
|
||||
@pytest.mark.skip("This test should be invoked with 8 GPUs")
|
||||
@skip_if_not_enough_gpus(min_gpus=8)
|
||||
@rerun_on_exception(exception_type=mp.ProcessRaisedException, pattern=".*Address already in use.*")
|
||||
def test_checkpoint_2d():
|
||||
world_size = 8
|
||||
|
@@ -15,7 +15,7 @@ from colossalai.initialize import launch
|
||||
from colossalai.logging import disable_existing_loggers
|
||||
from colossalai.utils import free_port, get_current_device, is_using_pp
|
||||
from colossalai.utils.checkpointing import gather_pipeline_parallel_state_dict, load_checkpoint, save_checkpoint
|
||||
from colossalai.testing import rerun_on_exception
|
||||
from colossalai.testing import rerun_on_exception, skip_if_not_enough_gpus
|
||||
|
||||
|
||||
def build_pipeline(model):
|
||||
@@ -37,7 +37,6 @@ def build_pipeline(model):
|
||||
def check_equal(A, B):
|
||||
assert torch.allclose(A, B, rtol=1e-3, atol=1e-2)
|
||||
|
||||
|
||||
def check_checkpoint_2p5d(rank, world_size, port):
|
||||
config = dict(parallel=dict(pipeline=dict(size=2), tensor=dict(size=4, depth=1, mode="2.5d")),)
|
||||
|
||||
@@ -67,7 +66,7 @@ def check_checkpoint_2p5d(rank, world_size, port):
|
||||
|
||||
|
||||
@pytest.mark.dist
|
||||
@pytest.mark.skip("This test should be invoked with 8 GPUs")
|
||||
@skip_if_not_enough_gpus(min_gpus=8)
|
||||
@rerun_on_exception(exception_type=mp.ProcessRaisedException, pattern=".*Address already in use.*")
|
||||
def test_checkpoint_2p5d():
|
||||
world_size = 8
|
||||
|
@@ -15,7 +15,7 @@ from colossalai.initialize import launch
|
||||
from colossalai.logging import disable_existing_loggers
|
||||
from colossalai.utils import free_port, get_current_device, is_using_pp
|
||||
from colossalai.utils.checkpointing import gather_pipeline_parallel_state_dict, load_checkpoint, save_checkpoint
|
||||
from colossalai.testing import rerun_on_exception
|
||||
from colossalai.testing import rerun_on_exception, skip_if_not_enough_gpus
|
||||
|
||||
|
||||
def build_pipeline(model):
|
||||
@@ -37,7 +37,6 @@ def build_pipeline(model):
|
||||
def check_equal(A, B):
|
||||
assert torch.allclose(A, B, rtol=1e-3, atol=1e-2)
|
||||
|
||||
|
||||
def check_checkpoint_3d(rank, world_size, port):
|
||||
config = dict(parallel=dict(pipeline=dict(size=1), tensor=dict(size=8, mode="3d")),)
|
||||
|
||||
@@ -67,7 +66,7 @@ def check_checkpoint_3d(rank, world_size, port):
|
||||
|
||||
|
||||
@pytest.mark.dist
|
||||
@pytest.mark.skip("This test requires 8 GPUs to execute")
|
||||
@skip_if_not_enough_gpus(min_gpus=8)
|
||||
@rerun_on_exception(exception_type=mp.ProcessRaisedException, pattern=".*Address already in use.*")
|
||||
def test_checkpoint_3d():
|
||||
world_size = 8
|
||||
|
Reference in New Issue
Block a user