[test] refactored with the new rerun decorator (#763)

* [test] refactored with the new rerun decorator * polish test case
2025-09-13 05:01:44 +00:00 · 2022-04-15 00:33:04 +08:00
parent deaf99f4c9
commit 5a1a095b92
34 changed files with 80 additions and 75 deletions
--- a/tests/test_utils/test_commons.py
+++ b/tests/test_utils/test_commons.py
@@ -1,6 +1,6 @@
 from colossalai.zero.sharded_param.tensor_utils import colo_model_data_tensor_move, colo_model_data_tensor_move_inline
 from colossalai.utils import free_port
-from colossalai.testing import rerun_on_exception
+from colossalai.testing import rerun_if_address_is_in_use
 from colossalai.zero.sharded_param import ShardedTensor
 import colossalai

@@ -35,7 +35,7 @@ def run_tensor_move(rank):
    assert (tgt_t.device.type == 'cpu')


-@rerun_on_exception(exception_type=mp.ProcessRaisedException, pattern=".*Address already in use.*")
+@rerun_if_address_is_in_use()
 def test_tensor_move():
    mp.spawn(run_tensor_move, nprocs=1)

--- a/tests/test_utils/test_gradient_accumluation.py
+++ b/tests/test_utils/test_gradient_accumluation.py
@@ -3,6 +3,7 @@ from functools import partial
 from pathlib import Path

 import colossalai
+from colossalai.testing.utils import rerun_if_address_is_in_use
 import pytest
 import torch
 import torch.multiprocessing as mp
@@ -10,7 +11,7 @@ import torch.nn as nn
 from colossalai.core import global_context as gpc
 from colossalai.logging import get_dist_logger
 from colossalai.utils import free_port, get_dataloader
-from colossalai.testing import rerun_on_exception
+from colossalai.testing import rerun_if_address_is_in_use
 from torch.optim import Adam
 from torchvision import transforms
 from torchvision.datasets import CIFAR10
@@ -87,7 +88,7 @@ def run_no_pipeline(rank, world_size, port):


@pytest.mark.dist
-@rerun_on_exception(exception_type=mp.ProcessRaisedException, pattern=".*Address already in use.*")
+@rerun_if_address_is_in_use()
 def test_engine():
    world_size = 4
    func = partial(run_no_pipeline, world_size=world_size, port=free_port())
--- a/tests/test_utils/test_zero_gradient_clippling.py
+++ b/tests/test_utils/test_zero_gradient_clippling.py
@@ -16,7 +16,7 @@ from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.nn.utils import clip_grad_norm_
 from colossalai.zero.shard_utils.tensor_shard_strategy import TensorShardStrategy
 from functools import partial
-from colossalai.testing import parameterize, rerun_on_exception
+from colossalai.testing import parameterize, rerun_if_address_is_in_use


 def checkpoint_wrapper(module, enable=True):
@@ -102,7 +102,7 @@ def run_dist(rank, world_size, port):


@pytest.mark.dist
-@rerun_on_exception(exception_type=mp.ProcessRaisedException, pattern=".*Address already in use.*")
+@rerun_if_address_is_in_use()
 def test_zero_clip_grad():
    world_size = 4
    run_func = partial(run_dist, world_size=world_size, port=free_port())