mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-16 22:52:25 +00:00
[checkpointio] support debug log (#6153)
* [checkpointio] support debug log * [checkpointio] refactor async writer api * fix test * fix test
This commit is contained in:
@@ -10,7 +10,7 @@ from colossalai.logging import disable_existing_loggers
|
||||
from colossalai.nn.optimizer import DistributedLamb, Lamb
|
||||
from colossalai.tensor.d_tensor import get_shard_dim_1d, is_distributed_tensor
|
||||
from colossalai.tensor.d_tensor.api import clear_layout_converter
|
||||
from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
|
||||
from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn
|
||||
from colossalai.testing.random import seed_all
|
||||
from colossalai.zero import LowLevelZeroOptimizer
|
||||
from tests.kit.model_zoo import model_zoo
|
||||
@@ -108,6 +108,7 @@ def set_dist_grad(
|
||||
@parameterize("p_g_dtype", _ALLOWED_P_G_TYPES)
|
||||
@parameterize("bias_correction", [False, True])
|
||||
@parameterize("tp_zero_size", [(1, 4), (4, 1), (2, 2)])
|
||||
@clear_cache_before_run()
|
||||
def run_dist_lamb_basic(
|
||||
bias_correction: bool, p_g_dtype: tuple[torch.dtype, torch.dtype], tp_zero_size: tuple[int, int]
|
||||
) -> None:
|
||||
@@ -177,6 +178,7 @@ def run_dist_lamb_basic(
|
||||
@parameterize("p_g_dtype", _ALLOWED_P_G_TYPES)
|
||||
@parameterize("bias_correction", [False, True])
|
||||
@parameterize("tp_zero_size", [(2, 2), (4, 1), (1, 4)])
|
||||
@clear_cache_before_run()
|
||||
def run_dist_lamb_fwd_bwd(
|
||||
bias_correction: bool, p_g_dtype: tuple[torch.dtype, torch.dtype], tp_zero_size: tuple[int, int]
|
||||
) -> None:
|
||||
|
Reference in New Issue
Block a user