mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-20 00:55:29 +00:00
[misc] update torch version (#6206)
* [misc] update torch version * fix test * fix test * fix test * fix test
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
from colossalai.cluster.device_mesh_manager import DeviceMeshInfo, DeviceMeshManager
|
||||
from colossalai.initialize import launch
|
||||
from colossalai.logging import disable_existing_loggers
|
||||
from colossalai.testing import spawn
|
||||
from colossalai.testing import rerun_if_address_is_in_use, spawn
|
||||
|
||||
|
||||
def check_device_mesh_manager(rank, world_size, port):
|
||||
@@ -24,6 +24,7 @@ def check_device_mesh_manager(rank, world_size, port):
|
||||
assert device_mesh_with_shape._logical_mesh_id.tolist() == [[0, 1], [2, 3]]
|
||||
|
||||
|
||||
@rerun_if_address_is_in_use()
|
||||
def test_device_mesh_manager():
|
||||
spawn(check_device_mesh_manager, 4)
|
||||
|
||||
|
@@ -51,7 +51,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
|
||||
if test_config["precision"] == "fp32":
|
||||
atol, rtol = 1e-5, 1e-3
|
||||
else:
|
||||
atol, rtol = 5e-2, 5e-2
|
||||
atol, rtol = 9e-2, 0
|
||||
if (stage_manager is None or stage_manager.is_first_stage()) and booster.plugin.zero_stage == 0:
|
||||
row_layer_grads = get_grad_tensors_for_check(
|
||||
t5, sharded_t5, row_layer_for_check, tp_group, atol=atol, rtol=rtol, dim=0
|
||||
|
Reference in New Issue
Block a user