[fp8] Disable all_gather intranode. Disable Redundant all_gather fp8 (#6059)

* all_gather only internode, fix pytest * fix cuda arch <89 compile pytest error * fix pytest failure * disable all_gather_into_tensor_flat_fp8 * fix fp8 format * fix pytest * fix conversations * fix chunk tuple to list
2025-09-05 11:02:05 +00:00 · 2024-09-14 10:40:01 +08:00
parent 696fced0d7
commit f20b066c59
8 changed files with 43 additions and 147 deletions
--- a/tests/test_fp8/test_fp8_all_to_all.py
+++ b/tests/test_fp8/test_fp8_all_to_all.py
@@ -5,7 +5,7 @@ from torch.testing import assert_close

 from colossalai import launch
 from colossalai.accelerator import get_accelerator
-from colossalai.quantization.fp8 import all_to_all_fp8
+from colossalai.quantization.fp8 import _all_to_all_fp8
 from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn


@@ -20,7 +20,7 @@ def check_4gpu(shape, scatter_dim, dtype, fp8_format):
    input_tensor_list = [x.contiguous() for x in input_tensor_list]
    output_tensor_list_fp8 = [torch.empty_like(x) for x in input_tensor_list]
    output_tensor_list = [torch.empty_like(x) for x in input_tensor_list]
-    all_to_all_fp8(output_tensor_list_fp8, input_tensor_list, group=_get_default_group(), fp8_format=fp8_format)
+    _all_to_all_fp8(output_tensor_list_fp8, input_tensor_list, group=_get_default_group(), fp8_format=fp8_format)
    dist.all_to_all(output_tensor_list, input_tensor_list, group=_get_default_group())
    assert_close(output_tensor_list_fp8, output_tensor_list, rtol=0.1, atol=0.1)

--- a/tests/test_fp8/test_fp8_allgather.py
+++ b/tests/test_fp8/test_fp8_allgather.py
@@ -5,22 +5,13 @@ from torch.testing import assert_close

 from colossalai import launch
 from colossalai.accelerator import get_accelerator
-from colossalai.quantization.fp8 import gather_fp8
+from colossalai.quantization.fp8 import _all_gather_fp8
 from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn


@parameterize(
    "shape",
-    [
-        (3, 7),
-        (2, 1),
-        (1, 2),
-        (2, 2),
-        (4, 2),
-        (5,),
-        (4,),
-        (2,),
-    ],
+    [(3, 7, 16)],
 )
@parameterize("dtype", [torch.bfloat16, torch.float16])
@parameterize("fp8_format", ["e4m3", "e5m2"])
@@ -30,7 +21,9 @@ def check_4gpu(shape, dtype, fp8_format, async_op):
    x = torch.rand(shape, dtype=dtype, device=get_accelerator().get_current_device())
    output_list = [torch.empty_like(x) for _ in range(world_size)]
    output_list_fp8 = [torch.empty_like(x) for _ in range(world_size)]
-    fp8_handle = gather_fp8(output_list_fp8, x, group=_get_default_group(), fp8_format=fp8_format, async_op=async_op)
+    fp8_handle = _all_gather_fp8(
+        output_list_fp8, x, group=_get_default_group(), fp8_format=fp8_format, async_op=async_op
+    )
    origin_hanle = dist.all_gather(output_list, x, group=_get_default_group(), async_op=async_op)
    if async_op:
        fp8_handle.wait()
--- a/tests/test_fp8/test_fp8_allgather_flat.py
+++ b/tests/test_fp8/test_fp8_allgather_flat.py
@@ -1,43 +0,0 @@
-import torch
-import torch.distributed as dist
-import torch.nn.functional as F
-from torch.distributed.distributed_c10d import _get_default_group
-from torch.testing import assert_close
-
-from colossalai import launch
-from colossalai.accelerator import get_accelerator
-from colossalai.quantization.fp8 import all_gather_into_tensor_flat_fp8
-from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
-
-
-@parameterize("shape", [(3, 7), (2, 1), (1, 2), (2, 2), (4, 2), (5,), (4,), (2,)])
-@parameterize("dtype", [torch.bfloat16, torch.float16])
-@parameterize("async_op", [True, False])
-def check_4gpu(shape, dtype, async_op):
-    world_size = dist.get_world_size()
-    rank = dist.get_rank()
-    x = torch.rand(shape, dtype=dtype, device=get_accelerator().get_current_device())
-    flat_padded_x = x.view(-1)
-    if flat_padded_x.size(0) % world_size != 0:
-        pad_size = world_size - flat_padded_x.size(0) % world_size
-        flat_padded_x = F.pad(flat_padded_x, (0, pad_size))
-    output = torch.empty_like(flat_padded_x)
-    chunk = flat_padded_x.chunk(world_size)[rank].clone()
-    handle = all_gather_into_tensor_flat_fp8(output, chunk, x.shape, group=_get_default_group(), async_op=async_op)
-    if async_op:
-        handle.wait()
-    assert_close(output[: x.numel()], x.view(-1), rtol=0.1, atol=0.1)
-
-
-def run_dist(rank, world_size, port):
-    launch(rank=rank, world_size=world_size, port=port, host="localhost")
-    check_4gpu()
-
-
-@rerun_if_address_is_in_use()
-def test_all_gather_flat():
-    spawn(run_dist, 4)
-
-
-if __name__ == "__main__":
-    test_all_gather_flat()