[gemini] fixes for benchmarking (#5847)

* [gemini] fix missing return * [gemini] fix missing arg pass * [gemini] use gather tensor instead of list * [test] enable flash attention for benchmark by default * [test] enable flash attention for benchmark by default --------- Co-authored-by: genghaozhe <939857490@qq.com>
2025-09-08 04:24:47 +00:00 · 2024-06-26 15:52:09 +08:00
parent 2a25a2aff7
commit 8e718a1421
5 changed files with 27 additions and 15 deletions
--- a/colossalai/zero/gemini/gemini_ddp.py
+++ b/colossalai/zero/gemini/gemini_ddp.py
@@ -387,6 +387,7 @@ class GeminiDDP(ModelWrapper):
        p: nn.Parameter,
        async_reduce_stream: Optional[torch.cuda.Stream] = None,
    ):
+        async_reduce_scatter = async_reduce_stream is not None
        setattr(p, "_gemini_reduced", True)
        empty_grad = torch.empty_like(grad)
        free_storage(empty_grad)
@@ -426,7 +427,7 @@ class GeminiDDP(ModelWrapper):
                async_reduce_stream.wait_stream(torch.cuda.current_stream())

            with torch.cuda.stream(async_reduce_stream):
-                reduced = chunk_manager.reduce_chunk(grad_chunk, async_op=(async_reduce_stream is not None))
+                reduced = chunk_manager.reduce_chunk(grad_chunk, async_op=async_reduce_scatter)
                if reduced:
                    grad_chunk.wait_async_reduce()
                    if not chunk_manager.reuse_fp16_chunk:
@@ -447,9 +448,13 @@ class GeminiDDP(ModelWrapper):
                    # record l2 norm for gradient clipping. flag is bound to fp16 chunk
                    if chunk.l2_norm_flag:
                        grad_chunk.set_l2_norm()
-                    chunk_manager.move_chunk(grad_chunk, grads_device[p], force_copy=True)
+                    chunk_manager.move_chunk(
+                        grad_chunk, grads_device[p], force_copy=True, async_move=async_reduce_scatter
+                    )
                    if not (master_weights) or (enable_gradient_accumulation):
-                        chunk_manager.move_chunk(chunk, grads_device[p], force_copy=True)
+                        chunk_manager.move_chunk(
+                            chunk, grads_device[p], force_copy=True, async_move=async_reduce_scatter
+                        )
        return empty_grad

    def zero_grad(self, set_to_none: bool = False) -> None: