[Hoxfix] Fix CUDA_DEVICE_MAX_CONNECTIONS for comm overlap

Co-authored-by: Edenzzzz <wtan45@wisc.edu>
2025-09-04 18:40:28 +00:00 · 2024-07-05 20:02:36 +08:00
parent 3420921101
commit 8ec24b6a4d
4 changed files with 7 additions and 6 deletions
--- a/colossalai/initialize.py
+++ b/colossalai/initialize.py
@@ -3,6 +3,12 @@

 import os

+# set CUDA_DEVICE_MAX_CONNECTIONS=1 to ensure that when overlapping communication and computation,
+# the order of of kernel launches on GPUs are the same as on the CPU so that comm is launched first.
+# see https://github.com/NVIDIA/Megatron-LM/issues/533
+# https://forums.developer.nvidia.com/t/how-many-streams-maximum-number-of-streams/6571/16
+os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
+
 import torch.distributed as dist

 from colossalai.accelerator import get_accelerator