[fp8] hotfix backward hook (#6053)

* [fp8] hotfix backward hook

* [fp8] hotfix pipeline loss accumulation
This commit is contained in:
Hongxin Liu
2024-09-11 16:11:25 +08:00
committed by GitHub
parent c54c4fcd15
commit 13946c4448
6 changed files with 31 additions and 17 deletions

View File

@@ -9,6 +9,7 @@ import os
# https://forums.developer.nvidia.com/t/how-many-streams-maximum-number-of-streams/6571/16
os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
import torch
import torch.distributed as dist
from colossalai.accelerator import get_accelerator
@@ -64,6 +65,11 @@ def launch(
set_seed(seed)
try:
torch._dynamo.config.optimize_ddp = world_size > 1
except AttributeError:
pass
if verbose:
logger = get_dist_logger()
logger.info(f"Distributed environment is initialized, world size: {dist.get_world_size()}", ranks=[0])