[hotfix] fix unsafe async comm in zero (#4404)

* improve stablility of zero

* fix wrong index

* add record stream
This commit is contained in:
LuGY
2023-08-11 15:09:24 +08:00
committed by GitHub
parent 6ccecc0c69
commit d86ddd9b29
3 changed files with 46 additions and 20 deletions

View File

@@ -137,7 +137,7 @@ def exam_zero_1_torch_ddp(world_size, dtype: torch.dtype):
zero_optimizer = LowLevelZeroOptimizer(zero_optimizer,
overlap_communication=True,
initial_scale=1,
reduce_bucket_size=262144)
reduce_bucket_size=1024 * 1024)
torch_optimizer = torch.optim.SGD(torch_model.parameters(), lr=1)