From f1fa1a675fbf54a389abd198d0e68edae431092e Mon Sep 17 00:00:00 2001 From: ver217 Date: Mon, 18 Apr 2022 14:07:39 +0800 Subject: [PATCH] fix grad offload when enabling reuse_fp16_shard --- colossalai/zero/sharded_optim/sharded_optim_v2.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/colossalai/zero/sharded_optim/sharded_optim_v2.py b/colossalai/zero/sharded_optim/sharded_optim_v2.py index c4fbf1b7c..5649ba521 100644 --- a/colossalai/zero/sharded_optim/sharded_optim_v2.py +++ b/colossalai/zero/sharded_optim/sharded_optim_v2.py @@ -290,6 +290,9 @@ class ShardedOptimizerV2(ColossalaiOptimizer): if p.colo_attr.saved_grad.is_null(): continue p.colo_attr.saved_grad.trans_state(TensorState.COMPUTE) + # If reuse_fp16_shard, grad fp16 which wasn't be offloaded may be evicted to CPU + if not p.colo_attr.offload_grad: + colo_model_data_tensor_move_inline(p.colo_attr.grad_payload, torch.cuda.current_device()) # FIXME(ver217): p.data here is an empty tensor on CUDA and has no useful infomation # If we change p.grad directly # it may raise error because of different shape/dtype/device of p.data and p.grad