[hotfix] fix CPUAdam kernel nullptr (#1410)

This commit is contained in:
ver217
2022-08-05 19:45:45 +08:00
committed by GitHub
parent 1e5eb0874c
commit 12b4887097
10 changed files with 58 additions and 125 deletions

View File

@@ -2,7 +2,6 @@ import torch
from colossalai.utils import multi_tensor_applier
from colossalai.registry import OPTIMIZERS
from colossalai.nn.optimizer import CPU_ADAM_CNT
from typing import Optional
from .nvme_optimizer import NVMeOptimizer
@@ -68,13 +67,11 @@ class HybridAdam(NVMeOptimizer):
eps=1e-8,
weight_decay=0,
adamw_mode=True,
simd_log=False,
nvme_offload_fraction: float = 0.0,
nvme_offload_dir: Optional[str] = None):
default_args = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, bias_correction=bias_correction)
super(HybridAdam, self).__init__(model_params, default_args, nvme_offload_fraction, nvme_offload_dir)
self.opt_id = CPU_ADAM_CNT()
self.adamw_mode = adamw_mode
try:
import cpu_adam
@@ -82,17 +79,11 @@ class HybridAdam(NVMeOptimizer):
except ImportError:
raise ImportError('Please install colossalai from source code to use HybridAdam')
self.cpu_adam_op = cpu_adam
self.cpu_adam_op.create_adam(self.opt_id, lr, betas[0], betas[1], eps, weight_decay, adamw_mode, simd_log)
self.cpu_adam_op = cpu_adam.CPUAdamOptimizer(lr, betas[0], betas[1], eps, weight_decay, adamw_mode)
self.gpu_adam_op = colossal_C.multi_tensor_adam
self._dummy_overflow_buf = torch.cuda.IntTensor([0])
def __del__(self):
super().__del__()
if getattr(self, 'cpu_adam_op', None):
self.cpu_adam_op.destroy_adam(self.opt_id)
@torch.no_grad()
def step(self, closure=None):
loss = None
@@ -129,9 +120,9 @@ class HybridAdam(NVMeOptimizer):
assert state['exp_avg'].device.type == 'cpu', "exp_avg should stay on cpu"
assert state['exp_avg_sq'].device.type == 'cpu', "exp_avg should stay on cpu"
self._pre_update(p, 'exp_avg', 'exp_avg_sq')
self.cpu_adam_op.adam_update(self.opt_id, state['step'], group['lr'], beta1, beta2, group['eps'],
group['weight_decay'], group['bias_correction'], p.data, p.grad.data,
state['exp_avg'], state['exp_avg_sq'], -1)
self.cpu_adam_op.step(state['step'], group['lr'], beta1, beta2, group['eps'], group['weight_decay'],
group['bias_correction'], p.data, p.grad.data, state['exp_avg'],
state['exp_avg_sq'], -1)
self._post_update(p, 'exp_avg', 'exp_avg_sq')
elif target_device.type == 'cuda':