mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-25 11:44:03 +00:00
[hotfix] adapt ProcessGroup and Optimizer to ColoTensor (#1388)
This commit is contained in:
@@ -143,9 +143,9 @@ class CPUAdam(NVMeOptimizer):
|
||||
state['step'] = 0
|
||||
|
||||
# gradient momentums
|
||||
state['exp_avg'] = torch.zeros_like(p.data, dtype=torch.float, device=target_device)
|
||||
state['exp_avg'] = torch.zeros_like(p, dtype=torch.float, device=target_device)
|
||||
# gradient variances
|
||||
state['exp_avg_sq'] = torch.zeros_like(p.data, dtype=torch.float, device=target_device)
|
||||
state['exp_avg_sq'] = torch.zeros_like(p, dtype=torch.float, device=target_device)
|
||||
self._post_state_init(p)
|
||||
|
||||
state['step'] += 1
|
||||
|
@@ -122,9 +122,9 @@ class FusedAdam(torch.optim.Optimizer):
|
||||
# State initialization
|
||||
if len(state) == 0:
|
||||
# Exponential moving average of gradient values
|
||||
state['exp_avg'] = torch.zeros_like(p.data)
|
||||
state['exp_avg'] = torch.zeros_like(p)
|
||||
# Exponential moving average of squared gradient values
|
||||
state['exp_avg_sq'] = torch.zeros_like(p.data)
|
||||
state['exp_avg_sq'] = torch.zeros_like(p)
|
||||
|
||||
if p.dtype not in [torch.float16, torch.float32]:
|
||||
raise RuntimeError('FusedAdam only support fp16 and fp32.')
|
||||
|
@@ -162,9 +162,9 @@ class FusedLAMB(torch.optim.Optimizer):
|
||||
# State initialization
|
||||
if len(state) == 0:
|
||||
# Exponential moving average of gradient values
|
||||
state['exp_avg'] = torch.zeros_like(p.data)
|
||||
state['exp_avg'] = torch.zeros_like(p)
|
||||
# Exponential moving average of gradient values
|
||||
state['exp_avg_sq'] = torch.zeros_like(p.data)
|
||||
state['exp_avg_sq'] = torch.zeros_like(p)
|
||||
|
||||
if p.dtype == torch.float16:
|
||||
g_16.append(p.grad.data)
|
||||
|
@@ -104,7 +104,7 @@ class FusedSGD(Optimizer):
|
||||
# momentum application can be skipped in the main kernel.
|
||||
if 'momentum_buffer' not in param_state:
|
||||
first_run = True
|
||||
buf = param_state['momentum_buffer'] = torch.zeros_like(p.data)
|
||||
buf = param_state['momentum_buffer'] = torch.zeros_like(p)
|
||||
momentums.append(buf)
|
||||
else:
|
||||
first_run = False
|
||||
|
@@ -116,9 +116,9 @@ class HybridAdam(NVMeOptimizer):
|
||||
state['step'] = 0
|
||||
|
||||
# gradient momentums
|
||||
state['exp_avg'] = torch.zeros_like(p.data, dtype=torch.float, device=target_device)
|
||||
state['exp_avg'] = torch.zeros_like(p, dtype=torch.float, device=target_device)
|
||||
# gradient variances
|
||||
state['exp_avg_sq'] = torch.zeros_like(p.data, dtype=torch.float, device=target_device)
|
||||
state['exp_avg_sq'] = torch.zeros_like(p, dtype=torch.float, device=target_device)
|
||||
self._post_state_init(p)
|
||||
|
||||
state['step'] += 1
|
||||
|
@@ -67,9 +67,9 @@ class Lamb(Optimizer):
|
||||
if len(state) == 0:
|
||||
state['step'] = 0
|
||||
# Exponential moving average of gradient values
|
||||
state['exp_avg'] = torch.zeros_like(p.data)
|
||||
state['exp_avg'] = torch.zeros_like(p)
|
||||
# Exponential moving average of squared gradient values
|
||||
state['exp_avg_sq'] = torch.zeros_like(p.data)
|
||||
state['exp_avg_sq'] = torch.zeros_like(p)
|
||||
|
||||
exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
|
||||
beta1, beta2 = group['betas']
|
||||
|
Reference in New Issue
Block a user