Hi. I want to report a strange behavior of Adam. A value in the running average of squared gradients suddenly turns negative after an iteration. Concretely, I added print
to the step
as follows
class Adam(Optimizer):
r"""Implements Adam algorithm.It has been proposed in `Adam: A Method for Stochastic Optimization`_. Arguments: params (iterable): iterable of parameters to optimize or dicts defining parameter groups lr (float, optional): learning rate (default: 1e-3) betas (Tuple[float, float], optional): coefficients used for computing running averages of gradient and its square (default: (0.9, 0.999)) eps (float, optional): term added to the denominator to improve numerical stability (default: 1e-8) weight_decay (float, optional): weight decay (L2 penalty) (default: 0) amsgrad (boolean, optional): whether to use the AMSGrad variant of this algorithm from the paper `On the Convergence of Adam and Beyond`_ (default: False) .. _Adam\: A Method for Stochastic Optimization: https://arxiv.org/abs/1412.6980 .. _On the Convergence of Adam and Beyond: https://openreview.net/forum?id=ryQu7f-RZ """ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False): if not 0.0 <= lr: raise ValueError("Invalid learning rate: {}".format(lr)) if not 0.0 <= eps: raise ValueError("Invalid epsilon value: {}".format(eps)) if not 0.0 <= betas[0] < 1.0: raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) if not 0.0 <= betas[1] < 1.0: raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, amsgrad=amsgrad) super(Adam, self).__init__(params, defaults) def __setstate__(self, state): super(Adam, self).__setstate__(state) for group in self.param_groups: group.setdefault('amsgrad', False) def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: loss = closure() for group in self.param_groups: print(self.state[group['params'][70]]) # print for idx, p in enumerate(group['params']): if p.grad is None: continue grad = p.grad.data if grad.is_sparse: raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') amsgrad = group['amsgrad'] state = self.state[p] # State initialization if len(state) == 0: state['step'] = 0 # Exponential moving average of gradient values state['exp_avg'] = torch.zeros_like(p.data, memory_format=torch.preserve_format) # Exponential moving average of squared gradient values state['exp_avg_sq'] = torch.zeros_like(p.data, memory_format=torch.preserve_format) if amsgrad: # Maintains max of all exp. moving avg. of sq. grad. values state['max_exp_avg_sq'] = torch.zeros_like(p.data, memory_format=torch.preserve_format) exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] if amsgrad: max_exp_avg_sq = state['max_exp_avg_sq'] beta1, beta2 = group['betas'] state['step'] += 1 bias_correction1 = 1 - beta1 ** state['step'] bias_correction2 = 1 - beta2 ** state['step'] if group['weight_decay'] != 0: grad.add_(group['weight_decay'], p.data) if idx == 70: print(grad) print(beta2) print(exp_avg_sq) # Decay the first and second moment running average coefficient exp_avg.mul_(beta1).add_(1 - beta1, grad) exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) if idx == 70: print(exp_avg_sq) # print if amsgrad: # Maintains the maximum of all 2nd moment running avg. till now torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq) # Use the max. for normalizing running avg. of gradient denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps']) else: denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps']) step_size = group['lr'] / bias_correction1 p.data.addcdiv_(-step_size, exp_avg, denom) print(self.state[self.param_groups[0]['params'][70]]) # print return loss
The first iteration is usual.
{}
tensor([-0.0496, -0.6419, -0.6217, 0.2978, 0.1158, 1.1376, -0.8590, 0.6209],
device='cuda:0')
0.999
tensor([0., 0., 0., 0., 0., 0., 0., 0.], device='cuda:0')
tensor([2.4593e-06, 4.1199e-04, 3.8654e-04, 8.8691e-05, 1.3419e-05, 1.2942e-03,
7.3781e-04, 3.8557e-04], device='cuda:0')
{'step': 1, 'exp_avg': tensor([-0.0050, -0.0642, -0.0622, 0.0298, 0.0116, 0.1138, -0.0859, 0.0621],
device='cuda:0'), 'exp_avg_sq': tensor([2.4593e-06, 4.1199e-04, 3.8654e-04, 8.8691e-05, 1.3419e-05, 1.2942e-03,
7.3781e-04, 3.8557e-04], device='cuda:0')}
The problem happens in the second iteration.
{'step': 1, 'exp_avg': tensor([-0.0050, -0.0642, -0.0622, 0.0298, 0.0116, 0.1138, -0.0859, 0.0621],
device='cuda:0'), 'exp_avg_sq': tensor([-2.3110e-04, 4.1199e-04, 3.8654e-04, 8.8691e-05, 1.3419e-05,
1.2942e-03, 7.3781e-04, 3.8557e-04], device='cuda:0')}
tensor([ 0.1954, 0.3056, 0.1321, 0.1082, -0.2105, -0.4178, -0.0048, -0.1158],
device='cuda:0')
0.999
tensor([-2.3110e-04, 4.1199e-04, 3.8654e-04, 8.8691e-05, 1.3419e-05,
1.2942e-03, 7.3781e-04, 3.8557e-04], device='cuda:0')
tensor([-1.9270e-04, 5.0496e-04, 4.0361e-04, 1.0032e-04, 5.7697e-05,
1.4675e-03, 7.3710e-04, 3.9860e-04], device='cuda:0')
{'step': 2, 'exp_avg': tensor([ 0.0151, -0.0272, -0.0427, 0.0376, -0.0106, 0.0606, -0.0778, 0.0443],
device='cuda:0'), 'exp_avg_sq': tensor([-1.9270e-04, 5.0496e-04, 4.0361e-04, 1.0032e-04, 5.7697e-05,
1.4675e-03, 7.3710e-04, 3.9860e-04], device='cuda:0')}
As you can see, the first value in exp_avg_sq
turns negative. I should happen some time during the forward path and gradient calculation but at the moment I could not figure out yet.
Could you please let me know if there is any scenario that could cause this problem.
My environment is Pytorch 1.4 and CUDA 10.1 on a Linux server.
Thank you.
EDIT1: My project is large, and this might be a very corner case, so I don’t think I can provide a minimal example to reproduce this.