Negative running average of squared gradients in Adam

Hi. I want to report a strange behavior of Adam. A value in the running average of squared gradients suddenly turns negative after an iteration. Concretely, I added print to the step as follows

class Adam(Optimizer):
r"""Implements Adam algorithm.

It has been proposed in `Adam: A Method for Stochastic Optimization`_.

Arguments:
    params (iterable): iterable of parameters to optimize or dicts defining
        parameter groups
    lr (float, optional): learning rate (default: 1e-3)
    betas (Tuple[float, float], optional): coefficients used for computing
        running averages of gradient and its square (default: (0.9, 0.999))
    eps (float, optional): term added to the denominator to improve
        numerical stability (default: 1e-8)
    weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
    amsgrad (boolean, optional): whether to use the AMSGrad variant of this
        algorithm from the paper `On the Convergence of Adam and Beyond`_
        (default: False)

.. _Adam\: A Method for Stochastic Optimization:
    https://arxiv.org/abs/1412.6980
.. _On the Convergence of Adam and Beyond:
    https://openreview.net/forum?id=ryQu7f-RZ
"""

def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
             weight_decay=0, amsgrad=False):
    if not 0.0 <= lr:
        raise ValueError("Invalid learning rate: {}".format(lr))
    if not 0.0 <= eps:
        raise ValueError("Invalid epsilon value: {}".format(eps))
    if not 0.0 <= betas[0] < 1.0:
        raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
    if not 0.0 <= betas[1] < 1.0:
        raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
    defaults = dict(lr=lr, betas=betas, eps=eps,
                    weight_decay=weight_decay, amsgrad=amsgrad)
    super(Adam, self).__init__(params, defaults)

def __setstate__(self, state):
    super(Adam, self).__setstate__(state)
    for group in self.param_groups:
        group.setdefault('amsgrad', False)

def step(self, closure=None):
    """Performs a single optimization step.

    Arguments:
        closure (callable, optional): A closure that reevaluates the model
            and returns the loss.
    """
    loss = None
    if closure is not None:
        loss = closure()
    for group in self.param_groups:
        print(self.state[group['params'][70]])  # print
        for idx, p in enumerate(group['params']):
            if p.grad is None:
                continue
            grad = p.grad.data
            if grad.is_sparse:
                raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
            amsgrad = group['amsgrad']

            state = self.state[p]

            # State initialization
            if len(state) == 0:
                state['step'] = 0
                # Exponential moving average of gradient values
                state['exp_avg'] = torch.zeros_like(p.data, memory_format=torch.preserve_format)
                # Exponential moving average of squared gradient values
                state['exp_avg_sq'] = torch.zeros_like(p.data, memory_format=torch.preserve_format)
                if amsgrad:
                    # Maintains max of all exp. moving avg. of sq. grad. values
                    state['max_exp_avg_sq'] = torch.zeros_like(p.data, memory_format=torch.preserve_format)

            exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
            if amsgrad:
                max_exp_avg_sq = state['max_exp_avg_sq']
            beta1, beta2 = group['betas']

            state['step'] += 1
            bias_correction1 = 1 - beta1 ** state['step']
            bias_correction2 = 1 - beta2 ** state['step']

            if group['weight_decay'] != 0:
                grad.add_(group['weight_decay'], p.data)

            if idx == 70:
                print(grad)
                print(beta2)
                print(exp_avg_sq)

            # Decay the first and second moment running average coefficient
            exp_avg.mul_(beta1).add_(1 - beta1, grad)
            exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
            if idx == 70:
                print(exp_avg_sq)  # print

            if amsgrad:
                # Maintains the maximum of all 2nd moment running avg. till now
                torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
                # Use the max. for normalizing running avg. of gradient
                denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
            else:
                denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])

            step_size = group['lr'] / bias_correction1

            p.data.addcdiv_(-step_size, exp_avg, denom)
    print(self.state[self.param_groups[0]['params'][70]])  # print
    return loss

The first iteration is usual.

{}
tensor([-0.0496, -0.6419, -0.6217,  0.2978,  0.1158,  1.1376, -0.8590,  0.6209],
       device='cuda:0')
0.999
tensor([0., 0., 0., 0., 0., 0., 0., 0.], device='cuda:0')
tensor([2.4593e-06, 4.1199e-04, 3.8654e-04, 8.8691e-05, 1.3419e-05, 1.2942e-03,
        7.3781e-04, 3.8557e-04], device='cuda:0')
{'step': 1, 'exp_avg': tensor([-0.0050, -0.0642, -0.0622,  0.0298,  0.0116,  0.1138, -0.0859,  0.0621],
       device='cuda:0'), 'exp_avg_sq': tensor([2.4593e-06, 4.1199e-04, 3.8654e-04, 8.8691e-05, 1.3419e-05, 1.2942e-03,
        7.3781e-04, 3.8557e-04], device='cuda:0')}

The problem happens in the second iteration.

{'step': 1, 'exp_avg': tensor([-0.0050, -0.0642, -0.0622,  0.0298,  0.0116,  0.1138, -0.0859,  0.0621],
       device='cuda:0'), 'exp_avg_sq': tensor([-2.3110e-04,  4.1199e-04,  3.8654e-04,  8.8691e-05,  1.3419e-05,
         1.2942e-03,  7.3781e-04,  3.8557e-04], device='cuda:0')}
tensor([ 0.1954,  0.3056,  0.1321,  0.1082, -0.2105, -0.4178, -0.0048, -0.1158],
       device='cuda:0')
0.999
tensor([-2.3110e-04,  4.1199e-04,  3.8654e-04,  8.8691e-05,  1.3419e-05,
         1.2942e-03,  7.3781e-04,  3.8557e-04], device='cuda:0')
tensor([-1.9270e-04,  5.0496e-04,  4.0361e-04,  1.0032e-04,  5.7697e-05,
         1.4675e-03,  7.3710e-04,  3.9860e-04], device='cuda:0')
{'step': 2, 'exp_avg': tensor([ 0.0151, -0.0272, -0.0427,  0.0376, -0.0106,  0.0606, -0.0778,  0.0443],
       device='cuda:0'), 'exp_avg_sq': tensor([-1.9270e-04,  5.0496e-04,  4.0361e-04,  1.0032e-04,  5.7697e-05,
         1.4675e-03,  7.3710e-04,  3.9860e-04], device='cuda:0')}

As you can see, the first value in exp_avg_sq turns negative. I should happen some time during the forward path and gradient calculation but at the moment I could not figure out yet.
Could you please let me know if there is any scenario that could cause this problem.
My environment is Pytorch 1.4 and CUDA 10.1 on a Linux server.
Thank you.

EDIT1: My project is large, and this might be a very corner case, so I don’t think I can provide a minimal example to reproduce this.

Hi,

Did you change the beta values from the default by any chance?

Thanks for your response. No I used all the default hyperparameters except for lr.

I guess you can add a bit more printing in the adam step function to see exactly where it appears.
I guess it will be due to numerical precision problems…