UserWarning: This overload of add_ , addcmul_, addcdiv_ is deprecated: errors - SharedAdam

granth_jain · November 25, 2020, 1:15pm

Hi,

I have a sharedAdam code for pytorch 0.1.12 and I have changed it to pytorch 1.6.
Can someone please help a check if the code migration is ok?

Pytorch 0.1.12 code

import math
import torch
import torch.optim as optim

Implementing the Adam optimizer with shared states

class SharedAdam(optim.Adam): # object that inherits from optim.Adam

def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0):
    super(SharedAdam, self).__init__(params, lr, betas, eps, weight_decay) # inheriting from the tools of optim.Adam
    for group in self.param_groups: # self.param_groups contains all the attributes of the optimizer, including the parameters to optimize (the weights of the network) contained in self.param_groups['params']
        for p in group['params']: # for each tensor p of weights to optimize
            state = self.state[p] # at the beginning, self.state is an empty dictionary so state = {} and self.state = {p:{}} = {p: state}
            state['step'] = torch.zeros(1) # counting the steps: state = {'step' : tensor([0])}
            state['exp_avg'] = p.data.new().resize_as_(p.data).zero_() # the update of the adam optimizer is based on an exponential moving average of the gradient (moment 1)
            state['exp_avg_sq'] = p.data.new().resize_as_(p.data).zero_() # the update of the adam optimizer is also based on an exponential moving average of the squared of the gradient (moment 2)

# Sharing the memory
def share_memory(self):
    for group in self.param_groups:
        for p in group['params']:
            state = self.state[p]
            state['step'].share_memory_() # tensor.share_memory_() acts a little bit like tensor.cuda()
            state['exp_avg'].share_memory_() # tensor.share_memory_() acts a little bit like tensor.cuda()
            state['exp_avg_sq'].share_memory_() # tensor.share_memory_() acts a little bit like tensor.cuda()

# Performing a single optimization step of the Adam algorithm (see algorithm 1 in https://arxiv.org/pdf/1412.6980.pdf)
def step(self):
    loss = None
    for group in self.param_groups:
        for p in group['params']:
            if p.grad is None:
                continue
            grad = p.grad.data
            state = self.state[p]
            exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
            beta1, beta2 = group['betas']
            state['step'] += 1
            if group['weight_decay'] != 0:
                grad = grad.add(group['weight_decay'], p.data)
            exp_avg.mul_(beta1).add_(1 - beta1, grad)
            exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
            denom = exp_avg_sq.sqrt().add_(group['eps'])
            bias_correction1 = 1 - beta1 ** state['step'][0]
            bias_correction2 = 1 - beta2 ** state['step'][0]
            step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1
            p.data.addcdiv_(-step_size, exp_avg, denom)
    return loss

Pytorch 1.6 Code-

class SharedAdam(optim.Adam):
“”“Implements Adam algorithm with shared states.
“””

def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
             weight_decay=0):
    super(SharedAdam, self).__init__(params, lr, betas, eps, weight_decay)

    for group in self.param_groups:
        for p in group['params']:
            state = self.state[p]
            state['step'] = torch.zeros(1)
            state['exp_avg'] = p.data.new().resize_as_(p.data).zero_()
            state['exp_avg_sq'] = p.data.new().resize_as_(p.data).zero_()

def share_memory(self):
    for group in self.param_groups:
        for p in group['params']:
            state = self.state[p]
            state['step'].share_memory_()
            state['exp_avg'].share_memory_()
            state['exp_avg_sq'].share_memory_()

def step(self, closure=None):
    """Performs a single optimization step.
    Arguments:
        closure (callable, optional): A closure that reevaluates the model
            and returns the loss.
    """
    loss = None
    if closure is not None:
        loss = closure()

    for group in self.param_groups:
        for p in group['params']:
            if p.grad is None:
                continue
            grad = p.grad.data
            state = self.state[p]

            exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
            beta1, beta2 = group['betas']

            state['step'] += 1

            if group['weight_decay'] != 0:
                grad = grad.add(other = p.data, alpha = group['weight_decay'])

            # Decay the first and second moment running average coefficient
            exp_avg.mul_(beta1).add_(grad, alpha = 1 - beta1)
            exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value = 1 - beta2)

            denom = exp_avg_sq.sqrt().add_(group['eps'])

            bias_correction1 = 1 - beta1 ** state['step'][0]
            bias_correction2 = 1 - beta2 ** state['step'][0]
            step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1

            p.data.addcdiv_(exp_avg, denom, value = -step_size)

    return loss

ptrblck · November 27, 2020, 7:05am

The function signature or these methods changes and you would have to adapt the passed values to is using the docs.
I’m not sure which line of code is causing the error, so feel free to add the stack trace in case you get stuck.