Hi,

I have a sharedAdam code for pytorch 0.1.12 and I have changed it to pytorch 1.6.

Pytorch 0.1.12 code

import math
import torch
import torch.optim as optim

# Implementing the Adam optimizer with shared states

``````def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0):
super(SharedAdam, self).__init__(params, lr, betas, eps, weight_decay) # inheriting from the tools of optim.Adam
for group in self.param_groups: # self.param_groups contains all the attributes of the optimizer, including the parameters to optimize (the weights of the network) contained in self.param_groups['params']
for p in group['params']: # for each tensor p of weights to optimize
state = self.state[p] # at the beginning, self.state is an empty dictionary so state = {} and self.state = {p:{}} = {p: state}
state['step'] = torch.zeros(1) # counting the steps: state = {'step' : tensor([0])}
state['exp_avg'] = p.data.new().resize_as_(p.data).zero_() # the update of the adam optimizer is based on an exponential moving average of the gradient (moment 1)
state['exp_avg_sq'] = p.data.new().resize_as_(p.data).zero_() # the update of the adam optimizer is also based on an exponential moving average of the squared of the gradient (moment 2)

# Sharing the memory
def share_memory(self):
for group in self.param_groups:
for p in group['params']:
state = self.state[p]
state['step'].share_memory_() # tensor.share_memory_() acts a little bit like tensor.cuda()
state['exp_avg'].share_memory_() # tensor.share_memory_() acts a little bit like tensor.cuda()
state['exp_avg_sq'].share_memory_() # tensor.share_memory_() acts a little bit like tensor.cuda()

# Performing a single optimization step of the Adam algorithm (see algorithm 1 in https://arxiv.org/pdf/1412.6980.pdf)
def step(self):
loss = None
for group in self.param_groups:
for p in group['params']:
continue
state = self.state[p]
exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
beta1, beta2 = group['betas']
state['step'] += 1
if group['weight_decay'] != 0:
bias_correction1 = 1 - beta1 ** state['step'][0]
bias_correction2 = 1 - beta2 ** state['step'][0]
step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1
return loss
``````

Pytorch 1.6 Code-

“”“Implements Adam algorithm with shared states.
“””

``````def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
weight_decay=0):
super(SharedAdam, self).__init__(params, lr, betas, eps, weight_decay)

for group in self.param_groups:
for p in group['params']:
state = self.state[p]
state['step'] = torch.zeros(1)
state['exp_avg'] = p.data.new().resize_as_(p.data).zero_()
state['exp_avg_sq'] = p.data.new().resize_as_(p.data).zero_()

def share_memory(self):
for group in self.param_groups:
for p in group['params']:
state = self.state[p]
state['step'].share_memory_()
state['exp_avg'].share_memory_()
state['exp_avg_sq'].share_memory_()

def step(self, closure=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
loss = closure()

for group in self.param_groups:
for p in group['params']:
continue
state = self.state[p]

exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
beta1, beta2 = group['betas']

state['step'] += 1

if group['weight_decay'] != 0:

# Decay the first and second moment running average coefficient