Loss is nan after optimizer.step

Hey,
I’m trying to use the foolowing optimizer that I implemented (rmsprop) but after the first step of the optimizer the loss that is calculated in my main is nan.

def __init__(self, parameters,lr=0.001,beta=0.999,epsilon=sys.float_info.epsilon):
    self.layers_data_list=[]
    self.epsilon=epsilon
    self.beta=beta
    for layer_params in list(parameters):
      layer_dict = dict()
      layer_dict['params']=layer_params
      layer_dict['lr']=lr
      layer_dict['average_derrevative']=None
      self.layers_data_list.append(layer_dict)

  def step(self): 
    for layer_data in self.layers_data_list:
      p=layer_data['params']
      lr=layer_data['lr']
      d_p=p.grad.data
      if layer_data['average_derrevative'] is None:
        layer_data['average_derrevative'] = torch.clone(d_p).detach()
      else :
        layer_data['average_derrevative'].mul_(self.beta).addcmul_(1-self.beta,d_p,d_p)
      
      avg_root = torch.clone(layer_data['average_derrevative']).detach().sqrt().add_(self.epsilon) # tried to add .clone().detach() didnt help..
      layer_data['params'].data.addcdiv_(-lr,d_p, avg_root)
1 Like

Not sure why, but after replacing the following line :
layer_data['average_derrevative'] = torch.clone(d_p).detach()

with the following :
layer_data['average_derrevative'] = torch.zeros_like(layer_data['params'].data)

solved my problem. To be honest, I’m not sure why the first command was a mistake. If someone can explain it will be gr8.