Get error when using scheduler for learning rate

Hello all, I got error when using 2 schedulers to automate learning rate change for 2 optimizers. however one scheduler is giving error, I have no clue why. Can you help? Thanks.

import torch.nn as nn
import torch
import numpy as np
import torchvision
from torch.autograd import Variable, Function
from torch.optim.lr_scheduler import ReduceLROnPlateau

train_data = torchvision.datasets.MNIST(
    root='/home/data/input/mnist',
    transform=torchvision.transforms.ToTensor(),
    download=True,
)

# hyper parameters
batch_size = 100 # batch size of images
img_h = 28
Dic_size = 50
ld = 4 # sparse penalty
lr = 0.0001 # learning rate
mom = 0.2
EPOCH = 100

# depends on size of the dictionary, number of atoms.
D = Variable(torch.from_numpy(np.random.normal(0,1,(Dic_size,img_h,img_h))).type(torch.FloatTensor), requires_grad=True)

# hx sparse representation
ht = Variable(torch.from_numpy(np.random.normal(0,1,(batch_size,Dic_size,1,1))).type(torch.FloatTensor), requires_grad=True)

# Dictionary loss function
def loss_Dictionary(x,D,ht):
    holder = []
    for i in range(len(ht)):
        holder.append((0.5*torch.norm((x[i]-(D*ht[i]).sum(dim=0)),p=2)**2))
    return torch.mean(torch.stack(holder))

# customized shrink function to update gradient
shrink_ht = lambda x: torch.stack([torch.sign(i)*torch.max(torch.abs(i)-lr*ld,0)[0] for i in x])

optimizer_ht = torch.optim.RMSprop([ht], lr=lr)
optimizer_D = torch.optim.RMSprop([D], lr=lr)

scheduler_ht = ReduceLROnPlateau(optimizer_ht,'min')
scheduler_D = ReduceLROnPlateau(optimizer_D,'min')

## random selection of the number
np.random.seed(0)
sample_holder = []
sampler = np.random.choice(60000,100,replace=False)
for sam_id in sampler:
    sample_holder.append(train_data.__getitem__(sam_id)[0][0])
train_batch = torch.stack(sample_holder, dim=0).view(-1,28,28)  
x = Variable(train_batch.view(-1,28,28))

## optimization step
for i in range(EPOCH):
    ## image update sparse representation
    holder_loss_ht = [] # to hold sparse representation loss value
    for idx in range(len(x)): 
        optimizer_ht.zero_grad() # clear up gradients
        loss_ht = 0.5*torch.norm((x[idx]-(D*ht[idx]).sum(dim=0)),p=2)**2
        loss_ht.backward() # back propogation and calculate gradients
        optimizer_ht.step() # update parameters with gradients
        ht.data[idx] = shrink_ht(ht.data[idx])  # customized shrink function. update the data (ht.data) only.
        holder_loss_ht.append(loss_ht.data)
        
    val_ht_loss = torch.mean(torch.stack(holder_loss_ht))    
    scheduler_ht.step(val_ht_loss)
    print('Epoc: ', i, ' Sparse loss: ', val_ht_loss)
  
        
    ## batch update from resconstruction
    optimizer_D.zero_grad() # clear up gradients
    loss_D = loss_Dictionary(x, D, ht) # x and ht are batch
    loss_D.backward() # back propogation and calculate gradients
    optimizer_D.step() # update parameters with gradients
    scheduler_D.step(loss_D)
    ## normalize it unit norm
    for ii in range(len(D)):
        D.data[ii,:,:] = D.data[ii,:,:]/torch.norm(D.data[ii,:,:],p=2) # Atom normalization.
        #D.data[:,ii,ii] = D.data[:,ii,ii]/torch.norm(D.data[:,ii,ii],p=2) # column normalization - cross features.
    print('Epoc: ', i, ' Dictionary loss: ', loss_D)

I got error on scheduler_D.step(loss_D) as below

Epoc: 0 Sparse loss: 1.592491530776024

RuntimeError Traceback (most recent call last)
in ()
27 loss_D.backward() # back propogation and calculate gradients
28 optimizer_D.step() # update parameters with gradients
β€”> 29 scheduler_D.step(loss_D)
30 ## normalize it unit norm
31 for ii in range(len(D)):

/home/miniconda3/lib/python3.6/site-packages/torch/optim/lr_scheduler.py in step(self, metrics, epoch)
294 self.last_epoch = epoch
295
–> 296 if self.is_better(current, self.best):
297 self.best = current
298 self.num_bad_epochs = 0

/home/miniconda3/lib/python3.6/site-packages/torch/optim/lr_scheduler.py in (a, best)
330 if mode == β€˜min’ and threshold_mode == β€˜rel’:
331 rel_epsilon = 1. - threshold
–> 332 self.is_better = lambda a, best: a < best * rel_epsilon
333 self.mode_worse = float(β€˜Inf’)
334 elif mode == β€˜min’ and threshold_mode == β€˜abs’:

RuntimeError: value cannot be converted to type float without overflow: inf

Could you print loss_D justd before passing it to the scheduler?
It seems loss_D is inf.

it is not infinite. code also works well without scheduler. thanks.

loss_D: Variable containing:
39222.4219
[torch.FloatTensor of size 1]

val_ht_loss = torch.mean(torch.stack(holder_loss_ht))
scheduler_ht.step(val_ht_loss)
print('Epoc: ', i, ’ Sparse loss: ', val_ht_loss)

–>

val_ht_loss = torch.mean(torch.stack(holder_loss_ht))
scheduler_ht.step(val_ht_loss.data[0])
print('Epoc: ', i, ’ Sparse loss: ', val_ht_loss)

Thanks mate! it works. one loss_D is variable. so I need to put .data into the scheduler_D.step(loss_D.data[0])

The pleasure is all mine :slight_smile:

1 Like