# Retrain_variables in the loss function

Hi everyone,

I am playing with a RNN in pytorch. I made the following example:

``````import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import numpy as np
import matplotlib.pyplot as plt

batch_size = 10
n_examples = 200
sequence_len = 15  # This is equavlent to time steps of the sequence in keras
sequence_size = 13
hidden_size = 30
class my_rnn(nn.Module):
def __init__(self, input_size=2, hidden_size=20, num_layers=3, output_size=1,
batch_size=10):
super(my_rnn, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.num_layers = num_layers
self.output_size = output_size
self.batch_size = batch_size

self.rnn = nn.RNN(input_size=self.input_size, hidden_size=self.hidden_size,
num_layers=self.num_layers, batch_first=True)
# The last layer is applied on the last output only of the RNN (not like
# TimeDistributedDense in Keras)
self.linear_layer = nn.Linear(self.hidden_size, self.output_size)

self.hidden = Variable(torch.zeros((self.num_layers, self.batch_size, self.hidden_size)))

def forward(self, input_sequence):
out_rnn, self.hidden = self.rnn(input_sequence, self.hidden)
in_linear = out_rnn[:, -1, :]
final_output = self.linear_layer(in_linear)
return final_output

def init_hidden(self):
self.hidden = Variable(torch.zeros((self.num_layers, self.batch_size, self.hidden_size)))

rnn = my_rnn(input_size=sequence_size, hidden_size=hidden_size, num_layers=3, batch_size=batch_size)
#input_data = Variable(torch.randn((batch_size, sequence_len, sequence_size)))
h0 = Variable(torch.zeros((3, batch_size, hidden_size)))
demo_target = Variable(torch.randn((batch_size, 1)))

loss_fn = nn.MSELoss()
optimizer = optim.SGD(rnn.parameters(), lr=0.001, momentum=0.9)
for i in range(10):
input_data = Variable(torch.randn((batch_size, sequence_len, sequence_size)))
output = rnn(input_data)
#print output
loss = loss_fn(output, demo_target)
loss.backward()
#loss.backward(retain_variables=True)
optimizer.step()
print loss.data
``````

However, this gives me the following error:
`RuntimeError: Trying to backward through the graph second time, but the buffers have already been freed. Please specify retain_variables=True when calling backward for the first time.`

When I make this `retain_variables=True` in the loss function, it seems to be working (not yet sure if it works correctly or not).

My questions are:

Thank you I guess the problem is caused by self.hidden. You should build a Variable of the hidden data and feed it to the network in every forward step. For example:

``````import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import numpy as np
import matplotlib.pyplot as plt

batch_size = 10
n_examples = 200
sequence_len = 15  # This is equavlent to time steps of the sequence in keras
sequence_size = 13
hidden_size = 30
class my_rnn(nn.Module):
def __init__(self, input_size=2, hidden_size=20, num_layers=3, output_size=1,
batch_size=10):
super(my_rnn, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.num_layers = num_layers
self.output_size = output_size
self.batch_size = batch_size

self.rnn = nn.RNN(input_size=self.input_size, hidden_size=self.hidden_size,
num_layers=self.num_layers, batch_first=True)
# The last layer is applied on the last output only of the RNN (not like
# TimeDistributedDense in Keras)
self.linear_layer = nn.Linear(self.hidden_size, self.output_size)

self.hidden = Variable(torch.zeros((self.num_layers, self.batch_size, self.hidden_size)))

def forward(self, input_sequence):
out_rnn, self.hidden = self.rnn(input_sequence, self.hidden)
in_linear = out_rnn[:, -1, :]
final_output = self.linear_layer(in_linear)
return final_output

def init_hidden(self):
self.hidden = Variable(torch.zeros((self.num_layers, self.batch_size, self.hidden_size)))

rnn = my_rnn(input_size=sequence_size, hidden_size=hidden_size, num_layers=3, batch_size=batch_size)
#input_data = Variable(torch.randn((batch_size, sequence_len, sequence_size)))
h0 = Variable(torch.zeros((3, batch_size, hidden_size)))
demo_target = Variable(torch.randn((batch_size, 1)))

loss_fn = nn.MSELoss()
optimizer = optim.SGD(rnn.parameters(), lr=0.001, momentum=0.9)
for i in range(10):
input_data = Variable(torch.randn((batch_size, sequence_len, sequence_size)))
rnn.init_hidden()  # init hidden Variable before every forward step
output = rnn(input_data)
#print output
loss = loss_fn(output, demo_target)
loss.backward()
#loss.backward(retain_variables=True)
optimizer.step()
print loss.data
``````

However, the better way is keep the hidden data and rebuild a Variable containing this data and then feed it to the forward function again. Like this:

1 Like

Thank you for your reply @cyyyyc123. I tried removing the `hidden` matrix out of the class - as you mentioned -, but the problem is still the same

This is my new code

``````import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import numpy as np
import matplotlib.pyplot as plt

batch_size = 32
n_examples = 200
sequence_len = 30  # This is equivalent to time steps of the sequence in keras
sequence_size = 1
hidden_size = 50
nb_layers = 2
target_size = 1

# Generate noisy sine-wave
# When I want to complexify (many to one), just swap the x and y
NSAMPLE = 100000
f = 2 # the frequency of the signal
x_data = np.float32(np.arange(NSAMPLE))
r_data = np.float32(np.random.uniform(-0.2, 0.2, NSAMPLE))
y_data = np.float32(np.sin(2 * np.pi * f* (x_data/NSAMPLE)) + r_data)
# Build the training data
X = []
y = []
for i in range(0, y_data.shape, sequence_len):
if i+sequence_len < y_data.shape:
X.append(x_data[i:i+sequence_len])
y.append(y_data[i+sequence_len]) # next point
X = np.array(X)
y = np.array(y)

class my_rnn_1(nn.Module):
def __init__(self, input_size=2, hidden_size=20, num_layers=3, output_size=1,
batch_size=10):
super(my_rnn_1, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.num_layers = num_layers
self.output_size = output_size
self.batch_size = batch_size

self.rnn = nn.RNN(input_size=self.input_size, hidden_size=self.hidden_size,
num_layers=self.num_layers, batch_first=True)
# The last layer is applied on the last output only of the RNN (not like
# TimeDistributedDense in Keras)
self.linear_layer = nn.Linear(self.hidden_size, self.output_size)

def forward(self, input_sequence, hidden):
out_rnn, hidden = self.rnn(input_sequence, hidden)
in_linear = out_rnn[:, -1, :]
final_output = self.linear_layer(in_linear)
return final_output, hidden

def init_hidden(self):
return Variable(torch.zeros((self.num_layers, self.batch_size, self.hidden_size))).cuda()

def get_batch(X, Y, i, evaluation=False):
global batch_size
seq_len = min(batch_size, len(X) - 1 - i)
data = X[i:i+seq_len, :]
data = data.view(data.size(0), data.size(1), 1)
target = Y[i+1:i+1+seq_len].view(-1, 1)
return data, target

rnn = my_rnn_1(input_size=sequence_size, hidden_size=hidden_size, num_layers=3, batch_size=batch_size).cuda()
demo_target = Variable(torch.randn((batch_size, 1))).cuda()
X = Variable(torch.FloatTensor(X)).cuda()
y = Variable(torch.FloatTensor(y)).cuda()
loss_fn = nn.MSELoss()
# optimizer = optim.SGD(rnn.parameters(), lr=0.0001, momentum=0.9)
optimizer = optim.RMSprop(rnn.parameters())

for epoch in range(20):
hidden = rnn.init_hidden()
total_loss = 0
for batch, i in enumerate(range(0, X.size(0) - 1, batch_size)):
data, targets = get_batch(X, y, i)
if data.size(0) < batch_size:
break
output, hidden = rnn(data, hidden)
#print output
loss = loss_fn(output, targets)
loss.backward()
# loss.backward(retain_variables=True)
optimizer.step()
total_loss += loss.data
if i % 10 == 0:
print "Loss = ", total_loss
print "Epoch " + str(epoch) + " -- loss = " + str(total_loss)
print "-"*100
``````

This still gives the same error:

``````Traceback (most recent call last):
File "/localdata/mohammom/gipsa-lig/experiments/tensorflow_tutorial/temp.py", line 132, in <module>
loss.backward()
File "/localdata/mohammom/anaconda2/lib/python2.7/site-packages/torch/autograd/variable.py", line 146, in backward
File "/localdata/mohammom/anaconda2/lib/python2.7/site-packages/torch/autograd/function.py", line 209, in _do_backward
File "/localdata/mohammom/anaconda2/lib/python2.7/site-packages/torch/autograd/function.py", line 217, in backward
File "/localdata/mohammom/anaconda2/lib/python2.7/site-packages/torch/nn/_functions/rnn.py", line 275, in backward_extended
input, hx, weight, output = self.saved_tensors
File "/localdata/mohammom/anaconda2/lib/python2.7/site-packages/torch/autograd/function.py", line 235, in saved_tensors
flat_tensors = super(NestedIOFunction, self).saved_tensors
RuntimeError: Trying to backward through the graph second time, but the buffers have already been freed. Please specify retain_variables=True when calling backward for the first time.
``````

Just an one line change: moving the

``````hidden = rnn.init_hidden()
``````

into the inner loop.

Your linear layer is not doing the same as a TimeDistributedDense in Keras. You are only using the last time step, and ditching everything else.

Have a look at my TimeDistributed wrapper here:

@cyyyyc123 Should I reset the hidden state of the RNN after every batch?
In the example you mentioned before (word_language_model), the hidden state is reset at the beginning of each epoch

@miguelvr Thank you for mentioning this issue, I am aware of it
I took a look at your implementation for Timedistributed Wrapper, many thanks! Very helpful

It depends on your purpose: if you want the hidden state to keep the information of the whole epoch, you can just initialize the hidden at the begin of the epoch; if you want every mini-batch from one epoch has the same initial hidden state, you should initialize the hidden at the begin of every mini-batch. However, the point why your code failed but the referential code succeed is the usage of ‘hidden = repackage_hidden(hidden)’ in the referential code: it will reconstruct a Variable for the next iteration. You can add one line

``````hidden = Variable(hidden.data)
``````

below

``````output, hidden = rnn(data, hidden)
``````

Aha, I see the problem now. Thank you for your time @cyyyyc123 