Hi everyone,
I am playing with a RNN in pytorch. I made the following example:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
batch_size = 10
n_examples = 200
sequence_len = 15 # This is equavlent to time steps of the sequence in keras
sequence_size = 13
hidden_size = 30
class my_rnn(nn.Module):
def __init__(self, input_size=2, hidden_size=20, num_layers=3, output_size=1,
batch_size=10):
super(my_rnn, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.num_layers = num_layers
self.output_size = output_size
self.batch_size = batch_size
self.rnn = nn.RNN(input_size=self.input_size, hidden_size=self.hidden_size,
num_layers=self.num_layers, batch_first=True)
# The last layer is applied on the last output only of the RNN (not like
# TimeDistributedDense in Keras)
self.linear_layer = nn.Linear(self.hidden_size, self.output_size)
self.hidden = Variable(torch.zeros((self.num_layers, self.batch_size, self.hidden_size)))
def forward(self, input_sequence):
out_rnn, self.hidden = self.rnn(input_sequence, self.hidden)
in_linear = out_rnn[:, -1, :]
final_output = self.linear_layer(in_linear)
return final_output
def init_hidden(self):
self.hidden = Variable(torch.zeros((self.num_layers, self.batch_size, self.hidden_size)))
rnn = my_rnn(input_size=sequence_size, hidden_size=hidden_size, num_layers=3, batch_size=batch_size)
#input_data = Variable(torch.randn((batch_size, sequence_len, sequence_size)))
h0 = Variable(torch.zeros((3, batch_size, hidden_size)))
demo_target = Variable(torch.randn((batch_size, 1)))
loss_fn = nn.MSELoss()
optimizer = optim.SGD(rnn.parameters(), lr=0.001, momentum=0.9)
for i in range(10):
input_data = Variable(torch.randn((batch_size, sequence_len, sequence_size)))
output = rnn(input_data)
#print output
optimizer.zero_grad()
loss = loss_fn(output, demo_target)
loss.backward()
#loss.backward(retain_variables=True)
optimizer.step()
print loss.data[0]
However, this gives me the following error:
RuntimeError: Trying to backward through the graph second time, but the buffers have already been freed. Please specify retain_variables=True when calling backward for the first time.
When I make this retain_variables=True
in the loss function, it seems to be working (not yet sure if it works correctly or not).
My questions are:
- What does
retain_variables=True
mean? - Is my code correct in general? or am I missing something (that made rise to this error)?
- Assuming my code is correct, then why in
https://github.com/pytorch/examples/blob/master/word_language_model/main.py
theretain_variables=True
option is not set?
Thank you