Hi, I’m trying to call backward() on a Variable as part of training an RNN, and I’m getting this error message:
RuntimeError: Trying to backward through the graph a second time, but the buffers have already been freed. Specify retain_graph=True when calling backward the first time.
Here is the code I’m running:
#
import tqdm as tqdmx
from tqdm import tqdm_notebook as tqdm
def train_rnn_model(network, criterion, optimizer, trainLoader, valLoader, n_epochs = 10, use_gpu = False):
if use_gpu:
network = network.cuda()
criterion = criterion.cuda()
# Training loop.
for epoch in range(0, n_epochs):
correct = 0.0
cum_loss = 0.0
counter = 0
# Make a pass over the training data.
t = tqdm(trainLoader, desc = 'Training epoch %d' % epoch)
#initialize hidden state
index, (imgIds, paddedSeqs, seqLengths) = next(enumerate(t))
hiddenState = Variable(torch.Tensor(1, paddedSeqs.size(1), 512).zero_(), requires_grad=True)
network.train() # This is important to call before training!
for (i, (imgIds, paddedSeqs, seqLengths)) in enumerate(t):
batch_size = paddedSeqs.size(1) #gets batch size
batchSequenceLength = paddedSeqs.size(0) #gets length of longest sequence
# Wrap inputs, and labels into torch.autograd.Variable types.
tmpseqs = paddedSeqs.clone()
inputs = Variable(tmpseqs) #just the initial input from the training data
#read in target/labels to remove [START], and add [END]
#change it via numpy first
tmp = paddedSeqs.clone().numpy()
tmp = np.delete(tmp, (0), axis=0)
#5001 is start, 5002 is end.
newrow = np.array([5002] * batch_size)
tmp = np.vstack([tmp, newrow])
#convert back to tensor
paddedSeqs = torch.from_numpy(tmp)
#create label Variable for later
labels = Variable(paddedSeqs)
if use_gpu:
inputs = inputs.cuda()
hiddenState = hiddenState.cuda()
# Forward pass:
predictions, hiddenState = network(inputs, hiddenState)
#before passing to loss
# Collapse the batch and sequence-length dimensions in order to use nn.Linear.
predictions = predictions.view(batch_size * batchSequenceLength, -1)
labels = labels.permute(1, 0)
labels = ((labels.contiguous()).view(-1))
if use_gpu:
predictions = predictions.cuda()
labels = labels.cuda()
#print('preds', predictions)
#print('labels', labels)
loss = criterion(predictions, labels)
# Backward pass:
optimizer.zero_grad()
# Loss is a variable, and calling backward on a Variable will
# compute all the gradients that lead to that Variable taking on its
# current value.
loss.backward()
# Weight and bias updates.
optimizer.step()
# logging information.
cum_loss += loss.data[0]
max_scores, max_labels = predictions.data.max(1)
correct += (max_labels == labels.data).sum()
counter += (inputs.size(0) * inputs.size(1))
t.set_postfix(loss = cum_loss / (1 + i), accuracy = 100* correct / counter)
vocabularySize = len(trainData.vocabulary['word2id'])
model = TextGeneratorModel(vocabularySize)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 0.0005)
# Train the previously defined model.
train_rnn_model(model, criterion, optimizer, trainLoader, valLoader, n_epochs = 10)
######
I have no idea why I’m getting that error message, but I’m guessing it has something to do with the hiddenState Variable. When I don’t reassign the hiddenState variable in the forward pass call (i.e.
predictions, _ = network (inputs, hiddenState)
)
it trains without an error. Obvious, it doesn’t update the hiddenState though…
I’m completely stumped. What am I missing?
Thanks!