The buffers have already been freed error

Hi I am trying to implement Sequence 2 Sequence model while implementing it I am facing the following error :

RuntimeError                              Traceback (most recent call last)
<ipython-input-12-cd1866fff827> in <module>()
----> 1 train(data1[0:10],data2[0:10],128,1,128,128,10000)

<ipython-input-6-2bf5208cf775> in train(data1, data2, embedding_size, n_layers, input_size, hidden_size, num_epochs)
     33             enc.zero_grad()
     34             dec.zero_grad()
---> 35             l.backward()
     36             optimizer.step()
     37 

/usr/local/lib/python2.7/dist-packages/torch/autograd/variable.pyc in backward(self, gradient, retain_variables)
    144                     'or with gradient w.r.t. the variable')
    145             gradient = self.data.new().resize_as_(self.data).fill_(1)
--> 146         self._execution_engine.run_backward((self,), (gradient,), retain_variables)
    147 
    148     def register_hook(self, hook):

/usr/local/lib/python2.7/dist-packages/torch/nn/_functions/thnn/auto.pyc in backward(self, grad_output)
     43 
     44     def backward(self, grad_output):
---> 45         input, target = self.saved_tensors
     46         grad_input = grad_output.new().resize_as_(input).zero_()
     47         getattr(self._backend, update_grad_input.name)(self._backend.library_state, input, target,

RuntimeError: Trying to backward through the graph second time, but the buffers have already been freed. Please specify retain_variables=True when calling backward for the first time.

Code:

import torch
import torch.optim as optim
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F

class Encoder(nn.Module):
    def __init__(self,vocab_size,embedding_size,n_layers,hidden_size):
        super(Encoder,self).__init__()
        self.embedding = nn.Embedding(vocab_size,embedding_size)
        self.lstm = nn.LSTM(embedding_size,hidden_size,n_layers)
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
    def init_hidden_cell(self):
        hidden = (Variable(torch.randn(1,1,self.hidden_size)),Variable(torch.randn(1,1,self.hidden_size)))
        return hidden
    
    def forward(self,x):
        vect = []
        for i in xrange(len(x)):
            vect.append(self.embedding(x[i].max(1)[1]))
        
        hidden = self.init_hidden_cell()
        output,hidden = self.lstm(torch.cat(vect),hidden)
        return hidden

class Decoder(nn.Module):
    def __init__(self,vocab_size,hidden_size,input_size,n_layers):
        super(Decoder,self).__init__()
        self.lstm = nn.LSTM(input_size,hidden_size,n_layers)
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.fc1 = nn.Linear(hidden_size,vocab_size)
    def forward(self,hidden):
        output,hidden = self.lstm(Variable(torch.zeros(1,1,self.input_size)),hidden)
        return F.softmax(self.fc1(hidden[0].view(-1,self.hidden_size))),hidden
        

def make_corpus(data):
    corpa = {"#":0}
    for i in data:
        for j in i.split(" "):
            if j not in corpa.keys():
                corpa[j] = len(corpa)
    return corpa

def make_vect(word,corpa):
    temp = torch.FloatTensor(1,len(corpa)).zero_()
    temp[0][corpa[word]] = 1.0
    return temp


def train(data1,data2,embedding_size,n_layers,input_size,hidden_size,num_epochs):
    corpa_lang1 = make_corpus(data1)
    corpa_lang2 = make_corpus(data2)
    #print corpa_lang1
    enc = Encoder(len(corpa_lang1),embedding_size,n_layers,hidden_size)
    dec = Decoder(len(corpa_lang2),hidden_size,input_size,n_layers)
    l = 0
    loss = nn.CrossEntropyLoss()
    params = list(enc.parameters()) + list(dec.parameters())
    optimizer = optim.SGD(params,lr= 0.01)
    for i in xrange(num_epochs):
        for j in xrange(len(data1)):
            print data1[j].split(" ")
            ip_vec = [Variable(make_vect(k,corpa_lang1),requires_grad= True) for k in data1[j].split(" ")]
            ip_vec = ip_vec + [Variable(make_vect("#",corpa_lang1),requires_grad = True)]
            op1,op2 = dec(enc(ip_vec))
            for m in xrange(len(data2[j].split(" "))+1):
                if m == len(data2[j].split(" ")):
                    op_vec = Variable(torch.FloatTensor([corpa_lang2["#"]]))
                    op_vec.data = torch.Tensor.long(op_vec.data)
                    op1,op2 = dec(op2)
                    l = l + loss(op1,op_vec)
                else:
                    op_vec = Variable(torch.FloatTensor([corpa_lang2[data2[j].split(" ")[m]]]))
                    op_vec.data = torch.Tensor.long(op_vec.data)
                    if m == 0:
                        l=l+loss(op1,op_vec)
                    else:
                        op1,op2 = dec(op2)
                        l = l + loss(op1,op_vec)
                
            
            enc.zero_grad()
            dec.zero_grad()
            l.backward()
            optimizer.step()
    
    return enc,dec


lines = open('data/eng-fra.txt').read().strip()

data1 = []
data2 = []
for i in lines.split("\n"):
    #print i.split("\t")
    if len(i.split("\t")) == 2:
        data1.append(i.split("\t")[0])
        data2.append(i.split("\t")[1])

for i in xrange(len(data2)):
    data2[i] = unicode(data2[i],encoding = 'utf-8')

train(data1[0:10],data2[0:10],128,1,128,128,10000)

Please can someone help me debug this problem?Thank you

I think it’s that when you accumulate the loss, you should unpack it, like loss.data[0]
Otherwise the graph will never be freed.
See this discussion: CUDA memory continuously increases when net(images) called in every iteration

1 Like

Hi , thanks for your advice it did remove that error but in turn this came up:

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-12-cd1866fff827> in <module>()
----> 1 train(data1[0:10],data2[0:10],128,1,128,128,10000)

<ipython-input-6-66e52529e52f> in train(data1, data2, embedding_size, n_layers, input_size, hidden_size, num_epochs)
     33             dec.zero_grad()
     34             print l
---> 35             l.backward()#retain_variables = True)
     36             optimizer.step()
     37 

/usr/local/lib/python2.7/dist-packages/torch/autograd/variable.pyc in backward(self, gradient, retain_variables)
    144                     'or with gradient w.r.t. the variable')
    145             gradient = self.data.new().resize_as_(self.data).fill_(1)
--> 146         self._execution_engine.run_backward((self,), (gradient,), retain_variables)
    147 
    148     def register_hook(self, hook):

RuntimeError: there are no graph nodes that require computing gradients

So , I have added the requires_grad attribute to every variable defined and do we add detach old graph after every backward pass or the old graph is preserved ? if we need to detach the old the graph then how do we need to do it?