Runtime error while loss.backward()

akshay_23 · August 7, 2020, 8:17am

I am facing this error after i was told to do retain_graph = True in loss.backward().Here is my error

one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [100, 400]], which is output 0 of TBackward, is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!

Here is my model architecture

def create_emb_layer(weights_matrix, non_trainable=False):
    weights_matrix = torch.tensor(weights_matrix)
    num_embeddings, embedding_dim = 16404,50
    emb_layer = nn.Embedding(16404, 50)
    emb_layer.load_state_dict({'weight': weights_matrix})
    if non_trainable:
        emb_layer.weight.requires_grad = False

    return emb_layer, num_embeddings, embedding_dim

class Sentiglove(nn.Module):
  def __init__(self,weights_matrix):
    super().__init__()
    self.embedding, num_embeddings, embedding_dim = create_emb_layer(weights_matrix, True)
    self.conv1 = nn.Conv1d(in_channels = 48,out_channels =  32, kernel_size = 3, stride = 1)
    self.lstm = nn.LSTM(input_size = 48 , hidden_size = 100,bidirectional=True,  batch_first = True)
    self.dropout = nn.Dropout(0.3)
    #self.fc = nn.Linear(200, 5)
    self.fc = nn.Linear(6400, 5)
    if (torch.cuda.is_available()):
      self.hidden = (torch.zeros(2, 50, 100).cuda(),torch.zeros(2, 50, 100).cuda())
    else:
      self.hidden = (torch.zeros(2, 50, 100),torch.zeros(2, 50, 100))
    
  def forward(self,x):
    x = self.embedding(x)
    x = F.relu(self.conv1(x))
    lstm_out , self.hidden = self.lstm(x,self.hidden)
    lstm_out = self.dropout(lstm_out)
    lstm_out = lstm_out.contiguous().view(50, -1)
    out = F.relu(self.fc(lstm_out))
    return F.log_softmax(out,dim=1)

Here is the training code

import time
start = time.time()
epochs = 1
val_correct = []
val_loss = []
train_correct = []
train_loss = []
for i in range(epochs):
  trn_corr = 0
  trn_loss = 0
  for b, (x_train,y_train) in enumerate(train_loader):
    with torch.autograd.set_detect_anomaly(True):
      b+=1
    #y_pred = model(x_train.cuda())
      y_pred = model(x_train)
    #y_train = y_train.cuda()
      loss = criterion(y_pred,y_train)
      predicted = torch.max(y_pred.data,1)[1] #gives the indices of the highest number in each row
      trn_corr += (predicted == y_train).sum()

      optimizer.zero_grad()
      loss.backward(retain_graph=True)
      optimizer.step()

      if(b%100==0):
        print(f"Epoch:  {i}   batch:   {b}   loss   {loss.item()}   accuracy   {trn_corr.item()/(b*50)}")
    
  train_loss.append(loss)
  train_correct.append(trn_corr)


print(time.time() - start)

Hope i get a quick reply as it is required for my research project

user_123454321 · August 8, 2020, 11:10pm

You might want to detach predicted using predicted = predicted.detach(). Since you are adding it to trn_corr, the variable’s (trn_corr) buffers are flushed when you do optimizer.step().

akshay_23 · August 9, 2020, 7:57am

still got an error

---------------------------------------------------------------------------

RuntimeError                              Traceback (most recent call last)

<ipython-input-46-a044eacf08de> in <module>()
     21 
     22       optimizer.zero_grad()
---> 23       loss.backward(retain_graph=True)
     24       optimizer.step()
     25 

1 frames

/usr/local/lib/python3.6/dist-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables)
    125     Variable._execution_engine.run_backward(
    126         tensors, grad_tensors, retain_graph, create_graph,
--> 127         allow_unreachable=True)  # allow_unreachable flag
    128 
    129 

RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [100, 400]], which is output 0 of TBackward, is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!

i had changed to code to include the corrections too

import time
start = time.time()
epochs = 1
val_correct = []
val_loss = []
train_correct = []
train_loss = []
for i in range(epochs):
  trn_corr = 0
  trn_loss = 0
  for b, (x_train,y_train) in enumerate(train_loader):
    with torch.autograd.set_detect_anomaly(True):
      b+=1
    #y_pred = model(x_train.cuda())
      y_pred = model(x_train)
    #y_train = y_train.cuda()
      loss = criterion(y_pred,y_train)
      predicted = torch.max(y_pred.data,1)[1] #gives the indices of the highest number in each row
      predicted = predicted.detach()
      trn_corr += (predicted == y_train).sum()

      optimizer.zero_grad()
      loss.backward(retain_graph=True)
      optimizer.step()

      if(b%100==0):
        print(f"Epoch:  {i}   batch:   {b}   loss   {loss.item()}   accuracy   {trn_corr.item()/(b*50)}")
    
  train_loss.append(loss)
  train_correct.append(trn_corr)


print(time.time() - start)

Thanks for the response i think its some error with the forward network i am not able to figure out what it is

akshay_23 · August 9, 2020, 7:59am

My model architecture made a few changes so updating it

def create_emb_layer(weights_matrix, non_trainable=False):
    weights_matrix = torch.tensor(weights_matrix)
    num_embeddings, embedding_dim = 16404,50
    emb_layer = nn.Embedding(16404, 50)
    emb_layer.load_state_dict({'weight': weights_matrix})
    if non_trainable:
        emb_layer.weight.requires_grad = False

    return emb_layer, num_embeddings, embedding_dim

class Sentiglove(nn.Module):
  def __init__(self,weights_matrix):
    super().__init__()
    self.embedding, num_embeddings, embedding_dim = create_emb_layer(weights_matrix, True)
    self.conv1 = nn.Conv1d(in_channels = 48,out_channels =  32, kernel_size = 3, stride = 1)
    self.lstm = nn.LSTM(input_size = 48 , hidden_size = 100,bidirectional=True,  batch_first = True)
    self.dropout = nn.Dropout(0.3)
    self.fc = nn.Linear(100, 5)
    #self.fc = nn.Linear(6400, 5)
    if (torch.cuda.is_available()):
      self.hidden = (torch.zeros(2, 50, 100).cuda(),torch.zeros(2, 50, 100).cuda())
    else:
      self.hidden = (torch.zeros(2, 50, 100),torch.zeros(2, 50, 100))
    
  def forward(self,x):
    x = self.embedding(x)
    x = F.relu(self.conv1(x))
    lstm_out , self.hidden = self.lstm(x,self.hidden)
    out = self.dropout(self.hidden[0][-1])
    out = F.relu(self.fc(out))
    return F.log_softmax(out,dim=1)

user_123454321 · August 9, 2020, 10:21am

Oh…can you tell me what is the shape of you input tensor (x_train) ?

akshay_23 · August 9, 2020, 2:17pm

x_train shape is batch_size * seq_length * word_vector , so the batch size is 50 and the seq length is 48 and the each word is given a embedding of vector of size 50.Its a sentiment analysis problem

user_123454321 · August 9, 2020, 5:49pm

Ok…I think it is the self.hidden that is causing the problem…I am not sure why you are passing self.hidden in self.lstm, do you want the lstm layer to remember the state it was in the previous batch? If so you can do a detach and store in the model like

    x = self.embedding(x)
    x = F.relu(self.conv1(x))
    lstm_out , hidden = self.lstm(x, self.hidden)
    out = self.dropout(hidden[0][-1])
    self.hidden = (hidden[0].detach(), hidden[1].detach())
    out = F.relu(self.fc(out))
    return F.log_softmax(out, dim=1)

If you don’t need the lstm remember the state it was in previous batch (pretty common in sentiment analysis in which one sentiment text occurs in only one batch and does not continue in the next batch), I suggest you don’t pass anything to lstm.

akshay_23 · August 9, 2020, 8:00pm

def create_emb_layer(weights_matrix, non_trainable=False):
    weights_matrix = torch.tensor(weights_matrix)
    num_embeddings, embedding_dim = 16404,50
    emb_layer = nn.Embedding(16404, 50)
    emb_layer.load_state_dict({'weight': weights_matrix})
    if non_trainable:
        emb_layer.weight.requires_grad = False

    return emb_layer, num_embeddings, embedding_dim

class Sentiglove(nn.Module):
  def __init__(self,weights_matrix):
    super().__init__()
    self.embedding, num_embeddings, embedding_dim = create_emb_layer(weights_matrix, True)
    self.conv1 = nn.Conv1d(in_channels = 48,out_channels =  32, kernel_size = 3, stride = 1)
    self.lstm = nn.LSTM(input_size = 48 , hidden_size = 100,bidirectional=True,  batch_first = True)
    self.dropout = nn.Dropout(0.3)
    self.fc = nn.Linear(6400, 5)
    if (torch.cuda.is_available()):
      self.hidden = (torch.zeros(2, 50, 100).cuda(),torch.zeros(2, 50, 100).cuda())
    else:
      self.hidden = (torch.zeros(2, 50, 100),torch.zeros(2, 50, 100))
    
  def forward(self,x):
    x = self.embedding(x)
    x = F.relu(self.conv1(x))
    lstm_out , hidden = self.lstm(x,self.hidden)
    out = self.dropout(lstm_out)
    out = out.contiguous().view(50,-1)
    self.hidden = (hidden[0].detach(), hidden[1].detach())
    out = F.relu(self.fc(out))
    return F.log_softmax(out,dim=1)

Yeah thanks ur solutions worked ,yeah u were right ,instead of passing the lstm_out i was passing the hidden state which was wrong on my part .Thanks