Trying to compute the loss of an encoder/decoder model

I am attempting to create an encoder/decoder model with mini-batch. I continue to encounter an errors stating:

RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [32, 6]], which is output 0 of AsStridedBackward0, is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!

The traceback reveals something is wrong with the y=self.linear(out) but I am unsure what exactly. Any help would be greatly appreciated. Below is the model. Thank you.

import torch
import torch.nn as nn
import numpy as np
from torch.autograd import Variable
from sliding_window import sliding_window
from training_datasets import get_training_datasets_batch
torch.autograd.set_detect_anomaly(True)


class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=1):
        super(Encoder, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.gru = nn.GRU(input_size=input_size, hidden_size=hidden_size,num_layers=num_layers,batch_first=True)
    def forward(self, x):
        flat = x.view(x.shape[0], x.shape[1], self.input_size)
        out,h = self.gru(flat)
        return out, h


class Decoder(nn.Module):
    def __init__(self, input_size, hidden_size, output_size=6, num_layers=1):
        super(Decoder, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.output_size = output_size

        self.gru = nn.GRU(input_size=input_size,hidden_size=hidden_size,num_layers=num_layers,batch_first=True)
        self.linear = nn.Linear(hidden_size, output_size)
        self.ReLU = nn.ReLU()
    def forward(self, x, h):
        x = x.unsqueeze(1)
        out, h = self.gru(x, h)
        out = out.squeeze(1)
        print(out.shape)
        y = self.linear(out)
        print(y.shape)
        y = self.ReLU(y)
        return y,h

class EncoderDecoder(nn.Module):
    def __init__(self, hidden_size, input_size=6, output_size=6):
        super(EncoderDecoder, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.encoder = Encoder(input_size=input_size, hidden_size=hidden_size)
        self.decoder = Decoder(input_size=input_size, hidden_size=hidden_size, output_size=output_size)

    def train_model(self, ts, epochs, target_len, features, batch_size=64, test_len=288, method = 'teacher_forcing', tfr = 0.5, lr = 0.01, dynamic_tf=False):
        X,Y= sliding_window(ts, features=288, target_len=target_len)

        x_train, x_val, x_test, y_train, y_val, y_test = get_training_datasets_batch(X,Y, features, test_len=test_len, batch_size=batch_size)
        losses = np.full(epochs,np.nan)
        optimizer = torch.optim.SGD(filter(lambda x: x.requires_grad, self.parameters()),
                                  lr=lr)
        criterion = nn.MSELoss()
        for e in range(epochs):
            print('Starting epoch {}'.format(e))
            x_train_data = iter(x_train)
            y_train_data = iter(y_train)
            x_val_data = iter(x_val)
            y_val_data = iter(y_val)
            x_train_shape = list(x_train)[0].shape
            # predicted = torch.zeros(target_len,batch_size,x_train_shape[2])
            # print(predicted.shape)
            loss=0
            for x_train_in in x_train_data:
                optimizer.zero_grad()
                x_train_in = Variable(x_train_in)
                y_train_in = Variable(next(y_train_data).transpose(0,1))
                _, enc_h = self.encoder(x_train_in)
                dec_in = x_train_in[:,-1,:]
                dec_h = enc_h
                
                if method == 'recursive':
                    for t in range(target_len):
                        dec_out, dec_h = self.decoder(dec_in, dec_h)
                        predicted = dec_out
                        dec_in = dec_out
                        loss += criterion(predicted,y_train_in[t])

                loss.backward(retain_graph=True)
                optimizer.step()

loss.backward(retain_graph=True) is often wrong as described e.g. here. Could you add more information on why you are using it as I guess its usage is causing the issue.

I’ve used it without retain_graph=True and it results in another error.

RuntimeError: Trying to backward through the graph a second time (or directly access saved tensors after they have already been freed).Saved intermediate values of the graph are freed when you call .backward() or autograd.grad(). Specify retain_graph=True if you need to backward through the graph a second time or if you need to access saved tensors after calling backward

In your training loop you are adding the current loss (with its computation graph) to loss via:

loss += criterion(predicted,y_train_in[t])

Assuming this addition is only needed for range(target_len) and not for x_train in x_train_data reset the loss value after the backward call.

Thank you. In the previous code, I summed the loss. If instead, I planned to average across the batches. How would I implement this? Below is an example however results in the same error occurs when I attempt to run though a second time.

for e in range(epochs):
   loss_batch = []
   predicted = torch.zeros(x_train_len, x_train_shape[0], x_train_shape[1])
   for r,x_train_in in enumerate(x_train_data):
      y_train_in = next(y_train_data)
      optimizer.zero_grad()
      _, enc_h = self.encoder(x_train_in.transpose(0,1))
      dec_in = x_train_in[-1, :]
      dec_h = enc_h
      if method == 'recursive':
         for t in range(target_len):
            dec_out, dec_h = self.decoder(dec_in, dec_h)
            predicted[r][t] = dec_out.squeeze(1)
            dec_in = dec_out.squeeze(1)

      loss = criterion(predicted[r], y_train_in)
      loss_batch.append(loss.item())
      loss.backward()
      optimizer.step()
   
   batch_average = np.mean(loss_batch)

You’ve marked by previous post as the solution so I’m unsure if you are still facing the issue or not.
If the new code snippet still fails, could you post a minimal, executable code snippet tp reproduce it, please?