Problem : RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

Hi !
I am creating a ladder like autoencoder network for video prediction.
I create 2 class : one for the encoder and one for the decoder.
I need these network be separate during the inference time to access some value.
but I need to train the 2 networks with the same loss :

  • I pass the 2 net parameters to the same optimizer like :
param_S1 = list(E1.parameters()) + list(D1.parameters())
net_optimizer_S1 = torch.optim.SGD(param_S1, lr=0.01, momentum=0.9)
  • then I train the network : The inference work fine, the loss calculation too but when the code get to the .backward() procedure, it gave :
  File "RobNET_V2.py", line 181, in <module>
    loss_S1.backward()
  File "/usr/local/lib/python3.5/dist-packages/torch/tensor.py", line 93, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph)
  File "/usr/local/lib/python3.5/dist-packages/torch/autograd/__init__.py", line 90, in backward
    allow_unreachable=True)  # allow_unreachable flag
RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

can you help me please ^^’ ?

my code :

import cv2
import numpy as np

import torch
import torch.nn.init as init
import torch.nn as nn
from torch import optim
from torchvision import transforms

#-------------------------------------------------------------------------------
#                           Predictive Network
#-------------------------------------------------------------------------------
# Local network size
INPUT = 32*32
HIDDEN = 8*8
OUTPUT = INPUT
# Readout size
READ = HIDDEN
OUT_READ = 2

class INPUT_ENCODER(nn.Module):
    def __init__(self, Encoder_size, Hidden_size):
        super(INPUT_ENCODER, self).__init__()
        # first encoder input Xt
        self.E1 = nn.Linear(Encoder_size, Hidden_size)
        # secondary feedback input
        self.ES1 = nn.Linear(Encoder_size, Hidden_size)
        # recurrent input H1-1
        self.ER1 = nn.Linear(Hidden_size, Hidden_size)
        # activation Sigmoid
        self.ACT = nn.Sigmoid()

    def forward(self, Xt, FB1, Last_H1):
        # Encoder pass
        out_E1 = self.E1(Xt)
        out_ES1 = self.ES1(FB1)
        out_ER1 = self.ER1(last_H1)
        sum_H1 = out_E1 + out_ES1 + out_ER1
        H1 = self.ACT(sum_H1)
        # output stack
        return H1

class LAYER_ENCODER(nn.Module):
    def __init__(self, Encoder_size, Hidden_size):
        super(LAYER_ENCODER, self).__init__()
        # first encoder input Hl-1
        self.EN = nn.Linear(Hidden_size, Hidden_size)
        # secondary feedback input
        self.ESN = nn.Linear(Encoder_size, Hidden_size)
        # recurrent input Hn-1
        self.ERN = nn.Linear(Hidden_size, Hidden_size)
        # activation Sigmoid
        self.ACT = nn.Sigmoid()

    def forward(self, Ht, FBN, Last_Hn):
        # Encoder pass
        out_EN = self.EN(Ht)
        out_ESN = self.ESN(FBN)
        out_ERN = self.ERN(Last_Hn)
        sum_Hn = out_EN + out_ESN + out_ERN
        Hn = self.ACT(sum_Hn)
        # output stack
        return Hn

class LAYER_DECODER(nn.Module):
    def __init__(self, Decoder_size, Hidden_size):
        super(LAYER_DECODER, self).__init__()
        # first Decoder input Hn
        self.DN = nn.Linear(Hidden_size, Decoder_size)
        # secondary superior input
        self.DSN = nn.Linear(Decoder_size, Decoder_size)
        # activation Sigmoid
        self.ACT = nn.Sigmoid()

    def forward(self, Ht, superior_in):
        # Encoder pass
        out_DN = self.DN(Ht)
        out_DSN = self.DSN(superior_in)
        sum_Yn = out_DN + out_DSN
        Yn = self.ACT(sum_Yn)
        # output stack
        return Yn

#-------------------------------------------------------------------------------
#                      Motor/action network
#-------------------------------------------------------------------------------
class Readout(nn.Module):
    def __init__(self, Hidden_size, external_out):
        # supervised readout layer for task specific function
        super(Readout, self).__init__()
        # Simple 2 linear layers readout
        self.readout = nn.Linear(Hidden_size, external_out)
        self.act_readout = nn.Sigmoid()

    def forward(self, input_read):
        return self.readout(self.act_readout(input_read))

#-------------------------------------------------------------------------------
#                        Initialisation part :
#-------------------------------------------------------------------------------

# init stacked autoencoder => 3 stacks (3 encoder / 3 decoder)
E1 = INPUT_ENCODER(INPUT, HIDDEN)
E2 = LAYER_ENCODER(INPUT, HIDDEN)
E3 = LAYER_ENCODER(INPUT, HIDDEN)
D1 = LAYER_DECODER(OUTPUT, HIDDEN)
D2 = LAYER_DECODER(OUTPUT, HIDDEN)
D3 = LAYER_DECODER(OUTPUT, HIDDEN)
print(E1, E2, E3, D1, D2, D3)
# init readout for high level command part
# here we want to control a simple robot by given an UR/Tau command to
# the low-level trajectory/speed/torque controller
READOUT = Readout(HIDDEN, OUT_READ)
print(READOUT)

# init the loss function
loss_function = nn.MSELoss(reduction='elementwise_mean')
# init stacked AE parameters
param_S1 = list(E1.parameters()) + list(D1.parameters())
param_S2 = list(E2.parameters()) + list(D2.parameters())
param_S3 = list(E3.parameters()) + list(D3.parameters())
# init the optimization fonction
net_optimizer_S1 = torch.optim.SGD(param_S1, lr=0.01, momentum=0.9)
net_optimizer_S2 = torch.optim.SGD(param_S2, lr=0.01, momentum=0.9)
net_optimizer_S3 = torch.optim.SGD(param_S3, lr=0.01, momentum=0.9)
#net_optimizer = optim.Adam(RobNET_ALPHA.parameters(), lr=1e-3)
readout_optimizer = torch.optim.SGD(READOUT.parameters(), lr=0.01, momentum=0.9)

# init the recurrent part of the network
last_H1 = torch.zeros(HIDDEN)
last_H2 = torch.zeros(HIDDEN)
last_H3 = torch.zeros(HIDDEN)
# init the feedback
last_Y1 = torch.zeros(OUTPUT)
last_Y2 = torch.zeros(OUTPUT)
last_Y3 = torch.zeros(OUTPUT)
# no superior input for decoder 3
SUP_L3 = torch.zeros(OUTPUT)

#-------------------------------------------------------------------------------
#                     Learning/inference algorithm :
#-------------------------------------------------------------------------------

cam = cv2.VideoCapture(0)

while(1):

    # zero the parameter gradients
    net_optimizer_S1.zero_grad()
    net_optimizer_S2.zero_grad()
    net_optimizer_S3.zero_grad()

    # capture state Xt
    ret, frame = cam.read()
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    frame = cv2.resize(frame, (32,32))
    Xt = torch.from_numpy(frame)
    Xt = Xt.type(torch.FloatTensor)
    Xt = Xt.view(INPUT)

    # forward process
    # decoder pass
    H1 = E1.forward(Xt, last_Y1, last_H1)
    H2 = E2.forward(H1, last_Y2, last_H2)
    H3 = E3.forward(H2, last_Y3, last_H3)
    # encoder pass
    Y3 = D3.forward(H3, SUP_L3)
    Y2 = D2.forward(H2, Y3)
    Y1 = D1.forward(H1, Y2)

    # calculate error loss
    # train first AE

    # TODO: element 0 of tensors does not require grad and does not have a grad_fn
    # maybe => loss = Variable(loss, requires_grad = True) ?

    loss_S1 = loss_function(last_Y1, Xt)
    loss_S1.backward()
    net_optimizer_S1.step()
    # train second AE
    loss_S2 = loss_function(last_Y2, Y2)
    loss_S2.backward()
    net_optimizer_S2.step()
    # train third AE
    loss_S3 = loss_function(last_Y3, Y3)
    loss_S3.backward()
    net_optimizer_S3.step()

    # update recurrent element
    last_H1 = H1
    last_H2 = H2
    last_H3 = H3
    # update feedback
    last_Y1 = Y1
    last_Y2 = Y2
    last_Y3 = Y3

Currently you are passing two tensors to your loss function, which don’t require gradients and are decoupled of your models.
last_Y1 is still the zero tensor, while Xt is the image you’ve captured using openCV.
Usually you would pass some model output as the first input and a target as the second input to your loss function. Could you check, what should be calculated in loss_S1?

Also a small side note: you should call the model directly and not the forward method: H1 = E1(Xt, last_Y1, last_H1).

Hi !
Thank you for your answer and the tips.
I solve my problem by using a variable when I calculate the loss.