Cannot find gradient of a variable in Pytorch

Jimut_Bahan_Pal · October 29, 2022, 1:51pm

So, I was implementing a function in Pytorch, and was unable to compute the gradient of input vector input_vec. Here decoder is a normal autoencoder’s decoder, device is cuda, input vector is input_vec = Variable(torch.zeros((1,16,16,16)), requires_grad = True) output is an image in img = dataiter.next()[0].reshape((1,3,128,128)), and loss function is torch.nn.CrossEntropyLoss().

import ot.utils

### Training function
def projected_grad(decoder, device, input_vec, output , loss_fn,lr=0.01):
    # input_vec = input_vec.cuda()
    input_vec = input_vec.clone().detach().requires_grad_(True).cuda()
    # print(input_vec)
    decoder.train()
    # Decode data
    decoded_data = decoder(input_vec)
    # print("Decoded data = ",decoded_data)
    # Evaluate loss
    loss = loss_fn(decoded_data,output)
    # print("Loss = ",loss)
    # Backward pass
    loss.backward()
    # print("Shape of input vector = ",input_vec.shape)
    with torch.no_grad():
        dx = input_vec.grad # NO GRADIENTS HERE
        print("DX = ",dx)
    input_vec -= lr*dx
    input_vec.grad.zero_()
    input_vec = ot.utils.proj_simplex(input_vec)
    # Print batch loss
    print('\t partial train loss (single batch): %f' % (loss.data))
    train_loss.append(loss.detach().cpu().numpy())

    return input_vec

We were unable to find the gradient of the variable dx here.

ptrblck · October 29, 2022, 5:03pm

Your code raises a warning which you shouldn’t ignore:

UserWarning: The .grad attribute of a Tensor that is not a leaf Tensor is being accessed. Its .grad attribute won't be populated during autograd.backward(). If you indeed want the .grad field to be populated for a non-leaf Tensor, use .retain_grad() on the non-leaf Tensor. If you access the non-leaf Tensor by mistake, make sure you access the leaf Tensor instead. See github.com/pytorch/pytorch/pull/30531 for more informations.

The .cuda() operation is differentiable, so call .requires_grad_() on the fully initialized tensor:

input_vec = input_vec.clone().detach().cuda().requires_grad_(True)

and it should work.

Jimut_Bahan_Pal · October 29, 2022, 7:51pm

It is still giving the same error.
I have reproduced the error in a colab notebook. Please check this… link: Google Colab

The problem is the gradients are just none for some reason. It is not even tensor of 0, just none.

! wget https://github.com/Jimut123/simply_junk/blob/main/masks.zip?raw=true -O masks.zip
! mkdir data
! mv masks.zip data
! pip3 install SimpleITK
! pip3 install nibabel
! pip3 install POT
! unzip '/content/data/masks.zip' -d '/content/data/images'

import torch
import torch.nn as nn
import torch.nn.functional as F

import cv2
import pickle
import torch.nn.functional as F
import torch.utils
import torch.distributions
import torchvision
import numpy as np
#from torchsummary import summary
from torchvision import datasets, transforms

import matplotlib.pyplot as plt # plotting library
import numpy as np # this module is useful to work with numerical arrays
import pandas as pd 
import random 
import torch
import torchvision
from torchvision import transforms
from torch.utils.data import DataLoader,random_split
from torch import nn
import torch.optim as optim
import SimpleITK as sitk
import nibabel as nib

import os
os.chdir('/content/data/images/masks')
train_dataset = []
print(os.listdir())
for file in os.listdir():
  # mask = cv2.imread(file,cv2.IMREAD_UNCHANGED)
  # print(file[-3:-1])
  if file[-3:-1]=="pn":
    img = cv2.imread(file,cv2.IMREAD_UNCHANGED)
    img = cv2.resize(img,(128,128),interpolation=cv2.INTER_NEAREST)

    n_classes=3
    mask_one_hot=F.one_hot(torch.tensor(img).long(), n_classes)
    mask_one_hot_transpose = np.transpose(mask_one_hot, (2, 0, 1))
    mask_one_hot_transpose=mask_one_hot_transpose.type(torch.float32)

    train_dataset.append(mask_one_hot_transpose)

plt.imshow(train_dataset[0].argmax(0))
plt.show()

m=len(train_dataset)
print(m)
train_data, val_data = random_split(train_dataset, [int(m-m*0.2), int(m*0.2)])
batch_size=64

train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size)
valid_loader = torch.utils.data.DataLoader(val_data, batch_size=batch_size)


class EncoderFinal(nn.Module):
    def __init__(self, latent_dims):
        # This part of code contains all the definations 
        # of the stuffs that we are going to use in the 
        # model
        super(EncoderFinal, self).__init__()
        self.conv1 = nn.Conv2d(3,8, 3, padding=1) 
        self.batch_norm1 = nn.BatchNorm2d(8) 
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(8,16,3, padding=1 )
        self.batch_norm2 = nn.BatchNorm2d(16)
        self.conv3 = nn.Conv2d(16,16,3, padding=1 )
        self.batch_norm3 = nn.BatchNorm2d(16)
        #self.flatten = nn.Flatten()

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.batch_norm1(x)
        x = self.pool(x)
        x = F.relu(self.conv2(x))
        x = self.batch_norm2(x)
        x = self.pool(x)
        x = F.relu(self.conv3(x))
        x = self.batch_norm3(x)
        x = F.softmax(self.pool(x), dim=1)
        # x = F.softmax(self.pool(x))
        #x = torch.flatten(x, 1)
        return x


class DecoderFinal(nn.Module):
    def __init__(self, latent_dims):
        super(DecoderFinal, self).__init__()
        self.t_conv1 = nn.ConvTranspose2d(16, 16, 2, stride=2)
        self.batch_norm1 = nn.BatchNorm2d(16)
        self.t_conv2 = nn.ConvTranspose2d(16, 8, 2, stride=2)
        self.batch_norm2 = nn.BatchNorm2d(8)
        self.t_conv3 = nn.ConvTranspose2d(8, 3, 2, stride=2)
      

    def forward(self, x):
        x = F.relu(self.t_conv1(x))
        x = self.batch_norm1(x)
        x = F.relu(self.t_conv2(x))
        x = self.batch_norm2(x)
        x = F.softmax(self.t_conv3(x), dim=1)
        return x


class AutoencoderFinal(nn.Module):
    def __init__(self, latent_dims):
        super(AutoencoderFinal, self).__init__()
        self.encoder = EncoderFinal(latent_dims)
        self.decoder = DecoderFinal(latent_dims)

    def forward(self, x):
        z = self.encoder(x)
        return self.decoder(z)


from torchsummary import summary

### Define the loss function
loss_fn = torch.nn.CrossEntropyLoss()

### Define an optimizer (both for the encoder and the decoder!)
lr= 0.01

### Set the random seed for reproducible results
torch.manual_seed(0)

### Initialize the two networks
d = 4

#model = Autoencoder(encoded_space_dim=encoded_space_dim)
encoder = EncoderFinal(128)
decoder = DecoderFinal(128)
summary(encoder.to("cuda"),input_size=(3,128,128))
summary(decoder.to("cuda"),input_size=(16,32,32))
params_to_optimize = [
    {'params': encoder.parameters()},
    {'params': decoder.parameters()}
]

optim = torch.optim.Adam(params_to_optimize, lr=lr, weight_decay=1e-05)

# Check if the GPU is available
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f'Selected device: {device}')

# Move both the encoder and the decoder to the selected device
encoder.to(device)
decoder.to(device)

import ot.utils

### Training function
def projected_grad(decoder, device, input_vec, output , loss_fn,lr=0.01):
    # input_vec = input_vec.cuda()
    input_vec = input_vec.clone().detach().cuda().requires_grad_(True)
    # print(input_vec)
    decoder.train()
    # Decode data
    decoded_data = decoder(input_vec)
    # print("Decoded data = ",decoded_data)
    # Evaluate loss
    loss = loss_fn(decoded_data,output)
    # print("Loss = ",loss)
    # Backward pass
    loss.backward()
    # print("Shape of input vector = ",input_vec.shape)
    with torch.no_grad():
        dx = input_vec[0,1,1,1].grad
        print("DX = ",dx)
    input_vec -= lr*dx
    input_vec.grad.zero_()
    input_vec = ot.utils.proj_simplex(input_vec)
    # Print batch loss
    print('\t partial train loss (single batch): %f' % (loss.data))
    train_loss.append(loss.detach().cpu().numpy())
    return input_vec


from torch.autograd import Variable
num_epochs = 30
i = 0
dataiter = iter(train_loader)
for i in range(20):
  img  = dataiter.next()[0].reshape((1,3,128,128))
  input_vec = Variable(torch.zeros((1,16,16,16)), requires_grad = True)
  for epoch in range(num_epochs):
    input_vec = projected_grad(decoder, device, input_vec.cuda(), img.cuda() , loss_fn,lr=0.01)
    loss = loss_fn(decoder(input_vec),img)
    if loss<0.0001:
      break
  imgplot = plt.imshow(image_batch[0].cpu().numpy().argmax(0))
  plt.savefig("./inputs/input_"+str(i)+".png")
  plt.show()
  imgplot2 = plt.imshow(decoded_data[0].cpu().detach().numpy().argmax(0))
  plt.savefig("./outputs/output_"+str(i)+".png")
  plt.show()

Any suggestions/ help would be appreciated!

ptrblck · October 29, 2022, 9:11pm

In your latest notebook you’ve changed the logic and are now indexing the input_vec.
Access the .grad attribute first, then index it:

with torch.no_grad():
    dx = input_vec.grad[0,1,1,1]
    print("DX = ",dx)

DX =  tensor(-0.0004, device='cuda:0')

Jimut_Bahan_Pal · October 30, 2022, 7:57am

You are the best !!! Thanks a lot for looking at the code and finding a solution to it!!