Is_leaf = True, requires_grad= True but .grad is giving me None Value

class CNN_Model():
  def __init__(self, epochs, learning_rate):
    self.epochs = epochs
    self.learning_rate = learning_rate

    # conv and dense layer should be define here
    # conv layer 1
    self.conv_1 = Conv2D(3,16,3,1,1)
     
    self.wbc_1_params = self.conv_1.weight_initialization()
     
    self.WCONV_1 = self.wbc_1_params[0]
    self.BCONV_1 = self.wbc_1_params[1]

    # Just for demonastration
    self.v = torch.empty(4,5, requires_grad=True)
    
    # conv layer 2
    self.conv_2 = Conv2D(16,32,3,1,1)
    self.wbc_2_params = self.conv_2.weight_initialization()

    #test
    self.WCONV_2 = self.wbc_2_params[0]
    self.BCONV_2 = self.wbc_2_params[1]
    
    # fully connected Layer
    self.fc_layer = FCLayer(sizes=[2048, 512, 128, 10], learning_rate= 0.1)

  def display_information(self):
    print("Number of epochs for training our CNN Model: ", self.epochs)
    print("Learning of our CNN Model: ", self.learning_rate)
  
  def forward(self, A_prev):                                          # A_prev= X_train

    # defining first convolution with input chanel= 3, output_chanel= 16, kernel_size= 3*3, stride= 1, padding= 1
    #conv_1 = Conv2D(3,16,3,1,1)

    # weight and bias initialization
    #wbc_1_params = conv_1.weight_initialization()
    # testing purpose
    W1 = self.WCONV_1
    b1 = self.BCONV_1

    # getting stride and padding 
    hparams = self.conv_1.get_hparameter()

    # performing convolution
    output, mem = self.conv_1.conv_forward(A_prev, self.WCONV_1, self.BCONV_1, hparams)    # output shape: 50000 * 32 * 32 * 16
    
    self.WCONV_1.retain_grad()
    self.BCONV_1.retain_grad()
    
    # passing the output through ReLu 
    relu_output = self.conv_1.relu(output)                           # relu_output shape: 50000 * 32 * 32 * 16
    #print("After passing through relu: ",type(relu_output))

    # performorming MaxPool operation
    hparameters = {"filter" : 2,
               "stride": 2}
    pool_output, pool_mem = self.conv_1.pool_forward(relu_output, hparameters)    # relu_output shape: 50000 * 16 * 16 * 16
    

    # storing output and pool layer output to visualize feature
    conv_1_features = {
        'output': output,
        'mem': mem,
        'pool_output': pool_output,
        'pool_mem': pool_mem    
    }

    #print('convolution 1 completed.....!')
    # defining our second convolution with input channel = 16, output_channel= 32, kernel_size = 3*3, stride= 1, padding= 1

    #conv_2 = Conv2D(16, 32, 3, 1, 1)

    #test purpose
    # W2 = self.WCONV_2
    # b2 = self.BCONV_2

    # getting stride and padding 
    hparams_2 = self.conv_2.get_hparameter()

    # performing convolution
    output_2, mem_2 = self.conv_2.conv_forward(pool_output, self.WCONV_2, self.BCONV_2, hparams) # passing 1s layer output as input         # output_2 shape: 50000 * 16 * 16 * 32

 
    self.WCONV_2.retain_grad()
    self.BCONV_2.retain_grad()

    # passing the output through ReLu 
    relu_output_2 = self.conv_2.relu(output_2)                                # relu_output_2 shape: 50000 * 16 * 16 * 32

    # performorming MaxPool operation
    
    hparameters_2 = {"filter" : 2,
               "stride": 2}
    
    pool_output_2, pool_mem_2 = self.conv_2.pool_forward(relu_output_2, hparameters_2)    # pool_output_2 shape: 50000 * 8 * 8 * 32

    
    # storing output and pool layer output to visualize feature
    conv_2_features = {
        'output': output_2,
        'mem': mem_2,
        'pool_output': pool_output_2,
        'pool_mem': pool_mem_2
    }

    #print('convolution 2 completed.....!')

    ###             Conv->ReLU->Pool-> Flatten       ###################
    # Make output flatten
    f_output = conv_2_features['pool_output']
    #f_output = f_output.asarray(f_output)
    #print(type(f_output))
    #print(f_output.shape)
    #print(f_output.shape[0])
    #print(f_output.shape[1])
    #print(f_output.shape[2])
    #print(f_output.shape[3])
    #output_rows = f_output[1].reshape(f_output.shape[0], f_output.shape[1] *f_output.shape[2]* f_output.shape[3])   # flatten_output =  50000 * 2048
#     output_rows = f_output.reshape(f_output.shape[0], f_output.shape[1]* f_output.shape[2]* f_output.shape[3])
    output_rows = torch.flatten(f_output, start_dim= 1)
    #print("Shape after flatten: ", output_rows.shape)

    # Pass through Fully connected layer

    # FCLayer forward 
    #fc_layer = FCLayer(sizes=[2048, 512, 128, 10], learning_rate= 0.1)

    # forward pass
    #fc_output = fc_layer.forward_pass(A_prev)
    fc_output = self.fc_layer.forward_pass(output_rows)
    #print(type(fc_output))
    
    # returning feauter maps from two layers
    return conv_1_features, conv_2_features, fc_output  
    


  def train(self, X_train, Y_train):
    #print("This is train function ")
    start_time = time.time()

    #fc_layer = FCLayer(sizes=[2048, 512, 128, 10], learning_rate= self.learning_rate)

    # getting one-hot-encoded labels
    one_hot_labels = self.fc_layer.one_hot_encoding(Y_train)

    # forward pass
    # conv1_features, conv2_features, output = self.forward(X_train)

    for iteration in range(self.epochs):
      conv1_features, conv2_features, output  = self.forward(X_train)
      loss = torch.sum(-one_hot_labels * torch.log(output))
      output.retain_grad()
      a_prev, wc1, bc1, hp = conv1_features['mem']
      # bc1.retain_grad()
      # loss.retain_grad()
      loss.backward()
      
      print(type(conv1_features['mem']))
      print(loss )
      print(loss.is_leaf )
      print(output.is_leaf )
      print(bc1.is_leaf )
      print(bc1.grad )
      
        
      acc = self.accuracy(X_train, Y_train)
        
      print('Epoch: {0}, Time spent: {1:.2f}s, Loss: {2:.2f}, Accuracy: {3:.2f}'.format(
          iteration+1, time.time() - start_time, loss, acc*100
      ))

    
    
      # backward pass to update weights and biases


  def predict(self, X_test):

    #fc_layer = FCLayer(sizes=[2048, 512, 128, 10], learning_rate= self.learning_rate) # this should be moved into construction area 
   
    # forward pass
    conv1_features, conv2_features, output = self.forward(X_test)

    predicted_class= torch.argmax(output, dim= 1)
    
    return predicted_class
    
    
    #print("This is prediction ")
    

  def accuracy(self, X_train, Y_train):
    
    predictions = []
    pred = self.predict(X_train)
    
    for i in range(len(Y_train)):
        if pred[i] == Y_train[i]:
            predictions.append(pred[i])
            
    acc = len(predictions) / len(Y_train) 
    
    return acc
        
           
    
   
  def grad_test(self):
    # p = self.v # this gives grad value
    p = self.BCONV_1
    q = p
    p1 = q+2
    p2 = p1*q
    p2 = p2.mean()
    return p2, q


I am trying to update my gradients after calculating loss. But I am getting None value all the time, though requires_grad= True and is_leaf = True as well. Below, there is a grad_test() function, from this function I am getting value for .grad but for my train() function, I am getting None value.
This code gives output like this:

tensor(3.2962, grad_fn=<SumBackward0>)
False
False
True
None

How can I solve this problem? Can anyone help me to figure this out?
Thank you.

Hi,

this might not be the problem, but your class is not inheriting from nn.Module. Is this on purpose?

class CNN_Model(torch.nn.Module):
  def __init__(self, epochs, learning_rate):
      super().__init__()
      ...

I forgot to inherit from torch.nn.Module. I ran this again with inheriting torch.nn.Module, but it gives me same output.

It seems you are using a lot of custom modules, which are not defined in the torch.nn namespace, so could you post a minimal, executable code snippet to reproduce the issue, please?

1 Like

Hi,
Thanks for your reply. Actually I was trying to build CNN from scratch. I have re-write this code and run again, and now it’s giving false for is_leaf. Executable code snippet is below:

import torch
from torchvision import datasets,transforms
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score

transform = transforms.Compose([transforms.Resize((32,32)),
                                transforms.ToPILImage(),
                                transforms.ToTensor(),
                                ])

train_set = datasets.CIFAR10(root='./data', train = True, download = True, transform = transform)
test_set = datasets.CIFAR10(root='./data', train = False, download = True, transform = transform)

# Getting numpy data
X_train  = train_set.data

#Converting to tensor for autograd
X_train = torch.from_numpy(X_train) 

# Getting numpy data
X_test  = test_set.data

#Converting to tensor for autograd
X_test = torch.from_numpy(X_test)

# Labels
Y_train= train_set.targets
Y_test = test_set.targets

# Normalizing our dataset
X_train = X_train.float()

# getting mean 
X_train_mean = torch.mean(X_train, dim= 0)

#getting variance
X_train_var = torch.var(X_train, dim=0)

#performing normalization-> normalize = (data - mean) / std
X_train = (X_train - X_train_mean) / torch.sqrt(X_train_var)

class FCLayer():
    def __init__(self, sizes, learning_rate):
        self.sizes = sizes
        self.learning_rate = learning_rate

        # we will save all parameters of our fc layer in this directory
        self.params = self.initialization()

  
    def sigmoid(self, x):
        return 1/(1+torch.exp(-x))

    def sigmoid_der(self, x):
        return self.sigmoid(x) *(1- self.sigmoid(x))

    def softmax(self, x):
        # Numerically stable with large exponentials
        exps = torch.exp(x - x.max())
        return exps / torch.sum(exps, axis=0)
 
    def initialization(self):
        #number of neurons in each layer 
        input_layer = self.sizes[0]
        hidden_layer_1 = self.sizes[1]
        hidden_layer_2 = self.sizes[2]
        output_layer = self.sizes[3]

        params = {
            'WH1': torch.rand(input_layer, hidden_layer_1, requires_grad= True) * torch.sqrt(torch.tensor(2) / (input_layer + hidden_layer_1)),
            'WH2': torch.rand(hidden_layer_1, hidden_layer_2, requires_grad= True) * torch.sqrt(torch.tensor(2) / (hidden_layer_1 + hidden_layer_2)),
            'WO': torch.rand(hidden_layer_2, output_layer, requires_grad= True) * torch.sqrt(torch.tensor(2) / (hidden_layer_2 + output_layer)),
            'BH1': torch.rand(hidden_layer_1, requires_grad= True) * torch.sqrt(torch.tensor(2)/ (input_layer + hidden_layer_2)),
            'BH2': torch.rand(hidden_layer_2, requires_grad= True) * torch.sqrt(torch.tensor(2) / (hidden_layer_1 + hidden_layer_2)),
            'BO' : torch.rand(output_layer, requires_grad= True) * np.sqrt(torch.tensor(2) / (hidden_layer_2 + output_layer))
            }
        return params

    def one_hot_encoding(self, Y_train):
        h = Y_train.shape[0]
        w = 10        # As CIFAR-10 has 10 classes
        one_hot_labels = torch.zeros((h,w))

        for i in range(h):
            one_hot_labels[i, Y_train[i]] = 1

        return one_hot_labels
    
    def forward_pass(self, x_train):
        params = self.params

        # input layer  (training samples)
        params['A'] = x_train

        # input layer to hidden layer 1
        params['ZH1'] = torch.matmul(params['A'], params['WH1']) + params['BH1']
        params['AH1'] = self.sigmoid(params['ZH1'])

        # hidden layer 1 to hidden layer 2
        params['ZH2'] = torch.matmul(params['AH1'], params['WH2']) + params['BH2']
        params['AH2'] = self.sigmoid(params['ZH2'])

        # hidden layer 2 to output layer
        params['ZO'] = torch.matmul(params['AH2'], params['WO']) + params['BO']
        params['AO'] = self.softmax(params['ZO'])

        return params['AO']

    def backward_pass(self, one_hot_labels, output):
        params = self.params

        # Calculate WO update
        dcost_dzo = output - one_hot_labels 
        dzo_dwo = params['AH2']
        dcost_wo = torch.matmul(dzo_dwo.T, dcost_dzo)
        dcost_bo = dcost_dzo

        # Calculate WH2 update
        dzo_dah2 = params['WO']
        dcost_dah2 = torch.matmul(dcost_dzo, dzo_dah2.T)
        dah2_dzh2 = self.sigmoid_der(params['ZH2'])
        dzh2_dwh2 = params['AH1']
        dcost_w2 = torch.matmul(dzh2_dwh2.T, dah2_dzh2 * dcost_dah2)
        dcost_bh2 = dcost_dah2 * dah2_dzh2

        # Calculate W1 update
        dzh2_dah1 = params['WH2']
        dcost_dzh2 = dcost_dah2 * dah2_dzh2 
        dcost_dah1 = torch.matmul(dcost_dzh2, dzh2_dah1.T)
        dah1_dzh1 = self.sigmoid_der(params['ZH1'])
        dzh1_dw1 = params['A']
        dcost_w1 = torch.matmul(dzh1_dw1.T, dcost_dah1 * dah1_dzh1)
        dcost_bh1 = dcost_dah1 * dah1_dzh1

        # updating weigths and biases
        params['WH1'] -= self.learning_rate * dcost_w1
        params['BH1'] -= self.learning_rate * dcost_bh1.sum(axis=0)
    
        params['WH2'] -= self.learning_rate * dcost_w2
        params['BH2'] -= self.learning_rate * dcost_bh2.sum(axis=0)
    
        params['WO'] -= self.learning_rate * dcost_wo
        params['BO'] -= self.learning_rate * dcost_bo.sum(axis=0)

# taking some training examples to see our training
demo_train = []
for i in range(3):
    demo_train.append(X_train[i])

demo_train = torch.stack(demo_train, dim=0)  


# Now let's take corresponding labels for those training examples 
demo_label = []
for i in range(3):
    demo_label.append(Y_train[i])

demo_label = np.asarray(demo_label)

class Conv2D():
    def __init__(self, input_chanel, output_chanel, kernel_size, stride_size, padding_size):
        self.input_chanel = input_chanel
        self.output_chanel = output_chanel
        self.kernel_size = kernel_size
        self.stride_size = stride_size
        self.padding_size = padding_size
        
    def show_details(self):
        print('Input Size: ', self.input_chanel)
        print('Output Size: ', self.output_chanel)
        print('Kernel Size: ', self.kernel_size)
        print('Stride Size: ', self.stride_size)
        print('Padding Size: ', self.padding_size)
  
    def weight_initialization(self):
        # initialize weights with xavier initialization
        w = torch.randn(self.kernel_size, self.kernel_size, self.input_chanel, self.output_chanel, requires_grad= True) /9.0     # RGB image has 3 chanel
        b = torch.randn(1,1,1, self.output_chanel, requires_grad= True) /9.0

        params = {
            'weights': w,
            'bias': b
        }

        return params

    
    def zero_pad(self, X):
        """
        Argument:
        X-- python numpy array of shape (n_H, n_W, n_C) -> n_H: height, n_W: width, n_C: chanel
        pad-- integer, amount of padding around each image in vertical and horizontal

        Return:
        X_pad-- padded image of shape(m, n_H, n_W, n_C) -> m is the total number of images
        """
        # we will apply padding only n_H and n_W not in m and n_C
        X = X.detach().numpy()
        
        X_pad = np.pad(X, ((0,0), (self.padding_size,self.padding_size), (self.padding_size, self.padding_size), (0,0)), 'constant', constant_values = (0,0))
        X_pad = torch.from_numpy(X_pad)

        return X_pad

    #function to perform single convolution 
    def conv_single_step(self, a_slice_prev, W, b):
        """
        It will apply one filter defined by parameters W on a single slice (a_slice_prev) of the output activation 
        of the previous layer.

        Arguments:
        a_slice_prev -- slice of input data of shape (f, f, n_C_prev) -> f: height, width, n_C_prev: chanel of previous layer
        W -- Weight parameters contained in a window - matrix of shape (f, f, n_C_prev)
        b -- Bias parameters contained in a window - matrix of shape (1, 1, 1)

        Returns:
        Z -- a scalar value, result of convolving the sliding window (W,b)  on a slice x of the input  data


        """
        # element-wise multiplication between a_slice_prev and w
        matrix = torch.mul(a_slice_prev, W)
        
        # sum up all the elements of the matrix
        sum = torch.sum(matrix)
        
        # adding bias to the sum to get final output Z
        Z = sum + b.float()

        return Z
  

    def get_hparameter(self):
        hparam = {
            'stride': self.stride_size,
            'pad': self.padding_size
        }
        return hparam

    def conv_forward(self, A_prev, W, b, hparameters):
        """
        Implementing the forward propagation for convolution

        Arguments:

        A_prev -- output activations of the previous layer, numpy array of shape (m, n_H_prev, n_W_prev, n_C_prev)
        m-> total number inputs
        n_H_prev-> height of previous layers input
        n_W_prev-> width of previous layers input
        n-C_prev-> chanel of previous layers input 

        W -- Weights, numpy array of shape (f, f, n_C_prev, n_C)
        f-> spatial extend of kernels/filter
        n_C_prec-> chanel of previous layers input
        n_C-> dept (how many filters we want to use for next layer)

        b -- Biases, numpy array of shape (1, 1, 1, n_C)
        n_C-> dept (how many filters we want to use for next layer)

        hparameters -- python dictionary containing "stride" and "pad" 

        Returns:

        Z -- conv output, numpy array of shape (m, n_H, n_W, n_C)
        m-> total number outputs will be generated in next layer(output layer) 
        n_H -> height of next layers output
        n_W-> width of next layers output
        n-C -> dept (how many filters we want to use for next layer)

        mem -- cache of values needed for the conv_backward() function 

        """

        # getting dimensions from A_prev's shape (shape of input data)
        (m, n_H_prev, n_W_prev, n_C_prev) = A_prev.shape

        # getting dimensions of W's shape
        (f, f, n_C_prev, n_C) = W.shape

        # getting parameters value from hparameter dictionary
        stride = hparameters['stride']
        pad = hparameters['pad']

        # computing dimensions of the conv2D output volume using a formula (Ref: CS231n)[lecture slide 4]
        n_H = int((n_H_prev + 2*pad - f)/stride) + 1
        n_W = int((n_W_prev + 2*pad - f)/stride) + 1

        # Initialize output volume with zeros
        Z = torch.zeros([m, n_H, n_W, n_C])

        # Applying zero-padding to our input data A_prev
        A_prev_pad = self.zero_pad(A_prev)

        # now we will use for loop to perform conv2D

        #loop over the total training examples
        for i in range(m):
            # taking i'th training example from zero_padded data
            a_prev_pad = A_prev_pad[i,:,:,:]
            for height in range(n_H):
                for width in range(n_W):
                    for chanel in range(n_C):
                        # finding the four corner's of current slide
                        vert_start = height * stride
                        vert_end = height* stride+ f
                        horiz_start = width * stride 
                        horiz_end = width* stride+ f

                        # with this four corner we are defining our current slice of a_prev_pad 
                        a_slice_prev = a_prev_pad[vert_start:vert_end,horiz_start:horiz_end,:]      # we are taking for all 3 chanels

                        # performing single convolution to each slice with W and b for getting our output 
                        Z[i, height, width, chanel] = self.conv_single_step(a_slice_prev, W[:, :, :, chanel], b[:,:,:,chanel])

        # To make sure our output shape is correct
        assert(Z.shape == (m, n_H, n_W, n_C))

        # Save information in "mem" as cache for the backprop
        mem = (A_prev, W, b, hparameters)

        return Z, mem

    def relu(self, Z):
        relu_Z = torch.maximum(torch.tensor(0), Z)     # element-wise
        return relu_Z

    # Max pooling 
    def pool_forward(self, A_prev, hparameters):
        """
        Implementing the forward pass of pooling layer

        Arguments:
        A_prev -- Input data, numpy array of shape (m, n_H_prev, n_W_prev, n_C_prev)
        hparameters -- python dictionary containing "f" and "stride"
        f-> spatial extend of filter

        Returns:
        A -- output of the pool layer, a numpy array of shape (m, n_H, n_W, n_C)
        mem -- cache used in the backward pass of the pooling layer, contains the input and hparameters 

        """
        # getting input dimensions
        (m, n_H_prev, n_W_prev, n_C_prev) = A_prev.shape

        # retrieving hyper parameters from hparameters
        f = hparameters['filter']
        stride = hparameters['stride']

        # Computing dimensions for output volume
        n_H = int(1 + (n_H_prev - f) / stride)
        n_W = int(1 + (n_W_prev - f) / stride)
        n_C = n_C_prev

        # initialize output volume with zeros as A
        A = torch.zeros((m, n_H, n_W, n_C))
        
        for i in range(m):
            for height in range(n_H):
                for width in range(n_W):
                    for chanel in range(n_C):
                        # finding the four corners
                        vert_start = height * stride
                        vert_end = height* stride+ f
                        horiz_start = width * stride 
                        horiz_end = width* stride+ f

                        # with this four corner we are defining our current slice of A_prev of ith training example, chanel c
                        a_prev_slice = A_prev[i, vert_start:vert_end,horiz_start:horiz_end, chanel]
                        
                        # getting the maximum value from the slice
                        A[i, height, width, chanel] = torch.max(a_prev_slice)  

        # Making sure our output shape is correct
        assert(A.shape == (m, n_H, n_W, n_C))

        # Store the input and hparameters in "cache" for pool_backward()
        mem = (A_prev, hparameters)

        return A, mem

class CNN_Model():
    def __init__(self, epochs, learning_rate):
        
        self.epochs = epochs
        self.learning_rate = learning_rate
    
        # conv layer 1
        self.conv_1 = Conv2D(3,16,3,1,1)
        
        # weight initialization
        self.wbc_1_params = self.conv_1.weight_initialization()
        self.WCONV_1 = self.wbc_1_params['weights']
        self.BCONV_1 = self.wbc_1_params['bias']

        # conv layer 2
        self.conv_2 = Conv2D(16,32,3,1,1)
        
        # weight initialization
        self.wbc_2_params = self.conv_2.weight_initialization()
        self.WCONV_2 = self.wbc_2_params['weights']
        self.BCONV_2 = self.wbc_2_params['bias']

        # fully connected Layer
        self.fc_layer = FCLayer(sizes=[2048, 512, 128, 10], learning_rate= 0.1)

  
    def forward(self, A_prev):                                          # A_prev= X_train
        #conv_1 = Conv2D(3,16,3,1,1)
        W1 = self.WCONV_1
        b1 = self.BCONV_1

        # getting stride and padding 
        hparams = self.conv_1.get_hparameter()

        # performing convolution
        output, mem = self.conv_1.conv_forward(A_prev, W1, b1, hparams)    # output shape: 50000 * 32 * 32 * 16

    
        # passing the output through ReLu 
        relu_output = self.conv_1.relu(output)                           # relu_output shape: 50000 * 32 * 32 * 16

        # performorming MaxPool operation
        hparameters = {"filter" : 2,
                       "stride": 2}
        
        pool_output, pool_mem = self.conv_1.pool_forward(relu_output, hparameters)    # pool_output shape: 50000 * 16 * 16 * 16
    

        # storing output and pool layer
        conv_1_features = {
            'output': output,
            'mem': mem,
            'pool_output': pool_output,
            'pool_mem': pool_mem    
            }

        #conv_2 = Conv2D(16,32,3,1,1)
        W2 = self.WCONV_2
        b2 = self.BCONV_2

        # getting stride and padding 
        hparams_2 = self.conv_2.get_hparameter()

        # performing convolution
        output_2, mem_2 = self.conv_2.conv_forward(pool_output, W2, b2, hparams) # passing 1s layer output as input         # output_2 shape: 50000 * 16 * 16 * 32

        # passing the output through ReLu 
        relu_output_2 = self.conv_2.relu(output_2)                                # relu_output_2 shape: 50000 * 16 * 16 * 32

        # performorming MaxPool operation
    
        hparameters_2 = {"filter" : 2,
                        "stride": 2}
    
        pool_output_2, pool_mem_2 = self.conv_2.pool_forward(relu_output_2, hparameters_2)    # pool_output_2 shape: 50000 * 8 * 8 * 32

    
        # storing output and pool layer output to visualize feature
        conv_2_features = {
            'output': output_2,
            'mem': mem_2,
            'pool_output': pool_output_2,
            'pool_mem': pool_mem_2
            }

 
        # Make output flatten for FC Layer
        f_output = conv_2_features['pool_output']

        output_rows = torch.flatten(f_output, start_dim= 1)

        fc_output = self.fc_layer.forward_pass(output_rows)
    
        # returning feauter maps from two layers
        return conv_1_features, conv_2_features, fc_output

    def train(self, X_train, Y_train):
        start_time = time.time()

        # getting one-hot-encoded labels
        one_hot_labels = self.fc_layer.one_hot_encoding(Y_train)

        # forward pass
        conv1_features, conv2_features, output = self.forward(X_train)

        for iteration in range(self.epochs):
            conv1_features, conv2_features, output = self.forward(X_train)
            
            loss = torch.sum(-one_hot_labels * torch.log(output))
                             
            a_prev, wc1, bc1, hp1 = conv1_features['mem']
            aa_prev, wc2, bc2, hp2 = conv2_features['mem']
            wc1.retain_grad()
            bc1.retain_grad()
            loss.backward()
            
            print(loss)
            print(loss.is_leaf)
            print(output.is_leaf)
            print(bc1.is_leaf)
            print(self.wbc_1_params['bias'])
            print(self.wbc_1_params['bias'].is_leaf)
          
      
 
 
        
        
        acc = self.accuracy(X_train, Y_train)
        
        print('Epoch: {0}, Time spent: {1:.2f}s, Loss: {2:.2f}, Accuracy: {3:.2f}'.format(
          iteration+1, time.time() - start_time, loss, acc*100
            ))

    def predict(self, X_test):
        # forward pass
        conv1_features, conv2_features, output = self.forward(X_test)

        predicted_class= torch.argmax(output, dim= 1)
        return predicted_class
    
    def accuracy(self, X_train, Y_train):
        
        predictions = []
        pred = self.predict(X_train)

        for i in range(len(Y_train)):
            if pred[i] == Y_train[i]:
                predictions.append(pred[i])

        acc = len(predictions) / len(Y_train) 

        return acc
           

Now, if I run this, I’m getting this

tensor(3.2958, grad_fn=<SumBackward0>)
False
False
False
tensor([[[[ 0.0255,  0.1655, -0.1032,  0.0785, -0.0081,  0.0402,  0.1369,
           -0.1066, -0.0874,  0.1110,  0.0014, -0.1300,  0.0657, -0.0220,
            0.0562,  0.1101]]]], grad_fn=<DivBackward0>)
False

Your params dict contains tensors which are not leaf tensors anymore, since you are executing an operation on the original leaf:

        params = {
            'WH1': torch.rand(input_layer, hidden_layer_1, requires_grad= True) * torch.sqrt(torch.tensor(2) / (input_layer + hidden_layer_1)),
            'WH2': torch.rand(hidden_layer_1, hidden_layer_2, requires_grad= True) * torch.sqrt(torch.tensor(2) / (hidden_layer_1 + hidden_layer_2)),
            'WO': torch.rand(hidden_layer_2, output_layer, requires_grad= True) * torch.sqrt(torch.tensor(2) / (hidden_layer_2 + output_layer)),
            'BH1': torch.rand(hidden_layer_1, requires_grad= True) * torch.sqrt(torch.tensor(2)/ (input_layer + hidden_layer_2)),
            'BH2': torch.rand(hidden_layer_2, requires_grad= True) * torch.sqrt(torch.tensor(2) / (hidden_layer_1 + hidden_layer_2)),
            'BO' : torch.rand(output_layer, requires_grad= True) * np.sqrt(torch.tensor(2) / (hidden_layer_2 + output_layer))
            }

Create the actual tensor before calling .requires_grad_() on it.

Thank you for your feedback. I will try to implement in that way.