Gradient computation in custom backward

Hi @albanD,

When I tried to include autograd.grad in backward as above, autograd.grad wasn’t returning anything though is was getting executed. I don’t know why, can you please have a look!

Then I tried it with different approach :

class Custom_Convolution(torch.autograd.Function):    
    
    @staticmethod
    def forward(ctx, input, weight, bias, stride, padding):  #input(from previous layer)'s shape = ([batch_size=100, 96, 8, 8])
        with torch.enable_grad():
               output = torch.nn.functional.conv2d(input, weight, bias, stride, padding)
               h = output.shape[2]
               w = output.shape[3]  
               # output from forward with size = ([batch_size, 128,4,4])

               output= output.view(output.shape[0], output.shape[1], -1) # output size = ([batch_size, 128,16])
       
               cont = torch.tensor([0.]).to(dev).requires_grad_(True)
               for i in range(0, output.shape[0]):
                   for f in range(len(output[i])):
                       Zi_unnormalized = output[i][f]
                       Zi = torch.nn.functional.normalize(Zi_unnormalized, dim = 0)
                       # Zj and Zk are tensors made from output[i][*] and output[other than i][*]. Zj and Zk varies for each Zi (or f)

                       Zi_Zk = torch.Tensor([0]).to(dev).requires_grad_(True)
                       for k in Zk:
                           k= torch.nn.functional.normalize(k, dim = 0)
                           zi_zk = ...
                           Zi_Zk = Zi_Zk.add(zi_zk)

                       # Similarly computing Zi_Zj
                       # Li = some algebra of Zi_Zj and Zi_Zk
                       # number of 'Li' values =  output.shape[0] * output.shape[1]
                       cont = cont.add(Li)   # 1 value
        print("\n Loss: ", cont_loss, cont_loss.requires_grad)
           
        # weight1 = weight.clone().requires_grad_(True)
        # bias1 = bias.clone().requires_grad_(True)

        # weight.shape = ([128, 96, 5, 5])
        cont_loss_weight  = torch.autograd.grad(outputs= cont_loss,inputs= weight, retain_graph=True)
    
        #bias.shape = ([128])
        cont_loss_bias = torch.autograd.grad(outputs= cont_loss, inputs= bias, retain_graph=True)
        
        output = output.view(output.shape[0], output.shape[1], h,w)
        ctx.save_for_backward(input, weight, bias, output, cont_loss, cont_loss_weight,cont_loss_bias)            

        return output    #output's shape = ([[batch_size= 100,128, 4, 4])

    @staticmethod
    def backward(ctx, grad_output):    # grad_output size = ([batch_size, 128,4,4])
        
        input, weight, bias, output,  cont_loss,cont_loss_weight,cont_loss_bias = ctx.saved_tensors    #input size = ([batch_size, 96,8,8])  
        grad_input = grad_weight = grad_bias = None

        if ctx.needs_input_grad[0]:
            grad_input = torch.nn.grad.conv2d_input(input.shape, weight, grad_output) #shape = ([batch_size,96,8,8])
              
        if ctx.needs_input_grad[1]:
            grad_weight = torch.nn.grad.conv2d_weight(input, weight.shape, grad_output)  #shape = ([128,96,5,5])
            grad_weight += cont_loss_weight 
                        
        if bias is not None and ctx.needs_input_grad[2]:
            grad_bias = grad_output.sum((0,2,3))        #shape = ([128])         
            grad_bias += cont_loss_bias

        if bias is not None:
            return grad_input, grad_weight, grad_bias, None, None
        else:
            return grad_input, grad_weight, None, None

Then I observed that cont_loss_weight is a tuple object containing two tensors each of shape ([96, 5, 5]). It should have returned a tensor of shape ([128, 96, 5, 5]) instead of tuple. And similarly for cont_loss_bias, a tensor of shape ([128]).
I don’t know why!

Moreover, when I do `cont_loss_weight = torch.autograd.grad(outputs= cont_loss,inputs= weight, retain_graph=True), I am guessing grad_weight in backward will get affected. I have to keep retain_graph= True as well.

So to avoid that, when I used a copy of parameters i.e. cont_loss_weight = torch.autograd.grad(outputs= cont_loss,inputs= weight1, retain_graph=True) I got this error

RuntimeError: One of the differentiated Tensors appears to not have been used in the graph. Set allow_unused=True if this is the desired behavior.