Gradient computation in custom backward

Hdk · November 14, 2020, 9:00pm

How can I get gradients of ‘Li’ w.r.t. its corresponding ‘Zi_unnormalized’ and such that a tensor of shape same as that of feat is formed with these gradients?

    ....
    @staticmethod
    def backward(ctx, grad_output):    # grad_output size = ([batch_size, 128,4,4])
        
        input, weight, bias, output = ctx.saved_tensors    #input size = ([batch_size, 96,8,8])
      
        feat = output.clone().detach()   # output from forward with size = ([batch_size, 128,4,4])

        features = feat.view(feat.shape[0], feat.shape[1], -1) # features size = ([batch_size, 128,16])

        i = 0        
        cont = torch.tensor([0.]).to(dev)
        while i in range(0, features.shape[0]):
              for f in range(len(features[i])):
                  Zi_unnormalized = features[i][f].requires_grad_(requires_grad=True)
                  Zi = torch.nn.functional.normalize(Zi_unnormalized, dim = 0)
                  # Zj and Zk are tensors made from features[i][*] and features[other than i][*]. Zj and Zk varies for each Zi (or f)

                  Zi_Zk = torch.Tensor([0]).to(dev)
                  for k in Zk:
                      k= torch.nn.functional.normalize(k, dim = 0)
                      zi_zk = torch.exp(torch.div(torch.dot(Zi,k.T), 0.07))
                      Zi_Zk = Zi_Zk.add(zi_zk)

                  # Similarly computing Zi_Zj
                  # Li = some algebra of Zi_Zj and Zi_Zk
                  # number of 'Li' values =  features.shape[0] * features.shape[1]
                  cont = cont.add(Li)   # 1 value
              i+=1

I tried autograd backward but I get every time a RuntimeError. Then I saw that the grad_fn of Zi everytime remains None!!

albanD · November 16, 2020, 3:35pm

Hi,

It is hard to say what the backward should be without knowing what the forward is.
Also could you be more precise on what kind of error you’re seeing when trying to use your custom Function?

Hdk · November 16, 2020, 6:44pm

Hi @albanD , thank you for reaching out.

I want to get these:
grad_weight += cont_loss_weight and grad_bias += cont_loss_bias.

For this I have two ideas in my mind, however i don’t know which one is correct (If you can suggest!).
Idea-1:

# keeping remaining code unchanged
if ctx.needs_input_grad[1]:
            grad_weight = torch.nn.grad.conv2d_weight(input, weight.shape, grad_output)  #shape = ([128,96,5,5])
            grad_feat = torch.autograd.grad(cont_loss, feat)         # has to be of shape = ([[batch_size= 100,128, 4, 4])
            cont_loss_weight = torch.nn.grad.conv2d_weight(input, weight.shape, grad_feat )                                                                     
            grad_weight += cont_loss_weight

# But not sure how to get `cont_loss_bias` in same manner.

Idea-2:

# keeping remaining code unchanged
if ctx.needs_input_grad[1]:
            grad_weight = torch.nn.grad.conv2d_weight(input, weight.shape, grad_output)  #shape = ([128,96,5,5])
            cont_loss_weight = torch.autograd.grad(outputs= cont_loss, inputs= weight, retain_graph=(True))
            grad_weight += cont_loss_weight
            
if bias is not None and ctx.needs_input_grad[2]:
            grad_bias = grad_output.sum((0,2,3))        #shape = ([128])
            cont_loss_bias = torch.autograd.grad(outputs= cont_loss, inputs= bias, retain_graph=(True))
            grad_bias += cont_loss_bias

Here’s the code :

class Custom_Convolution(torch.autograd.Function):    
    
    @staticmethod
    def forward(ctx, input, weight, bias, stride, padding):  #input(from previous layer)'s shape = ([batch_size=100, 96, 8, 8])
        output = torch.nn.functional.conv2d(input, weight, bias, stride, padding)  
        ctx.save_for_backward(input, weight, bias, output)
        return output    #output's shape = ([[batch_size= 100,128, 4, 4])

    @staticmethod
    def backward(ctx, grad_output):    # grad_output size = ([batch_size, 128,4,4])
        
        input, weight, bias, output = ctx.saved_tensors    #input size = ([batch_size, 96,8,8])
      
        feat = output.clone()#.requires_grad_(True)   # output from forward with size = ([batch_size, 128,4,4])

        feat = feat.view(feat.shape[0], feat.shape[1], -1) # features size = ([batch_size, 128,16])

        i = 0        
        cont = torch.tensor([0.]).to(dev)
        while i in range(0, feat.shape[0]):
              for f in range(len(feat[i])):
                  Zi_unnormalized = feat[i][f]
                  Zi = torch.nn.functional.normalize(Zi_unnormalized, dim = 0)
                  # Zj and Zk are tensors made from feat[i][*] and feat[other than i][*]. Zj and Zk varies for each Zi (or f)

                  Zi_Zk = torch.Tensor([0]).to(dev)
                  for k in Zk:
                      k= torch.nn.functional.normalize(k, dim = 0)
                      zi_zk = ...
                      Zi_Zk = Zi_Zk.add(zi_zk)

                  # Similarly computing Zi_Zj
                  # Li = some algebra of Zi_Zj and Zi_Zk
                  # number of 'Li' values =  feat.shape[0] * feat.shape[1]
                  cont = cont.add(Li)   # 1 value
              i+=1

        grad_input = grad_weight = grad_bias = None

        if ctx.needs_input_grad[0]:
            grad_input = torch.nn.grad.conv2d_input(input.shape, weight, grad_output) #shape = ([batch_size,96,8,8])
        
        # If I go with Idea-2        
        if ctx.needs_input_grad[1]:
            grad_weight = torch.nn.grad.conv2d_weight(input, weight.shape, grad_output)  #shape = ([128,96,5,5])
            cont_loss_weight = torch.autograd.grad(outputs= cont_loss, 
                                            inputs= weight, retain_graph=(True))
            grad_weight += cont_loss_weight
            
        if bias is not None and ctx.needs_input_grad[2]:
            grad_bias = grad_output.sum((0,2,3))        #shape = ([128])
            cont_loss_bias = torch.autograd.grad(outputs= cont_loss, inputs= bias, retain_graph=(True))
           grad_bias += cont_loss_bias          
            
        if bias is not None:
            return grad_input, grad_weight, grad_bias, None, None
        else:
            return grad_input, grad_weight, None, None

It gives:
RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

If I put feat = output.clone().requires_grad_(True), it gives :

RuntimeError: One of the differentiated Tensors appears to not have been used in the graph. Set allow_unused=True if this is the desired behavior.

albanD · November 16, 2020, 8:31pm

Hi,

Thanks for the code sample but you still didn’t say what you’re trying to accomplish here.

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

This means that the output of your function does not require gradients. You need to make sure that at least one of the input Tensors requires gradients.

feat = output.clone().requires_grad_(True)

This would just make the output require gradients, that won’t make the autograd work with operations that happened before.
You should have your input requiring gradients so that you can compute gradients for it.

Hdk · November 16, 2020, 9:05pm

Let me break down the problem.

I have a network of say 5 convolutional layers, wherein only 4th layer is a custom convolution layer as you saw in above code.

The output shape from forward of this 4th i.e. custom layer is ([[batch_size,128,4,4]).
Now, I have to include an additional loss (apart from cross entropy at the end of network which is usual in image classification task) in the backward computation of the layer itself, which takes in features (Zi, Zj, Zk) from that output to let the network learn them.

Sorry, there is one typo. Correction : cont_loss = torch.tensor([0.]).to(dev) and cont_loss = cont_loss.add(Li)

cont_loss is that additional loss that you have noticed in the code. For including this only I have to manipulate the backward so that everything works as usual when I call loss.backward() in training loop.

That’s why I have these two additional terms which need to be taken care :
grad_weight += cont_loss_weight and grad_bias += cont_loss_bias.

But, I don’t know which of my idea is correct that I wrote before.

I hope I could elaborate.

albanD · November 17, 2020, 2:44pm

Could you explain why you have to include this in the backward of the layer itself?
Why can’t it be an additional term of your loss that you backprop as usual (ce_loss + additional_loss).backward() ?

Hdk · November 17, 2020, 2:55pm

Well, that loss is specific to that layer only. I don’t have concrete argument for that but I was suggested to implement the loss within layer backward only. So, I have to do it that way (No option !)

albanD · November 17, 2020, 3:04pm

But if that loss term is only influenced by that layers weight, the gradient corresponding to that part of the loss will only influence that layer’s weights.

Also you still haven’t shared any formula of what you’re trying to compute so it is hard to say

Hdk · November 17, 2020, 3:11pm

Yes, you are right. It will only influence that layer’s weights.

Sorry, you can see equation of loss in equation no. (2) & (3) and on page no. 13 the gradient computation.
https://arxiv.org/pdf/2004.11362.pdf

albanD · November 17, 2020, 3:27pm

For such problem, it will definitely be much easier to let the autograd figure out the gradients. In particular because you might not have all the information you need during the backward of that layer as the loss will depend on the layers that appeared after this one.

Note as well that a good ressource is other implementations of contrastive losses for pytorch: https://github.com/topics/contrastive-loss

Hdk · November 17, 2020, 3:37pm

As far as I saw other implementations, I found that a seperate loss function is created as you mentioned. But those implementations have two seperate networks one for contrastive loss and then fine tuning with the other one for classification task by CE loss.

But in my case I need to proceed with only one network with contrastive loss included in backward of one layer so that the network learns the features of that particular layer and should update its weight accordingly. That’s why there are these terms in backward:

grad_weight += cont_loss_weight and grad_bias += cont_loss_bias

So, I have to stick to this approach but I haven’t seen such implementation or thread regarding this.

Hdk · November 17, 2020, 4:00pm

In doing so, this will happen as mentioned by @ptrblck. So, the layers before the custom layers will also have gradients of contrastive loss accumulated in addition to that of CE loss. But, I want to restrict this gradient accumulation (of contrastive + CE loss) within the custom layer.

Probably, this would justify why I am doing it this way.

albanD · November 17, 2020, 5:51pm

In that case, you can use the nightly version of pytorch an use the new inputs argument to the .backward() function:

net.zero_grad()
ce_loss.backward()
additional_loss.backward(inputs=net.your_contrastive_layer.parameters())
opt.step()

Hdk · November 17, 2020, 10:08pm

So for doing this you mean, I should create a seperate Contrastive loss function instead of doing it in layer backward. And then use nightly version code that you mentioned. Right?

And can you please explain what this code will do. additional_loss.backward(inputs=net.your_contrastive_layer.parameters())

I try to install nightly version with from here.
But I encounterd an error this error:

EnvironmentNotWritableError: The current user does not have write permissions to the target environment.
  environment location: C:\ProgramData\Anaconda3

Looks like I don’t have permission from the admin of the PC I’m using.

albanD · November 17, 2020, 10:27pm

So for doing this you mean, I should create a seperate Contrastive loss function instead of doing it in layer backward.

I think it is going to be simpler than modifying the backward and writing the gradients yourself yes.

And can you please explain what this code will do.

It will run the backward as usual but will only update the .grad fields of the inputs that were given. So in your case, you only want to update the parameters of that one layer.

Looks like I don’t have permission from the admin of the PC I’m using.

You might want to create a new environment in your conda so that you can install things.

Hdk · November 17, 2020, 10:40pm

Thanks for the explanation.

I will have to look into this.

But, if I stick to this code above. What do you think is the correct way to calculate cont_loss_weight by Idea-1 or Idea-2?

And what changes should I make in the code of backward to run it successfully, any suggestions?

I tried this as well without nightly version, but GPU went out of memory.

RuntimeError: CUDA out of memory.

albanD · November 17, 2020, 10:50pm

But, if I stick to this code above. What do you think is the correct way to calculate cont_loss_weight by Idea-1 or Idea-2?

I honestly don’t know. You will need to derive the mathematical formula for what the gradient should be with pen and paper first. Then implement the final formula you get in there.

I tried this as well without nightly version, but GPU went out of memory.

Does it run out at the first iteration? Or after a while?
Can you try reducing the batch size to reduce memory pressure?

Hdk · November 17, 2020, 11:01pm

In the research paper that I showed you, there is gradient equation (i.e. differentiation of Li w.r.t. Zi for every 'i’th feature if you compare with my code), but I don’t know why it was w.r.t. evey feature instead of w.r.t. parameters (weight and bias). Contradictorily, the loss formula doesn’t include use of parameters, but the features which go into the loss formula are result of mathematics of input and parameters!

If I use smaller batch size then I could see results for first few batches, but I doubt that it would still go out of memory after some more iterations.
However, this loss function yields good results for bigger batch sizes. So, I tried with bigger batch size,but it went out of memory after some time without showing first result.

Hdk · November 24, 2020, 11:11pm

I’m witnessing some unusual output for below code, can you have a look please!

class Custom_Convolution(torch.autograd.Function):    
    
    @staticmethod
    def forward(ctx, input, weight, bias, stride, padding):  #input(from previous layer)'s shape = ([batch_size=100, 96, 8, 8])
        output = torch.nn.functional.conv2d(input, weight, bias, stride, padding)  
        ctx.save_for_backward(input, weight, bias, output)
        return output    #output's shape = ([[batch_size= 100,128, 4, 4])

    @staticmethod
    def backward(ctx, grad_output):    # grad_output size = ([batch_size, 128,4,4])
        
        input, weight, bias, output = ctx.saved_tensors    #input size = ([batch_size, 96,8,8])  
        grad_input = grad_weight = grad_bias = None

        if ctx.needs_input_grad[0]:
            grad_input = torch.nn.grad.conv2d_input(input.shape, weight, grad_output) #shape = ([batch_size,96,8,8])
              
        if ctx.needs_input_grad[1]:
            grad_weight = torch.nn.grad.conv2d_weight(input, weight.shape, grad_output)  #shape = ([128,96,5,5])
                        
        if bias is not None and ctx.needs_input_grad[2]:
            grad_bias = grad_output.sum((0,2,3))        #shape = ([128])          
           
        with torch.enable_grad():
               feat = output.clone()   # output from forward with size = ([batch_size, 128,4,4])

               feat = feat.view(feat.shape[0], feat.shape[1], -1) # features size = ([batch_size, 128,16])
       
               cont = torch.tensor([0.]).to(dev)
               for i in range(0, feat.shape[0]):
                   for f in range(len(feat[i])):
                       Zi_unnormalized = feat[i][f]
                       Zi = torch.nn.functional.normalize(Zi_unnormalized, dim = 0)
                       # Zj and Zk are tensors made from feat[i][*] and feat[other than i][*]. Zj and Zk varies for each Zi (or f)

                       Zi_Zk = torch.Tensor([0]).to(dev)
                       for k in Zk:
                           k= torch.nn.functional.normalize(k, dim = 0)
                           zi_zk = ...
                           Zi_Zk = Zi_Zk.add(zi_zk)

                       # Similarly computing Zi_Zj
                       # Li = some algebra of Zi_Zj and Zi_Zk
                       # number of 'Li' values =  feat.shape[0] * feat.shape[1]
                       cont = cont.add(Li)   # 1 value
               print("\n Loss: ", cont_loss, cont_loss.requires_grad)

### This line of printing loss keeps on repeating with the same value of Loss!!!! 


        cont_loss_weight = torch.autograd.grad(outputs= cont_loss, inputs= weight, retain_graph=(True))
        print ("Shape:", cont_loss_weight .shape)                                   
        grad_weight += cont_loss_weight

        cont_loss_bias = torch.autograd.grad(outputs= cont_loss, inputs= bias, retain_graph=(True))
        grad_bias += cont_loss_bias

        if bias is not None:
            return grad_input, grad_weight, grad_bias, None, None
        else:
            return grad_input, grad_weight, None, None

Output :

Loss:  tensor([37.218], device='cuda:0', grad_fn=<AddBackward0>) True
Loss:  tensor([37.218], device='cuda:0', grad_fn=<AddBackward0>) True
Loss:  tensor([37.218], device='cuda:0', grad_fn=<AddBackward0>) True
Loss:  tensor([37.218], device='cuda:0', grad_fn=<AddBackward0>) True
.
.
.
RuntimeError: CUDA out of memory.

I don’t know why this line is repeating so many time and has no end. And finally CUDA goes out of memory. It should print once only for 1 batch, moreover I haven’t made any indentation mistake!

This line
cont_loss_weight = torch.autograd.grad(outputs= cont_loss, inputs= weight, retain_graph=(True))
is getting executed but it neither show any error nor it returns something because its following line: print ("Shape:", cont_loss_weight .shape) doesn’t get printed.

Hdk · November 27, 2020, 4:49pm

Hi @albanD,

When I tried to include autograd.grad in backward as above, autograd.grad wasn’t returning anything though is was getting executed. I don’t know why, can you please have a look!

Then I tried it with different approach :

class Custom_Convolution(torch.autograd.Function):    
    
    @staticmethod
    def forward(ctx, input, weight, bias, stride, padding):  #input(from previous layer)'s shape = ([batch_size=100, 96, 8, 8])
        with torch.enable_grad():
               output = torch.nn.functional.conv2d(input, weight, bias, stride, padding)
               h = output.shape[2]
               w = output.shape[3]  
               # output from forward with size = ([batch_size, 128,4,4])

               output= output.view(output.shape[0], output.shape[1], -1) # output size = ([batch_size, 128,16])
       
               cont = torch.tensor([0.]).to(dev).requires_grad_(True)
               for i in range(0, output.shape[0]):
                   for f in range(len(output[i])):
                       Zi_unnormalized = output[i][f]
                       Zi = torch.nn.functional.normalize(Zi_unnormalized, dim = 0)
                       # Zj and Zk are tensors made from output[i][*] and output[other than i][*]. Zj and Zk varies for each Zi (or f)

                       Zi_Zk = torch.Tensor([0]).to(dev).requires_grad_(True)
                       for k in Zk:
                           k= torch.nn.functional.normalize(k, dim = 0)
                           zi_zk = ...
                           Zi_Zk = Zi_Zk.add(zi_zk)

                       # Similarly computing Zi_Zj
                       # Li = some algebra of Zi_Zj and Zi_Zk
                       # number of 'Li' values =  output.shape[0] * output.shape[1]
                       cont = cont.add(Li)   # 1 value
        print("\n Loss: ", cont_loss, cont_loss.requires_grad)
           
        # weight1 = weight.clone().requires_grad_(True)
        # bias1 = bias.clone().requires_grad_(True)

        # weight.shape = ([128, 96, 5, 5])
        cont_loss_weight  = torch.autograd.grad(outputs= cont_loss,inputs= weight, retain_graph=True)
    
        #bias.shape = ([128])
        cont_loss_bias = torch.autograd.grad(outputs= cont_loss, inputs= bias, retain_graph=True)
        
        output = output.view(output.shape[0], output.shape[1], h,w)
        ctx.save_for_backward(input, weight, bias, output, cont_loss, cont_loss_weight,cont_loss_bias)            

        return output    #output's shape = ([[batch_size= 100,128, 4, 4])

    @staticmethod
    def backward(ctx, grad_output):    # grad_output size = ([batch_size, 128,4,4])
        
        input, weight, bias, output,  cont_loss,cont_loss_weight,cont_loss_bias = ctx.saved_tensors    #input size = ([batch_size, 96,8,8])  
        grad_input = grad_weight = grad_bias = None

        if ctx.needs_input_grad[0]:
            grad_input = torch.nn.grad.conv2d_input(input.shape, weight, grad_output) #shape = ([batch_size,96,8,8])
              
        if ctx.needs_input_grad[1]:
            grad_weight = torch.nn.grad.conv2d_weight(input, weight.shape, grad_output)  #shape = ([128,96,5,5])
            grad_weight += cont_loss_weight 
                        
        if bias is not None and ctx.needs_input_grad[2]:
            grad_bias = grad_output.sum((0,2,3))        #shape = ([128])         
            grad_bias += cont_loss_bias

        if bias is not None:
            return grad_input, grad_weight, grad_bias, None, None
        else:
            return grad_input, grad_weight, None, None

Then I observed that cont_loss_weight is a tuple object containing two tensors each of shape ([96, 5, 5]). It should have returned a tensor of shape ([128, 96, 5, 5]) instead of tuple. And similarly for cont_loss_bias, a tensor of shape ([128]).
I don’t know why!

Moreover, when I do `cont_loss_weight = torch.autograd.grad(outputs= cont_loss,inputs= weight, retain_graph=True), I am guessing grad_weight in backward will get affected. I have to keep retain_graph= True as well.

So to avoid that, when I used a copy of parameters i.e. cont_loss_weight = torch.autograd.grad(outputs= cont_loss,inputs= weight1, retain_graph=True) I got this error

RuntimeError: One of the differentiated Tensors appears to not have been used in the graph. Set allow_unused=True if this is the desired behavior.