How is conv2d calculating the gradient respect to the input?

Given the following code, I want to understand how conv1. x.register_hook: is being calculated?

try_grad = nn.Conv2d(1,4,kernel_size=1)

def try_grad_assign(x):
  try_grad.data = x

class try_conv_model(nn.Module):
  def __init__(self):
    super(try_conv_model, self).__init__()
    self.conv1 = nn.Conv2d(1, 4, kernel_size=1, bias=False)
    #self.conv2 = nn.Conv2d(1, 4, kernel_size=1, bias=False)
    self.gap = nn.AdaptiveAvgPool2d((1,1))
  
  def forward(self, x):
    print("-"*150)
    print("before conv1: ",x)
    x = self.conv1(x)
    x.register_hook(lambda x: print("conv1. x.register_hook: ",x))
    #print("before conv2: ",x)
    #x = self.conv2(x)
    print("before AdaptiveAvgPool2D: ",x)
    x = self.gap(x)
    x.register_hook(try_grad_assign)
    print("after AdaptiveAvgPool2D: ",x)
    x = x.view(-1, self.num_flat_features(x))
    return(x)

  def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features
  
conv_model = try_conv_model()

def my_func(z):
  print("z[0]: ",z[0])
  z = z[0] + torch.tensor([[0,0,1,0]])
  z = z[0] - torch.tensor([[0,0,-1,0]])
  print("new_grad: ",z)
  return z

conv_model.conv1.register_forward_hook(lambda x,y,z: print(" \n conv_model.conv1.register_forward_hook: ",x,y,z))
conv_model.conv1.register_backward_hook(lambda x,y,z: print(" \n conv_model.conv1.register_backward_hook: ",x,y,z))
#conv_model.conv1.register_hook(lambda x: print("conv_model.conv1.register_hook: ",conv_model.conv1.register_hook))
#conv_model.conv2.register_forward_hook(lambda x,y,z: print(" \n conv_model.conv2.register_forward_hook: ",x,y,z))
#conv_model.conv2.register_backward_hook(lambda x,y,z: print(" \n conv_model.conv2.register_backward_hook: ",x,y,z))

conv_model.gap.register_forward_hook(lambda x,y,z: print(" \n conv_model.gap.register_forward_hook: ",x,y,z))
conv_model.gap.register_backward_hook(lambda x,y,z: print(" \n conv_model.gap.register_backward_hook: ",x,y,z))
#conv_model.gap.register_hook(lambda x: print("conv_model.gap.register_hook: ",x))

conv_model.conv1.weight.data = torch. tensor([[[[1]]], [[[1]]], [[[1]]], [[[1]]]], dtype = torch.float)
img = torch.ones(1,1,5,5, dtype = torch.float)
print(img)

labels = torch.tensor([2], dtype = torch.long)
print(labels)

criterion = F.cross_entropy
#criterion.register_hook(lambda x: print("babasjbdkajskjs"))
optimizer = optim.SGD(conv_model.parameters(), lr=0.01, momentum=0.9)
epoch = 1

def train(epoch):
  conv_model.train()
  for i in range(epoch):
    print("&"*300)
    print("*"*25)
    for param in conv_model.parameters():
      print(param)
    print("^"*25)
    conv_model.train()
    outputs = conv_model(img)
    outputs.require_grad = True
    #outputs.register_hook(lambda x: print("outputs.register_hook: ", x))
    outputs.register_hook(my_func)
    print("outputs: ",outputs)
    loss = criterion(outputs, labels)
    loss.register_hook(lambda x: print(" \n before backward loss hook: ",x))
    
    print(" \n before backward () conv_model.conv1.weight.grad: ",conv_model.conv1.weight.grad)
    #print(" \n before backward () conv_model.conv2.weight.grad: ",conv_model.conv2.weight.grad)
    #print(" \n before backward () linear_model.fc1.bias.grad: ",linear_model.fc1.bias.grad)
    loss.backward()
    loss.register_hook(lambda x: print(" \n after backward loss hook: ",x))

    print("loss.grad: ",loss.grad)
    
    #print(" \n after backward () conv_model.conv2.weight.grad: ",conv_model.conv2.weight.grad)
    print(" \n after backward () conv_model.conv1.weight.grad: ",conv_model.conv1.weight.grad)
    #print(" \n after backward () linear_model.fc1.bias.grad: ",linear_model.fc1.bias.grad)
    
    print("outputs.grad: ", outputs.grad)

    optimizer.step()
    print("*"*25)
    for param in conv_model.parameters():
      print(param)
    print("^"*25)

train(epoch)


print("model prediction ",conv_model(img).max(1, keepdim=True)[1])

result:

tensor([[[[1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.]]]])
tensor([2])
&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
*************************
Parameter containing:
tensor([[[[1.]]],


        [[[1.]]],


        [[[1.]]],


        [[[1.]]]], requires_grad=True)
^^^^^^^^^^^^^^^^^^^^^^^^^
------------------------------------------------------------------------------------------------------------------------------------------------------
before conv1:  tensor([[[[1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.]]]])
 
 conv_model.conv1.register_forward_hook:  Conv2d(1, 4, kernel_size=(1, 1), stride=(1, 1), bias=False) (tensor([[[[1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.]]]]),) tensor([[[[1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.]],

         [[1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.]],

         [[1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.]],

         [[1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.]]]], grad_fn=<MkldnnConvolutionBackward>)
before AdaptiveAvgPool2D:  tensor([[[[1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.]],

         [[1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.]],

         [[1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.]],

         [[1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.]]]], grad_fn=<MkldnnConvolutionBackward>)
 
 conv_model.gap.register_forward_hook:  AdaptiveAvgPool2d(output_size=(1, 1)) (tensor([[[[1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.]],

         [[1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.]],

         [[1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.]],

         [[1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.]]]], grad_fn=<MkldnnConvolutionBackward>),) tensor([[[[1.]],

         [[1.]],

         [[1.]],

         [[1.]]]], grad_fn=<ViewBackward>)
after AdaptiveAvgPool2D:  tensor([[[[1.]],

         [[1.]],

         [[1.]],

         [[1.]]]], grad_fn=<ViewBackward>)
outputs:  tensor([[1., 1., 1., 1.]], grad_fn=<ViewBackward>)
 
 before backward () conv_model.conv1.weight.grad:  None
 
 before backward loss hook:  tensor(1.)
z[0]:  tensor([ 0.2500,  0.2500, -0.7500,  0.2500])
new_grad:  tensor([[0.2500, 0.2500, 1.2500, 0.2500]])
 
 conv_model.gap.register_backward_hook:  AdaptiveAvgPool2d(output_size=(1, 1)) (tensor([0.2500, 0.2500, 1.2500, 0.2500]),) (tensor([[[[0.2500]],

         [[0.2500]],

         [[1.2500]],

         [[0.2500]]]]),)
conv1. x.register_hook:  tensor([[[[0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
          [0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
          [0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
          [0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
          [0.0100, 0.0100, 0.0100, 0.0100, 0.0100]],

         [[0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
          [0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
          [0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
          [0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
          [0.0100, 0.0100, 0.0100, 0.0100, 0.0100]],

         [[0.0500, 0.0500, 0.0500, 0.0500, 0.0500],
          [0.0500, 0.0500, 0.0500, 0.0500, 0.0500],
          [0.0500, 0.0500, 0.0500, 0.0500, 0.0500],
          [0.0500, 0.0500, 0.0500, 0.0500, 0.0500],
          [0.0500, 0.0500, 0.0500, 0.0500, 0.0500]],

         [[0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
          [0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
          [0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
          [0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
          [0.0100, 0.0100, 0.0100, 0.0100, 0.0100]]]])
 
 conv_model.conv1.register_backward_hook:  Conv2d(1, 4, kernel_size=(1, 1), stride=(1, 1), bias=False) (None, tensor([[[[0.2500]]],


        [[[0.2500]]],


        [[[1.2500]]],


        [[[0.2500]]]]), None) (tensor([[[[0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
          [0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
          [0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
          [0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
          [0.0100, 0.0100, 0.0100, 0.0100, 0.0100]],

         [[0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
          [0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
          [0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
          [0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
          [0.0100, 0.0100, 0.0100, 0.0100, 0.0100]],

         [[0.0500, 0.0500, 0.0500, 0.0500, 0.0500],
          [0.0500, 0.0500, 0.0500, 0.0500, 0.0500],
          [0.0500, 0.0500, 0.0500, 0.0500, 0.0500],
          [0.0500, 0.0500, 0.0500, 0.0500, 0.0500],
          [0.0500, 0.0500, 0.0500, 0.0500, 0.0500]],

         [[0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
          [0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
          [0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
          [0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
          [0.0100, 0.0100, 0.0100, 0.0100, 0.0100]]]]),)
loss.grad:  None
 
 after backward () conv_model.conv1.weight.grad:  tensor([[[[0.2500]]],


        [[[0.2500]]],


        [[[1.2500]]],


        [[[0.2500]]]])
outputs.grad:  None
*************************
Parameter containing:
tensor([[[[0.9975]]],


        [[[0.9975]]],


        [[[0.9875]]],


        [[[0.9975]]]], requires_grad=True)
^^^^^^^^^^^^^^^^^^^^^^^^^
------------------------------------------------------------------------------------------------------------------------------------------------------
before conv1:  tensor([[[[1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.]]]])
 
 conv_model.conv1.register_forward_hook:  Conv2d(1, 4, kernel_size=(1, 1), stride=(1, 1), bias=False) (tensor([[[[1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.]]]]),) tensor([[[[0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
          [0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
          [0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
          [0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
          [0.9975, 0.9975, 0.9975, 0.9975, 0.9975]],

         [[0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
          [0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
          [0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
          [0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
          [0.9975, 0.9975, 0.9975, 0.9975, 0.9975]],

         [[0.9875, 0.9875, 0.9875, 0.9875, 0.9875],
          [0.9875, 0.9875, 0.9875, 0.9875, 0.9875],
          [0.9875, 0.9875, 0.9875, 0.9875, 0.9875],
          [0.9875, 0.9875, 0.9875, 0.9875, 0.9875],
          [0.9875, 0.9875, 0.9875, 0.9875, 0.9875]],

         [[0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
          [0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
          [0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
          [0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
          [0.9975, 0.9975, 0.9975, 0.9975, 0.9975]]]],
       grad_fn=<MkldnnConvolutionBackward>)
before AdaptiveAvgPool2D:  tensor([[[[0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
          [0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
          [0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
          [0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
          [0.9975, 0.9975, 0.9975, 0.9975, 0.9975]],

         [[0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
          [0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
          [0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
          [0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
          [0.9975, 0.9975, 0.9975, 0.9975, 0.9975]],

         [[0.9875, 0.9875, 0.9875, 0.9875, 0.9875],
          [0.9875, 0.9875, 0.9875, 0.9875, 0.9875],
          [0.9875, 0.9875, 0.9875, 0.9875, 0.9875],
          [0.9875, 0.9875, 0.9875, 0.9875, 0.9875],
          [0.9875, 0.9875, 0.9875, 0.9875, 0.9875]],

         [[0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
          [0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
          [0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
          [0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
          [0.9975, 0.9975, 0.9975, 0.9975, 0.9975]]]],
       grad_fn=<MkldnnConvolutionBackward>)
 
 conv_model.gap.register_forward_hook:  AdaptiveAvgPool2d(output_size=(1, 1)) (tensor([[[[0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
          [0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
          [0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
          [0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
          [0.9975, 0.9975, 0.9975, 0.9975, 0.9975]],

         [[0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
          [0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
          [0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
          [0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
          [0.9975, 0.9975, 0.9975, 0.9975, 0.9975]],

         [[0.9875, 0.9875, 0.9875, 0.9875, 0.9875],
          [0.9875, 0.9875, 0.9875, 0.9875, 0.9875],
          [0.9875, 0.9875, 0.9875, 0.9875, 0.9875],
          [0.9875, 0.9875, 0.9875, 0.9875, 0.9875],
          [0.9875, 0.9875, 0.9875, 0.9875, 0.9875]],

         [[0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
          [0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
          [0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
          [0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
          [0.9975, 0.9975, 0.9975, 0.9975, 0.9975]]]],
       grad_fn=<MkldnnConvolutionBackward>),) tensor([[[[0.9975]],

         [[0.9975]],

         [[0.9875]],

         [[0.9975]]]], grad_fn=<ViewBackward>)
after AdaptiveAvgPool2D:  tensor([[[[0.9975]],

         [[0.9975]],

         [[0.9875]],

         [[0.9975]]]], grad_fn=<ViewBackward>)
model prediction  tensor([[3]])

If the gradient being back-propagated by the avg2d layer is

(tensor([[[[0.2500]],

         [[0.2500]],

         [[1.2500]],

         [[0.2500]]]]),) 

what is it being convoluted with so that it is giving:

tensor([[[[0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
          [0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
          [0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
          [0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
          [0.0100, 0.0100, 0.0100, 0.0100, 0.0100]],

         [[0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
          [0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
          [0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
          [0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
          [0.0100, 0.0100, 0.0100, 0.0100, 0.0100]],

         [[0.0500, 0.0500, 0.0500, 0.0500, 0.0500],
          [0.0500, 0.0500, 0.0500, 0.0500, 0.0500],
          [0.0500, 0.0500, 0.0500, 0.0500, 0.0500],
          [0.0500, 0.0500, 0.0500, 0.0500, 0.0500],
          [0.0500, 0.0500, 0.0500, 0.0500, 0.0500]],

         [[0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
          [0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
          [0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
          [0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
          [0.0100, 0.0100, 0.0100, 0.0100, 0.0100]]]])

If convoluted with img using print(try_grad(img)), it gives the following result:

tensor([[[[-0.1372, -0.1372, -0.1372, -0.1372, -0.1372],
          [-0.1372, -0.1372, -0.1372, -0.1372, -0.1372],
          [-0.1372, -0.1372, -0.1372, -0.1372, -0.1372],
          [-0.1372, -0.1372, -0.1372, -0.1372, -0.1372],
          [-0.1372, -0.1372, -0.1372, -0.1372, -0.1372]],

         [[ 0.3811,  0.3811,  0.3811,  0.3811,  0.3811],
          [ 0.3811,  0.3811,  0.3811,  0.3811,  0.3811],
          [ 0.3811,  0.3811,  0.3811,  0.3811,  0.3811],
          [ 0.3811,  0.3811,  0.3811,  0.3811,  0.3811],
          [ 0.3811,  0.3811,  0.3811,  0.3811,  0.3811]],

         [[-1.4801, -1.4801, -1.4801, -1.4801, -1.4801],
          [-1.4801, -1.4801, -1.4801, -1.4801, -1.4801],
          [-1.4801, -1.4801, -1.4801, -1.4801, -1.4801],
          [-1.4801, -1.4801, -1.4801, -1.4801, -1.4801],
          [-1.4801, -1.4801, -1.4801, -1.4801, -1.4801]],

         [[-0.2196, -0.2196, -0.2196, -0.2196, -0.2196],
          [-0.2196, -0.2196, -0.2196, -0.2196, -0.2196],
          [-0.2196, -0.2196, -0.2196, -0.2196, -0.2196],
          [-0.2196, -0.2196, -0.2196, -0.2196, -0.2196],
          [-0.2196, -0.2196, -0.2196, -0.2196, -0.2196]]]],
       grad_fn=<MkldnnConvolutionBackward>)

So my question is, how is conv2d calculating the gradient respect to the input?
and why does it differ from mine? The way I am manually calculating the gradient, is that wrong or am i missing something?

Hi,

First you should be careful and not rely on the result of register_backward_hook. You can see in the documentation a warning stating that it can return wrong results at the moment.
I would first remove these from your code.

Otherwise, the gradient for a convolution wrt input is given by a convtranspose of the gradient of the output with the weights of the original convolution.