Bug in masked_fill_()

I have registered_hook on last module to modify the gradient during backprobabgation. The masked_fill_ does not returns a tensor with modified values, instead it returns the same tensor.

def custom(grad, model_output):
  print("before custom grad: ",grad)
  i = 0
  for col in labels_:
    label = col.data.cpu().numpy()
    print("label: ", label)
    print("before custom  model_output[i,label,:,:]: ", model_output[i,label,:,:])
    max_ouput = torch.max(model_output[i,label,:,:])
    print("max_ouput: ",max_ouput)
    norm_output = model_output[i,label,:,:]/max_ouput
    print("norm_output: ",norm_output)
    mask = norm_output > 0.50
    print("mask: ", mask)
    grad[i,label,:,:].masked_fill_(mask, 333)
    print("grad[i,label,:,:]: ",grad[i,label,:,:])
    i =+ 1
    
  print("after custom grad: ",grad)

  print(kjsdnjasn)
  #thresh = grad * threshold_mat
  return None


def func(x):
  output_model_inter = model_inter(inputs_)
  cus1 = custom(x,output_model_inter)
  return cus1

class Last_layers_mod2(nn.Module):
  def __init__(self):
    super(Last_layers_mod2, self).__init__()
    self.gap = nn.AdaptiveAvgPool2d((1,1))

  def forward(self, x):
    if self.train and x.requires_grad:
      x.register_hook(func)
    x = self.gap(x)
    x = x.view(-1, self.num_flat_features(x))
    return x

  def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features

vgg16_feature = models.vgg16(pretrained=True)
middle_layer2 = Middle_layers_mod()
last_layer2 = Last_layers_mod2()
model_second = nn.Sequential(vgg16_feature.features, middle_layer2, last_layer2)
#print(model_second)

model_second = model_second.to(device)
set_parameter_requires_grad(vgg16_feature)
# Gather the parameters to be optimized/updated in this run. If we are
#  finetuning we will be updating all parameters. However, if we are
#  doing feature extract method, we will only update the parameters
#  that we have just initialized, i.e. the parameters with requires_grad
#  is True.
params_to_update2 = model_second.parameters()
print("Params to learn:")

for name,param in model_second.named_parameters():
        if param.requires_grad == True:
            print("\t",name)
# Observe that all parameters are being optimized

optimizer_ft2 = optim.SGD(params_to_update2, lr=0.001, momentum=0.9)
criterion2 = nn.CrossEntropyLoss()

model_ft, hist = train_model(model_second, dataloaders_dict, criterion2, optimizer_ft2, num_epochs=15)

result:


Epoch 0/14
----------
before custom grad:  tensor([[[[ 0.0100,  0.0100,  0.0100,  0.0100,  0.0100],
          [ 0.0100,  0.0100,  0.0100,  0.0100,  0.0100],
          [ 0.0100,  0.0100,  0.0100,  0.0100,  0.0100],
          [ 0.0100,  0.0100,  0.0100,  0.0100,  0.0100],
          [ 0.0100,  0.0100,  0.0100,  0.0100,  0.0100]],

         [[ 0.0111,  0.0111,  0.0111,  0.0111,  0.0111],
          [ 0.0111,  0.0111,  0.0111,  0.0111,  0.0111],
          [ 0.0111,  0.0111,  0.0111,  0.0111,  0.0111],
          [ 0.0111,  0.0111,  0.0111,  0.0111,  0.0111],
          [ 0.0111,  0.0111,  0.0111,  0.0111,  0.0111]],

         [[-0.0306, -0.0306, -0.0306, -0.0306, -0.0306],
          [-0.0306, -0.0306, -0.0306, -0.0306, -0.0306],
          [-0.0306, -0.0306, -0.0306, -0.0306, -0.0306],
          [-0.0306, -0.0306, -0.0306, -0.0306, -0.0306],
          [-0.0306, -0.0306, -0.0306, -0.0306, -0.0306]],

         [[ 0.0095,  0.0095,  0.0095,  0.0095,  0.0095],
          [ 0.0095,  0.0095,  0.0095,  0.0095,  0.0095],
          [ 0.0095,  0.0095,  0.0095,  0.0095,  0.0095],
          [ 0.0095,  0.0095,  0.0095,  0.0095,  0.0095],
          [ 0.0095,  0.0095,  0.0095,  0.0095,  0.0095]]]], device='cuda:0')
label:  2
before custom  model_output[i,label,:,:]:  tensor([[13.0475, 14.9160, 16.1668, 24.7181, 33.7023],
        [ 9.5000, 10.8908, 12.3670, 21.3958, 32.8964],
        [ 6.0762,  6.8957, 10.9900, 22.4678, 36.6358],
        [ 4.2738,  5.6732, 11.7115, 23.4452, 38.2012],
        [ 5.3080,  7.4024, 10.6921, 15.0981, 23.5845]], device='cuda:0')
max_ouput:  tensor(38.2012, device='cuda:0')
norm_output:  tensor([[0.3415, 0.3905, 0.4232, 0.6471, 0.8822],
        [0.2487, 0.2851, 0.3237, 0.5601, 0.8611],
        [0.1591, 0.1805, 0.2877, 0.5881, 0.9590],
        [0.1119, 0.1485, 0.3066, 0.6137, 1.0000],
        [0.1389, 0.1938, 0.2799, 0.3952, 0.6174]], device='cuda:0')
mask:  tensor([[False, False, False,  True,  True],
        [False, False, False,  True,  True],
        [False, False, False,  True,  True],
        [False, False, False,  True,  True],
        [False, False, False, False,  True]], device='cuda:0')
grad[i,label,:,:]:  tensor([[-0.0306, -0.0306, -0.0306, -0.0306, -0.0306],
        [-0.0306, -0.0306, -0.0306, -0.0306, -0.0306],
        [-0.0306, -0.0306, -0.0306, -0.0306, -0.0306],
        [-0.0306, -0.0306, -0.0306, -0.0306, -0.0306],
        [-0.0306, -0.0306, -0.0306, -0.0306, -0.0306]], device='cuda:0')
after custom grad:  tensor([[[[ 0.0100,  0.0100,  0.0100,  0.0100,  0.0100],
          [ 0.0100,  0.0100,  0.0100,  0.0100,  0.0100],
          [ 0.0100,  0.0100,  0.0100,  0.0100,  0.0100],
          [ 0.0100,  0.0100,  0.0100,  0.0100,  0.0100],
          [ 0.0100,  0.0100,  0.0100,  0.0100,  0.0100]],

         [[ 0.0111,  0.0111,  0.0111,  0.0111,  0.0111],
          [ 0.0111,  0.0111,  0.0111,  0.0111,  0.0111],
          [ 0.0111,  0.0111,  0.0111,  0.0111,  0.0111],
          [ 0.0111,  0.0111,  0.0111,  0.0111,  0.0111],
          [ 0.0111,  0.0111,  0.0111,  0.0111,  0.0111]],

         [[-0.0306, -0.0306, -0.0306, -0.0306, -0.0306],
          [-0.0306, -0.0306, -0.0306, -0.0306, -0.0306],
          [-0.0306, -0.0306, -0.0306, -0.0306, -0.0306],
          [-0.0306, -0.0306, -0.0306, -0.0306, -0.0306],
          [-0.0306, -0.0306, -0.0306, -0.0306, -0.0306]],

         [[ 0.0095,  0.0095,  0.0095,  0.0095,  0.0095],
          [ 0.0095,  0.0095,  0.0095,  0.0095,  0.0095],
          [ 0.0095,  0.0095,  0.0095,  0.0095,  0.0095],
          [ 0.0095,  0.0095,  0.0095,  0.0095,  0.0095],
          [ 0.0095,  0.0095,  0.0095,  0.0095,  0.0095]]]], device='cuda:0')

Am I doing something wrong or is it a bug?

Hi,

The problem is that you call it on grad[i,label,:,:] which is not a view of the original grad. So the masked_fill_ does not modify what you want.
You wither want do do it with a single indexing ope as grad[foo] = 333. Or use only view operations like (I did not test the code, but you want something similar) grad.select(0, 1).index_copy_(0, label, grad[i,label,:,:].masked_fill_(mask, 333))

1 Like