I have registered_hook on last module to modify the gradient during backprobabgation. The masked_fill_ does not returns a tensor with modified values, instead it returns the same tensor.
def custom(grad, model_output):
print("before custom grad: ",grad)
i = 0
for col in labels_:
label = col.data.cpu().numpy()
print("label: ", label)
print("before custom model_output[i,label,:,:]: ", model_output[i,label,:,:])
max_ouput = torch.max(model_output[i,label,:,:])
print("max_ouput: ",max_ouput)
norm_output = model_output[i,label,:,:]/max_ouput
print("norm_output: ",norm_output)
mask = norm_output > 0.50
print("mask: ", mask)
grad[i,label,:,:].masked_fill_(mask, 333)
print("grad[i,label,:,:]: ",grad[i,label,:,:])
i =+ 1
print("after custom grad: ",grad)
print(kjsdnjasn)
#thresh = grad * threshold_mat
return None
def func(x):
output_model_inter = model_inter(inputs_)
cus1 = custom(x,output_model_inter)
return cus1
class Last_layers_mod2(nn.Module):
def __init__(self):
super(Last_layers_mod2, self).__init__()
self.gap = nn.AdaptiveAvgPool2d((1,1))
def forward(self, x):
if self.train and x.requires_grad:
x.register_hook(func)
x = self.gap(x)
x = x.view(-1, self.num_flat_features(x))
return x
def num_flat_features(self, x):
size = x.size()[1:] # all dimensions except the batch dimension
num_features = 1
for s in size:
num_features *= s
return num_features
vgg16_feature = models.vgg16(pretrained=True)
middle_layer2 = Middle_layers_mod()
last_layer2 = Last_layers_mod2()
model_second = nn.Sequential(vgg16_feature.features, middle_layer2, last_layer2)
#print(model_second)
model_second = model_second.to(device)
set_parameter_requires_grad(vgg16_feature)
# Gather the parameters to be optimized/updated in this run. If we are
# finetuning we will be updating all parameters. However, if we are
# doing feature extract method, we will only update the parameters
# that we have just initialized, i.e. the parameters with requires_grad
# is True.
params_to_update2 = model_second.parameters()
print("Params to learn:")
for name,param in model_second.named_parameters():
if param.requires_grad == True:
print("\t",name)
# Observe that all parameters are being optimized
optimizer_ft2 = optim.SGD(params_to_update2, lr=0.001, momentum=0.9)
criterion2 = nn.CrossEntropyLoss()
model_ft, hist = train_model(model_second, dataloaders_dict, criterion2, optimizer_ft2, num_epochs=15)
result:
Epoch 0/14
----------
before custom grad: tensor([[[[ 0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
[ 0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
[ 0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
[ 0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
[ 0.0100, 0.0100, 0.0100, 0.0100, 0.0100]],
[[ 0.0111, 0.0111, 0.0111, 0.0111, 0.0111],
[ 0.0111, 0.0111, 0.0111, 0.0111, 0.0111],
[ 0.0111, 0.0111, 0.0111, 0.0111, 0.0111],
[ 0.0111, 0.0111, 0.0111, 0.0111, 0.0111],
[ 0.0111, 0.0111, 0.0111, 0.0111, 0.0111]],
[[-0.0306, -0.0306, -0.0306, -0.0306, -0.0306],
[-0.0306, -0.0306, -0.0306, -0.0306, -0.0306],
[-0.0306, -0.0306, -0.0306, -0.0306, -0.0306],
[-0.0306, -0.0306, -0.0306, -0.0306, -0.0306],
[-0.0306, -0.0306, -0.0306, -0.0306, -0.0306]],
[[ 0.0095, 0.0095, 0.0095, 0.0095, 0.0095],
[ 0.0095, 0.0095, 0.0095, 0.0095, 0.0095],
[ 0.0095, 0.0095, 0.0095, 0.0095, 0.0095],
[ 0.0095, 0.0095, 0.0095, 0.0095, 0.0095],
[ 0.0095, 0.0095, 0.0095, 0.0095, 0.0095]]]], device='cuda:0')
label: 2
before custom model_output[i,label,:,:]: tensor([[13.0475, 14.9160, 16.1668, 24.7181, 33.7023],
[ 9.5000, 10.8908, 12.3670, 21.3958, 32.8964],
[ 6.0762, 6.8957, 10.9900, 22.4678, 36.6358],
[ 4.2738, 5.6732, 11.7115, 23.4452, 38.2012],
[ 5.3080, 7.4024, 10.6921, 15.0981, 23.5845]], device='cuda:0')
max_ouput: tensor(38.2012, device='cuda:0')
norm_output: tensor([[0.3415, 0.3905, 0.4232, 0.6471, 0.8822],
[0.2487, 0.2851, 0.3237, 0.5601, 0.8611],
[0.1591, 0.1805, 0.2877, 0.5881, 0.9590],
[0.1119, 0.1485, 0.3066, 0.6137, 1.0000],
[0.1389, 0.1938, 0.2799, 0.3952, 0.6174]], device='cuda:0')
mask: tensor([[False, False, False, True, True],
[False, False, False, True, True],
[False, False, False, True, True],
[False, False, False, True, True],
[False, False, False, False, True]], device='cuda:0')
grad[i,label,:,:]: tensor([[-0.0306, -0.0306, -0.0306, -0.0306, -0.0306],
[-0.0306, -0.0306, -0.0306, -0.0306, -0.0306],
[-0.0306, -0.0306, -0.0306, -0.0306, -0.0306],
[-0.0306, -0.0306, -0.0306, -0.0306, -0.0306],
[-0.0306, -0.0306, -0.0306, -0.0306, -0.0306]], device='cuda:0')
after custom grad: tensor([[[[ 0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
[ 0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
[ 0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
[ 0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
[ 0.0100, 0.0100, 0.0100, 0.0100, 0.0100]],
[[ 0.0111, 0.0111, 0.0111, 0.0111, 0.0111],
[ 0.0111, 0.0111, 0.0111, 0.0111, 0.0111],
[ 0.0111, 0.0111, 0.0111, 0.0111, 0.0111],
[ 0.0111, 0.0111, 0.0111, 0.0111, 0.0111],
[ 0.0111, 0.0111, 0.0111, 0.0111, 0.0111]],
[[-0.0306, -0.0306, -0.0306, -0.0306, -0.0306],
[-0.0306, -0.0306, -0.0306, -0.0306, -0.0306],
[-0.0306, -0.0306, -0.0306, -0.0306, -0.0306],
[-0.0306, -0.0306, -0.0306, -0.0306, -0.0306],
[-0.0306, -0.0306, -0.0306, -0.0306, -0.0306]],
[[ 0.0095, 0.0095, 0.0095, 0.0095, 0.0095],
[ 0.0095, 0.0095, 0.0095, 0.0095, 0.0095],
[ 0.0095, 0.0095, 0.0095, 0.0095, 0.0095],
[ 0.0095, 0.0095, 0.0095, 0.0095, 0.0095],
[ 0.0095, 0.0095, 0.0095, 0.0095, 0.0095]]]], device='cuda:0')
Am I doing something wrong or is it a bug?