I am trying to get the gradients of the loss wrt the input in my RNN model. It uses a VGG 16 as a feature extractor and an LSTM for sequence modelling. I registered the hook to the first layer of the VGG16 deep net. According to Exact meaning of grad_input and grad_output, grad_in
is supposed to be a 3-tuple that contains the derivative of the loss wrt the layer input and the filter weights and biases.
In my case however, grad_in[0]
returns None
, grad_in[1].size()
returns 64x3x3x3 i.e. the filters of the first convolutional layer of VGG16 and grad_in[2].size()
returns 64, the biases for the 64 filters.
grad_out[0].size()
returns 14x64x100x40 (timesteps, channels, height, width) i.e. the output of the first convolutional layer.
So how do I get the gradient of the loss wrt the input ? grad_in[0].size()
should return me 14x3x100x40 (timesteps, channel, height, width). Below is the class containing my architecture.
class CNNLSTM(nn.Module):
def __init__(
self, embedding_dim=64, h_dim=32
):
super(CNNLSTM, self).__init__()
# gradients
self.gradients = None
## CNN Feature Extractor
self.model = models.vgg16(pretrained=True)
self.model = nn.Sequential(*list(self.model.children())[0]) #5, 10, 18, 25, 31
for name, param in self.model.named_parameters():
if param.requires_grad:
param.requires_grad = True
self.hook_layers()
# CNN feature embedder
self.feature_embedder = nn.Linear(1536, embedding_dim)
# LSTM
self.lstm = nn.LSTM(
embedding_dim, h_dim, 1, dropout=0.0, batch_first=False
)
# MLP classifier
self.classifier = make_mlp(
[h_dim, 128, 2],
['relu', 'sigmoid'],
batch_norm=False,
dropout=0.0
)
def hook_layers(self):
def hook_function(module, grad_in, grad_out):
self.gradients = grad_in
print(grad_in[0])
print(grad_in[1].size())
print(grad_in[2].size())
print(grad_out[0].size())
self.model[0].register_backward_hook(hook_function)
def init_hidden(self, batch):
return (
torch.zeros(1, batch, self.h_dim).cuda(),
torch.zeros(1, batch, self.h_dim).cuda()
)
def forward(self, images_pedestrian_all):
"""
Inputs:
- obs_traj: Tensor of shape (obs_len, batch, 2)
Output:
- final_h: Tensor of shape (self.num_layers, batch, self.h_dim)
"""
# batch = number of pedestrians where sequence length of each pedestrian varies
batch = len(images_pedestrian_all)
# for each pedestrian
#features_pedestrian_all = []
state_all = []
for images_pedestrian_i in images_pedestrian_all:
# sequence length
seq_len = images_pedestrian_i.size(0)
# send all the images of the current pedestrian through the CNN feature extractor
images_pedestrian_i = images_pedestrian_i.cuda()
features_pedestrian_i = self.model(images_pedestrian_i)
features_pedestrian_i = features_pedestrian_i.view(seq_len, -1)
# embed the features
features_pedestrian_i = self.feature_embedder(features_pedestrian_i)
features_pedestrian_i = torch.unsqueeze(features_pedestrian_i, 1)
# send through lstm
state_tuple = self.init_hidden(1)
output, state = self.lstm(features_pedestrian_i)
state_all.append(state[0].squeeze())
state_all = torch.stack(state_all, dim=0)
y_pred = self.classifier(state_all)
return y_pred
Here is a snippet of the forward pass
# predict decision
decision_pred = classifier(pedestrian_crops)
onehot_pred = torch.round(decision_pred.cpu())
# backprop
classifier.zero_grad()
decision_pred.backward(gradient=onehot_pred.cuda())