I’m trying to understand the interpretation of gradInput tensors for simple criterions using backward hooks on the modules. Here are three modules (two criterions and a model):
import torch
import torch.nn as nn
import torch.optim as onn
import torch.autograd as ann
class L1Loss(nn.Module):
def __init__(self):
super(L1Loss, self).__init__()
def forward(self, input_var, target_var):
'''
L1 loss:
|y - x|
'''
return (target_var - input_var).norm()
class CosineLoss(nn.Module):
def __init__(self):
super(CosineLoss, self).__init__()
def forward(self, input_var, target_var):
'''
Cosine loss:
1.0 - (y.x / |y|*|x|)
'''
return 1.0 - input_var.dot(target_var) / (input_var.norm()*target_var.norm())
class Model(nn.Module):
def __init__(self, mode=None):
super(Model, self).__init__()
def hook_func(module, grad_i, grad_o):
print 'Grad input:', grad_i
self.input_encoder = nn.Linear(20, 10)
self.target_encoder = nn.Linear(20, 10)
if mode == 'cos':
self.criterion = CosineLoss()
elif mode == 'l1':
self.criterion = L1Loss()
self.criterion.register_backward_hook(hook_func)
self.optimizer = onn.Adam(self.parameters(), lr=1e-5)
def forward(self, input_var_1, input_var_2):
return self.input_encoder(input_var_1), self.target_encoder(input_var_2)
def train(self, input_np, target_np):
input_var = ann.Variable(input_np)
target_var = ann.Variable(target_np)
input_encode, target_encode = self.forward(input_var, target_var)
loss = self.criterion(input_encode, target_encode)
loss.backward()
self.optimizer.step()
return loss.data[0]
If I run a few iterations using L1Loss
:
mod = Model(mode='l1')
for i in range(5):
inp = torch.rand(1, 20)
tar = torch.rand(1, 20)
loss_val = mod.train(inp, tar)
print 'Iteration\t{0}\tLoss\t{1}'.format(i, loss_val)
I see grad input is a single tensor of shape (1, 10):
Grad input: (Variable containing:
-0.2466 -0.0966 0.0659 -0.1954 0.3573 -0.5367 0.5818 0.0758 0.2598 -0.2447
[torch.FloatTensor of size 1x10]
,)
I was expecting two tensors of that shape, one for each input. On the other hand, if I run with the cosine loss:
mod = Model(mode='cos')
for i in range(5):
inp = torch.rand(1, 20)
tar = torch.rand(1, 20)
loss_val = mod.train(inp, tar)
print 'Iteration\t{0}\tLoss\t{1}'.format(i, loss_val)
I find grad input is a single scalar value:
Grad input: (Variable containing:
-1
[torch.FloatTensor of size 1]
,)
In both cases I was expecting two gradInput tensors, one corresponding to each input to the criterion’s forward
function.
Where is my interpretation amiss? Is there something wrong with the implementations of the criterions? I’m particularly surprised by the cosine loss - the L1 loss seems to interpret the second input (targer_var
) as a ground truth that is not being optimized, but I’m not clear on the cosine loss function. Neither am I clear on what has changed between the two - both calculations are on the outputs of each encoder, with a scalar loss returned by forward
, but the shape of gradInput
is different for both.