I am trying to reproduce this recent paper:
GradNorm: Gradient Normalization for Adaptive Loss Balancing in Deep Multitask Networks
The idea is to normalize gradients across different tasks, and authors used this idea to learn weights for the corresponding losses for each task adaptively.
I have my main model class for two arbitrary regression tasks (one shared layer and two task specific towers) as follows:
class MTLnet(nn.Module): def __init__(self): super(MTLnet, self).__init__() self.sharedlayer = nn.Sequential( nn.Linear(feature_size, shared_layer_size), nn.ReLU(), nn.Dropout() ) self.tower1 = nn.Sequential( nn.Linear(shared_layer_size, tower_h1), nn.ReLU(), nn.Dropout(), nn.Linear(tower_h1, tower_h2), nn.ReLU(), nn.Dropout(), nn.Linear(tower_h2, output_size) ) self.tower2 = nn.Sequential( nn.Linear(shared_layer_size, tower_h1), nn.ReLU(), nn.Dropout(), nn.Linear(tower_h1, tower_h2), nn.ReLU(), nn.Dropout(), nn.Linear(tower_h2, output_size) ) def forward(self, x): h_shared = self.sharedlayer(x) out1 = self.tower1(h_shared) out2 = self.tower2(h_shared) return out1, out2 MTL = MTLnet() opt1 = torch.optim.Adam(MTL.parameters(), lr=LR) loss_func = nn.MSELoss()
And two weights for two losses as follows:
Weightloss1 = torch.tensor(torch.FloatTensor(), requires_grad=True) Weightloss2 = torch.tensor(torch.FloatTensor(), requires_grad=True) params = [Weightloss1, Weightloss2] opt2 = torch.optim.Adam(params, lr=LR) Gradloss = nn.L1Loss()
And the code to run goes like this for mini-batches:
alph = 0.16 for minibatch in minibatches: XE, YE1, YE2 = minibatch # Getting weighted losses for two tasks Yhat1, Yhat2 = MTL(XE) l1 = params.data*loss_func(Yhat1, YE1.view(-1,1)) l2 = params.data*loss_func(Yhat2, YE2.view(-1,1)) loss = torch.add(l1,l2)/2 # for the first epoch with no l0 if epoch == 0: l0 = loss.data opt1.zero_grad() opt2.zero_grad() loss.backward(retain_graph=True) # Getting gradients of the first layers in each tower and calculate their l2-norms par = list(MTL.parameters()) G1 = torch.tensor(par.grad.norm(2), requires_grad=True) G2 = torch.tensor(par.grad.norm(2), requires_grad=True) G_avg = torch.add(G1,G2)/2 # Calculating relative losses lhat1 = l1/l0 lhat2 = l2/l0 lhat_avg = torch.add(lhat1,lhat2)/2 # Calculating relative inverse training rates for tasks inv_rate1 = lhat1/lhat_avg inv_rate2 = lhat2/lhat_avg # Calculating the gradient loss according to Eq. 2 in the GradNorm paper Lgrad = torch.add(Gradloss(G1, G_avg*(inv_rate1)**alph),Gradloss(G2, G_avg*(inv_rate2)**alph)) Lgrad.backward() # Updating loss weights opt2.step() # Updating the model weights opt1.step() # Renormalizing the losses weights to make their summation equals to 2 params = [2*params.data/(params.data+params.data), 2*params.data/(params.data+params.data)]
The problem is although Weightloss1 and 2 are required_grad=True and G1 and G2 also have gradients (checked using .grad), G_avg and Lgrad have None for their gradients, and as a result, loss weights in the “params” remain unchanged.