import torch
from torch import nn
from torchvision.models import vgg16 as vgg
def main():
best_prec1 = 0
lr = 0.0001
epochs = 20
model = vgg(num_classes=1000, pretrained=False)
model.cuda()
criterion_A = nn.MSELoss().cuda() # loss of taskA
criterion_B = nn.CrossEntropyLoss().cuda() # loss of taskB
# learnable loss weight param
log_sigma_A = torch.tensor([1.]).requires_grad_()
log_sigma_B = torch.tensor([1.]).requires_grad_()
weight_list = []
bias_list = []
last_weight_list = []
last_bias_list = []
loss_weight_list = [log_sigma_A, log_sigma_B]
for name, value in model.named_parameters():
if 'classifier' in name:
# print(name)
if 'weight' in name:
last_weight_list.append(value)
elif 'bias' in name:
last_bias_list.append(value)
else:
if 'weight' in name:
weight_list.append(value)
elif 'bias' in name:
bias_list.append(value)
optimizer = torch.optim.SGD([{'params': weight_list, 'lr': lr},
{'params': bias_list, 'lr': lr * 2},
{'params': last_weight_list, 'lr': lr * 10},
{'params': last_bias_list, 'lr': lr * 20},
{'params': loss_weight_list, 'lr': lr}], momentum=0.9, weight_decay=0.0005,
nesterov=True)
for epoch in range(0, epochs):
# train for one epoch
train( model, criterion_A, optimizer, epoch, log_sigma_A, log_sigma_B)
def train(model, criterion_A, optimizer, epoch, log_sigma_AA, log_sigma_BB):
log_sigma_A = log_sigma_AA.cuda()
log_sigma_B = log_sigma_BB.cuda()
sigma_A = torch.Tensor.exp(log_sigma_A)
sigma_B = torch.Tensor.exp(log_sigma_B)
predA = model(torch.rand((1,3,224,224),device='cuda:0',requires_grad=True))
loss_A = criterion_A(predA,torch.rand((1,1000),device='cuda:0'))
loss = (1 / (2 * sigma_A)) * loss_A + (1 / (2 * sigma_B))+ log_sigma_A + log_sigma_B
optimizer.zero_grad()
loss.backward()
print('Grad A: %f' % log_sigma_AA.grad)
print('Loss: %f' % loss.item())
optimizer.step()
main()
Still works with your code
Grad A: 0.940426
Loss: 2.243513
Grad A: 0.939637
Loss: 2.243998
Grad A: 0.939842
Loss: 2.243357
Grad A: 0.938279
Loss: 2.244367
Grad A: 0.935748
Loss: 2.246241
Grad A: 0.939467
Loss: 2.241770
Grad A: 0.939254
Loss: 2.241145
Grad A: 0.937665
Loss: 2.241820
Grad A: 0.942895
Loss: 2.235607
Grad A: 0.934549
Loss: 2.242907
Grad A: 0.937016
Loss: 2.239339
Grad A: 0.937262
Loss: 2.237943
Grad A: 0.932476
Loss: 2.241532
Grad A: 0.934655
Loss: 2.238118
Grad A: 0.933905
Loss: 2.237595
Grad A: 0.935085
Loss: 2.235111
Grad A: 0.936593
Loss: 2.232268
Grad A: 0.939101
Loss: 2.228399
Grad A: 0.938238
Loss: 2.227876
Grad A: 0.939020
Loss: 2.225688
Are you trying to print values after loss backward? before there are no gradients
Your code fails becose you are reassigning log_sigma_A, losing outer function scope.
Inside your function you are pointing to a non-leaf tensor (copy of log_sigma in cuda) rather than original leaf tensor defined outside the function