When trying to train network for Meta Pseudo Labels, the following code:
x_i, x_j, x = train_iter
x_i_t, x_j_t, x_t, target_t = test_iter
"""teacher forward pass"""
_, _, _, _, _, _, _, t_out = t_model(x_i_t.to(device1), x_j_t.to(device1), x_t.to(device1))
# labeled teacher loss
t_loss_l = xent_criterion(t_out, target_t.to(device1))
# soft pseudo-labels
_, _, _, _, _, out_i, out_j, _ = t_model(x_i.to(device1), x_j.to(device1), x.to(device1))
spl = torch.softmax(out_i.detach(), dim=-1)
# hard pseudo-labels
max_probs, hpl = torch.max(spl, dim=-1)
# calculate mask
mask = max_probs.ge(0.5).float()
t_loss_u = torch.mean(-(spl * torch.log_softmax(out_j, dim=-1)).sum(dim=-1) * mask)
t_loss_uda = t_loss_l + t_loss_u
"""student optimizer step"""
# these values are re-used by t-network downstream (fix) (compare to 't_logits_us')
_, _, _, _, _, _, _, t_out_s = s_model(x_i_t.to(device0), x_j_t.to(device0), x_t.to(device0))
_, _, _, _, _, out_i_s, out_j_s, _ = s_model(x_i.to(device0), x_j.to(device0), x.to(device0))
s_loss = xent_criterion(out_i_s, hpl.to(device0))
s_loss_l_old = xent_criterion(t_out_s.detach(), target_t.to(device0))
s_optimizer.zero_grad()
s_loss.backward() # retain_graph=True, inputs=list(s_model.parameters()))
s_optimizer.step()
"""student forward pass"""
# TODO: determine relation between t_logits and t_loss_mpl
_, _, _, _, _, _, _, t_out_s = s_model(x_i_t.to(device0), x_j_t.to(device0), x_t.to(device0))
# _, _, _, _, _, out_i, out_j, _ = t_model(x_i.to(device1), x_j.to(device1), x.to(device1))
s_loss_l_new = xent_criterion(t_out_s.detach(), target_t.to(device0))
# dot_prod = s_loss_l_new.detach() - s_loss_l_old.detach() # .detach()
_, hpl = torch.max(out_j.detach(), dim=-1)
t_loss_mpl = xent_criterion(out_j, hpl.to(device1)) # dot_prod.to(device1) *
# t_loss_mpl_t = t_loss_mpl.clone() # .to(device1)
# t_loss_uda_t = t_loss_uda.clone() # .to(device1)
t_loss = t_loss_uda.to(device1) + t_loss_mpl.to(device1)
"""teacher optimizer step"""
t_optimizer.zero_grad()
t_loss.backward(retain_graph=True) # , inputs=list(t_model.parameters()))
t_optimizer.step()
return t_loss.item(), s_loss.item()
results in the following error (with set_detect_anomaly(True):
Gathering Pseudo Labels: 466/466Step [0/466] Loss: 4.100398063659668
/home/adam/.local/lib/python3.10/site-packages/torch/autograd/__init__.py:173: UserWarning: Error detected in AddmmBackward0. Traceback of forward call that caused the error:
File "/home/adam/contrastive_learner/SimCLR/main.py", line 656, in <module>
main(0, args)
File "/home/adam/contrastive_learner/SimCLR/main.py", line 603, in main
loss_epoch = train(args, train_loader, test_loader, t_model, model, xent_criterion, criterion, optimizer, t_optimizer, save_path) # pseudo_loader
File "/home/adam/contrastive_learner/SimCLR/main.py", line 183, in train
t_loss_mpl, s_loss_mpl = MPL((x_i, x_j, x), (x_i_t, x_j_t, x_t, target_t), t_model, model, t_optimizer, optimizer, xent_criterion)
File "/home/adam/contrastive_learner/SimCLR/main.py", line 93, in MPL
_, _, _, _, _, out_i, out_j, _ = t_model(x_i.to(device1), x_j.to(device1), x.to(device1))
File "/home/adam/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
return forward_call(*input, **kwargs)
File "/home/adam/contrastive_learner/SimCLR/vgg19_64.py", line 129, in forward
out_x_j = self.classifier(z_j)
File "/home/adam/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
return forward_call(*input, **kwargs)
File "/home/adam/.local/lib/python3.10/site-packages/torch/nn/modules/container.py", line 141, in forward
input = module(input)
File "/home/adam/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
return forward_call(*input, **kwargs)
File "/home/adam/.local/lib/python3.10/site-packages/torch/nn/modules/linear.py", line 103, in forward
return F.linear(input, self.weight, self.bias)
(Triggered internally at ../torch/csrc/autograd/python_anomaly_mode.cpp:104.)
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
Traceback (most recent call last):
File "/home/adam/contrastive_learner/SimCLR/main.py", line 656, in <module>
main(0, args)
File "/home/adam/contrastive_learner/SimCLR/main.py", line 603, in main
loss_epoch = train(args, train_loader, test_loader, t_model, model, xent_criterion, criterion, optimizer, t_optimizer, save_path) # pseudo_loader
File "/home/adam/contrastive_learner/SimCLR/main.py", line 183, in train
t_loss_mpl, s_loss_mpl = MPL((x_i, x_j, x), (x_i_t, x_j_t, x_t, target_t), t_model, model, t_optimizer, optimizer, xent_criterion)
File "/home/adam/contrastive_learner/SimCLR/main.py", line 128, in MPL
t_loss.backward(retain_graph=True) # , inputs=list(t_model.parameters()))
File "/home/adam/.local/lib/python3.10/site-packages/torch/_tensor.py", line 363, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File "/home/adam/.local/lib/python3.10/site-packages/torch/autograd/__init__.py", line 173, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [128, 4]], which is output 0 of AsStridedBackward0, is at version 3; expected version 2 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!
I believe that this error is a result of the s_model being updated with respect to the t_model variables, however I am unsure exactly which part of the code is causing this error. Any help is greatly appreciated.