My code keeps crashing after a couple of thousand iterations (suddenly all the weights go to nan), but nothing obvious seemed to trigger it, so now I turned on anomaly detection and I get the following error already in the first iteration, but I can’t really see the problem.
[W ..\torch\csrc\autograd\python_anomaly_mode.cpp:60] Warning: Error detected in SqrtBackward. Traceback of forward call that caused the error:
File "C:\Program Files\JetBrains\PyCharm Community Edition 2020.1\plugins\python-ce\helpers\pydev\pydevd.py", line 2131, in <module>
main()
File "C:\Program Files\JetBrains\PyCharm Community Edition 2020.1\plugins\python-ce\helpers\pydev\pydevd.py", line 2122, in main
globals = debugger.run(setup['file'], None, None, is_module)
File "C:\Program Files\JetBrains\PyCharm Community Edition 2020.1\plugins\python-ce\helpers\pydev\pydevd.py", line 1431, in run
return self._exec(is_module, entry_point_fn, module_name, file, globals, locals)
File "C:\Program Files\JetBrains\PyCharm Community Edition 2020.1\plugins\python-ce\helpers\pydev\pydevd.py", line 1438, in _exec
pydev_imports.execfile(file, globals, locals) # execute the script
File "C:\Program Files\JetBrains\PyCharm Community Edition 2020.1\plugins\python-ce\helpers\pydev\_pydev_imps\_pydev_execfile.py", line 18, in execfile
exec(compile(contents+"\n", file, 'exec'), glob, loc)
File "C:/Users/Tue/PycharmProjects/Pfold/run_1d_supervised.py", line 109, in <module>
losses = main()
File "C:\Users\Tue\PycharmProjects\Pfold\supervised\main.py", line 71, in main
net = train(net, optimizer, dl_train, loss_fnc, dl_test=dl_test, scheduler=lr_scheduler,ite=ite_start, loss_reg_fnc=loss_reg_fnc)
File "C:\Users\Tue\PycharmProjects\Pfold\supervised\optimization.py", line 72, in train
dists_pred, coords_pred = net(features,mask)
File "C:\Users\Tue\PycharmProjects\Pfold\venv\lib\site-packages\torch\nn\modules\module.py", line 722, in _call_impl
result = self.forward(*input, **kwargs)
File "C:\Users\Tue\PycharmProjects\Pfold\supervised\network_vnet.py", line 162, in forward
dists += (tr2DistSmall(x[:,i*3:(i+1)*3,:]),)
File "C:\Users\Tue\PycharmProjects\Pfold\supervised\network_transformer.py", line 159, in tr2DistSmall
D = torch.sqrt(D)
(function print_stack)
Traceback (most recent call last):
File "C:\Program Files\JetBrains\PyCharm Community Edition 2020.1\plugins\python-ce\helpers\pydev\pydevd.py", line 1438, in _exec
pydev_imports.execfile(file, globals, locals) # execute the script
File "C:\Program Files\JetBrains\PyCharm Community Edition 2020.1\plugins\python-ce\helpers\pydev\_pydev_imps\_pydev_execfile.py", line 18, in execfile
exec(compile(contents+"\n", file, 'exec'), glob, loc)
File "C:/Users/Tue/PycharmProjects/Pfold/run_1d_supervised.py", line 109, in <module>
losses = main()
File "C:\Users\Tue\PycharmProjects\Pfold\supervised\main.py", line 71, in main
net = train(net, optimizer, dl_train, loss_fnc, dl_test=dl_test, scheduler=lr_scheduler,ite=ite_start, loss_reg_fnc=loss_reg_fnc)
File "C:\Users\Tue\PycharmProjects\Pfold\supervised\optimization.py", line 91, in train
loss.backward()
File "C:\Users\Tue\PycharmProjects\Pfold\venv\lib\site-packages\torch\tensor.py", line 185, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph)
File "C:\Users\Tue\PycharmProjects\Pfold\venv\lib\site-packages\torch\autograd\__init__.py", line 125, in backward
Variable._execution_engine.run_backward(
RuntimeError: Function 'SqrtBackward' returned nan values in its 0th output.
The function that it crashes in, is the following:
def tr2DistSmall(Y):
k = Y.shape[1]
Z = Y - torch.mean(Y, dim=2, keepdim=True)
D = torch.sum(Z**2, dim=1).unsqueeze(1) + torch.sum(Z**2, dim=1).unsqueeze(2) - 2*Z.transpose(1,2) @ Z
D = 3*D/k
D[...,torch.arange(D.shape[-1]),torch.arange(D.shape[-1])] = 0
D = torch.relu(D)
D = torch.sqrt(D)
return D
I have checked it on the forward pass, and everything looks fine, there are no nan values at any point, and all values seems reasonable. So can anyone illuminate what I’m doing wrong?
Edit:
I tried making the function it crashes in simpler, but even with the following I still get the problem:
def tr2Dist_new(r):
d = torch.sum(r ** 2, dim=1).unsqueeze(1) + torch.sum(r ** 2, dim=1).unsqueeze(2) - 2 * r.transpose(1,2) @ r
d[..., torch.arange(d.shape[-1]), torch.arange(d.shape[-1])] = 0
d = torch.sqrt(d)
return d