I am trying to make a simple Taylor series layer for my neural network but am unable to test it out because the weights become NaNs on the first backward pass.
Here is the code:
class Maclaurin(nn.Module):
""" Maclaurin Series Layer First Draft """
def __init__(self):
super().__init__()
weights = torch.Tensor(1, 30)
bias = torch.Tensor([64])
self.bias = nn.Parameter(bias)
scal = torch.arange(0, 30) # powers of taylor series
self.scal = scal.to(device)
# initialize weights & biases
nn.init.kaiming_uniform_(weights, a=math.sqrt(5)) # weight init
fan_in, _ = nn.init._calculate_fan_in_and_fan_out(weights)
bound = 1 / math.sqrt(fan_in)
nn.init.uniform_(self.bias, -bound, bound) # bias init
weights = torch.transpose(weights, 0, 1)
self.weights = nn.Parameter(weights)
def forward(self, x):
xr = x.repeat(1, 30) # extend values to create Maclaurin series for each point
xr = torch.pow(xr, self.scal) # raise each term in series to proper power
wx = torch.mm(xr, self.weights) # multiply columns by weights
return torch.add(wx, self.bias) # w times x + b
class NeuralNet(nn.Module):
def __init__(self, input_size, hidden_size, num_out):
super(NeuralNet, self).__init__()
self.taylor = Taylor()
self.taylor.cuda()
def forward(self, x):
out = self.taylor(x)
return out
I checked all of the sizes of the tensors for consistency here:
x powers weights
initial [64, 1] [30] [30, 1]
x.repeat [64, 30] [30] [30, 1]
powers [64, 30] [30] [30, 1]
x*weights [64, 1] [30] [30, 1]
sum rows [64] [30] [30, 1]
Here is the output when I use torch.autograd.set_detect_anomaly(True)
(I removed the file info).
wx = torch.mm(xr, self.weights)
(function _print_stack)
Traceback (most recent call last):
File "", line 3437, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "", line 1, in <module>
runfile("')
File "", line 197, in runfile
pydev_imports.execfile(filename, global_vars, local_vars) # execute the script
File "", line 18, in execfile
exec(compile(contents+"\n", file, 'exec'), glob, loc)
File "", line 113, in <module>
loss.backward() # backpropagation, compute gradients
File "", line 255, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File "", line 147, in backward
Variable._execution_engine.run_backward(
RuntimeError: Function 'MmBackward' returned nan values in its 1th output.
Checking for infinite values in the gradient gave me this
taylor.bias tensor(True, device='cuda:0')
taylor.weights tensor(False, device='cuda:0')
So even though it registers bias goes to infinity on the first iteration but it seems like the problem lies with the torch.mm function if you look at the anomalies reported during backpropagation.
In some previous iterations, I got rid of all of these problems by leaving the weights as a row vector and multiplying them by a diagonal matrix before doing matrix multiplication, but then the network stopped learning the weights and learns the bias normally.
Notes:
- I already tried a very small learning rate. I don’t think this is the problem.
- bias and half of the weights are becoming NaNs by the second iteration, all of the weights are NaNs by the third
- Even though most loss functions seem to have this problem some like torch.nn.SmoothL1Loss() do not (as long as the number of terms in the series is less than 40) so it would be interesting to see if it had something to do with the loss functions