So I have a few questions for implementing a custom functions. We need to do something different than gradient descend thus we try to to implement something within the forward/backward mode of pytorch, but calculating more than gradients. However, currently on a very simple example, pytorch just crashes with:
Traceback (most recent call last):
File "/home/alex/work/python/nn-second-order/bin/test.py", line 94, in <module>
loss.backward()
File "/usr/local/lib/python3.4/dist-packages/torch/autograd/variable.py", line 145, in backward
self._execution_engine.run_backward((self,), (gradient,), retain_variables)
RuntimeError: could not compute gradients for some functions
The code we are using looks like this:
import torch
from torch.autograd import Variable
torch.nn.Linear
class Gn_SquareLoss(torch.autograd.Function):
def forward(self, x, y):
self.save_for_backward(x, y)
diff = (x - y)
return diff.pow(2)
def backward(self, grad_output):
# N x D
x, y = self.saved_tensors
diff = (x - y) * 2
# N x D
grad_input = grad_output.clone()
# N x D
g = [grad_input * diff, grad_input * diff]
d = g[0].size()[1]
# D x D
m = [torch.eye(d, d) * 2, torch.eye(d, d) * 3]
# (N+D) x D
s1 = torch.cat((m[0], g[0]), 0)
# (N+D) x D
s2 = torch.cat((m[0], g[0]), 0)
return s1, s2
class Gn_Dot(torch.autograd.Function):
def forward(self, x, w):
self.save_for_backward(x, w)
return torch.mm(x, w)
def backward(self, grad_output):
# N x H, H x D
x, w = self.saved_tensors
# (N + D) x D
grad_input, = grad_output.clone()
d = grad_input.size()[1]
# D x D
m = grad_input[:d]
# N x D
g = grad_input[d:]
# N x H
dx = torch.mm(g, torch.transpose(w, 0, 1))
# H x D
dw = torch.mm(torch.transpose(x, 0, 1), g)
# D x H
m = torch.mm(m, torch.transpose(w, 0, 1))
# (D + N) x H
dx = torch.cat((m, dx), 0)
return dx, dw
class Gn_Tanh(torch.autograd.Function):
def forward(self, x):
self.save_for_backward(x)
return torch.tanh(x)
def backward(self, grad_output):
x, = self.saved_tensors
grad_input = grad_output.clone()
d = grad_input.size()[1]
m = grad_input[:d]
g = grad_input[d:]
dx = torch.cat((m, g * (1 - torch.tanh(x).pow(2))), dimension=0)
return dx
dtype = torch.FloatTensor
# dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10
# Create random Tensors to hold input and outputs, and wrap them in Variables.
x = Variable(torch.randn(N, D_in).type(dtype), requires_grad=False)
y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False)
# Create random Tensors for weights, and wrap them in Variables.
w1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True)
w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True)
affine = Gn_Dot()
tanh = Gn_Tanh()
square_loss = Gn_SquareLoss()
learning_rate = 1e-6
for t in range(500):
# Forward pass: compute predicted y using operations on Variables; we compute
# ReLU using our custom autograd operation.
y1 = affine(x, w1)
h1 = tanh(y1)
y2 = affine(h1, w2)
loss1 = (y - y2).pow(2).sum()
loss = square_loss(y2, y).sum()
print(t, loss.data[0], loss1.data[0])
# Manually zero the gradients before running the backward pass
w1.grad.data.zero_()
w2.grad.data.zero_()
# Use autograd to compute the backward pass.
loss.backward()
# Update weights using gradient descent
w1.data -= learning_rate * w1.grad.data
w2.data -= learning_rate * w2.grad.data
Note that in order to satisfy pytorch’s requirement to have the number of gradients outputed same as inputs, we concatenate the extra thing we calculate (for testing purposes this is identity matrix) and then in each layer we peel them out.
Now the main issue is that this should work, however the error pytorch is giving us in a mystery why? My guess is something is happening in the C API, but I do not know what.