I am writing a custom function with custom backward.
class Capsule(Function):
'''
function for a single capsule projection layer
'''
@staticmethod
def forward(ctx, input, weight):
sigma = (th.matmul(weight.transpose(1, 0), weight)).inverse()
Pl = th.matmul(weight, th.matmul(sigma, weight.transpose(1, 0)))
vl = th.matmul(Pl, input)
output = vl.norm(dim=0)
ctx.intermediate_results = Pl
ctx.save_for_backward(input, weight, sigma, output)
return output
@staticmethod
def backward(ctx, grad_output):
input, weight, sigma, output = ctx.saved_tensors
Pl = ctx.intermediate_results
grad_input = grad_weight = grad_sigma = None
Wl_inv = th.matmul(sigma, weight.transpose(1, 0)).transpose(1, 0)
grad = th.matmul(Pl, input)
grad = grad / output
grad_input = (grad_output * grad)
print('b = {}'.format(grad_input))
I_ = (th.eye(Pl.shape[0], Pl.shape[1])).to(th.double)
IPl = I_ - Pl
xxt = th.matmul(input / output, input.transpose(1, 0))
grad_weight = th.matmul(
th.matmul(IPl, xxt),
Wl_inv)
grad_sigma = -0.5 * th.matmul(th.matmul(
th.matmul(sigma, weight.transpose(1, 0)), weight), sigma)
return grad_input, grad_weight
i = th.randn(100, 3, dtype=th.double, requires_grad=True)
w = th.randn(100, 8, dtype=th.double, requires_grad=True)
input = (i, w)
test = gradcheck(cap, input, eps=1e-4, atol=1e-4)
print(test)
When I try to gradcheck the implementation, the gradcheck
fails with information as:
RuntimeError: Jacobian mismatch for output 0 with respect to input 1,
numerical:tensor([[ 0.0168, -0.1056, -0.0221],
[-0.0212, 0.0279, -0.0228],
[-0.0045, 0.0354, 0.0501],
…,
[-0.0010, -0.0099, -0.0074],
[-0.0107, 0.1135, -0.0110],
[ 0.0030, 0.0543, -0.0073]])
analytical:tensor([[-0.1109, -0.1109, -0.1109],
[-0.0161, -0.0161, -0.0161],
[ 0.0809, 0.0809, 0.0809],
…,
[-0.0183, -0.0183, -0.0183],
[ 0.0918, 0.0918, 0.0918],
[ 0.0500, 0.0500, 0.0500]])
However, when I print the grad_input
manually in the backward
function, the output is different from the analytical:tensor. In the output from the print
function in the backward
, the result seems to be reasonable, while in the fail information, the analytical:tensor is the same for the same dimension for all the 3 inputs.
So what is the output in the fail information, isn’t it the grad_input
computed in the backward
?