Self-defined C++/CUDA extension convolution function cannot backward correctly in torch.nn.Function

import torch
from torch.autograd.function import Function, once_differentiable
from torch.nn.modules.utils import _pair

import BaseConv

class Conv2dFunction(Function):

    @staticmethod
    def forward(ctx, input, weight, bias, stride=1, padding=0, dilation=1):
        ctx.kernel_size = _pair(weight.shape[2:4])
        ctx.stride = _pair(stride)
        ctx.padding = _pair(padding)
        ctx.dilation = _pair(dilation)
        ctx.save_for_backward(input, weight, bias)
        
        output, = BaseConv.forward(
            input, weight, bias,
            ctx.kernel_size[0], ctx.kernel_size[1],
            ctx.stride[0], ctx.stride[1],
            ctx.padding[0], ctx.padding[1],
            ctx.dilation[0], ctx.dilation[1],
        )
        return output
    
    @staticmethod
    @once_differentiable
    def backward(ctx, grad_output):
        input, weight, bias = ctx.saved_tensors

        grad_input, grad_weight, grad_bias= BaseConv.backward(
            input, weight, bias, grad_output,
            ctx.kernel_size[0], ctx.kernel_size[1],
            ctx.stride[0], ctx.stride[1],
            ctx.padding[0], ctx.padding[1],
            ctx.dilation[0], ctx.dilation[1],
        )

        return grad_input, grad_weight, grad_bias, None, None, None

If I call BaseConv.forward() and BaseConv.backward() directly, there is no any problem. However, when calling Conv2dFunction.apply(), then use loss = torch.sum() and loss.backward(), the result is of high instability.

seed = 17

torch.manual_seed(seed)
x1 = torch.rand(1,1,1,3).float().cuda()
w1 = torch.ones(1,1,1,1).float().cuda()
b1 = torch.ones(1).float().cuda()
x1.requires_grad = True
w1.requires_grad = True
b1.requires_grad = True
o1 = slide_conv2d(x, w, b, (1,1), (0,0), (1,1))
l1 = torch.sum(o)
l1.backward()
print(o1)
print(x1.grad)

torch.manual_seed(seed)
x2 = torch.rand(1,1,1,3).float().cuda()
w2 = torch.ones(1,1,1,1).float().cuda()
b2 = torch.ones(1).float().cuda()
x2.requires_grad = True
w2.requires_grad = True
b2.requires_grad = True
o2 = BaseSlideConv.forward(x, w, b ,1, 1, 1, 1, 0, 0, 1, 1)
g2 = BaseSlideConv.backward(x, w, b, torch.ones(1,1,1,3).cuda() ,1, 1, 1, 1, 0, 0, 1, 1)
o1 = [[[[1.4342, 1.5351, 1.8302]]]]
g1 = [[[[1.0000, 1.5351, 1.8302]]]]
o2 = [[[[1.4342, 1.5351, 1.8302]]]]
g2 = [[[[1., 1., 1.]]]]

What’s more, when I take the same input to run it again and again, the result of the first way changes. Sometimes it is correct and in most cases it is wrong. Some common probable results are [[[[1.0000, 0.5351, 0.8302]]]], [[[[1., 1., 1.]]]] and [[[[1.4342, 1.5351, 1.8302]]]]. So what’s wrong about that? I have been confused for a long time and I need your help.