Seems that this should be the correct solution:

To implement my optimized module in python:

```
class optimizedDepthwiseFunction(torch.autograd.Function):
@staticmethod
def forward(ctx, input, filter, filterHeight, stride, padding, dilation, groups):
ctx.save_for_backward(input, filter)
ctx.conf = {
"filterHeight": filterHeight,
"stride": stride,
"padding": padding,
"dilation": dilation,
"groups": groups
}
output = optimizedDepthwise_cuda.forward(input, filter, filterHeight, stride)
return output
@staticmethod
def backward(ctx, grad_output):
input, filter = ctx.saved_tensors
conf = ctx.conf
grad_input = grad_weight = None
if ctx.needs_input_grad[0]:
input_ = grad_output.new_empty(1).expand(input.shape)
grad_input = torch.ops.aten.convolution_backward(grad_output, input_, filter, None,
(conf["stride"], conf["stride"]), (conf["padding"], conf["padding"]), (conf["dilation"], conf["dilation"]),
False, [0], conf["groups"], (True, False, False))[0]
if ctx.needs_input_grad[1]:
filter_ = grad_output.new_empty(1).expand(filter.shape)
grad_weight = torch.ops.aten.convolution_backward(grad_output, input, filter_, None,
(conf["stride"], conf["stride"]), (conf["padding"], conf["padding"]), (conf["dilation"], conf["dilation"]),
False, [0], conf["groups"], (False, True, False))[1]
return grad_input, grad_weight, None, None, None, None, None
```

```
class optimizedDepthwiseLayer(torch.nn.Module):
def __init__(self, inputChannel, outputChannel, filterHeight, stride):
super(optimizedDepthwiseLayer, self).__init__()
self.inputChannel = inputChannel
self.outputChannel = outputChannel
self.filterHeight = filterHeight
self.stride = stride
if(self.filterHeight == 3):
self.padding = 1
elif(self.filterHeight == 5):
self.padding = 2
self.dilation = 1
self.groups = inputChannel
self.filter = torch.nn.Parameter(torch.empty((self.inputChannel, 1, self.filterHeight, self.filterHeight), dtype=torch.float))
self.reset_parameters()
def reset_parameters(self):
stdv = 1.0 / math.sqrt(self.inputChannel * self.filterHeight * self.filterHeight)
for weight in self.parameters():
weight.data.uniform_(-stdv, +stdv)
def forward(self, input):
return optimizedDepthwiseFunction.apply(
input,
self.filter,
self.filterHeight,
self.stride,
self.padding,
self.dilation,
self.groups)
```

And I acutually do not need at::convolution_backward in the cpp file, because I can directly use `torch.ops.aten.convolution_backward`

in my python module.

```
#include <torch/extension.h>
#include <vector>
#include <array>
#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
torch::Tensor optimizedDepthwise_cuda_forward(
torch::Tensor input,
torch::Tensor filter,
int filterHeight,
int stride);
torch::Tensor optimizedDepthwise_forward(
torch::Tensor input,
torch::Tensor filter,
int filterHeight,
int stride) {
CHECK_INPUT(input);
CHECK_INPUT(filter);
return optimizedDepthwise_cuda_forward(
input,
filter,
filterHeight,
stride);
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("forward", &optimizedDepthwise_forward, "Optimized Depthwise forward (CUDA)");
}
```

Could you please help me to double check if my implementation logic contains any mistake? Thank you!