How to speedup the custom convolution process?

I am trying to perform a custom convolution operation. I am using a c++ extension to perform the convolution operation using a lookup table. The trained model is an Encoder-Decoder architecture, with encoder as DenseNet-161.

Following are the input and weight dim for the 1st layer of decoder, over which I’m doing custom convolution op.

input dimension = [1,2208,15,20]
kernel dim = [1104,2208,1,1]

And this is the manner in which I reshape them and call the c++ extension for custom conv2d function:

img = F.pad(input= input, pad= (pad, pad, pad, pad), mode='constant', value= 0)

patches = img.unfold(2,kh,dh).unfold(3,kw,dw).reshape(batch_size, in_channels*kh*kw, -1)
result = torch.zeros(batch_size, out_channel, out_h, out_w)
kernel = kernel.reshape(out_channel, -1)

result = custom_conv.conv_two_d(patches, kernel, lookup_table, out_h, out_w)

The c++ implementation is given below:

#include <torch/extension.h>
#include <torch/types.h>
#include <vector>
#include <THC/THC.h>
#include <iostream>
#include <ATen/NativeFunctions.h>
#include <ATen/Config.h>
#include <cuda_runtime_api.h>


torch::Tensor conv_two_d(torch::Tensor input,
                           torch::Tensor weights,
                           torch::Tensor lookup_table,
                           int64_t outh, int64_t outw) {

    int64_t batch_size = input.size(0);
    int64_t out_channels = weights.size(0);
    torch::Tensor output = torch::zeros(torch::IntArrayRef({batch_size, out_channels, outh, outw})).cuda();
    torch::Tensor result = torch::zeros(torch::IntArrayRef({out_channels, outh*outw})).cuda();
    input = input.cuda();
    weights = weights.cuda();
    int sign=0;
    time_t start, end;
    std::cout<<"\n Starting..."<<std::endl;

    for (int elt=0; elt<batch_size;elt++){
        torch::Tensor input_n = input[elt].cuda();
        for(int i=0; i<weights.size(0); i++){
            start = time(NULL);

            for(int j=0; j<input_n.size(1); j++){
                int64_t r=0;

                for(int k=0; k<weights.size(1);k++){

                    float t1 = weights[i][k].item<float>();
                    float t2 = input_n[k][j].item<float>();
                    t1 = round(t1*1000); t2 = round(t2*1000);

                    if(t1>255){t1=255;} if(t1<-255){t1=-255;}                   
                    if(t2>255){t2=255;} if(t2<-255){t2=-255;}

                    if(t1>0 && t2>0 || t1<0 && t2<0){
                        t1=abs(t1); t2=abs(t2);

                    if(t1<0 && t2>0 || t1>0 && t2<0){
                        t1=abs(t1); t2=abs(t2);

                    r+= lookup_table[t1][t2].item<int>()*sign;

                result[i][j] = r/1000000;

            end = time(NULL);
            std::cout<<"Time taken for iteration:"<<i<<" in batch:"<<elt<<" is:"<<end-start<<std::endl;
        output[elt].add_(result.reshape(torch::IntArrayRef({out_channels, outh, outw})));
    return output;

    m.def("conv_two_d", &conv_two_d,"conv forward (CUDA)");

The time taken to complete each iteration is ~10-11 seconds… which in turn takes ~3.5 hrs to complete inference over a single image. (as shown below)

My doubts are:

1) What can I possibly do to increase the speed at which the inference is done?
I have looked into the possibility of FFT_conv but I got to know that it’s mostly for larger kernels and images and saw it mentioned that, performing FFT_conv on smaller kernels might actually slow down the convolution process further.

2) Is there a way in which I can reduce the inference time for smaller kernels?
The model on which I am doing inference has only 1x1 and 3x3 kernels.