Model learning very slowly

I have created a custom nn module, which I added to a VGG layer. However, the program runs very slowly after adding the custom LSTM layer. Essentially, the LSTM layer makes an LSTM cell for each row, where the weights are two types of learnable convolutions. One convolution takes the input and convolves, while the other convolution convolves the previous LSTM cell’s hidden layer. These two inputs are then added, and using the equations defined in DeepMind’s Pixel RNN paper, the next gates for the next LSTM cell are computed. I am wondering why the model learns so slowly? I have included code below.

Here is the function which creates the convolutions and LSTM cells, along with the the definition of my custom LSTM cell.

class RLSTM(nn.Module):
    def __init__(self,ch):
        super(RLSTM,self).__init__()
        self.ch=ch
        self.input_to_state = torch.nn.Conv2d(self.ch,4*self.ch,kernel_size=(1,3),padding=(0,1)).cuda()
        self.state_to_state = torch.nn.Conv2d(self.ch,4*self.ch,kernel_size=(1,3),padding=(0,1)).cuda() # error is here: hidPrev is an array - not a valid number of input channel
        self.cell_list = []
        
       
    def forward(self, image):
        size = image.size()
        b = size[0]
        indvs = list(image.split(1,0))
        tensor_array = []
        
        for i in range(b):
            tensor_array.append(self.RowLSTM(indvs[i]))
        seq=tuple(tensor_array)
        trans = torch.cat(seq,0)
      
        return trans.cuda() 
    def RowLSTM(self, image): 
     # input-to-state (K_is * x_i) : 1x3 convolution. generate h x n x n tensor. hxnxn tensor contains all i -> s info
        self.cell_list=[]
        igates = []
        n = image.size()[2]
        ch=image.size()[1]
        for i in range(n):
            if i==0:      
                isgates = self.splitIS(self.input_to_state(image)) # convolve, then split into gates (4 per row) 
                
                cell=RowLSTMCell(0,isgates[0][0],isgates[0][1],isgates[0][2],isgates[0][3],torch.zeros(ch,n,1).cuda(),torch.zeros(ch,n,1).cuda())
                cell.c=isgates[0][0]*isgates[0][3]
                cell.h=torch.tanh(cell.c)*isgates[0][1]
                # now have dummy variables for first row
                self.cell_list.append(cell)       
            else:   
                cell_prev = self.cell_list[i-1]
                hid_prev = cell_prev.getHiddenState()
                ssgates = self.splitSS(self.state_to_state(hid_prev.unsqueeze(0)))
                gates = self.addGates(isgates, ssgates,i)
                ig, og, fg, gg = gates[0], gates[1], gates[2], gates[3]
                cell = RowLSTMCell(cell_prev, ig, og, fg, gg, 0 ,0) #MORE zeros
                cell.compute()

                self.cell_list.append(cell)

        # now have a list of all cell data, concatenate hidden state into 1 x h x n x n
        hidden_layers = []
        for i in range(n):
            hid = self.cell_list[i].h
            hidden_layers.append(torch.unsqueeze(hid,0))

        seq = tuple(hidden_layers)
        tensor = torch.cat(seq,3)
   
        return tensor 
    
    def splitIS(self, tensor): #always going to be splitting into 4 pieces, so no need to add extra parameters
        inputStateGates={}
        size=tensor.size() # 1 x 4h x n x n
        out_ft=size[1] # get 4h for the nxnx4h tensor
        num=size[2] # get n for the nxn image
        hh=out_ft/4 # we want to split the tensor into 4, for the gates
        tensor = torch.squeeze(tensor).cuda() # 4h x n x n

        # First, split by row: Creates n tensors of 4h x n x 1
        rows = list(tensor.split(1,2))

        for i in range(num):
            # Each row is a tensor of 4h x n x 1, split it into 4 of h x n x 1
            row=rows[i]
          #  print("Each row using cuda: "+str(row.is_cuda))
            inputStateGates[i]=list(row.split(hh,0))
            
        return inputStateGates 


    def splitSS(self, tensor): # 1 x 4h x n x 1, create 4 of 1 x h x n x 1 
        size=tensor.size() 
        out_ft=size[1] # get 4h for the 1x4hxn tensor
        num=size[2] # get n for the 1xhxn row
        hh=out_ft/4 # we want to split the tensor into 4, for the gates
        tensor = tensor.squeeze(0).cuda() # 4h x n x 1
        splitted=list(tensor.split(hh,0))
        return splitted 


    def addGates(self, i2s,s2s,key):
        """ these dictionaries are of form {key : [[i], [o], [f], [g]]}
            we want to add pairwise elemeents """

        # i2s is of form key: [[i], [o], [f], [g]] where each gate is hxn
        # s2s is of form [[h,n],[h,n],[h,n], [h,n]]
        gateSum = []
        for i in range(4): # always of length 4, representing the gates
            gateSum.append(torch.sigmoid(i2s[key][i] + s2s[i]))
        return gateSum

Next, here is the definition of the LSTM cell.

class RowLSTMCell(): #inherit torch.nn.LSTM?
    def __init__(self,prev_row, i, o, f, g, c, h):
        self.c=c
        self.h=h
        self.i=i
        self.i = self.i.cuda()
        self.o=o
        self.o = self.o.cuda()
        self.g=g
        self.g = self.g.cuda()
        self.f=f
        self.f = self.f.cuda()
        self.prev_row=prev_row 
    def getStateSize(self):
        return self._state_size

    def getOutputSize(self):
        return self._output_size

    def compute(self):
        c_prev = self.prev_row.getCellState()
        h_prev = self.prev_row.getHiddenState()   
        self.c = self.f * c_prev + self.i * self.g
        self.h = torch.tanh(self.c) * self.o
    def getHiddenState(self):
        return self.h

    def getCellState(self):
        return self.c

I add the RLSTM layer in between two specific convolutions of the VGG-16 model. However, now the training process becomes very slow. I am wondering why that is? Any help is much appreciated.