My model is getting slower

Greetings,
I am trying to make my own model with PyTorch.
It is based on LSTM and convolutions.
It is incomplete but works well.
However, my model is getting slower forward by forward.

Additionally, I apply .backward(retain_graph=True) according to the following instruction.
“Trying to backward through the graph a second time, but the buffers have already been freed. Specify retain_graph=True when calling backward the first time.”

class DLSTMCell(nn.Module):

def __init__(self, input_size, hidden_size, x_kernel_size, h_kernel_size, residual_length_, stride=1):        
    super(DLSTMCell, self).__init__()
    pad_x = math.floor(x_kernel_size/2)        
    pad_h = math.floor(h_kernel_size/2)
    self.hidden_size = hidden_size
    self.stride = stride
    self.residual_length = residual_length_
    self.output_size = int(hidden_size * residual_length_)

    # input gate
    self.conv_i_x = nn.Parameter(torch.FloatTensor(input_size, self.hidden_size)).cuda()
    self.conv_i_x_bias = nn.Parameter(torch.FloatTensor(self.hidden_size)).cuda()
    self.batchnorm_i_x = nn.BatchNorm1d(hidden_size)
    self.conv_i_h = nn.Conv1d(self.output_size, hidden_size, h_kernel_size, stride=1, padding=pad_h)
    self.batchnorm_i_h = nn.BatchNorm1d(hidden_size)
    
    # forget gate
    self.conv_f_x = nn.Parameter(torch.FloatTensor(input_size, self.hidden_size)).cuda()
    self.conv_f_x_bias = nn.Parameter(torch.FloatTensor(self.hidden_size)).cuda()
    self.batchnorm_f_x = nn.BatchNorm1d(hidden_size)
    self.conv_f_h = nn.Conv1d(self.output_size, hidden_size, h_kernel_size, stride=1, padding=pad_h)
    self.batchnorm_f_h = nn.BatchNorm1d(hidden_size)
    
    # cell gate
    self.conv_c_x = nn.Parameter(torch.FloatTensor(input_size, self.hidden_size)).cuda()
    self.conv_c_x_bias = nn.Parameter(torch.FloatTensor(self.hidden_size)).cuda()
    self.batchnorm_c_x = nn.BatchNorm1d(hidden_size)
    self.conv_c_h = nn.Conv1d(self.output_size, hidden_size, h_kernel_size, stride=1, padding=pad_h)
    self.batchnorm_c_h = nn.BatchNorm1d(hidden_size)

    # output gate
    self.conv_o_x = nn.Parameter(torch.FloatTensor(input_size, self.hidden_size)).cuda()
    self.conv_o_x_bias = nn.Parameter(torch.FloatTensor(self.hidden_size)).cuda()
    self.batchnorm_o_x = nn.BatchNorm1d(hidden_size)
    self.conv_o_h = nn.Conv1d(self.output_size, hidden_size, h_kernel_size, stride=1, padding=pad_h)
    self.batchnorm_o_h = nn.BatchNorm1d(hidden_size)
    
    self.last_cell = None
    
def reset_state(self):
    self.last_cell = None

def reset_h_list(self):
    self.register_buffer('h_list', torch.zeros(8, self.residual_length, self.hidden_size))
    self.h_list = self.h_list.cuda()

def forward(self, x):
    batch_size = x.size(0)

    # first sequence
    if self.last_cell is None:
        self.last_cell = Variable(torch.zeros(batch_size, self.hidden_size))
        self.conv_i_x_bias = self.conv_i_x_bias.unsqueeze(0).expand(batch_size, *self.conv_i_x_bias.size())
        self.conv_f_x_bias = self.conv_f_x_bias.unsqueeze(0).expand(batch_size, *self.conv_f_x_bias.size())
        self.conv_c_x_bias = self.conv_c_x_bias.unsqueeze(0).expand(batch_size, *self.conv_c_x_bias.size())
        self.conv_o_x_bias = self.conv_o_x_bias.unsqueeze(0).expand(batch_size, *self.conv_o_x_bias.size())
        self.last_cell = self.last_cell.cuda()
        self.h_list = self.h_list.cuda()

    h = self.h_list[:, -self.residual_length:, :].contiguous().view(batch_size, self.output_size, -1)

    # input gate
    input_h = self.batchnorm_i_h(self.conv_i_h(h))
    input_x = self.batchnorm_i_x(torch.mm(x, self.conv_i_x) + self.conv_i_x_bias)
    input_h = torch.squeeze(input_h)
    input_gate = F.sigmoid(input_x + input_h)
    
    # forget gate
    forget_x = self.batchnorm_f_x(torch.mm(x, self.conv_f_x) + self.conv_f_x_bias)
    forget_h = self.batchnorm_f_h(self.conv_f_h(h))
    forget_h = torch.squeeze(forget_h)
    forget_gate = F.sigmoid(forget_x + forget_h)
    
    # forget gate
    cell_x = self.batchnorm_c_x(torch.mm(x, self.conv_c_x) + self.conv_c_x_bias)
    cell_h = self.batchnorm_c_h(self.conv_c_h(h))
    cell_h = torch.squeeze(cell_h)
    cell_intermediate = F.tanh(cell_x + cell_h) # g
    cell_gate = (forget_gate * self.last_cell) + (input_gate * cell_intermediate)
    
    # output gate
    output_x = self.batchnorm_o_x(torch.mm(x, self.conv_o_x) + self.conv_o_x_bias)
    output_h = self.batchnorm_o_h(self.conv_o_h(h))
    output_h = torch.squeeze(output_h)
    output_gate = F.sigmoid(output_x + output_h)
    
    next_h = output_gate * F.tanh(cell_gate)
    self.last_cell = cell_gate
    next_h = torch.unsqueeze(next_h, dim=1)
    self.h_list = torch.cat((self.h_list, next_h), dim=1)
    #print("self.h_list shape: {}".format(self.h_list.shape))

    return next_h

class LSTM(nn.Module):

"""A module that runs multiple steps of LSTM."""

def __init__(self, cell_class, input_size, hidden_size, x_kernel_size, h_kernel_size, num_layers=1, residual_length=5, 
             use_bias=True, batch_first=True, dropout=0, **kwargs):
    super(LSTM, self).__init__()
    self.cell_class = cell_class
    self.input_size = input_size
    self.hidden_size = hidden_size
    self.x_kernel_size = x_kernel_size
    self.h_kernel_size = h_kernel_size
    self.num_layers = num_layers
    self.use_bias = use_bias
    self.batch_first = batch_first
    self.dropout = dropout
    self.residual_length = residual_length
    self.output_size = hidden_size * residual_length

    for layer in range(num_layers):
        layer_input_size = input_size if layer == 0 else hidden_size

        cell = cell_class(input_size=layer_input_size, hidden_size=hidden_size, x_kernel_size=x_kernel_size, h_kernel_size=h_kernel_size, residual_length_=residual_length, **kwargs)
        setattr(self, 'cell_{}'.format(layer), cell)
    self.dropout_layer = nn.Dropout(dropout)
    self.reset_parameters()

def get_cell(self, layer):
    return getattr(self, 'cell_{}'.format(layer))

def reset_parameters(self):
    for layer in range(self.num_layers):
        cell = self.get_cell(layer)

@staticmethod
def _forward_rnn(cell, input_, energy_, length): # energy_: the amount of information of each sequence (attention)
    assert input_.size(0) == energy_.size(0), "Sequence length of input and energy must be the same"
    max_time = input_.size(0)
    output = []
    for time in range(max_time):
        h_next = cell(x=input_[time])
        output.append(h_next)
    output = torch.stack(output, 0)
    #print("output shape: {}".format(output.shape))
    return output

def forward(self, input_, energy_, length=None, hx=None):
    if self.batch_first:
        input_ = input_.transpose(0, 1)
        energy_ = energy_.transpose(0, 1)
    max_time, batch_size, _ = input_.size()
    if length is None:
        length = Variable(torch.LongTensor([max_time] * batch_size))
        if input_.is_cuda:
            device = input_.get_device()
            length = length.cuda(device)

    layer_output = None
    for layer in range(self.num_layers):
        cell = self.get_cell(layer)
        cell.reset_h_list()
        layer_output = LSTM._forward_rnn(cell=cell, input_=input_, energy_=energy_, length=length)

    layer_output = layer_output[-self.residual_length:, :, :].transpose(0, 1)
    layer_output = layer_output.contiguous().view(batch_size, -1)

    return layer_output

I change my question.
Why the model trying to backward through the graph more than 2 times?
Thank you