LSTM and In-place operation error in loss.backward

I am trying to build a “LSTM” that has the following structure:

class DGM2(nn.Module): # this includes batch dimnesions [batch, dim1, dim2] of the data
    
    def __init__(self, num_inputs, L, M, phi): 
        super(DGM2, self).__init__()
        self.num_inputs = num_inputs
        self.L = L # How many layers
        self.M = M # neurons per hidden layer
        self.phi = phi # activation function

        init_weight = lambda *shape: nn.init.xavier_uniform_(nn.Parameter(torch.empty(*shape)), gain=nn.init.calculate_gain('tanh'))

        double_1 = lambda: ( init_weight(M,num_inputs),
                           init_weight(M,1))
        
        double_2 = lambda: ( init_weight(1,M),
                           init_weight(1,1))

        triple = lambda: ( init_weight(M,num_inputs),
                           init_weight(M,M),
                           init_weight(M,1))

        self.W_1, self.bias_1 = double_1()
        # self.U_z, self.W_z, self.b_z = triple()
        # self.U_g, self.W_g, self.b_g = triple()
        # self.U_r, self.W_r, self.b_r = triple()
        # self.U_h, self.W_h, self.b_h = triple()
        self.W, self.bias = double_2()

        # Because we have  more layers we need a Parameter Dictionary to keep track of all the learnable parameters. !!Find more efficient ways to implement this!!
        self.dict_Z = nn.ParameterDict()
        self.dict_G = nn.ParameterDict()
        self.dict_R = nn.ParameterDict()
        self.dict_H = nn.ParameterDict()


        for i in range(L):
            self.U_z, self.W_z, self.b_z = triple()
            self.dict_Z["U_z{k}".format(k=i)] = self.U_z
            self.dict_Z["W_z{k}".format(k=i)] = self.W_z
            self.dict_Z["b_z{k}".format(k=i)] = self.b_z

            self.U_g, self.W_g, self.b_g = triple()
            self.dict_G["U_g{k}".format(k=i)] = self.U_g
            self.dict_G["W_g{k}".format(k=i)] = self.W_g
            self.dict_G["b_g{k}".format(k=i)] = self.b_g

            self.U_r, self.W_r, self.b_r = triple()
            self.dict_R["U_r{k}".format(k=i)] = self.U_r
            self.dict_R["W_r{k}".format(k=i)] = self.W_r
            self.dict_R["b_r{k}".format(k=i)] = self.b_r

            self.U_h, self.W_h, self.b_h = triple()
            self.dict_H["U_h{k}".format(k=i)] = self.U_h
            self.dict_H["W_h{k}".format(k=i)] = self.W_h
            self.dict_H["b_h{k}".format(k=i)] = self.b_h


    def forward(self, x):
        S=torch.empty(self.L+1, x.shape[0] ,self.M, 1)
        S[0] = self.phi(torch.matmul(self.W_1, torch.transpose(x, 1, 2)) + self.bias_1)
        Z=torch.empty(self.L, x.shape[0],self.M, 1)
        G=torch.empty(self.L, x.shape[0],self.M, 1)
        R=torch.empty(self.L, x.shape[0],self.M, 1)
        H=torch.empty(self.L, x.shape[0],self.M, 1)

        for layer in range(self.L):
            Z[layer] = self.phi(torch.matmul(self.dict_Z.get("U_z{k}".format(k=layer)),torch.transpose(x, 1, 2)) + torch.matmul(self.dict_Z.get("W_z{k}".format(k=layer)),S[layer]) + self.dict_Z.get("b_z{k}".format(k=layer)))
            G[layer] = self.phi(torch.matmul(self.dict_G.get("U_g{k}".format(k=layer)),torch.transpose(x, 1, 2)) + torch.matmul(self.dict_G.get("W_g{k}".format(k=layer)),S[0]) + self.dict_G.get("b_g{k}".format(k=layer)))
            R[layer] = self.phi(torch.matmul(self.dict_R.get("U_r{k}".format(k=layer)),torch.transpose(x, 1, 2)) + torch.matmul(self.dict_R.get("W_r{k}".format(k=layer)),S[layer]) + self.dict_R.get("b_r{k}".format(k=layer)))
            H[layer] = self.phi(torch.matmul(self.dict_H.get("U_h{k}".format(k=layer)),torch.transpose(x, 1, 2)) + torch.matmul(self.dict_H.get("W_h{k}".format(k=layer)),torch.mul(S[layer], R[layer])) + self.dict_H.get("b_h{k}".format(k=layer)))
            S[layer+1] = torch.mul((1 - G[layer]), H[layer]) + torch.mul(Z[layer], S[layer])

        output = torch.matmul(self.W, S[self.L]) + self.bias #.squeeze(-1)
        return output

I have 2 questions regarding the initialisation of the parameters and the forward function.

  1. Is the ParameterDict a good and effiecient way to initialise and keep track of all learnable parameters?
  2. In the forward method I create the torch.empty tensors that I later change in the for loop using in-place operation. This raises a PyTorch error RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation. I figured out using torch.autograd.set_detect_anomaly(True) that the error is cause by the for loop and specifically the line S[layer+1] = torch.mul((1 - G[layer]), H[layer]) + torch.mul(Z[layer], S[layer]). My question is that if i change the torch.empty tensors with python lists the error goes away but is this an efficient way of writting PyTorch code and how it affects complexity of the code in training situation?

Thank you in advance!