How to stack Bidirectional GRU layers with different hidden size and Residual Connections?

Hi to all!
It’s possible to stack Bidirectional GRUs with different hidden size and also do a residual connection with the ‘L-2 layer’ output without losing the time coherence ??

I.E.:

class Encoder(nn.Module):
  def __init__(self, input_size_encoder,hidden_size,bidirectional):
    super(Encoder, self).__init__()
    self.hidden_size = hidden_size
    self.input_size_encoder = input_size_encoder
    self.bidirectional = bidirectional

    # input Layer N (iLN) | output Layer N (oLN)
    self.iL1= self.input_size_encoder
    self.oL1= self.hidden_size
    self.iL2= self.iL1*2 + self.oL1*2 #(Bidirectional)
    self.oL2= self.iL2 
    self.iL3= self.oL2*2 + self.oL1*2 + self.iL1*2   #(Bidirectional)
    self.oL3= self.iL3

    self.gru1 = nn.GRU(self.iL1,self.oL1, batch_first=True, bidirectional= self.bidirectional)
    self.gru2 = nn.GRU(self.iL2,self.oL2, batch_first=True, bidirectional = self.bidirectional)
    self.gru3 = nn.GRU(self.iL3,self.oL3, batch_first=True, bidirectional = self.bidirectional)
    
  def forward(self, x):
    output1, h_n1 = self.gru1(x) 
    #concatenate x to fw & bw (out1)
    fw1_res = torch.cat((x,output1[:,:,self.oL1:]),dim=2) # x,L1
    bw1_res = torch.cat((output1[:,:,:self.oL1]),x),dim=2)# L1,x
    output1_residual = torch.cat((fw1_res,bw1_res)),dim=2)
    output2, h_n2 = self.gru2(output1_residual)
    #concatenate x&out1 to fw & bw (out2)
    fw2_res = torch.cat((x,output1[:,:,self.oL1:],output2[:,:,self.oL2:]),dim=2) # x,L1,L2
    bw2_res = torch.cat((output2[:,:,:self.oL2],output1[:,:self.oL1]),x),dim=2) # L2,L1,x
    output2_residual = torch.cat((fw2_res,bw2_res)),dim=2)
    output3, h_n3 = self.gru3(output2_residual)
    return output3,[h_n1,h_n2,h_n3]

How could I say to nn.GRU() that his input comes from bidirectional outputs?

Thank you !

I figured out it

class Encoder(nn.Module):
  def __init__(self, input_size_encoder,hidden_size,bidirectional):
    super(Encoder, self).__init__()
    self.hidden_size = hidden_size
    self.input_size_encoder = input_size_encoder
    self.bidirectional = bidirectional

    # input Layer N (iLN) | output Layer N (oLN)
    self.iL1= self.input_size_encoder
    self.oL1= self.hidden_size
    self.iL2= self.iL1 + self.oL1*2 #(Bidirectional)
    self.oL2= self.iL2 
    self.iL3= self.oL2*2 + self.oL1*2 + self.iL1   #(Bidirectional)
    self.oL3= self.iL3

    self.gru1 = nn.GRU(self.iL1,self.oL1, batch_first=True, bidirectional= self.bidirectional)
    self.gru2 = nn.GRU(self.iL2,self.oL2, batch_first=True, bidirectional = self.bidirectional)
    self.gru3 = nn.GRU(self.iL3,self.oL3, batch_first=True, bidirectional = self.bidirectional)
    
  def forward(self, x):
    output1, h_n1 = self.gru1(x)
    #concatenate x to fw & bw (out1)
    output1_residual = torch.cat((x,output1),dim=2)
    output2, h_n2 = self.gru2(output1_residual)
    #concatenate x&out1 to fw & bw (out2)
    output2_residual = torch.cat((x,output1,output2),dim=2)
    output3, h_n3 = self.gru3(output2_residual)
    return output3,[h_n1,h_n2,h_n3]