If using num_layers and multiple individual lstms can create the same model containing multiple lstms

I am using two ways to create a two-layer lstm as shown in the following two codes (they have the same initial parameters and inputs). Can anyone tell me why the outputs are not the same if num_layers is as what is explained in What is num_layers in RNN module? ? And if num_layers is not the same as in the link, can anyone tell me what the lstm-based model looks like by using num_layers? Thanks so much!

The first way using num_layers:

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

lstm = nn.LSTM(3, 3,2)  # Input dim is 3, output dim is 3
inputs = [torch.randn(1, 3) for _ in range(5)]  # make a sequence of length 5

weight_ih_0=None
weight_hh_0=None
# bias_ih_0=None
# bias_hh_0=None

weight_ih_1=None
weight_hh_1=None
# bias_ih_1=None
# bias_hh_1=None

for name, param in lstm.named_parameters():
  if 'bias' in name:
     # print(f'bias {name} before init: {param}')
     nn.init.constant_(param, 0.0)
     # print(f'bias {name} after init: {param}')
  elif 'weight' in name:
     # print(f'weight {name} before init: {param}')
     nn.init.xavier_normal_(param)
     print(f'weight {name} after init: {param}')

for name, param in lstm.named_parameters():
    if 'weight_ih_l0' in name:
        weight_ih_0=param
    if 'weight_hh_l0' in name:
        weight_hh_0=param
    if 'weight_ih_l1' in name:
        weight_ih_1=param
    if 'weight_hh_l1' in name:
        weight_hh_1=param
    
print(f'inputs: {inputs}')

# initialize the hidden state.
hidden = (torch.zeros(2, 1, 3),
          torch.zeros(2, 1, 3))

idx=0
for i in inputs:
    print(f'idx: {idx}')
    
    # print(f'i: {i}')
    
    idx+=1
    
    # Step through the sequence one element at a time.
    # after each step, hidden contains the hidden
    out, hidden = lstm(i.view(1, 1, -1), hidden)

    print(out)
    
    print("==========")
    
    # print(hidden)

The outputs is:

weight weight_ih_l0 after init: Parameter containing:
tensor([[ 0.6025, -0.1577, -0.0990],
        [-0.5255,  0.4554,  0.4651],
        [ 0.1428,  0.1414, -0.0291],
        [ 0.1248,  0.3465, -0.5053],
        [ 0.6295, -0.8635, -0.3394],
        [ 0.1072,  0.0786,  0.3427],
        [ 0.5352, -0.2032,  0.8816],
        [ 0.3727, -0.1608, -0.6332],
        [-0.3745,  0.1903, -0.1654],
        [-0.0460, -0.2148,  0.7737],
        [-0.1980, -0.8980, -0.3470],
        [-0.1130,  0.6074,  0.1844]], requires_grad=True)
weight weight_hh_l0 after init: Parameter containing:
tensor([[-0.0719, -0.0122,  0.2626],
        [ 0.3887, -0.3044, -0.4356],
        [-0.8422,  0.2204,  0.1151],
        [ 0.4171,  0.1116, -0.2114],
        [ 0.2061, -0.3204, -0.0983],
        [ 0.4791, -0.5683, -0.3928],
        [-0.3196, -0.1726, -0.0732],
        [-0.3058, -0.5667, -0.0211],
        [-0.0832, -0.3168,  0.1241],
        [-0.4197,  0.0525,  0.0741],
        [ 0.3849,  0.0481, -0.3130],
        [ 0.5788,  0.6312, -0.3627]], requires_grad=True)
weight weight_ih_l1 after init: Parameter containing:
tensor([[ 3.6955e-02,  7.1276e-02, -4.3073e-01],
        [-5.2666e-01,  2.7323e-02,  1.2894e-01],
        [ 3.7136e-01,  3.3969e-01,  1.9601e-01],
        [ 3.5802e-01, -4.3600e-01, -1.7962e-01],
        [ 8.3209e-01,  1.7189e-01,  2.2195e-01],
        [-2.1302e-02, -1.6867e-01, -1.3460e-01],
        [ 1.3446e-01,  1.7708e-01, -5.6676e-01],
        [-2.3697e-01, -2.8254e-02, -2.2063e-01],
        [-2.0928e-01,  3.4973e-01,  3.5858e-04],
        [-5.0565e-01, -6.8619e-02,  3.7702e-01],
        [-9.0796e-02, -1.7238e-01,  4.7868e-01],
        [-1.1565e-01, -6.7956e-02, -2.1049e-01]], requires_grad=True)
weight weight_hh_l1 after init: Parameter containing:
tensor([[-0.3017, -0.0811, -0.6554],
        [ 0.2665, -0.2052, -0.0577],
        [ 0.5493, -0.5094,  0.2167],
        [ 0.1210, -0.3868, -0.2293],
        [-0.0991,  0.6744, -0.0114],
        [-0.0343, -0.6136,  0.4856],
        [ 0.0505,  0.3920, -0.1662],
        [ 0.1163, -0.1296,  0.2505],
        [-0.1373, -0.8803, -0.4666],
        [-0.0230, -0.0346, -0.8451],
        [ 0.2032,  0.1847, -0.0758],
        [ 0.2533,  0.1532,  0.8224]], requires_grad=True)
inputs: [tensor([[1.5381, 1.4673, 1.5951]]), tensor([[-1.5279,  1.0156, -0.2020]]), tensor([[-1.2865,  0.8231, -0.6101]]), tensor([[-1.2960, -0.9434,  0.6684]]), tensor([[ 1.1628, -0.3229,  1.8782]])]
idx: 0
tensor([[[ 0.0374, -0.0085, -0.0240]]], grad_fn=<StackBackward>)
==========
idx: 1
tensor([[[ 0.0073, -0.0110, -0.0296]]], grad_fn=<StackBackward>)
==========
idx: 2
tensor([[[-0.0314, -0.0147, -0.0136]]], grad_fn=<StackBackward>)
==========
idx: 3
tensor([[[-0.0458, -0.0118, -0.0254]]], grad_fn=<StackBackward>)
==========
idx: 4
tensor([[[-0.0096, -0.0281, -0.0440]]], grad_fn=<StackBackward>)
==========

The second way creating two individual lstm:

import copy

torch.manual_seed(1)

lstm = nn.LSTMCell(3, 3)  # Input dim is 3, output dim is 3
lstm2 = nn.LSTMCell(3, 3)  # Input dim is 3, output dim is 3
inputs = [torch.randn(1, 3) for _ in range(5)]  # make a sequence of length 5

for name, param in lstm.named_parameters():
  if 'bias' in name:
     # print(f'lstm bias {name} before init: {param}')
     nn.init.constant_(param, 0.0)
     # print(f'lstm bias {name} after init: {param}')
  elif 'weight' in name:
      
     # print(f'lstm weight {name} before init: {param}')
     if 'weight_ih' in name:
         param=copy.deepcopy(weight_ih_0)
         print(f'lstm {name} after init: {param}')
     if 'weight_hh' in name:
         param=copy.deepcopy(weight_hh_0)
         print(f'lstm {name} after init: {param}')
     
for name, param in lstm2.named_parameters():
  if 'bias' in name:
     # print(f'lstm2 bias {name} before init: {param}')
     nn.init.constant_(param, 0.0)
     # print(f'lstm2 bias {name} after init: {param}')
  elif 'weight' in name:
     # print(f'lstm2 weight {name} before init: {param}')
     if 'weight_ih' in name:
         param=copy.deepcopy(weight_ih_1)
         print(f'lstm2 {name} after init: {param}')
     if 'weight_hh' in name:
         param=copy.deepcopy(weight_hh_1)
         print(f'lstm2 {name} after init: {param}')

print(f'inputs: {inputs}')

# initialize the hidden state.
hidden = torch.zeros(1, 3)
cell= torch.zeros(1, 3)

idx=0
for i in inputs:
    print(f'idx: {idx}')
    
    idx+=1
    
    # Step through the sequence one element at a time.
    # after each step, hidden contains the hidden
    hidden, cell = lstm(i.view(1, -1), (hidden,cell))
    # print(hidden.shape)
    hidden, cell = lstm2(hidden, (hidden,cell))

    print(hidden)
    
    print("==========")

And the output is:

lstm weight_ih after init: Parameter containing:
tensor([[ 0.6025, -0.1577, -0.0990],
        [-0.5255,  0.4554,  0.4651],
        [ 0.1428,  0.1414, -0.0291],
        [ 0.1248,  0.3465, -0.5053],
        [ 0.6295, -0.8635, -0.3394],
        [ 0.1072,  0.0786,  0.3427],
        [ 0.5352, -0.2032,  0.8816],
        [ 0.3727, -0.1608, -0.6332],
        [-0.3745,  0.1903, -0.1654],
        [-0.0460, -0.2148,  0.7737],
        [-0.1980, -0.8980, -0.3470],
        [-0.1130,  0.6074,  0.1844]], requires_grad=True)
lstm weight_hh after init: Parameter containing:
tensor([[-0.0719, -0.0122,  0.2626],
        [ 0.3887, -0.3044, -0.4356],
        [-0.8422,  0.2204,  0.1151],
        [ 0.4171,  0.1116, -0.2114],
        [ 0.2061, -0.3204, -0.0983],
        [ 0.4791, -0.5683, -0.3928],
        [-0.3196, -0.1726, -0.0732],
        [-0.3058, -0.5667, -0.0211],
        [-0.0832, -0.3168,  0.1241],
        [-0.4197,  0.0525,  0.0741],
        [ 0.3849,  0.0481, -0.3130],
        [ 0.5788,  0.6312, -0.3627]], requires_grad=True)
lstm2 weight_ih after init: Parameter containing:
tensor([[ 3.6955e-02,  7.1276e-02, -4.3073e-01],
        [-5.2666e-01,  2.7323e-02,  1.2894e-01],
        [ 3.7136e-01,  3.3969e-01,  1.9601e-01],
        [ 3.5802e-01, -4.3600e-01, -1.7962e-01],
        [ 8.3209e-01,  1.7189e-01,  2.2195e-01],
        [-2.1302e-02, -1.6867e-01, -1.3460e-01],
        [ 1.3446e-01,  1.7708e-01, -5.6676e-01],
        [-2.3697e-01, -2.8254e-02, -2.2063e-01],
        [-2.0928e-01,  3.4973e-01,  3.5858e-04],
        [-5.0565e-01, -6.8619e-02,  3.7702e-01],
        [-9.0796e-02, -1.7238e-01,  4.7868e-01],
        [-1.1565e-01, -6.7956e-02, -2.1049e-01]], requires_grad=True)
lstm2 weight_hh after init: Parameter containing:
tensor([[-0.3017, -0.0811, -0.6554],
        [ 0.2665, -0.2052, -0.0577],
        [ 0.5493, -0.5094,  0.2167],
        [ 0.1210, -0.3868, -0.2293],
        [-0.0991,  0.6744, -0.0114],
        [-0.0343, -0.6136,  0.4856],
        [ 0.0505,  0.3920, -0.1662],
        [ 0.1163, -0.1296,  0.2505],
        [-0.1373, -0.8803, -0.4666],
        [-0.0230, -0.0346, -0.8451],
        [ 0.2032,  0.1847, -0.0758],
        [ 0.2533,  0.1532,  0.8224]], requires_grad=True)
inputs: [tensor([[1.5381, 1.4673, 1.5951]]), tensor([[-1.5279,  1.0156, -0.2020]]), tensor([[-1.2865,  0.8231, -0.6101]]), tensor([[-1.2960, -0.9434,  0.6684]]), tensor([[ 1.1628, -0.3229,  1.8782]])]
idx: 0
tensor([[-0.0152, -0.0344,  0.0368]], grad_fn=<MulBackward0>)
==========
idx: 1
tensor([[-0.0265, -0.0143,  0.0730]], grad_fn=<MulBackward0>)
==========
idx: 2
tensor([[-0.0210, -0.0033,  0.0529]], grad_fn=<MulBackward0>)
==========
idx: 3
tensor([[-0.0580, -0.0201,  0.1194]], grad_fn=<MulBackward0>)
==========
idx: 4
tensor([[-0.0672, -0.0801,  0.1165]], grad_fn=<MulBackward0>)
==========