I am using two ways to create a two-layer lstm as shown in the following two codes (they have the same initial parameters and inputs). Can anyone tell me why the outputs are not the same if num_layers is as what is explained in What is num_layers in RNN module? ? And if num_layers is not the same as in the link, can anyone tell me what the lstm-based model looks like by using num_layers? Thanks so much!
The first way using num_layers:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
torch.manual_seed(1)
lstm = nn.LSTM(3, 3,2) # Input dim is 3, output dim is 3
inputs = [torch.randn(1, 3) for _ in range(5)] # make a sequence of length 5
weight_ih_0=None
weight_hh_0=None
# bias_ih_0=None
# bias_hh_0=None
weight_ih_1=None
weight_hh_1=None
# bias_ih_1=None
# bias_hh_1=None
for name, param in lstm.named_parameters():
if 'bias' in name:
# print(f'bias {name} before init: {param}')
nn.init.constant_(param, 0.0)
# print(f'bias {name} after init: {param}')
elif 'weight' in name:
# print(f'weight {name} before init: {param}')
nn.init.xavier_normal_(param)
print(f'weight {name} after init: {param}')
for name, param in lstm.named_parameters():
if 'weight_ih_l0' in name:
weight_ih_0=param
if 'weight_hh_l0' in name:
weight_hh_0=param
if 'weight_ih_l1' in name:
weight_ih_1=param
if 'weight_hh_l1' in name:
weight_hh_1=param
print(f'inputs: {inputs}')
# initialize the hidden state.
hidden = (torch.zeros(2, 1, 3),
torch.zeros(2, 1, 3))
idx=0
for i in inputs:
print(f'idx: {idx}')
# print(f'i: {i}')
idx+=1
# Step through the sequence one element at a time.
# after each step, hidden contains the hidden
out, hidden = lstm(i.view(1, 1, -1), hidden)
print(out)
print("==========")
# print(hidden)
The outputs is:
weight weight_ih_l0 after init: Parameter containing:
tensor([[ 0.6025, -0.1577, -0.0990],
[-0.5255, 0.4554, 0.4651],
[ 0.1428, 0.1414, -0.0291],
[ 0.1248, 0.3465, -0.5053],
[ 0.6295, -0.8635, -0.3394],
[ 0.1072, 0.0786, 0.3427],
[ 0.5352, -0.2032, 0.8816],
[ 0.3727, -0.1608, -0.6332],
[-0.3745, 0.1903, -0.1654],
[-0.0460, -0.2148, 0.7737],
[-0.1980, -0.8980, -0.3470],
[-0.1130, 0.6074, 0.1844]], requires_grad=True)
weight weight_hh_l0 after init: Parameter containing:
tensor([[-0.0719, -0.0122, 0.2626],
[ 0.3887, -0.3044, -0.4356],
[-0.8422, 0.2204, 0.1151],
[ 0.4171, 0.1116, -0.2114],
[ 0.2061, -0.3204, -0.0983],
[ 0.4791, -0.5683, -0.3928],
[-0.3196, -0.1726, -0.0732],
[-0.3058, -0.5667, -0.0211],
[-0.0832, -0.3168, 0.1241],
[-0.4197, 0.0525, 0.0741],
[ 0.3849, 0.0481, -0.3130],
[ 0.5788, 0.6312, -0.3627]], requires_grad=True)
weight weight_ih_l1 after init: Parameter containing:
tensor([[ 3.6955e-02, 7.1276e-02, -4.3073e-01],
[-5.2666e-01, 2.7323e-02, 1.2894e-01],
[ 3.7136e-01, 3.3969e-01, 1.9601e-01],
[ 3.5802e-01, -4.3600e-01, -1.7962e-01],
[ 8.3209e-01, 1.7189e-01, 2.2195e-01],
[-2.1302e-02, -1.6867e-01, -1.3460e-01],
[ 1.3446e-01, 1.7708e-01, -5.6676e-01],
[-2.3697e-01, -2.8254e-02, -2.2063e-01],
[-2.0928e-01, 3.4973e-01, 3.5858e-04],
[-5.0565e-01, -6.8619e-02, 3.7702e-01],
[-9.0796e-02, -1.7238e-01, 4.7868e-01],
[-1.1565e-01, -6.7956e-02, -2.1049e-01]], requires_grad=True)
weight weight_hh_l1 after init: Parameter containing:
tensor([[-0.3017, -0.0811, -0.6554],
[ 0.2665, -0.2052, -0.0577],
[ 0.5493, -0.5094, 0.2167],
[ 0.1210, -0.3868, -0.2293],
[-0.0991, 0.6744, -0.0114],
[-0.0343, -0.6136, 0.4856],
[ 0.0505, 0.3920, -0.1662],
[ 0.1163, -0.1296, 0.2505],
[-0.1373, -0.8803, -0.4666],
[-0.0230, -0.0346, -0.8451],
[ 0.2032, 0.1847, -0.0758],
[ 0.2533, 0.1532, 0.8224]], requires_grad=True)
inputs: [tensor([[1.5381, 1.4673, 1.5951]]), tensor([[-1.5279, 1.0156, -0.2020]]), tensor([[-1.2865, 0.8231, -0.6101]]), tensor([[-1.2960, -0.9434, 0.6684]]), tensor([[ 1.1628, -0.3229, 1.8782]])]
idx: 0
tensor([[[ 0.0374, -0.0085, -0.0240]]], grad_fn=<StackBackward>)
==========
idx: 1
tensor([[[ 0.0073, -0.0110, -0.0296]]], grad_fn=<StackBackward>)
==========
idx: 2
tensor([[[-0.0314, -0.0147, -0.0136]]], grad_fn=<StackBackward>)
==========
idx: 3
tensor([[[-0.0458, -0.0118, -0.0254]]], grad_fn=<StackBackward>)
==========
idx: 4
tensor([[[-0.0096, -0.0281, -0.0440]]], grad_fn=<StackBackward>)
==========
The second way creating two individual lstm:
import copy
torch.manual_seed(1)
lstm = nn.LSTMCell(3, 3) # Input dim is 3, output dim is 3
lstm2 = nn.LSTMCell(3, 3) # Input dim is 3, output dim is 3
inputs = [torch.randn(1, 3) for _ in range(5)] # make a sequence of length 5
for name, param in lstm.named_parameters():
if 'bias' in name:
# print(f'lstm bias {name} before init: {param}')
nn.init.constant_(param, 0.0)
# print(f'lstm bias {name} after init: {param}')
elif 'weight' in name:
# print(f'lstm weight {name} before init: {param}')
if 'weight_ih' in name:
param=copy.deepcopy(weight_ih_0)
print(f'lstm {name} after init: {param}')
if 'weight_hh' in name:
param=copy.deepcopy(weight_hh_0)
print(f'lstm {name} after init: {param}')
for name, param in lstm2.named_parameters():
if 'bias' in name:
# print(f'lstm2 bias {name} before init: {param}')
nn.init.constant_(param, 0.0)
# print(f'lstm2 bias {name} after init: {param}')
elif 'weight' in name:
# print(f'lstm2 weight {name} before init: {param}')
if 'weight_ih' in name:
param=copy.deepcopy(weight_ih_1)
print(f'lstm2 {name} after init: {param}')
if 'weight_hh' in name:
param=copy.deepcopy(weight_hh_1)
print(f'lstm2 {name} after init: {param}')
print(f'inputs: {inputs}')
# initialize the hidden state.
hidden = torch.zeros(1, 3)
cell= torch.zeros(1, 3)
idx=0
for i in inputs:
print(f'idx: {idx}')
idx+=1
# Step through the sequence one element at a time.
# after each step, hidden contains the hidden
hidden, cell = lstm(i.view(1, -1), (hidden,cell))
# print(hidden.shape)
hidden, cell = lstm2(hidden, (hidden,cell))
print(hidden)
print("==========")
And the output is:
lstm weight_ih after init: Parameter containing:
tensor([[ 0.6025, -0.1577, -0.0990],
[-0.5255, 0.4554, 0.4651],
[ 0.1428, 0.1414, -0.0291],
[ 0.1248, 0.3465, -0.5053],
[ 0.6295, -0.8635, -0.3394],
[ 0.1072, 0.0786, 0.3427],
[ 0.5352, -0.2032, 0.8816],
[ 0.3727, -0.1608, -0.6332],
[-0.3745, 0.1903, -0.1654],
[-0.0460, -0.2148, 0.7737],
[-0.1980, -0.8980, -0.3470],
[-0.1130, 0.6074, 0.1844]], requires_grad=True)
lstm weight_hh after init: Parameter containing:
tensor([[-0.0719, -0.0122, 0.2626],
[ 0.3887, -0.3044, -0.4356],
[-0.8422, 0.2204, 0.1151],
[ 0.4171, 0.1116, -0.2114],
[ 0.2061, -0.3204, -0.0983],
[ 0.4791, -0.5683, -0.3928],
[-0.3196, -0.1726, -0.0732],
[-0.3058, -0.5667, -0.0211],
[-0.0832, -0.3168, 0.1241],
[-0.4197, 0.0525, 0.0741],
[ 0.3849, 0.0481, -0.3130],
[ 0.5788, 0.6312, -0.3627]], requires_grad=True)
lstm2 weight_ih after init: Parameter containing:
tensor([[ 3.6955e-02, 7.1276e-02, -4.3073e-01],
[-5.2666e-01, 2.7323e-02, 1.2894e-01],
[ 3.7136e-01, 3.3969e-01, 1.9601e-01],
[ 3.5802e-01, -4.3600e-01, -1.7962e-01],
[ 8.3209e-01, 1.7189e-01, 2.2195e-01],
[-2.1302e-02, -1.6867e-01, -1.3460e-01],
[ 1.3446e-01, 1.7708e-01, -5.6676e-01],
[-2.3697e-01, -2.8254e-02, -2.2063e-01],
[-2.0928e-01, 3.4973e-01, 3.5858e-04],
[-5.0565e-01, -6.8619e-02, 3.7702e-01],
[-9.0796e-02, -1.7238e-01, 4.7868e-01],
[-1.1565e-01, -6.7956e-02, -2.1049e-01]], requires_grad=True)
lstm2 weight_hh after init: Parameter containing:
tensor([[-0.3017, -0.0811, -0.6554],
[ 0.2665, -0.2052, -0.0577],
[ 0.5493, -0.5094, 0.2167],
[ 0.1210, -0.3868, -0.2293],
[-0.0991, 0.6744, -0.0114],
[-0.0343, -0.6136, 0.4856],
[ 0.0505, 0.3920, -0.1662],
[ 0.1163, -0.1296, 0.2505],
[-0.1373, -0.8803, -0.4666],
[-0.0230, -0.0346, -0.8451],
[ 0.2032, 0.1847, -0.0758],
[ 0.2533, 0.1532, 0.8224]], requires_grad=True)
inputs: [tensor([[1.5381, 1.4673, 1.5951]]), tensor([[-1.5279, 1.0156, -0.2020]]), tensor([[-1.2865, 0.8231, -0.6101]]), tensor([[-1.2960, -0.9434, 0.6684]]), tensor([[ 1.1628, -0.3229, 1.8782]])]
idx: 0
tensor([[-0.0152, -0.0344, 0.0368]], grad_fn=<MulBackward0>)
==========
idx: 1
tensor([[-0.0265, -0.0143, 0.0730]], grad_fn=<MulBackward0>)
==========
idx: 2
tensor([[-0.0210, -0.0033, 0.0529]], grad_fn=<MulBackward0>)
==========
idx: 3
tensor([[-0.0580, -0.0201, 0.1194]], grad_fn=<MulBackward0>)
==========
idx: 4
tensor([[-0.0672, -0.0801, 0.1165]], grad_fn=<MulBackward0>)
==========