Excuse me, I learned colah’s blog, which explains LSTM.
https://colah.github.io/posts/2015-08-Understanding-LSTMs/
According to the formula in this blog, finally, H_t=O_t*tanh, O_t is sigmoid layer, so the value of each element of H_t should be in the (-1, 1) interval. But when PyTorch calls LSTM, it seems that the element value of H_t can be trained to be any large or small, instead of being in (-1,1) interval. Between the theory in colah’s blog and PyTorch code, what is the gap and difference?
Thanks a lot!
Does anybody can help? I am confused.
From the documentation of PyTorch’s LSTM, I do not see much deviation from Colah’s blog. Do you have a code snippet to show what you mean?
Thank you for your reply.
Yes, the documentation of [PyTorch’s LSTM ] is in accordance with Colah’s blog.
But the h_n of PyTorch can be trained to be the values like 40 or 20, but according to the documentation h_t=o_t*tanh should be in the interval (-1,1).
here is my simple model
class seqmodel(torch.nn.Module):
def __init__(self, input_size=2, hidden_size=2, output_size=1, num_layer=2):
super(seqmodel, self).__init__()
self.layer1 = nn.LSTM(input_size, hidden_size, num_layer)
self.layer2 = nn.Linear(hidden_size, output_size)
def forward(self, x):
x,_= self.layer1(x)
s, b, h = x.size()
x = x.view(s * b, h)
x = self.layer2(x)
x = x.view(s, b,-1)
return x
pcmodel=seqmodel(2,2,1,2)
y_pred=[]
for j in range (0,len(testy)):
inputx=torch.stack((testx[j+0],testx[1+j],testx[j+2]),dim=0)
inputx=inputx.unsqueeze(1)
output=pcmodel(inputx)
y_pred.append(output[2,0,0])
Can you give an example of how you can train the h_n
to hold values other than between [-1, 1]
?
The code snippet that you have posted does not show this scenario.
I have tried to see the range of values that h_n
outputs. I do not see any values other than [-1,1]
so far.
import torch
import torch.nn as nn
rnn = nn.LSTM(10, 20, 2)
input = torch.randn(5, 3, 10) * 100
h0 = torch.randn(2, 3, 20)
c0 = torch.randn(2, 3, 20)
output, (hn, cn) = rnn(input, (h0, c0))
print("input", input.min(), input.max())
print("h0", h0.min(), h0.max())
print("c0", c0.min(), c0.max())
print("output", output.min(), output.max())
print("hn", hn.min(), hn.max())
print("cn", cn.min(), cn.max())
input tensor(-227.9721) tensor(278.6736)
h0 tensor(-3.3298) tensor(2.2657)
c0 tensor(-2.9608) tensor(1.7019)
output tensor(-0.4966, grad_fn=<MinBackward1>) tensor(0.4206, grad_fn=<MaxBackward1>)
hn tensor(-0.9645, grad_fn=<MinBackward1>) tensor(0.7616, grad_fn=<MaxBackward1>)
cn tensor(-2.0067, grad_fn=<MinBackward1>) tensor(1.1488, grad_fn=<MaxBackward1>)
Thank you for your reply.
I mean, when I print(torch.stack(y_pred,0)), some entries could be the values like 20.
Each entry is out=output[2,0,0], where sequence length is 3, and output is my model output of pcrnn.
Please help see the code below:
import numpy as np
import scipy.io as sio
import math
from os.path import dirname, join as pjoin
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.backends.cudnn as cudnn
import torchvision
import torchvision.transforms as transforms
from torch.utils.tensorboard import SummaryWriter
print(torch.__version__)
##T1 = torch.tensor([[1, 2, 3],
## [4, 5, 6],
## [7, 8, 9]])
##T2 = torch.tensor([[10, 20, 30],
## [40, 50, 60],
## [70, 80, 90]])
##T=torch.stack((T1,T2),dim=0)
##print(T)
##pcrnn=nn.LSTMCell(10,20)
##input=torch.randn(2,3,10)
##hx=torch.randn(3,20)
##cx=torch.randn(3,20)
##output=[]
##print(input.size()[0])
##for i in (range(input.size()[0])):
## hx,cx=pcrnn(input[i],(hx,cx))
## output.append(hx)
##output=torch.stack(output,dim=0)
input1=torch.tensor([[0.0,0.0],[0.0,0.0],[1.0,1.0],[1.0,6.0],[1.0,5.0],
[1.0,6.0],[1.0,6.0],[1.0,1.0],
[0.0,6.0],[1.0,8.0],[1.0,6.0],
[1.0,10.0],[0.0,4.0],[1.0,8.0],
[1.0,1.0],[0.0,1.0]])
#testin=torch.stack((input[0],input[1],input[2]),dim=0)
y1=torch.tensor([13.0,12.0,13.0,15.0,20.0,24.0,20.0,22.0,13.0,10.0,6.0,11.0,14.0])
input2=torch.tensor([[0.0,0.0],[0.0,0.0],[1.0,1.0],[0.0,0.0],[0.0,1.0],[1.0,2.0],
[1.0,3.0],[0.0,5.0],[1.0,10.0],
[1.0,9.0],[1.0,8.0],[1.0,6.0],[1.0,14.0],[1.0,5.0],[1.0,9.0],
[1.0,5.0],[0.0,20.0],[0.0,3.0],
[0.0,2.0],[0.0,2.0],[1.0,2.0],[0.0,1.0],[0.0,1.0],[0.0,1.0],
[0.0,3.0],[0.0,0.0],[0.0,0.0],[0.0,0.0]])
y2=torch.tensor([3.0,6.0,10.0,18.0,24.0,27.0,23.0,28.0,25.0,28.0,19.0,34.0,28.0,
25.0,7.0,6.0,5.0,4.0,3.0,5.0,4.0,3.0,0.0,0.0,0.0,0.0])
y=torch.cat((y1,y2),0)
print(y.shape)
testx=torch.tensor([[1.0,6.0],
[1.0,10.0],
[0.0,4.0],
[1.0,8.0],
[1.0,1.0],
[0.0,1.0],
[1.0,4.0]])
testy=torch.tensor([10.0,6.0,11.0,14.0,14.0])
##pcmodel= torch.nn.Sequential(
## torch.nn.LSTM(2,1,2)
##)
class seqmodel(torch.nn.Module):
def __init__(self, input_size=2, hidden_size=2, output_size=1, num_layer=2):
super(seqmodel, self).__init__()
self.layer1 = nn.LSTM(input_size, hidden_size, num_layer)
self.layer2 = nn.Linear(hidden_size, output_size)
def forward(self, x):
x,_= self.layer1(x)
s, b, h = x.size()
x = x.view(s * b, h)
x = self.layer2(x)
x = x.view(s, b,-1)
return x
pcmodel=seqmodel(2,2,1,2)
y_pred=[]
for j in range (0,len(testy)):
inputx=torch.stack((testx[j+0],testx[1+j],testx[j+2]),dim=0)
inputx=inputx.unsqueeze(1)
output=pcmodel(inputx)
y_pred.append(output[2,0,0])
#y_pred=torch.tensor([item.detach().numpy() for item in y_pred])
print(torch.stack(y_pred,0))
loss_fn=torch.nn.MSELoss(reduction='sum')
learning_rate=1e-3
##for parameters in pcmodel.parameters():
## print(parameters)
##h0 = torch.randn(2, 1, 1)
##c0 = torch.randn(2, 1, 1)
optimizer = torch.optim.RMSprop(pcmodel.parameters(), lr=learning_rate)
for epoch in range(2000):
for j in range (0,len(y1)):
inputx=torch.stack((input1[j+0],input1[1+j],input1[j+2]),dim=0)
inputx=inputx.unsqueeze(1)
#print(inputx.shape)
output=pcmodel(inputx)
#print(output.shape)
#print(output)
out=output[2,0,0]#2,0,0是对标提取h_n的第三个分量,sequence-length是3,所以输出三个分量
loss=loss_fn(out,y1[j])
optimizer.zero_grad()
loss.backward()
optimizer.step()
if epoch%100==99:
print(epoch,loss.item())
for j in range (0,len(y2)):
inputx=torch.stack((input2[j+0],input2[1+j],input2[j+2]),dim=0)
inputx=inputx.unsqueeze(1)
#print(inputx.shape)
output=pcmodel(inputx)
#print(output)
out=output[2,0,0]
loss=loss_fn(out,y2[j])
optimizer.zero_grad()
loss.backward()
optimizer.step()
#y_pred=torch.tensor([item.detach().numpy() for item in out])
## y_pred=y_pred[:,-1,:,:]
## y_pred=y_pred.squeeze(1)
## y_pred=y_pred.squeeze(1)
y_pred=[]
for j in range (0,len(testy)):
inputx=torch.stack((testx[j+0],testx[1+j],testx[j+2]),dim=0)
inputx=inputx.unsqueeze(1)
output=pcmodel(inputx)
y_pred.append(output[2,0,0])
#y_pred=torch.tensor([item.detach().numpy() for item in y_pred])
print(torch.stack(y_pred,0))
The output of model is from nn.Linear()
layer.
Did I misunderstand your point or , Did you misunderstand your model?
OK. Now I understand. Thank you very much. So could you recommend some book introducing the Pytorch code for lstm?
I am a beginner. When I met programming problems, I spend a lot of time in finding answers on the internet.
Personally, when I face issues, I refer to the Pytorch documentation and search in the discussion forums, and blogs. I haven’t gone through any books so far.
Thank you for your reply.
So the priority of the PyTorch documentation should be guaranteed.