Should I use multiple Linear layers with ReLU after LSTM layer?
Below example is a model class I wrote, that has statefull LSTM implementation. It seems everything else is working good, except the fact that on average, when I use multiple Linear layers with ReLU, after the LSTM layer, it would give me a worse performance then just using single LSTM and Linear Layer without ReLU.
Please guide. You can just answer in theory since I know the code; but here I give code so that you can have easy overview:
class MYLSTM(nn.Module):
def __init__(self, input_size, hidden_size, batch_size, num_layers=4):
super(MYLSTM, self).__init__()
# variables
self.input_size = input_size # no. of features
self.hidden_size = hidden_size
self.batch_size = batch_size # the no. of previous steps in x for future y
self.num_layers = num_layers
# model properties
self.lstm = nn.LSTM(input_size, hidden_size, num_layers,dropout=0.2)
self.fir = nn.Linear(hidden_size, 32) # 1 is single number as output
self.sec = nn.Linear(32, 4) # 1 is single number as output
self.last = nn.Linear(4, 1) # 1 is single number as output
# activation func
self.relu = nn.ReLU()
def forward(self, input_data, h_0, c_0):
# we cannot feed input_data directly to lstm without reshaping acc. to pytorch documentation
sequence_length = len(input_data) # sequence length aka L, basically no. of rows
if h_0 == None and c_0 == None: # runs only when h_0, c_0 is None which's at beginning of each epoch
h_0 = torch.zeros(self.num_layers, self.batch_size, self.hidden_size) # num_layers, N, hidden size aka Hcell
c_0 = torch.zeros(self.num_layers, self.batch_size, self.hidden_size) # num_layers, N, hidden size aka Hcell
# to keep state values (h_0,c_0), but to detach them from the previous calculations
h_0 = h_0.detach()
c_0 = c_0.detach()
input_lstm = input_data.view(sequence_length, self.batch_size, self.input_size) # L,N, input size aka Hin
# we give our defined inputs in the lstm model as inputs and get lstm_out
# lstm_out is treated further to get output for feeding to Linear layer
lstm_out, (h_n, c_n) = self.lstm(input_lstm, (h_0, c_0))
output = lstm_out.view(sequence_length, self.batch_size, self.hidden_size) # L,N,Hcell
output = torch.mean(output, dim=1, keepdim=True) # output is 30,5,256(Hcell) shape we mean the 5 results at dim 1 so they all can combine as 1
input_data = self.fir(output[:,-1,:])
input_data = self.relu(input_data)
input_data = self.sec(input_data)
input_data = self.relu(input_data)
input_data = self.last(input_data)
return input_data, (h_n, c_n)
model = MYLSTM(1,64,batch_size).to(device)
model