Hello,
I am trying to implement char rnn to predict next character given a character.
I have a doubt in understanding how many LSTM nodes are generated for the first layer in LSTM architecture.
suppose I have a LSTM with below architecture.
class CharRNN(nn.Module):
def __init__(self, tokens, n_hidden=256, n_layers=2,
drop_prob=0.5, lr=0.001):
super().__init__()
self.drop_prob = drop_prob
self.n_layers = n_layers
self.n_hidden = n_hidden
self.lr = lr
# creating character dictionaries
self.chars = tokens
self.int2char = dict(enumerate(self.chars))
self.char2int = {ch: ii for ii, ch in self.int2char.items()}
## TODO: define the LSTM
print("length of self.chars: ", len(self.chars))
self.lstm = nn.LSTM(len(self.chars), n_hidden, n_layers,
dropout=drop_prob, batch_first=True)
## TODO: define a dropout layer
self.dropout = nn.Dropout(drop_prob)
## TODO: define the final, fully-connected output layer
self.fc = nn.Linear(n_hidden, len(self.chars))
def forward(self, x, hidden):
''' Forward pass through the network.
These inputs are x, and the hidden/cell state `hidden`. '''
## TODO: Get the outputs and the new hidden state from the lstm
print("entered")
print("forward function: ", x.shape)
r_output, hidden = self.lstm(x, hidden)
## TODO: pass through a dropout layer
out = self.dropout(r_output)
print("out shape: ", out.shape)
# Stack up LSTM outputs using view
# you may need to use contiguous to reshape the output
out = out.contiguous().view(-1, self.n_hidden)
## TODO: put x through the fully-connected layer
out = self.fc(out)
print("out shape after fc: ", out.shape)
# return the final output and the hidden state
return out, hidden
def init_hidden(self, batch_size):
''' Initializes hidden state '''
# Create two new tensors with sizes n_layers x batch_size x n_hidden,
# initialized to zero, for hidden state and cell state of LSTM
weight = next(self.parameters()).data
if (train_on_gpu):
hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
else:
hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
weight.new(self.n_layers, batch_size, self.n_hidden).zero_())
return hidden
n_hidden=512
n_layers=2
batch_size = 128
seq_length = 100
net = CharRNN(chars, n_hidden, n_layers)
CharRNN(
(lstm): LSTM(83, 512, num_layers=2, batch_first=True, dropout=0.5)
(dropout): Dropout(p=0.5, inplace=False)
(fc): Linear(in_features=512, out_features=83, bias=True)
)
83 because there are 83 unique characters in the entire novel.
my training data X is text data from Anna Karenina novel. data is shifted over one step for y
.
Consider an example.
x
[[25 8 60 11 45 27 28 73 1 2]
[17 7 20 73 45 8 60 45 73 60]
[27 20 80 73 7 28 73 60 73 65]
[17 73 45 8 27 73 66 8 46 27]
[73 17 60 12 73 8 27 28 73 45]
[66 64 17 17 46 7 20 73 60 20]
[73 76 20 20 60 73 8 60 80 73]
[47 35 43 7 20 17 24 50 37 73]]
y
[[ 8 60 11 45 27 28 73 1 2 2]
[ 7 20 73 45 8 60 45 73 60 45]
[20 80 73 7 28 73 60 73 65 7]
[73 45 8 27 73 66 8 46 27 65]
[17 60 12 73 8 27 28 73 45 27]
[64 17 17 46 7 20 73 60 20 80]
[76 20 20 60 73 8 60 80 73 17]
[35 43 7 20 17 24 50 37 73 36]]
The above input is then one hot encoded using below code.
def get_batches(arr, batch_size, seq_length):
'''Create a generator that returns batches of size
batch_size x seq_length from arr.
Arguments
---------
arr: Array you want to make batches from
batch_size: Batch size, the number of sequences per batch
seq_length: Number of encoded chars in a sequence
'''
batch_size_total = batch_size * seq_length
# total number of batches we can make
n_batches = len(arr)//batch_size_total
#print("n_batches: ", n_batches)
#print("batch_size:", batch_size)
#print("seq_length: ", seq_length)
# Keep only enough characters to make full batches
arr = arr[:n_batches * batch_size_total]
# Reshape into batch_size rows
arr = arr.reshape((batch_size, -1))
# iterate through the array, one sequence at a time
for n in range(0, arr.shape[1], seq_length):
# The features
x = arr[:, n:n+seq_length]
# The targets, shifted by one
y = np.zeros_like(x)
try:
y[:, :-1], y[:, -1] = x[:, 1:], arr[:, n+seq_length]
except IndexError:
y[:, :-1], y[:, -1] = x[:, 1:], arr[:, 0]
yield x, y
My questions
- How many LSTM nodes are created in the first layer of the LSTM architecture?
- Can anyone explain me how the input is sent to first layer in LSTM?
My thinking is the first layer should have 100 lstm nodes because the sequence length is 100. If that is correct then I don’t understand how pytorch automatically creates it because I have not mentioned the seq_length while defining LSTM using nn.LSTM
Can anyone please explain me? I am stuck with this for days.