Hi! I am trying to create an RNN from scratch to generate shakespeare. But it seems like the model is not learning even though the weights are being updated. Here’s my Model Definition:
# Defining the model arch
class RNN(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super(RNN, self).__init__()
self.hidden_size = hidden_size
self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
self.i2h1 = nn.Linear(hidden_size, hidden_size)
self.i2o = nn.Linear(input_size + hidden_size, hidden_size)
self.i2o1 = nn.Linear(hidden_size, output_size)
def forward(self, input, hidden):
combined = torch.cat((input, hidden), 1)
hidden = F.tanh(self.i2h(combined))
hidden = self.i2h1(hidden)
output = F.tanh(self.i2o(combined))
output = self.i2o1(output)
return output, hidden
def init_hidden(self):
return torch.zeros(seq_length, self.hidden_size)
This is my generation code:
# generate text
def generate(rnn, start_letter, length):
hidden = torch.zeros(1, hidden_size) # Create an initial hidden state
input = char_to_one_hot(char_to_ix[start_letter]) #Convert the character to one-hot encoded tensor
output_name = start_letter # Output String
for i in range(length):
input = input.to(device)
hidden = hidden.to(device)
output, hidden = rnn(input, hidden) # Getting outputs from the model
probs = F.softmax(output, dim=1) # Converting those outputs to probabilities
# print(probs.shape)
probs = probs.cpu().detach().numpy()
ix = np.random.choice(range(vocab_size), p=probs.ravel()) # Sampling a character from that probability distribution
letter = ix_to_char[ix] # COnverting the idx back to a letter
output_name += letter # Appending to the output string
return output_name
print(generate(rnn, "t", 20))
In my training loop I’ve created chunks of seq_length and one hot encoded them. So if the seq_length is 5 my input size is (5, 65) where 65 is the vocab size.
And the target is a torch.long encoded using a dict that converts a char to its corresponding int.
This is my training loop:
# Training Loop
while True: # Infinte epochs
lossi = []
for cx, chunk in tqdm(enumerate(chunks), total=len(chunks)): # Iternating over each chunk of 5 characters to produce the next one
hidden = rnn.init_hidden() # Initializing a hidden state h0
inputs = torch.zeros(seq_length, vocab_size) # This will contain the 5 one hot encoded vectors corresponding to each letter
targets = torch.zeros(seq_length, dtype=torch.long)
for i in range(seq_length):
input_char = chunk[i] # Getting a character from a chunk of 5
inputs[i] = char_to_one_hot(input_char) # One-hot encoding that character and adding to the inputs tensor
targets[i] = chunk[i+1] # Getting the next character which would act as a target we will try to predict
output, hidden = rnn(inputs, hidden)
loss = criterion(output.view(-1, vocab_size), targets.view(-1)) # CrossentropyLoss
lossi.append(loss.item())
optimizer.zero_grad()
loss.backward()
optimizer.step()
if (cx%500==0):
print(f"------------- Iteration {cx} --------------------")
print(generate(rnn, ix_to_char[chunk[0]], 200)) # Generating after 3000 chunks
print("==================================")
print("Mean Loss:", sum(lossi)/len(lossi))
print("==================================")
It would be great if someone could help because I’m sure there is some problem with this code.
Thanks!