Hi! I’m usually an image guy. But I can’t find a job and have too much time, so I thought it could be fun to reimplement the unreasonable efficiency of recurrent neural networks and get an idea of what it is to work with text.
https://karpathy.github.io/2015/05/21/rnn-effectiveness/
By the look of the loss function, I’d say it’s training. But when I sample, I get only one letter, usually only ‘a’. And I’m at loss as to what it is I do wrong.
Do you know where I could find someone who would walk with me through my code step by step? Or alternatively here is my code:
import numpy as np
from matplotlib import pyplot as plt
import string
import torch
import torch.nn as nn
class charrnn(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, device):
super(charrnn, self).__init__()
##batch first or there seem to be problems with the loss
self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
self.fc = nn.Linear(hidden_size, input_size)
self.device = device
self.function = torch.softmax #get the max prob on a character
def forward(self, x):
o, (self.h,self.c) = self.lstm(x, (self.h, self.c))
r = self.fc(o)
r = self.function(r, dim=2)
return r
##creates the hidden state and hidden cell at the begining of working with one particular sequence
def init_hc(self):
self.h = torch.zeros(num_layers, 1, hidden_size, device=self.device)
self.c = torch.zeros(num_layers, 1, hidden_size, device=self.device)
##replaces a char by its number as in string.printable
def line2numbers(line, all_letters):
numbers = []
for l in line:
idx = all_letters.index(l)
numbers.append(idx)
return numbers
##embeds a number into a one-hot encoded vector
def line2tensor(line, n_letters):
line = torch.LongTensor(line)
tensor = nn.functional.one_hot(line, num_classes=n_letters).float()
tensor = tensor.unsqueeze(0) ##get the batch first or there seem to be problems with the loss
return tensor
print('hyperparameters')
epochs = 51 #number of epochs seems excessive by comparison with blogs
lr = 0.001 #learning rate
sequence_len = 25 #train on sequences of 25 characters
num_iterations = 5000 #number of sequences to train on
log_interval = 5 #for visualisation while training
sampling_len = 2000 #how much to sample at the end
print('loading alphabet and data')
all_letters = string.printable
n_letters = len(all_letters)
path = 'data/names/Arabic.txt'
f = open(path,'r')
text = f.read()
f.close()
numbers = line2numbers(text, all_letters)
numbers = list(filter(lambda a: a != 96, numbers)) #skip line return for now
tensor = line2tensor(numbers, n_letters)
##tensor has shape 1 x text lenght x 100 where
#1 is the batch size
#text lenght is the number of characters in the text
#100 is the length of the one-hot encoding: n_letters
print(tensor.shape)
print('network')
input_size = n_letters
hidden_size = 128
num_layers = 1
device = torch.device('cuda')
net = charrnn(input_size, hidden_size, num_layers, device)
net = net.to(device)
net.train()
print('optimizer')
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
lossF = nn.CrossEntropyLoss()#nn.MSELoss()#
lossF = lossF.to(device)
print('running through network')
loss_epoch_accumulator = []
##chose randomly sequences to train on
indexes = np.random.randint(0,len(numbers)-sequence_len-1,num_iterations)
for epoch in range(epochs):
loss_iter_accumulator = []
for i in indexes:
optimizer.zero_grad()
net.init_hc() ##start with a new hidden/cell state
sample = tensor[:,i:i+sequence_len,:] #a sequence to train on
label = tensor[:,i+1:i+sequence_len+1,:] #the target is the sequence shifted by 1
sample = sample.to(device) #predict the next character of the sequence
label = label.to(device)
res = net(sample) #perform the prediction
loss = lossF(res, label) #compute the loss and backward
loss.backward()
optimizer.step()
loss_iter_accumulator.append(loss.item())
##control stuff
loss_epoch_accumulator.append(np.mean(loss_iter_accumulator))
if (epoch%log_interval)==0:
print('epoch {} out of {} elapsed --- loss {}'.format(epoch, epochs,loss_epoch_accumulator[-1]))
print('training completed')
np.save('train_loss.npy', loss_epoch_accumulator)
torch.save(net.state_dict(), 'net_model.pkl')
print('sampling')
##generate a random letter for starters and embed it into a one-hot
gen = [np.random.randint(0,n_letters)]
gen = line2tensor(gen, n_letters)
##prepare the net for inference
net.eval()
net.init_hc() #why should it keep only the last training iteration?
for i in range(sampling_len):
prompt = gen[:,-sequence_len:,:] #taining was on sequence_len, inference on the same seems logical
prompt = prompt.to(device)
res = net(prompt)
last = res[:,-1:,:].cpu() #take only the last character
gen = torch.cat((gen, last), dim=1) #append it to the sequence
print('converting samples to text and writing to file')
seq = []
num = []
for i in range(gen.shape[0]):
line = gen[:,i,:]
val = torch.argmax(line)
num.append(val)
char = string.printable[val]
seq.append(char)
seq = ''.join(seq)
f = open('generated.txt','w')
f.write(seq)
f.close()
plt.plot(loss_epoch_accumulator)
plt.show()