I followed karpathy’s tutorial to build a character level model using an RNN. However, the loss doesn’t seem to decrease. And after training, the model spits out gibberish. Here is the code for the model.
class LSTMWriter(N.Module):
def __init__(self,vocab_size,n_layers=1):
super(LSTMWriter,self).__init__()
self.vocab_size=vocab_size
self.n_layers=n_layers
self.embedding=N.Embedding(self.vocab_size,10)
self.lstm=N.GRU(10,10,n_layers,batch_first=True)
self.dropout=N.Dropout(0.1)
self.linear=N.Linear(10,self.vocab_size)
self.linear2=N.Linear(self.vocab_size,self.vocab_size)
def init_hidden(self,batch_size=1):
h=A.Variable(torch.zeros(self.n_layers,batch_size,10))
c=A.Variable(torch.zeros(self.n_layers,batch_size,10))
return h
def forward(self,sequence,hidden):
embedding=self.embedding(sequence)
recurrent=embedding
for i in range(self.n_layers):
recurrent,hidden=self.lstm(recurrent,hidden)
flattened=self.linear(recurrent.contiguous().view(recurrent.size(0)*recurrent.size(1),recurrent.size(2)))
#flattened=self.dropout(flattened)
flat=flattened.view(recurrent.size(0),recurrent.size(1),flattened.size(1))
return flat,hidden
The code for training is:
dataset=Dataset(storage["raw_data"],storage["word_dict"])
vocab_size=len(storage["word_dict"])
word_dict=storage["word_dict"]
rev_dict=storage["rev_dict"]
del storage
data=dataset.get_dataset()
loss_fn=N.CrossEntropyLoss()
model=LSTMWriter(vocab_size,4)
optimizer=O.Adam(model.parameters(),lr=0.1)
for i in range(10):
hidden=model.init_hidden(1000)
batch_generator=get_batches(data[:,:200])
total_loss=0
b=1
while b:
model.zero_grad()
try:
_X,_y=next(batch_generator)
except:
break
train=A.Variable(torch.from_numpy(_X))
targets=A.Variable(torch.from_numpy(_y).contiguous().view(-1))
out,hidden=model.forward(train,A.Variable(hidden.data))
loss=loss_fn(out.contiguous().view(-1,vocab_size),targets)
loss.backward()
optimizer.step()
total_loss+=loss.data[0]
if b % 10 ==0:
logging.info("Epoch :{} Batches :{} Loss :{}".format(i,b,total_loss))
total_loss=0
b+=1
generate_text(data[0][3:3+10],model)
And for text generation is:
def generate_text(X,model):
hidden=model.init_hidden(1)
input_=numpy.array([X])
_,hidden=model.forward(A.Variable(torch.from_numpy(input_)),hidden)
gen_str=""
for i in range(100):
out,hidden=model.forward(A.Variable(torch.from_numpy(input_)),hidden)
out=out[:,-1].data.exp()
char=torch.multinomial(out,1)[0][0]
gen_str+=rev_dict[char]
input_=numpy.append(input_.squeeze(),char)
input_=numpy.array([input_[1:]],dtype=numpy.long)
print(gen_str)
After training for a considerable time, the output is complete gibberish
ft t t ttelhnFteuml aasd n nrms eoc eivwcwi td do i ,yactwobe ipn9 lethihy hep to tgeifrroiov
Any Ideas what might be going wrong?