I am developping a LSTM to generate new sequence of word, but the outputs i get are the same everytime.
It looks like this :
i am "–said "come "edwin "–said "edwin "edwin "'tis "'tis "–said "come "–said "'tis "come "coming "'tis "edwin "'tis "come "coming "come "come "'tis "–said "coming "coming "edwin "coming "come "coming "edwin "–said "–said "edwin "come "'tis "–said "'tis "'tis "edwin "'tis "edwin "'tis "–said "coming "coming "come "–said "'tis "–said "–said "'tis "'tis "come "'tis "come "edwin "come "coming "'tis "coming "coming "coming "'tis "'tis "coming "edwin
I don’t know what i’ve done wrong i don’t find any clues online.
Here’s my code
model.py
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
class LSTM(nn.Module) :
def __init__(self,vocab_size, seq_size, embedding_dim, hidden_dim):
super(LSTM, self).__init__()
self.vocab = vocab_size
self.hidden_size = hidden_dim
self.seq_size = seq_size
self.encoder = nn.Embedding(vocab_size, embedding_dim)
self.lstm = nn.LSTM(embedding_dim, hidden_dim , batch_first=True )
self.dropout_1 = nn.Dropout(p=0.2, inplace=False)
self.lstm2 = nn.LSTM(embedding_dim, hidden_dim , batch_first=True )
self.dropout_2 = nn.Dropout(p=0.2, inplace=False)
self.decoder = nn.Linear(in_features=hidden_dim,out_features=vocab_size)
self.activation = nn.Softmax(dim=0)
def zero_state(self, batch_size):
return (
(torch.zeros(1, batch_size, self.hidden_size),
torch.zeros(1, batch_size, self.hidden_size)),
(torch.zeros(1, batch_size, self.hidden_size),
torch.zeros(1, batch_size, self.hidden_size)),
)
def forward(self,sentence,prev_state_1,prev_state_2):
#print("sentence shape ",np.shape(sentence))
#print("prev shape ",np.shape(prev_state_1))
x = self.encoder(sentence)
x , hidden_cell_1 = self.lstm(x,prev_state_1)
x = self.dropout_1(x)
x , hidden_cell_2 = self.lstm2(x,prev_state_2)
x = self.dropout_2(x)
x = self.decoder(x)
x = F.log_softmax(x, dim=1)
return x , hidden_cell_1,hidden_cell_2
train.py
from model import LSTM
import numpy as np
from dataset import *
import torch
import torch.nn as nn
from argparse import Namespace
flags = Namespace(
seq_size=100,
batch_size=64,
embedding_size=256,
lstm_size=256,
gradients_norm=5,
checkpoint_path='checkpoint',
num_epochs = 200
)
def train():
int_to_vocab, vocab_to_int, n_vocab, in_text = get_data_from_file( flags.batch_size, flags.seq_size)
x_batch,y_batch = create_batch(in_text,flags.batch_size,flags.seq_size)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = LSTM(n_vocab, flags.seq_size,flags.embedding_size, flags.lstm_size).to(device)
#optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.7)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_function = nn.CrossEntropyLoss()
for e in range(flags.num_epochs):
print(f'epoch #{e}: ',end="")
batches = get_batches(x_batch,y_batch,flags.batch_size, flags.seq_size)
(state_h_1, state_c_1),(state_h_2, state_c_2) = model.zero_state(flags.batch_size)
state_h_1 = state_h_1.to(device)
state_c_1 = state_c_1.to(device)
state_h_2 = state_h_2.to(device)
state_c_2 = state_c_2.to(device)
for i,(x, y) in enumerate(batches):
model.train()
optimizer.zero_grad()
x = torch.tensor(x , dtype=torch.int64).to(device)
#print("x shape {} ".format(np.shape(x)))
tmp = []
for index,el in enumerate(y) :
tmp.append(np.zeros(n_vocab))
tmp[index][y[index]] = 1
#print(y)
y = tmp
y = torch.tensor(y , dtype=torch.int64).to(device)
logits, (state_h_1, state_c_1),(state_h_2, state_c_2) = model(x, (state_h_1, state_c_1),(state_h_2, state_c_2))
#print("logits shape {} , y shape {}".format(np.shape(logits),np.shape(y)))
loss = loss_function(logits, y)
state_h_1 = state_h_1.detach()
state_c_1 = state_c_1.detach()
state_h_2 = state_h_2.detach()
state_c_2 = state_c_2.detach()
loss_value = loss.item()
loss.backward()
_ = torch.nn.utils.clip_grad_norm_(model.parameters(), flags.gradients_norm)
optimizer.step()
print(f'batch #{i}:\tloss={loss.item():.10f}')
return model
if __name__ == "__main__":
model = train()
torch.save(model,'save/model')
dataset.py
import numpy as np
import random
import torch
from collections import Counter
import string
def get_data_from_file(batch_size, seq_size) :
filename = "corpus.txt"
raw_text = open(filename, 'r', encoding='utf-8').read()
text = raw_text.lower()
#text = text.translate(str.maketrans('', '', string.punctuation))
text = text.split()
chars = sorted(list(set(text)))
#word_counts = Counter(text)
#sorted_vocab = dict((c, i) for i, c in enumerate(chars))
#int_to_vocab = {k: w for k, w in enumerate(sorted_vocab)}
#vocab_to_int = {w: k for k, w in int_to_vocab.items()}
vocab_to_int = dict((c, i) for i, c in enumerate(chars))
int_to_vocab = dict((i, c) for i, c in enumerate(chars))
n_vocab = len(int_to_vocab)
np.save("save/int_to_vocab",int_to_vocab)
np.save("save/vocab_to_int",vocab_to_int)
print('Vocabulary size', n_vocab)
print("total charact : ", len(text))
int_text = np.array([vocab_to_int[w] for w in text])
#encod_text = one_hot_encod(int_text)
return int_to_vocab, vocab_to_int, n_vocab,int_text
def create_batch(array,batch_size,seq_size):
num_batches = int(len(array) / (seq_size * batch_size))
array = array[:num_batches * batch_size * seq_size]
#x_batch = np.reshape(array, (num_batches, -1))
x_batch = np.array(np.split(array, int(len(array) / seq_size) ))
y_batch = x_batch[:,-1]
#print(y_batch)
x_batch = np.delete(x_batch, np.s_[::2], 1)
print("x shape {} , y shape {} ".format(np.shape(x_batch),np.shape(y_batch)))
return x_batch,y_batch
def get_batches(x,y ,batch_size, seq_size):
num_batches = len(x)
for i in range(0, num_batches,batch_size):
yield x[i:i+batch_size], y[i:i+batch_size]
pred.py
import torch
import numpy as np
model = torch.load("save/model")
vocab_to_int = np.load("save/vocab_to_int.npy",allow_pickle='TRUE').item()
int_to_vocab = np.load("save/int_to_vocab.npy",allow_pickle='TRUE').item()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.eval()
top_k = 5
words = ['i','am']
(state_h_1, state_c_1),(state_h_2, state_c_2) = model.zero_state(1)
state_h_1 = state_h_1.to(device)
state_c_1 = state_c_1.to(device)
state_h_2 = state_h_2.to(device)
state_c_2 = state_c_2.to(device)
for w in words:
ix = torch.tensor([[vocab_to_int[w]]]).to(device)
output, (state_h_1, state_c_1),(state_h_2, state_c_2) = model(ix, (state_h_1, state_c_1),(state_h_2, state_c_2))
_, top_ix = torch.topk(output[0], k=top_k)
choices = top_ix.tolist()
choice = np.random.choice(choices[0])
words.append(int_to_vocab[choice])
for _ in range(100):
ix = torch.tensor([[choice]],dtype=torch.int64).to(device)
output, (state_h_1, state_c_1),(state_h_2, state_c_2) = model(ix, (state_h_1, state_c_1),(state_h_2, state_c_2))
_, top_ix = torch.topk(output[0], k=top_k)
choices = top_ix.tolist()
choice = np.random.choice(choices[0])
words.append(int_to_vocab[choice])
print(' '.join(words))