LSTM generating same word over and over

theo-oriol · April 22, 2020, 9:33am

I am developping a LSTM to generate new sequence of word, but the outputs i get are the same everytime.
It looks like this :
i am "–said "come "edwin "–said "edwin "edwin "'tis "'tis "–said "come "–said "'tis "come "coming "'tis "edwin "'tis "come "coming "come "come "'tis "–said "coming "coming "edwin "coming "come "coming "edwin "–said "–said "edwin "come "'tis "–said "'tis "'tis "edwin "'tis "edwin "'tis "–said "coming "coming "come "–said "'tis "–said "–said "'tis "'tis "come "'tis "come "edwin "come "coming "'tis "coming "coming "coming "'tis "'tis "coming "edwin

I don’t know what i’ve done wrong i don’t find any clues online.
Here’s my code
model.py

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np 

class LSTM(nn.Module) :
    def __init__(self,vocab_size, seq_size, embedding_dim, hidden_dim):
        super(LSTM, self).__init__()
        self.vocab = vocab_size
        self.hidden_size = hidden_dim
        self.seq_size = seq_size

        self.encoder = nn.Embedding(vocab_size, embedding_dim)

        self.lstm = nn.LSTM(embedding_dim, hidden_dim , batch_first=True )
        self.dropout_1 = nn.Dropout(p=0.2, inplace=False)
        self.lstm2 = nn.LSTM(embedding_dim, hidden_dim , batch_first=True )
        self.dropout_2 = nn.Dropout(p=0.2, inplace=False)
        self.decoder = nn.Linear(in_features=hidden_dim,out_features=vocab_size)
        self.activation = nn.Softmax(dim=0)
    def zero_state(self, batch_size):
        return (
                (torch.zeros(1, batch_size, self.hidden_size),
                torch.zeros(1, batch_size, self.hidden_size)), 

                (torch.zeros(1, batch_size, self.hidden_size),
                torch.zeros(1, batch_size, self.hidden_size)),                
                )

    def forward(self,sentence,prev_state_1,prev_state_2):
        #print("sentence shape ",np.shape(sentence))
        #print("prev shape ",np.shape(prev_state_1))
        x = self.encoder(sentence)
        x , hidden_cell_1 = self.lstm(x,prev_state_1)
        x = self.dropout_1(x)
        x , hidden_cell_2 = self.lstm2(x,prev_state_2)
        x = self.dropout_2(x)
        x = self.decoder(x)
        x = F.log_softmax(x, dim=1)
        return x , hidden_cell_1,hidden_cell_2

train.py

from model import LSTM
import numpy as np 
from dataset import *
import torch 
import torch.nn as nn
from argparse import Namespace

flags = Namespace(
    seq_size=100,
    batch_size=64,
    embedding_size=256,
    lstm_size=256,
    gradients_norm=5,
    checkpoint_path='checkpoint',
    num_epochs = 200
)

def train():
    int_to_vocab, vocab_to_int, n_vocab, in_text = get_data_from_file( flags.batch_size, flags.seq_size)
    x_batch,y_batch = create_batch(in_text,flags.batch_size,flags.seq_size)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    model = LSTM(n_vocab, flags.seq_size,flags.embedding_size, flags.lstm_size).to(device)

    #optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.7)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    loss_function = nn.CrossEntropyLoss()

    for e in range(flags.num_epochs):
        print(f'epoch #{e}: ',end="")
        batches = get_batches(x_batch,y_batch,flags.batch_size, flags.seq_size)
        (state_h_1, state_c_1),(state_h_2, state_c_2) = model.zero_state(flags.batch_size)
        state_h_1 = state_h_1.to(device)
        state_c_1 = state_c_1.to(device)
        state_h_2 = state_h_2.to(device)
        state_c_2 = state_c_2.to(device)
        
        for i,(x, y) in enumerate(batches):
            model.train()
            optimizer.zero_grad()
 

            x = torch.tensor(x , dtype=torch.int64).to(device)
            #print("x shape {} ".format(np.shape(x)))
            
            tmp = []
            for index,el in enumerate(y) :
                tmp.append(np.zeros(n_vocab))
                tmp[index][y[index]] = 1
            #print(y)
            y = tmp 
            y = torch.tensor(y , dtype=torch.int64).to(device)
            logits, (state_h_1, state_c_1),(state_h_2, state_c_2) = model(x, (state_h_1, state_c_1),(state_h_2, state_c_2))
            #print("logits shape {} , y shape {}".format(np.shape(logits),np.shape(y)))
            loss = loss_function(logits, y)

            state_h_1 = state_h_1.detach()
            state_c_1 = state_c_1.detach()
            state_h_2 = state_h_2.detach()
            state_c_2 = state_c_2.detach()

            loss_value = loss.item()

            loss.backward()
            _ = torch.nn.utils.clip_grad_norm_(model.parameters(), flags.gradients_norm)
            optimizer.step()
        print(f'batch #{i}:\tloss={loss.item():.10f}')
    return model 

if __name__ == "__main__":
    model = train()
    torch.save(model,'save/model')

dataset.py

import numpy as np 
import random
import torch 
from collections import Counter
import string

def get_data_from_file(batch_size, seq_size) :
    filename = "corpus.txt"
    raw_text = open(filename, 'r', encoding='utf-8').read()
    text = raw_text.lower()
    #text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.split()
    
    chars = sorted(list(set(text)))


    #word_counts = Counter(text)
    #sorted_vocab = dict((c, i) for i, c in enumerate(chars))
    #int_to_vocab = {k: w for k, w in enumerate(sorted_vocab)}
    #vocab_to_int = {w: k for k, w in int_to_vocab.items()}
    vocab_to_int = dict((c, i) for i, c in enumerate(chars))
    int_to_vocab = dict((i, c) for i, c in enumerate(chars))
    n_vocab = len(int_to_vocab)
    np.save("save/int_to_vocab",int_to_vocab)
    np.save("save/vocab_to_int",vocab_to_int)
    print('Vocabulary size', n_vocab)
    print("total charact : ", len(text))

    int_text = np.array([vocab_to_int[w] for w in text])
    #encod_text = one_hot_encod(int_text)
    return int_to_vocab, vocab_to_int, n_vocab,int_text


def create_batch(array,batch_size,seq_size):
    num_batches = int(len(array) / (seq_size * batch_size))
    array = array[:num_batches * batch_size * seq_size]
    #x_batch = np.reshape(array, (num_batches, -1))
    x_batch = np.array(np.split(array, int(len(array) / seq_size)      ))
    y_batch = x_batch[:,-1]
    #print(y_batch)
    x_batch = np.delete(x_batch, np.s_[::2], 1)
    print("x shape {} , y shape {} ".format(np.shape(x_batch),np.shape(y_batch)))
    return x_batch,y_batch
    

def get_batches(x,y ,batch_size, seq_size):
    num_batches = len(x)
    for i in range(0, num_batches,batch_size):
        yield x[i:i+batch_size], y[i:i+batch_size]

pred.py

import torch
import numpy as np 

model = torch.load("save/model")
vocab_to_int = np.load("save/vocab_to_int.npy",allow_pickle='TRUE').item()
int_to_vocab = np.load("save/int_to_vocab.npy",allow_pickle='TRUE').item()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.eval()

top_k = 5
words = ['i','am']

(state_h_1, state_c_1),(state_h_2, state_c_2) = model.zero_state(1)
state_h_1 = state_h_1.to(device)
state_c_1 = state_c_1.to(device)
state_h_2 = state_h_2.to(device)
state_c_2 = state_c_2.to(device)

for w in words:
        ix = torch.tensor([[vocab_to_int[w]]]).to(device)
        output, (state_h_1, state_c_1),(state_h_2, state_c_2) = model(ix, (state_h_1, state_c_1),(state_h_2, state_c_2))
    
_, top_ix = torch.topk(output[0], k=top_k)
choices = top_ix.tolist()
choice = np.random.choice(choices[0])

words.append(int_to_vocab[choice])

for _ in range(100):
    ix = torch.tensor([[choice]],dtype=torch.int64).to(device)
    output, (state_h_1, state_c_1),(state_h_2, state_c_2) = model(ix, (state_h_1, state_c_1),(state_h_2, state_c_2))

    _, top_ix = torch.topk(output[0], k=top_k)
    choices = top_ix.tolist()
    choice = np.random.choice(choices[0])
    words.append(int_to_vocab[choice])

print(' '.join(words))