LSTM Classification help..Stuck for a few days

Hi everyone, I’m stuck on this very simple LSTM classification problem for a couple of days. Most of the codes are adapted from here:
https://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html
and a very similar adaption here:

It seems that while the loss drops a bit after 50 epochs, it keep predicting the same class.


from torch.utils.data import Dataset, DataLoader

filepath = "../input/sample-country-data/data/data/names/"

#--------utils-----------

def findFiles(path): return glob.glob(path)

# Turn a Unicode string to plain ASCII, thanks to http://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

# Build the category_lines dictionary, a list of names per language
category_lines = {}
all_categories = []

# Read a file and split into lines
def readLines(filename):
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    return [unicodeToAscii(line) for line in lines]

for filename in findFiles(filepath+'*.txt'):
    category = os.path.splitext(os.path.basename(filename))[0]
    all_categories.append(category)
    lines = readLines(filename)
    category_lines[category] = lines

n_categories = len(all_categories)

def categoryFromOutput(output):
    top_n, top_i = output.topk(1)
    category_i = top_i[0].item()
    return all_categories[category_i], category_i

def randomChoice(l):
    return l[random.randint(0, len(l) - 1)]

def str2ascii_arr(msg):
    arr = [ord(c) for c in msg]
    return arr#, len(arr)

#---------preprocess---------
#mainly to convert them into tensor for training
def process_input(x,y):
    name_seqs = [str2ascii_arr(name) for name in x]
    country_seq = [all_categories.index(country) for country in y]
    country_seq = torch.tensor(country_seq)
    name_seqs_len = torch.tensor([len(seq) for seq in name_seqs])
    
    seq_matrix = torch.zeros((len(name_seqs),name_seqs_len.max())).long()

    for idx, (name_seq, seq_len) in enumerate(zip(name_seqs,name_seqs_len)):
        seq_matrix[idx,:seq_len] = torch.tensor(name_seq).long()

    name_seqs_len, sort_idx = name_seqs_len.sort(0,descending=True)
    seq_matrix = seq_matrix[sort_idx]
    
    return seq_matrix, country_seq, name_seqs_len

#---------DS---------

class NameDS(Dataset):
    def __init__(self):
        self.y_list = []
        self.x_list = []
        for key, value in category_lines.items():
            self.y_list.extend([key]*len(value))
            self.x_list.extend(value)
        
    def __getitem__(self, index):
        return self.x_list[index], self.y_list[index]
    
    def __len__(self):
        return len(self.x_list)

#---------lstm model---------
class LSTMClassifier(nn.Module):
    def __init__(self, embed_dim_size, hidden_dim_size, vocab_size,output_dim_size):
        super().__init__()
        self.hidden_dim_size = hidden_dim_size

        self.embedding = nn.Embedding(vocab_size,embed_dim_size)
        self.lstm = nn.LSTM(embed_dim_size, hidden_dim_size)
        self.fc = nn.Linear(hidden_dim_size,output_dim_size)
        
        
    def forward(self, input, name_seqs_len):
        batch_size = input.size(0)
                
        input = input.t()        
        embeds = self.embedding(input)
        
        hidden = self.init_hidden(batch_size)        
        seq_matrix = pack_padded_sequence(embeds, name_seqs_len)

        outputs, (ht,ct) = self.lstm(seq_matrix, hidden)
        
        ht = ht[-1]
        
        outputs = self.fc(ht)
        return outputs
        
    def init_hidden(self, batch_size):
        return (torch.zeros(1, batch_size, self.hidden_dim_size).to(device),
                torch.zeros(1, batch_size, self.hidden_dim_size).to(device))
    
    
name_dataset = NameDS()
train_loader = DataLoader(dataset=name_dataset,
                              batch_size=batch_size,
                              shuffle=True)


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
batch_size = 128
lstmClassifier = LSTMClassifier(128,64,128,n_categories)
loss_function = nn.CrossEntropyLoss()#nn.NLLLoss()
optimizer = optim.Adam(lstmClassifier.parameters(), lr=0.001)
torch.backends.cudnn.benchmark = True
lstmClassifier = lstmClassifier.to(device)

losses = []

epochs = 50
num_of_batches = None#3 #early stopping
for i in range(0,epochs):
    total_loss = 0
    for i, (x, y) in enumerate(train_loader,1):
    
        if(i is not None and i==num_of_batches):
            break
        
        x, y, x_seq_len = process_input(x,y)
        x, y, x_seq_len = x.to(device), y.to(device), x_seq_len.to(device)

        lstmClassifier.zero_grad()
    
        output = lstmClassifier(x,x_seq_len)

        loss = loss_function(output,y)#.unsqueeze(0))
        total_loss += loss.data[0]

        #print loss every 10 batches
        if i%10 == 0:
            print('Training loss per 10 batches:',total_loss / (10 *i))

        loss.backward()
        optimizer.step()
    print('Training loss per epoch:',total_loss / (np.ceil(len(train_loader.dataset)/batch_size)))
    losses.append(total_loss / (np.ceil(len(train_loader.dataset)/batch_size)))

Evaluation:

x,y = iter(train_loader).next()
lstmClassifier = lstmClassifier.eval()
x, y, x_seq_len = process_input(x,y)
x, y, x_seq_len = x.to(device), y.to(device), x_seq_len.to(device)
pred = lstmClassifier(x,x_seq_len)
categoryFromOutput(pred[0]),categoryFromOutput(pred[1])

Please help and thanks in advance!