Hi everyone, I’m stuck on this very simple LSTM classification problem for a couple of days. Most of the codes are adapted from here:
https://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html
and a very similar adaption here:
It seems that while the loss drops a bit after 50 epochs, it keep predicting the same class.
from torch.utils.data import Dataset, DataLoader
filepath = "../input/sample-country-data/data/data/names/"
#--------utils-----------
def findFiles(path): return glob.glob(path)
# Turn a Unicode string to plain ASCII, thanks to http://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
return ''.join(
c for c in unicodedata.normalize('NFD', s)
if unicodedata.category(c) != 'Mn'
and c in all_letters
)
# Build the category_lines dictionary, a list of names per language
category_lines = {}
all_categories = []
# Read a file and split into lines
def readLines(filename):
lines = open(filename, encoding='utf-8').read().strip().split('\n')
return [unicodeToAscii(line) for line in lines]
for filename in findFiles(filepath+'*.txt'):
category = os.path.splitext(os.path.basename(filename))[0]
all_categories.append(category)
lines = readLines(filename)
category_lines[category] = lines
n_categories = len(all_categories)
def categoryFromOutput(output):
top_n, top_i = output.topk(1)
category_i = top_i[0].item()
return all_categories[category_i], category_i
def randomChoice(l):
return l[random.randint(0, len(l) - 1)]
def str2ascii_arr(msg):
arr = [ord(c) for c in msg]
return arr#, len(arr)
#---------preprocess---------
#mainly to convert them into tensor for training
def process_input(x,y):
name_seqs = [str2ascii_arr(name) for name in x]
country_seq = [all_categories.index(country) for country in y]
country_seq = torch.tensor(country_seq)
name_seqs_len = torch.tensor([len(seq) for seq in name_seqs])
seq_matrix = torch.zeros((len(name_seqs),name_seqs_len.max())).long()
for idx, (name_seq, seq_len) in enumerate(zip(name_seqs,name_seqs_len)):
seq_matrix[idx,:seq_len] = torch.tensor(name_seq).long()
name_seqs_len, sort_idx = name_seqs_len.sort(0,descending=True)
seq_matrix = seq_matrix[sort_idx]
return seq_matrix, country_seq, name_seqs_len
#---------DS---------
class NameDS(Dataset):
def __init__(self):
self.y_list = []
self.x_list = []
for key, value in category_lines.items():
self.y_list.extend([key]*len(value))
self.x_list.extend(value)
def __getitem__(self, index):
return self.x_list[index], self.y_list[index]
def __len__(self):
return len(self.x_list)
#---------lstm model---------
class LSTMClassifier(nn.Module):
def __init__(self, embed_dim_size, hidden_dim_size, vocab_size,output_dim_size):
super().__init__()
self.hidden_dim_size = hidden_dim_size
self.embedding = nn.Embedding(vocab_size,embed_dim_size)
self.lstm = nn.LSTM(embed_dim_size, hidden_dim_size)
self.fc = nn.Linear(hidden_dim_size,output_dim_size)
def forward(self, input, name_seqs_len):
batch_size = input.size(0)
input = input.t()
embeds = self.embedding(input)
hidden = self.init_hidden(batch_size)
seq_matrix = pack_padded_sequence(embeds, name_seqs_len)
outputs, (ht,ct) = self.lstm(seq_matrix, hidden)
ht = ht[-1]
outputs = self.fc(ht)
return outputs
def init_hidden(self, batch_size):
return (torch.zeros(1, batch_size, self.hidden_dim_size).to(device),
torch.zeros(1, batch_size, self.hidden_dim_size).to(device))
name_dataset = NameDS()
train_loader = DataLoader(dataset=name_dataset,
batch_size=batch_size,
shuffle=True)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
batch_size = 128
lstmClassifier = LSTMClassifier(128,64,128,n_categories)
loss_function = nn.CrossEntropyLoss()#nn.NLLLoss()
optimizer = optim.Adam(lstmClassifier.parameters(), lr=0.001)
torch.backends.cudnn.benchmark = True
lstmClassifier = lstmClassifier.to(device)
losses = []
epochs = 50
num_of_batches = None#3 #early stopping
for i in range(0,epochs):
total_loss = 0
for i, (x, y) in enumerate(train_loader,1):
if(i is not None and i==num_of_batches):
break
x, y, x_seq_len = process_input(x,y)
x, y, x_seq_len = x.to(device), y.to(device), x_seq_len.to(device)
lstmClassifier.zero_grad()
output = lstmClassifier(x,x_seq_len)
loss = loss_function(output,y)#.unsqueeze(0))
total_loss += loss.data[0]
#print loss every 10 batches
if i%10 == 0:
print('Training loss per 10 batches:',total_loss / (10 *i))
loss.backward()
optimizer.step()
print('Training loss per epoch:',total_loss / (np.ceil(len(train_loader.dataset)/batch_size)))
losses.append(total_loss / (np.ceil(len(train_loader.dataset)/batch_size)))
Evaluation:
x,y = iter(train_loader).next()
lstmClassifier = lstmClassifier.eval()
x, y, x_seq_len = process_input(x,y)
x, y, x_seq_len = x.to(device), y.to(device), x_seq_len.to(device)
pred = lstmClassifier(x,x_seq_len)
categoryFromOutput(pred[0]),categoryFromOutput(pred[1])
Please help and thanks in advance!