Hello I am new to PyTorch and I am building a word predictor with LSTM but
I have high training loss and not its not changing I tried already a lot of things. Smaller and bigger training set, more and fewer layers, and the rest of the hyperparameters.
Would someone so kind to tell me what I am doing wrong thank you very much
That’s the code I am using`
Blockquote
import torch
import torch.nn as nn
import string
import random
import sys
import unidecode
from torch.utils.tensorboard import SummaryWriter
import numpy as np
import re
import math
import argparse
from argparse import Namespace
from torch.utils.tensorboard import SummaryWriter
import datetime
print(“pytorch version”, torch.version)
def char_tensor(string, all_characters):
tensor = torch.zeros(len(string)).long()
for c in range(len(string)):
tensor[c] = all_characters.index(string[c])
return tensor
def clean_text(text):
text = text.replace(‘\n’,’ ‘)
text = re.sub(’[^A-Za-z ]+', ‘’, text)
return text
def word_to_class(word, word_dict):
word_class = word_dict[word]
word_class_t = torch.zeros(1) + word_class
return word_class_t
def one_hot_encode(y,output_size):
one_hot = torch.zeros(output_size)
one_hot[y] = 1
return one_hot
def create_batches(X_train, y_train, choices, word_dict, output_size, all_characters, batch_size):
x_batch =
y_batch =
for i in range(batch_size):
idx = choices.pop()
x_batch.append(char_tensor((X_train[idx]), all_characters).long())
num = word_to_class(y_train[idx], word_dict)
#one_hot = one_hot_encode(int(num), output_size)
y_batch.append(num)
xt_batch = torch.stack(x_batch)
yt_batch = torch.stack(y_batch)
return xt_batch, yt_batch, choices
def eval_model(model, X_test, y_test, create_batches, word_dict,
all_characters, output_size, device, batch_size=32):
criterion = nn.CrossEntropyLoss()
iterations = math.floor(len(X_test)/ batch_size)
test_data_size = iterations * batch_size
choices = list(range(test_data_size))
X_test = X_test[:int(iterations * batch_size)]
y_test = y_test[:int(iterations * batch_size)]
epoch_loss = 0
for idx in range(iterations):
text = “Iteration {} / {} “.format(idx, iterations)
print(text, end=”\r”)
hidden = model.init_hidden(batch_size)
x_batch, y_batch, choices = create_batches(X_test, y_test, choices, word_dict, output_size, all_characters, batch_size)
x_batch = x_batch.float().to(device)
y_batch = y_batch.type(torch.long)
output, hidden = model(x_batch, hidden)
output = output.float()
y_batch = y_batch.long().to(device)
loss = criterion(output.squeeze(0), y_batch.squeeze(1))
epoch_loss += loss.item()
print("validation loss {}".format(epoch_loss/iterations))
def save_checkpoint(state, is_best, filename=‘/output/checkpoint.pth.tar’):
“”“Save checkpoint if a new best is achieved”“”
if is_best:
print (“=> Saving a new best”)
torch.save(state, filename) # save checkpoint
else:
print (“=> Validation Accuracy did not improve”)
class SentimentNet(nn.Module):
def init(self, vocab_size, output_size, embedding_dim, hidden_dim1, hidden_dim2,
n_layers, device, drop_prob=0.3):
super(SentimentNet, self).init()
self.output_size = output_size
self.n_layers = n_layers
self.hidden_dim1 = hidden_dim1
self.hidden_dim2 = hidden_dim2
self.device = device
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.lstm = nn.LSTM(embedding_dim, hidden_dim1, n_layers, dropout=drop_prob, batch_first=True)
self.dropout = nn.Dropout(drop_prob)
self.fc = nn.Linear(hidden_dim1, hidden_dim2)
self.fc2 = nn.Linear(hidden_dim2, output_size)
self.relu = nn.ReLU()
self.softmax = torch.nn.Softmax(dim=1)
def forward(self, x, hidden):
batch_size = x.size(0)
x = x.long()
embeds = self.embedding(x)
lstm_out, hidden = self.lstm(embeds, hidden)
#lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
out = self.dropout(lstm_out)
out = self.relu(self.fc(out[:, -1, :]))
out = self.dropout(out)
out = self.softmax(self.fc2(out))
#out = out[:,-1]
#out = self.softmax(out)
#out = out.view(batch_size, -1)
return out, hidden
def init_hidden(self, batch_size):
weight = next(self.parameters()).data
hidden = (weight.new(self.n_layers, batch_size,
self.hidden_dim1).zero_().to(self.device), weight.new(self.n_layers,
batch_size, self.hidden_dim1).zero_().to(self.device))
return hidden
def main(args):
# Device configuration
device = torch.device(“cuda” if torch.cuda.is_available() else “cpu”)
seq_length = args.seq_length
num_epochs = 500
batch_size = args.batch_size
hidden_size1 = args.hidden_size1
hidden_size2 = args.hidden_size2
num_layers = args.num_layers
lr = args.lr
# Get characters from string.printable
all_characters = string.printable
n_characters = len(all_characters)
# Read large text file (Note can be any text file: not limited to just names)
text = unidecode.unidecode(open(‘data/’ + args.filename).read())
text_clean = clean_text(text)
text_clean = text_clean.split()
print("length text “, len(text_clean))
part = len(text_clean) * args.data_size
text_clean = text_clean[:int(part)]
seq_length = args.seq_length
path = 'runs/LSTM_{}_lr-{}batch_size-{}'.format(datetime.datetime.now().strftime("%Y-%m-%d%H-%M-%S”), args.lr, args.seq_length, args.batch_size)
writer = SummaryWriter(path)
y_data =
data =
part = ‘’
for idx in range(0, len(text_clean) - 1):
part += (text_clean[idx] + " ")
new_length = len(part) + len(text_clean[idx+1])
if new_length >= seq_length:
if len(part) > seq_length:
print(“ERROR”, len(part))
print(part)
part = part[:-1]
part = part.rjust(seq_length)
y_data.append(text_clean[idx+1])
data.append(part)
part = ‘’
continue
print("data size ", len(data))
print(“ydata”,len(y_data))
train_size = round(len(data) * 0.1)
X_train = data[:train_size]
y_train = y_data[:train_size]
X_test = data[train_size:]
y_test = y_data[train_size:]
print(“X_train”, len(X_train))
print(“y_train”, len(y_train))
print(“X_train”, len(X_train[0]))
print(“y_train”, y_train[0])
unique_word = set(text_clean)
output_size = len(set(text_clean))
word_dict = {word:idx for idx,word in enumerate(unique_word)}
iterations = math.floor(len(X_train)/ batch_size)
train_data_size = iterations * batch_size
choices = list(range(train_data_size))
#model = RNN(seq_length, hidden_size, num_layers,output_size).to(device)
print("create model ..")
model = SentimentNet(output_size, output_size, output_size, hidden_size1,
hidden_size2, num_layers, device).to(device)
print("... done loding ")
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()
print("=> Starting training")
print("batch_Size", batch_size)
print("num_layers", num_layers)
print("lr", lr)
best_loss = float("inf")
print("start training")
print("label amount ", len(y_train))
X_train = X_train[:int(iterations * batch_size)]
y_train = y_train[:int(iterations * batch_size)]
print("use {} to train ".format(device))
for epoch in range(1, num_epochs + 1):
epoch_loss = 0
choices = list(range(train_data_size))
# cute the test set
for idx in range(iterations):
text = "Iteration {} / {} ".format(idx, iterations)
print(text, end="\r")
hidden = model.init_hidden(batch_size)
x_batch, y_batch, choices = create_batches(X_train, y_train, choices, word_dict, output_size, all_characters, batch_size)
x_batch = x_batch.float().to(device)
#x_batch = x_batch.unsqueeze(0)
#print(x_batch.type())
y_batch = y_batch.type(torch.long)
output, hidden = model(x_batch, hidden)
output = output.float()
y_batch = y_batch.long().to(device)
#print(y_batch)
loss = criterion(output.squeeze(0), y_batch.squeeze(1))
#print("loss ", loss)
optimizer.zero_grad()
loss.backward(retain_graph=True)
optimizer.step()
epoch_loss += loss.item()
epoch_loss = epoch_loss / iterations
print("Epoch {} loss {}".format(epoch, epoch_loss ))
eval_model(model, X_test, y_test, create_batches, word_dict,
all_characters, output_size, device)
if name == “main”:
parser = argparse.ArgumentParser(description=‘LSTM word prediction’)
parser.add_argument(‘–filename’, default=“book1_LittleWomen.txt”, help=‘’)
parser.add_argument(‘–seq_length’, default=20, type=int, help=‘’)
parser.add_argument(‘–batch_size’, default=128, type=int, help=‘’)
parser.add_argument(‘–hidden_size1’, default=512, type=int, help=‘’)
parser.add_argument(‘–hidden_size2’, default=1024, type=int, help=‘’)
parser.add_argument(‘–num_layers’, default=4, type=int, help=‘’)
parser.add_argument(‘–data_size’, default=0.1, type=float, help=‘’)
parser.add_argument(‘–lr’, default=0.0005, type=float, help=‘’)
args = parser.parse_args()
main(args)
Blockquote
`