Hi,
I’m trying to modify the character level rnn classification code to make it fit for my application. The data set I have is pretty huge (4 lac training instances). The code snippets are shown below (I’ve shown only the necessary parts, all helper functions are same as the official example)
I initially faced the problem of exploding / vanishing gradient as described in this issue issue
I used the solution given there to clip the gradient in the train() function. But now, I seem to get negative values for loss. What is that supposed to mean?
Also, how is it that in the official example (when I apply it to my dataset) I get loss values that are greater than 1.
class RNN(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super(RNN, self).__init__()
self.hidden_size = hidden_size
self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
self.i2o = nn.Linear(input_size + hidden_size, output_size)
self.softmax = nn.Softmax()
def forward(self, input, hidden):
combined = torch.cat((input, hidden), 1)
hidden = self.i2h(combined)
output = self.i2o(combined)
output = self.softmax(output)
output = output.add(1e-8)
output = output.log()
return output, hidden
def initHidden(self):
return Variable(torch.zeros(1, self.hidden_size).cuda())
criterion and the train() function are written as follows:
criterion = nn.NLLLoss().cuda()
learning_rate = 0.005 # If you set this too high, it might explode. If too low, it might not learn
def train(category_tensor, line_tensor):
hidden = rnn.initHidden()
rnn.zero_grad()indian
# print(len(line_tensor.size()))
if(line_tensor.dim() != 0): #I have random new lines in some cases. This condition is to handle those
for i in range(line_tensor.size()[0]):
output, hidden = rnn(line_tensor[i], hidden)
loss = criterion(output, category_tensor)
loss.backward()
# This line is used to prevent the vanishing / exploding gradient problem
torch.nn.utils.clip_grad_norm(rnn.parameters(), 0.25)
for p in rnn.parameters():
p.data.add_(-learning_rate, p.grad.data)
return output, loss.data[0]
else:
return None, -1
Training of the model happens here
n_iters = 40000
print_every = 200
plot_every = 200
# # Keep track of losses for plotting
current_loss = 0
all_losses = []
def timeSince(since):
now = time.time()
s = now - since
m = math.floor(s / 60)
s -= m * 60
return '%dm %ds' % (m, s)
start = time.time()
tp = 0
tn = 0
fp = 0
fn = 0
precision = 0
recall = 0
fmeasure = 0
for iter in range(1, n_iters + 1):
category, line, category_tensor, line_tensor = randomTrainingExample()
output, loss = train(category_tensor, line_tensor)
if loss != -1:
current_loss += loss
guess, guess_i = categoryFromOutput(output)
if guess == -1 and guess_i == -1:
continue
else:
correct = '1' if guess == category else '0 (%s)' % category
if guess == 'class1' and category == 'class1':
tp += 1
elif guess == 'class2' and category == 'class2':
fn += 1
elif guess == 'class1' and category == 'class2':
fp += 1
else:
tn += 1
if iter % print_every == 0:
loss = current_loss / print_every
print('%d %d%% (%s) %.4f %s / %s %s' % (iter, iter / n_iters * 100, timeSince(start), loss, line, guess, correct))
all_losses.append(current_loss / plot_every)
current_loss = 0
def evaluate(line_tensor):
hidden = rnn.initHidden()
if(line_tensor.dim() == 0):
return line_tensor
else:
for i in range(line_tensor.size()[0]):
output, hidden = rnn(line_tensor[i], hidden)
return output
def predict(input_line, category, n_predictions=1):
output = evaluate(Variable(lineToTensor(input_line)).cuda())
global total
global indian
global nonindian
total += 1
if(output.dim() != 0):
topv, topi = output.data.topk(1, 1, True)
for i in range(0, n_predictions):
value = topv[0][i]
category_index = topi[0][i]
if category_index <= 1:
if all_categories[category_index] == 'indian':
indian += 1
else:
nonindian += 1
predictions.append([value, all_categories[category_index], category])