How to handle exploding/vanishing gradient in Pytorch and negative loss values

Hi,

I’m trying to modify the character level rnn classification code to make it fit for my application. The data set I have is pretty huge (4 lac training instances). The code snippets are shown below (I’ve shown only the necessary parts, all helper functions are same as the official example)

I initially faced the problem of exploding / vanishing gradient as described in this issue issue

I used the solution given there to clip the gradient in the train() function. But now, I seem to get negative values for loss. What is that supposed to mean?

Also, how is it that in the official example (when I apply it to my dataset) I get loss values that are greater than 1.

class RNN(nn.Module):

    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.Softmax()
  
    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        output = output.add(1e-8)
        output = output.log()

        return output, hidden
        

    def initHidden(self):
        return Variable(torch.zeros(1, self.hidden_size).cuda())

criterion and the train() function are written as follows:

criterion = nn.NLLLoss().cuda()

learning_rate = 0.005 # If you set this too high, it might explode. If too low, it might not learn

def train(category_tensor, line_tensor):
    hidden = rnn.initHidden()

    rnn.zero_grad()indian
    # print(len(line_tensor.size()))
    if(line_tensor.dim() != 0): #I have random new lines in some cases. This condition is to handle those
        for i in range(line_tensor.size()[0]):
            output, hidden = rnn(line_tensor[i], hidden)

        loss = criterion(output, category_tensor)
        loss.backward()
        
       # This line is used to prevent the vanishing / exploding gradient problem
        torch.nn.utils.clip_grad_norm(rnn.parameters(), 0.25)
        
        for p in rnn.parameters():
            p.data.add_(-learning_rate, p.grad.data)
            
        return output, loss.data[0]
    else:
        return None, -1

Training of the model happens here

n_iters = 40000
print_every = 200
plot_every = 200

# # Keep track of losses for plotting
current_loss = 0
all_losses = []

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

start = time.time()

tp = 0
tn = 0
fp = 0
fn = 0

precision = 0
recall = 0
fmeasure = 0

for iter in range(1, n_iters + 1):
    category, line, category_tensor, line_tensor = randomTrainingExample()
    output, loss = train(category_tensor, line_tensor)

    if loss != -1:
        current_loss += loss

        guess, guess_i = categoryFromOutput(output)
        if guess == -1 and guess_i == -1:
            continue
        else:                
            correct = '1' if guess == category else '0 (%s)' % category
            if guess == 'class1' and category == 'class1':
                tp += 1
            elif guess == 'class2' and category == 'class2':
                fn += 1
            elif guess == 'class1' and category == 'class2':
                fp += 1
            else:
                tn += 1
            
            if iter % print_every == 0:
                loss = current_loss / print_every
                print('%d %d%% (%s) %.4f %s / %s %s' % (iter, iter / n_iters * 100, timeSince(start), loss, line, guess, correct))
                all_losses.append(current_loss / plot_every)
                current_loss = 0

def evaluate(line_tensor):
    hidden = rnn.initHidden()
    if(line_tensor.dim() == 0):
        return line_tensor
    else:
        for i in range(line_tensor.size()[0]):
            output, hidden = rnn(line_tensor[i], hidden)
        return output

def predict(input_line, category, n_predictions=1):
    output = evaluate(Variable(lineToTensor(input_line)).cuda())
    global total
    global indian
    global nonindian

    total += 1
    if(output.dim() != 0):
        topv, topi = output.data.topk(1, 1, True)

        for i in range(0, n_predictions):
            value = topv[0][i]
            category_index = topi[0][i]

            if category_index <= 1:
                if all_categories[category_index] == 'indian':
                    indian += 1
                else:
                    nonindian += 1
                predictions.append([value, all_categories[category_index], category])

Haven’t checked the gradient clipping part but the negative loss occurs because you need to use LogSoftmax with the NLLLoss. If the output vector is y^ = [0.99 0.01 0.0] (which is the output of softmax) and the true class y = 0 the NLLLoss is simply defined as -0.99. If you use log softmax for the same output vector defined above then y^ = [0 -4.6 -inf] so NLLLoss will be -0 = 0.