NLLLoss returns nan for loss every time

I am trying to create a model that can correctly assign a groupname to a new asset based on it’s hostname. I have created a model using NLLLoss, but it always guesses the same groupname and the loss function is always nan. The code is below, and I’d appreciate any and all help in troubleshooting this.

def letterToIndex(letter):
    return char_dict.get(letter)

def letterToTensor(letter):
    tensor = torch.zeros(1, n_letters)
    tensor[0][letterToIndex(letter)] = 1
    return tensor

def lineToTensor(hostname):
    tensor = torch.zeros(len(hostname), 1, n_letters)
    for li, letter in enumerate(hostname):
        tensor[li][0][letterToIndex(letter)] = 1

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
  
        self.hidden_size = hidden_size
 
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

input = letterToTensor('a')
hidden = torch.zeros(1, n_hidden)

output, next_hidden = rnn(input, hidden)

input = lineToTensor('test')
hidden = torch.zeros(1, n_hidden)

output, next_hidden = rnn(input[0], hidden)
print(output)

def categoryFromOutput(output):
    top_n, top_i = output.topk(1)
    category_i = top_i[0].item()
    return all_categories[category_i], category_i

print(categoryFromOutput(output))

def randomChoice(l):
    return l[random.randint(0, len(l) - 1)]

def randomTrainingExample():
    category = randomChoice(all_categories)
    line = randomChoice(category_lines[category])
    category_tensor = torch.tensor([all_categories.index(category)], dtype=torch.long)
    line_tensor = lineToTensor(line)
    return category, line, category_tensor, line_tensor

for i in range(10):
    category, line, category_line, line_tensor = randomTrainingExample()
    print('category =', category, '/ line =', line)

criterion = nn.NLLLoss()

learning_rate = 0.005 #If set too high, it might explode, too low, it might not learn
def train(category_tensor, line_tensor):
    hidden = rnn.initHidden()

    rnn.zero_grad()

    for i in range(line_tensor.size()[0]:
        output, hidden = rnn(line_tensor[i], hidden)
    loss = criterion(output, category_tensor)
    loss.backward()

    #Add parameters' gradients to their values, multiplied by learning rate
    for p in rnn.parameters():
        p.data.add_(p.grad.data, alpha=-learning_rate)

    return output, loss.item()

n_iters = 100000
print_every = 5000
plot_every = 1000

#keep track of losses for plotting
current_loss = 0
all_losses = []

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

start = time.time()

for iter in range(1, n_iters + 1):
    category, line, category_tensor, line_tensor = randomTrainingExample()
    output, loss = train(category_tensor, line_tensor)
    current_loss += loss

# Print iter number, loss, name and guess
    if iter % print_every == 0:
        guess, guess_i = categoryFromOutput(output)
        correct = 'Y' if guess == category else 'N (%s)' % category
        print('%d %d%% (%s) %.4f %s / %s %s' % (iter, iter / n_iters * 100, timeSince(start), loss, line, guess, correct))

#Add current loss avg to list of losses
if iter % plot_every == 0:
    all_losses.append(current_loss / plot_every)
    current_loss = 0

I don’t know how the forward method is implemented, but assume that output contains the log probabilities, while hidden is created in another way?
Could you check output for invalid values such as NaNs and Infs?
If you can find some, I would check the inputs for invalid values first, and in case the inputs are looking fine, I would check the value range of the output and hidden tensors inside the loop to see, if the model is diverging during the line_tensor loop.

I put a print statement in the forward method to view output, and found that the values start off working, but end up with nan values after a certain number of iterations in the training loop. Thanks for the advice! How can I account for the variability in the input? I can’t control what hostnames people enter for their devices.