PyTorch LSTM has nan for MSELoss

My model is:

class BaselineModel(nn.Module):
    def __init__(self, feature_dim=5, hidden_size=5, num_layers=2, batch_size=32):
        super(BaselineModel, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size

        self.lstm = nn.LSTM(input_size=feature_dim,
                            hidden_size=hidden_size, num_layers=num_layers)

    def forward(self, x, hidden):
        lstm_out, hidden = self.lstm(x, hidden)
        return lstm_out, hidden

    def init_hidden(self, batch_size):
        hidden = Variable(next(self.parameters()).data.new(
            self.num_layers, batch_size, self.hidden_size))
        cell = Variable(next(self.parameters()).data.new(
            self.num_layers, batch_size, self.hidden_size))
        return (hidden, cell)

Training looks like:


    train_loader = torch.utils.data.DataLoader(
        train_set, batch_size=BATCH_SIZE, shuffle=True, **params)

    model = BaselineModel(batch_size=BATCH_SIZE)
    optimizer = optim.Adam(model.parameters(), lr=0.01, weight_decay=0.0001)
    loss_fn = torch.nn.MSELoss(reduction='sum')

    for epoch in range(250):

        # hidden = (torch.zeros(2, 13, 5),
        #           torch.zeros(2, 13, 5))
        # model.hidden = hidden
        for i, data in enumerate(train_loader):
            hidden = model.init_hidden(13)
            inputs = data[0]
            outputs = data[1]

            print('inputs',  inputs.size())
            # print('outputs', outputs.size())

            # optimizer.zero_grad()
            model.zero_grad()

            # print('inputs', inputs)
            pred, hidden = model(inputs, hidden)

            loss = loss_fn(pred, outputs)

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

            print('Epoch: ', epoch, '\ti: ', i, '\tLoss: ', loss)

I have gradient clipping set already, which seems to be the recommended solution. But after even the first step, I get:

Epoch:  0       i:  0   Loss:  tensor(nan, grad_fn=<MseLossBackward>)

If anyone is interested in magic internet points, I also posted here