Recurrent Neural Network giving NAN loss after one step

I’m currently rewriting a simple RNN cell within pytorch 1.0.1 and my definition code so far looks like

class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.Wxh = nn.Parameter(torch.randn([hidden_size, voc_size]))
        self.Whh = nn.Parameter(torch.randn([hidden_size, hidden_size]))
        self.Why = nn.Parameter(torch.randn([voc_size, hidden_size]))
        self.bh = nn.Parameter(torch.randn([hidden_size, 1]))
        self.by = nn.Parameter(torch.randn([voc_size,1]))
        
        self.hp = torch.tensor(torch.zeros([hidden_size, 1]), dtype=torch.float32, device="cuda")
    def forward(self, x):
        h = torch.empty([hidden_size, 0], dtype=torch.float32, device="cuda")
        for t in x.split(1, dim=1):
            calc = self.Wxh @ t + self.Whh @ self.hp + self.bh
            h = torch.cat([h, torch.tanh(calc)], dim=1)
            self.hp = torch.tanh(calc)
        o = self.Why @ h + self.by
        return o
    
    def zeroH(self):
        self.hp = torch.tensor(torch.zeros([hidden_size, 1]), dtype=torch.float32, device="cuda")
        
rnn = Model().cuda()

When I go to train the model using the following simple trainer

optimizer = torch.optim.SGD(rnn.parameters(), lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

for epoch in range(epochs):
    for iteration in range(len(x_batches)):
        xb = x_batches[iteration]
        yb = torch.tensor(y_batches[iteration], dtype=torch.float32, device="cuda").argmax(0)
        print(yb.shape)
        y_pred = torch.t(rnn(torch.tensor(xb, dtype=torch.float32, device="cuda")))
        
        loss = loss_fn(y_pred, yb)
        print(loss)
        
        optimizer.zero_grad()
        loss.backward(retain_graph=True)
        
        optimizer.step()

I get Nan for the loss after a single step of gradient descent. Is this due to an inherent design flaw or bad data inputs?

I forgot exploding gradients was a thing… After clipping my “o” value the model now has gradients but I’m not sure if they are vanishing or if my model is wrong because it no longer changes after one step of SGD.