I’m currently rewriting a simple RNN cell within pytorch 1.0.1 and my definition code so far looks like
class Model(nn.Module):
def __init__(self):
super(Model, self).__init__()
self.Wxh = nn.Parameter(torch.randn([hidden_size, voc_size]))
self.Whh = nn.Parameter(torch.randn([hidden_size, hidden_size]))
self.Why = nn.Parameter(torch.randn([voc_size, hidden_size]))
self.bh = nn.Parameter(torch.randn([hidden_size, 1]))
self.by = nn.Parameter(torch.randn([voc_size,1]))
self.hp = torch.tensor(torch.zeros([hidden_size, 1]), dtype=torch.float32, device="cuda")
def forward(self, x):
h = torch.empty([hidden_size, 0], dtype=torch.float32, device="cuda")
for t in x.split(1, dim=1):
calc = self.Wxh @ t + self.Whh @ self.hp + self.bh
h = torch.cat([h, torch.tanh(calc)], dim=1)
self.hp = torch.tanh(calc)
o = self.Why @ h + self.by
return o
def zeroH(self):
self.hp = torch.tensor(torch.zeros([hidden_size, 1]), dtype=torch.float32, device="cuda")
rnn = Model().cuda()
When I go to train the model using the following simple trainer
optimizer = torch.optim.SGD(rnn.parameters(), lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()
for epoch in range(epochs):
for iteration in range(len(x_batches)):
xb = x_batches[iteration]
yb = torch.tensor(y_batches[iteration], dtype=torch.float32, device="cuda").argmax(0)
print(yb.shape)
y_pred = torch.t(rnn(torch.tensor(xb, dtype=torch.float32, device="cuda")))
loss = loss_fn(y_pred, yb)
print(loss)
optimizer.zero_grad()
loss.backward(retain_graph=True)
optimizer.step()
I get Nan for the loss after a single step of gradient descent. Is this due to an inherent design flaw or bad data inputs?