EDIT: Nevermind. Issue resolved itself
I am new to pytorch, so I am willing to bet that I’m messing something up. The training goes along fine exactly as with keras and tensorflow. In the first epochs the test loss is RMSE 1.5 like the training. Then however in the next epochs it starts to explode with values like 700+. I have a feeling it accumulates loss at each call.
I assume my mistake must be in my training loop and understanding of pytorch works. Could you help me out?
criterion = nn.MSELoss()
nepochs = 15
batchsize = 100
ntrain = len(train_idx)
nbatches = int(np.ceil(ntrain / batchsize))
e = 0
# Run training loop
optimizer = optim.Adam(mdnn.parameters(), lr=1e-4)
for _ in range(nepochs):
mdnn.train()
randidx = np.random.permutation(np.arange(ntrain))
for b in trange(nbatches, desc='epoch {}'.format(e)):
batchidx = randidx[b*batchsize:min(ntrain, (b+1)*batchsize)]
xbatch = [Variable(torch.from_numpy(x[batchidx]), requires_grad=False) for x in x_train]
ybatch = Variable(torch.from_numpy(y_train[batchidx].astype(np.float32)), requires_grad=False)
optimizer.zero_grad() # zero the gradient buffers
output = mdnn.forward(xbatch, mol.element)
loss = criterion(output, ybatch)
loss.backward()
optimizer.step()
tqdm.write('Training rmse: {}'.format(loss.data.numpy()[0]))
e += 1
mdnn.eval()
output = mdnn.forward(x_test_var, mol.element)
loss = criterion(output, y_test_var)
print('Validation rmse: {}'.format(loss.data.numpy()[0]))