Hi,
Below is my training code
def train(model, trainloader, optimizer, epochal_train_losses):
train_loss = 0.00
n = 0
model.train()
for num, batch in enumerate(tqdm(trainloader)):
optimizer.zero_grad()
# pdb.set_trace()
coord, force, embedding_property = batch[0].cuda(non_blocking=True), batch[1].cuda(non_blocking=True), batch[2].cuda(non_blocking=True)
energy, pred_force = model.forward(coord,
embedding_property=embedding_property)
batch_loss = model.criterion(pred_force, force)
batch_loss.backward()
optimizer.step()
# perform L2 lipschitz check and projection
lipschitz_projection(model, strength=lipschitz_strength)
train_loss += batch_loss.detach().cpu()
n += 1
train_loss /= n
# epochal_train_losses.append(train_loss)
def train_and_evaluate(model, trainloader, validloader, optimizer, scheduler, start_epoch=1, restart=None):
if restart:
restore_path = os.path.join(log_dir + "/last.pth.tar")
checkpoint = load_checkpoint(restore_path, model, optimizer)
start_epoch = checkpoint["epoch"]
logging.info("Restoring from {} current epoch is {}".format(restore_path, start_epoch))
logging.info("starting training from epoch:{}".format(start_epoch))
best_val = 100000.00
for epoch in range(1, num_epochs+1):
############ training #############
train(model, trainloader, optimizer, epochal_train_losses)
############ validation #############
n=0
val_loss = 0.0
model.eval()
for num, batch in enumerate(validloader):
coord, force, embedding_property = batch[0].cuda(non_blocking=True), batch[1].cuda(non_blocking=True), batch[2].cuda(non_blocking=True)
energy, pred_force = model.forward(coord,
embedding_property=embedding_property)
batch_loss = model.criterion(pred_force, force)
val_loss += batch_loss.detach().cpu()
n += 1
print(n)
val_loss /= n
epochal_val_losses.append(val_loss)
scheduler.step(val_loss)
is_best = val_loss <= best_val
if epoch % epoch_freq == 0:
print(
"Epoch: {: <5} Test: {: <20}".format(
epoch, val_loss))
logging.info("Epoch:{} Val:{}".format(epoch, val_loss))
if epoch % args.summary_interval == 0:
np.save(log_dir, np.c_[epochal_val_losses])
if is_best:
best_val = val_loss
save_checkpoint({'epoch': epoch + 1,
'val_loss': val_loss,
# 'train_loss': epochal_train_losses[-1],
'state_dict': model.state_dict(),
'optim_dict': optimizer.state_dict()},
is_best=is_best,
checkpoint=log_dir)
I am getting OOM error. What am I doing wrong. I have tried with torch.no_grad() it gives error that element 0 does not require grad so not used.