Hello, I’m trying to resume training after saving my model. I did this several times and worked. But now, no error is returned but the first batch is never processed.
Here a quick overview of my code, I tried to omit all boilerplate to make it more readable:
def train_model(RUN):
def train(model, device, dataloader, loss_fn, optimizer, batch_acc):
optimizer.zero_grad()
model.train()
# metrics boilerplate
for x, batch in enumerate(dataloader):
print("DEBUG") # never prints
# Train logic
del data_prot, data_met, weights, target, logits
gc.collect()
return metrics
### Testing function
def test(model, device, dataloader, loss_fn):
# test boilerplate
loss_fn = nn.BCEWithLogitsLoss
model = IntegrativeModel( # Init boilerplate)
params_to_optimize = [
{'params': model.parameters() }
]
optim = torch.optim.Adam(params_to_optimize, lr=0.000031, weight_decay= 0.0000072)
path = RUN + "_last.pth"
best_loss = 5
if os.path.exists(path):
torch.set_flush_denormal(True) # Tried this, nothing changed
checkpoint = torch.load(RUN + "_last.pth")
start = checkpoint["epoch"] + 1
model.load_state_dict(checkpoint["model"])
optim.load_state_dict(checkpoint["optimizer"])
# Logging boilerplate
print(f"Resuming training from epoch {start}") # This prints
else:
# Loading pretrained module boilerplate
model.set_cold() # This function removes the gradient for the pretrained module
start = 0
train_log = {}
with open(RUN+'.json', 'w') as f:
json.dump(train_log, f)
model= nn.DataParallel(model)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
print(f'Selected device: {device}') # prints
# Dataloader creation boilerplate
num_epochs = 40
for epoch in range(start, num_epochs):
train_log[epoch] = {}
if epoch == 1:
model.module.set_warm() # I tried calling the function after loading
elif epoch in [10, 15, 20, 25]:
batch_acc *= 2
for par in optim.param_groups:
par["weight_decay"] = par["weight_decay"]/2
print("starting epoch...") # prints
train_loss = train(
model, device, train_dataloader, loss_fn, optim, batch_acc)
test_loss = test(model, device, test_dataloader_3, loss_fn)
print('\n EPOCH {}/{} \t train loss {} \t val loss {}'.format(epoch
+ 1, num_epochs, train_loss, test_loss))
# Metrics and logging boilerplate
with open(RUN+'.json', 'w') as f:
json.dump(train_log, f)
if test_loss[1] > best_loss:
best_loss = test_loss[0]
torch.save({"epoch":epoch,
"optimizer": optim.state_dict(),
"model":model.module.state_dict()},
"{}.pth".format(RUN+"_best"))
best_loss = test_loss[0]
torch.save({"epoch":epoch,
"optimizer": optim.state_dict(),
"model":model.module.state_dict()},
"{}.pth".format(RUN+"_last"))
gc.collect()
I don’t know what else I could try and my training is still stuck.