I would like to save an in-memory checkpoint, so to resume training. My question is: Am I saving everything necessary to later continue training with the following code? Also, is it necessary to save the loss?
DEVICE = 'cuda'
def train(model, optimizer, loss, train_loader, test_loader, epochs):
best_model, best_optimizer, best_loss, best_accuracy = None, None, None, 0.0
for epoch in range(epochs):
for x, y in train_loader:
optimizer.zero_grad()
x = x.to(DEVICE)
y = y.to(DEVICE)
y_hat = model(x)
output = loss(y_hat, y)
output.backward()
optimizer.step()
model.eval()
test_acc, test_loss = validate(model, test_loader, loss)
model.train()
# Save in-memory checkpoint
if test_acc > best_accuracy:
best_accuracy = test_acc
best_model = copy.deepcopy(model) # Is this the "correct" way?
best_optimizer = copy.deepcopy(optimizer)
best_loss = copy.deepcopy(loss)
return best_model, best_optimizer, best_loss
def validate(model, test_loader, loss_fn):
with torch.no_grad():
pass
And then, when I call the train loop, my intention is to continue training like this:
model = nn.Sequential(
# ...
)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
loss = nn.CrossEntropyLoss()
model2, optimizer2, loss2 = train(model, optimizer, loss,
train_loader, test_loader, epochs=150)
# Continue training
_, _, _ = train(model2, optimizer2, loss2, train_loader,
test_loader, epochs=10)