Doubt on In-Memory checkpoint

andrejankas · April 18, 2021, 4:16pm

I would like to save an in-memory checkpoint, so to resume training. My question is: Am I saving everything necessary to later continue training with the following code? Also, is it necessary to save the loss?

DEVICE = 'cuda'

def train(model, optimizer, loss, train_loader, test_loader, epochs):
    best_model, best_optimizer, best_loss, best_accuracy = None, None, None, 0.0
    for epoch in range(epochs):
        for x, y in train_loader:
            optimizer.zero_grad()
            x = x.to(DEVICE)
            y = y.to(DEVICE)
            y_hat = model(x)
            output = loss(y_hat, y)
            output.backward()
            optimizer.step()
        model.eval()
        test_acc, test_loss = validate(model, test_loader, loss)
        model.train()
        # Save in-memory checkpoint
        if test_acc > best_accuracy:
            best_accuracy = test_acc
            best_model = copy.deepcopy(model)  # Is this the "correct" way?
            best_optimizer = copy.deepcopy(optimizer)
            best_loss = copy.deepcopy(loss)
    return best_model, best_optimizer, best_loss


def validate(model, test_loader, loss_fn):
    with torch.no_grad():
        pass

And then, when I call the train loop, my intention is to continue training like this:

model = nn.Sequential(
  # ...
)

optimizer = optim.Adam(model.parameters(), lr=1e-3)
loss = nn.CrossEntropyLoss()
model2, optimizer2, loss2 = train(model, optimizer, loss,
                                  train_loader, test_loader, epochs=150)
# Continue training
_, _, _ = train(model2, optimizer2, loss2, train_loader,
                test_loader, epochs=10)

andrejankas · April 19, 2021, 2:13pm

I am using the same logic I posted here, and when call again the train loop: learning gets stuck, same loss is shown. Any idea why?

ptrblck · April 20, 2021, 4:55am

I think your code won’t work, since the copied optimizer2 will lose its reference to the new copy of model and thus won’t update it anymore.
You would thus have to recreate a new optimizer with model2's parameter references and load its state_dict as shown here:

def train(model, optimizer, loss, data, target):
    best_model, best_optimizer, best_loss = None, None, None
    
    optimizer.zero_grad()
    out = model(data)
    loss = criterion(out, target)
    loss.backward()
    optimizer.step()

    # Save in-memory checkpoint
    best_model = copy.deepcopy(model)  # Is this the "correct" way?
    best_optimizer = copy.deepcopy(optimizer)
    best_loss = loss.clone()
    return best_model, best_optimizer, best_loss



model = nn.Linear(10, 10)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()
data = torch.randn(1, 10)
target = torch.tensor([0])

model2, optimizer2, loss2 = train(model, optimizer, criterion, data, target)
print(loss2)

for _ in range(10):
    optimizer = optim.Adam(model2.parameters(), lr=1e-3)
    optimizer.load_state_dict(optimizer2.state_dict())
    model2, optimizer2, loss2 = train(model2, optimizer, criterion, data, target)
    print(loss2)