I’m training a model, performing a forward pass, and comparing it to that from loading the same model. The outputs are similar and perform similarly when used for classification, but not exactly the same as expected.
I set up the model and optimizer:
class LinearNet(nn.Module):
def __init__(self):
super().__init__()
self.classifier = nn.Sequential(
nn.Linear(98, 98*3),
nn.ReLU(inplace=True),
nn.Linear(98*3, 98*2),
nn.ReLU(inplace=True),
nn.Linear(98*2, 98*1)
)
def forward(self, x):
x = x.view(-1, 98*1)
x = self.classifier(x)
return x
net = LinearNet()
optimizer = optim.Adam(net.parameters(), **{'lr':0.001, 'betas':(0.9, 0.999), 'eps':1e-08, 'weight_decay':0, 'amsgrad':False})
Train the model and save a checkpoint at every 2000 mini-batches:
for epoch in range(1, epochs+1):
for i, data in enumerate(trainloader, 0):
# get the inputs
inputs, labels = data
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
outputs = net(inputs)
loss = criterion(outputs, labels.flatten())
loss.backward()
optimizer.step()
# get statistics every 2000 mini-batchss
running_loss += loss.item()
if i % 2000 == 1999:
# log the running training loss
training_loss.append(running_loss / 2000)
# log the running validation loss
with torch.no_grad():
running_val_loss = 0.0
for i_val, data_val in enumerate(valloader, 0):
inputs_val, labels_val = data_val
outputs_val = net(inputs_val)
loss_val = criterion(outputs_val, labels_val.flatten()).item()
running_val_loss += loss_val
validation_loss.append(running_val_loss / len(valloader))
print('[%d, %5d] train_loss: %.3f | val_loss: %.3f' %
(epoch, i + 1, running_loss / 2000, running_val_loss / len(valloader)))
# save checkpoint
torch.save({
'epoch': epoch,
'model_state_dict': net.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'training_loss': running_loss / 2000,
'validation_loss': running_val_loss / len(valloader)
}, PATH+'/epoch{}_model.pt'.format(epoch))
running_loss = 0.0
net.eval()
Load the last saved checkpoint:
checkpoint = torch.load(PATH+'/epoch{}_model.pt'.format(epoch))
loaded_net = LinearNet()
loaded_net.load_state_dict(checkpoint['model_state_dict'])
loaded_net.to(device)
for parameter in loaded_net.parameters():
parameter.requires_grad = False
loaded_net.eval()
And finally compare results with the same code:
output = net(torch.tensor(inputs).float().to(device)).cpu().detach().numpy()
I’m wondering why the outputs are not exactly the same? I’m thinking maybe my checkpoint process is not correct somehow?