Hello,
for the last 2 days I am trying to solve issue when resuming training from model checkpoint. Problem is that the training loss after resuming is a LOT different than before saving model (the difference is huge, almost as if the model was right after initialization process). I can see, that after few iterations it increases accuracy (decreases loss) much faster than if it was learned from scratch. But I don’t know what cuases the loss to be so high.
But weird thing is that I checked output of the same input (before I turned models in the eval mode - model.eval()
) right before saving model and right after loading and the output was the same -> so that means it has correct weights everywhere in the model. I assume that the model has to be loaded correctly, because otherwise it would produce different output for the same inputs.
So I was wondering if the optimizer
is saved and loaded correctly, so as the lr_scheduler
. I checked and optimizer
is loaded correctly, the only difference was in the id
s for each layer, but I guess these id
s are picked randomly at time the model is created. But the order was correct and the values in the state
dict were also correct. The lr_scheduler
was having correct values. Also I tried the model without lr_scheduler and removed all the “params” from optimizer (I’ve set momentum = 0, weight_decay = 0
, so there’s actually no params in the optimizer). But still without success.
Maybe there can be some problem with using optimizer with 2 models like this optimizer = torch.optim.SGD([{'params': model.parameters()}, {'params': fc_layer.parameters()}], lr=config.lr, weight_decay=config.weight_decay, momentum=config.momentum)
? I am training face recognition model with AM-Softmax (see the code below), so that’s why I have 2 models (one model for feature-vector extraction, second for learning using AM-Softmax). Can be the problem here? I checked and the weights are correctly saving and loading from and to the fc_layer
model.
class AddMarginProduct(nn.Module):
def __init__(self, in_features: int, out_features: int, s: float = 30.0, m: float = 0.35) -> None:
'''
Implementation of large margin cosine distance
Arguments:
nn {[type]} -- [description]
in_features {int} -- [size of each input sample]
out_features {int} -- [size of each output sample]
Keyword Arguments:
s {float} -- [norm of input feature] (default: {30.0})
m {float} -- [margin] (default: {0.35})
'''
super(AddMarginProduct, self).__init__()
self.in_features = in_features
self.out_features = out_features
self.s = s
self.m = m
self.weight = Parameter(torch.FloatTensor(out_features, in_features))
nn.init.xavier_uniform_(self.weight)
def forward(self, input, label):
'''
Forward pass of layer
Arguments:
input {[type]} -- [torch layer inputs]
label {[type]} -- [torch model labels]
Returns:
[type] -- [output]
'''
# --------------------------- cos(theta) & phi(theta) ---------------------------
cosine = F.linear(F.normalize(input), F.normalize(self.weight))
phi = cosine - self.m
# --------------------------- convert label to one-hot ---------------------------
one_hot = torch.zeros(cosine.size()).scatter_(1, label.unsqueeze(1), 1).byte()
output = torch.where(one_hot, phi, cosine)
output *= self.s
return output
Now I am out of ideas why there’s the big difference after resuming from the model checkpoint.
There’s just main parts of my code, if you can spot some mistake I did? (I removed unnecessary part as config parsing, my custom classes for progress managing and so on, but the complete code doesn’t yield any error).
def main():
# Set cuda if it's available, set correct default tensor and set random seed
use_cuda = not config.no_cuda and torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
# torch.manual_seed(config.seed) # Tried with or without manual seed
if device.type == 'cuda':
torch.set_default_tensor_type('torch.cuda.FloatTensor')
cudnn.benchmark = True
# Create train dataset factory with transforms and train_loader
train_dataset = dataset_factory.get_train_dataset(transforms=T.Compose([
T.RandomAffine(10, translate=(0.05, 0.05), scale=(0.9, 1.1), shear=5),
T.RandomHorizontalFlip(p=0.5),
T.ToTensor(),
T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
]))
train_loader = torch.utils.data.DataLoader(
train_dataset,
batch_size=config.batch_size,
shuffle=True,
num_workers=config.num_of_workers,
pin_memory=config.pin_memory,
)
# Create test dataset with transforms and test_loader
test_dataset = dataset_factory.get_test_dataset(transforms=T.Compose([
T.ToTensor(),
T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
]))
test_loader = torch.utils.data.DataLoader(
test_dataset,
batch_size=config.test_batch_size,
shuffle=False,
num_workers=config.num_of_workers,
pin_memory=config.pin_memory,
)
# Create models
model = resnet_18(pretrained=config.pretrained, num_classes=config.feature_size)
fc_layer = AddMarginProduct(config.feature_size, train_dataset.one_hot_vector_classes)
# Cast to device
model.to(device)
fc_layer.to(device)
# Create loss func and optimizer
criterion = F.cross_entropy
optimizer = torch.optim.SGD([{'params': model.parameters()}, {'params': fc_layer.parameters()}], lr=config.lr, weight_decay=config.weight_decay, momentum=config.momentum)
# Set scheduler and set initial epoch
scheduler = StepLR(optimizer, step_size=config.lr_step_size, gamma=config.lr_gammma)
start_epoch = 1
# Model manager is just wrapper for torch.save() and torch.load(), because it automatically adds best accuracies to the dictionary
model_manager = ModelManager()
# If we should resume
if args.resume and os.path.isfile(args.resume):
fc_load_failed = False
# Load the checkpoint
print('Loading checkpoint {}'.format(args.resume))
checkpoint_state = model_manager.load_model_checkpoint(args.resume)
# Models
model.load_state_dict(checkpoint_state["models"]["model"])
try:
fc_layer.load_state_dict(checkpoint_state["models"]["fc_layer"])
# fc_layer.to(device)
except Exception as e:
fc_load_failed = True
print("Cannot load fc_layer (probably wrong dimensions from loaded model) - exception: ", e)
# Load model only (not the optimizer and other options)
if not args.model_only:
scheduler.load_state_dict(checkpoint_state['scheduler'])
if not fc_load_failed:
optimizer.load_state_dict(checkpoint_state['optimizer'])
start_epoch = checkpoint_state['epoch'] + 1
config.epochs += start_epoch
print("Loaded checkpoint '{}' (epoch {})".format(args.resume, start_epoch))
# Train
for epoch in range(start_epoch, config.epochs):
scheduler.step()
# In this function is called model.train() and fc_layer.train()
train(config, model, fc_layer, device, train_loader, optimizer, criterion, epoch, update_logger, epoch_logger)
# There is called model.eval() and fc_layer.eval()
lfw_acc = test(config, model, fc_layer, device, test_loader, criterion, epoch_logger, epoch, evaluator, plots_dir)
# Save model after each epoch
model_manager.save_model_checkpoint(
os.path.join(models_dir, "{}_e_{}_acc_{}.pth".format(config.model, str(epoch).zfill(4), lfw_acc)),
{
"model_arch": config.model,
"fc_layer_arch": config.fc_layer,
"models": {
"model": model.state_dict(),
"fc_layer": fc_layer.state_dict()
},
"optimizer": optimizer.state_dict(),
"scheduler": scheduler.state_dict(),
"epoch": epoch,
},
lfw_acc,
os.path.join(models_dir, "{}_best.pth".format(config.model))
)
Have I done anything wrong?
Also I tried pytorch version 0.4.1 and 1.0.0, and nothing changed.
Thank you very much for your help, I am a little desperate, because I am out of ideas what could be wrong.