Brief background:
I have some neural network code where I use the off-the-shelf AlexNet to classify pictures of galaxies according to their apparent shape. I use an outer loop to randomly split the data into train and test sets, and an inner loop to train for a certain number of epochs (using the given train-test split).
Below I show some performance snippets. When I use the pretrained AlexNet, I get performance as shown below.
Epoch 0/19
----------
train Loss: 1.0289 Acc: 0.5794
val Loss: 0.8498 Acc: 0.6455
Epoch 1/19
----------
train Loss: 0.7615 Acc: 0.6877
val Loss: 0.6023 Acc: 0.7675
Epoch 2/19
----------
train Loss: 0.6124 Acc: 0.7541
val Loss: 0.5280 Acc: 0.7989
Epoch 3/19
----------
train Loss: 0.5634 Acc: 0.7775
val Loss: 0.4999 Acc: 0.8088
Epoch 4/19
----------
train Loss: 0.5335 Acc: 0.7915
val Loss: 0.4847 Acc: 0.8038
However, when I make the following two changes to my code:
-
def initialize_model(model_name, num_classes, feature_extract, use_pretrained=True):
becomesdef initialize_model(model_name, num_classes, feature_extract, use_pretrained):
-
model_ft, input_size = initialize_model(model_name, num_classes, feature_extract, use_pretrained=True)
becomesmodel_ft, input_size = initialize_model(model_name, num_classes, feature_extract, use_pretrained=False)
Then I get performance like the following:
Epoch 0/19
----------
train Loss: 1.3729 Acc: 0.4470
val Loss: 1.3457 Acc: 0.5472
Epoch 1/19
----------
train Loss: 1.3219 Acc: 0.5496
val Loss: 1.2954 Acc: 0.5472
Epoch 2/19
----------
train Loss: 1.2761 Acc: 0.5496
val Loss: 1.2544 Acc: 0.5472
Epoch 3/19
----------
train Loss: 1.2391 Acc: 0.5496
val Loss: 1.2214 Acc: 0.5472
Epoch 4/19
----------
train Loss: 1.2092 Acc: 0.5496
val Loss: 1.1948 Acc: 0.5472
I understand that the fresh (not pretrained) network will perform worse initially. What concerns me is that the accuracy scores are constant (except for one), even while the losses are decreasing.
I am thinking about re-running the pretrained job, but commenting out the lines where I use DistributedDataParallel
. I have no idea if this is relevant, but I can’t think of anything else to suspect.
I have switched between pretrained and non-pretrained AlexNet before (although with no random train-test splits and no DistributedDataParallel
) and the non-pretrained network behaved as I would have expected: it performed much worse initially but eventually (mostly) caught up.
Below I share some parts of my code (for the pretrained job).
Model training function: (this is mostly a direct paste from a PyTorch tutorial)
def train_model(model, dataloaders, criterion, optimizer, num_epochs=25, is_inception=False):
since = time.time()
val_acc_history = []
train_acc_history = []
best_model_wts = copy.deepcopy(model.state_dict())
best_acc = 0.0
for epoch in range(num_epochs):
print('Epoch {}/{}'.format(epoch, num_epochs - 1))
print('-' * 10)
# Each epoch has a training and validation phase
for phase in ['train', 'val']:
if phase == 'train':
model.train() # Set model to training mode
else:
model.eval() # Set model to evaluate mode
running_loss = 0.0
running_corrects = 0
# Iterate over data.
for inputs, labels in dataloaders[phase]:
inputs = inputs.to(device)
labels = labels.to(device)
# zero the parameter gradients
optimizer.zero_grad()
# forward
# track history if only in train
with torch.set_grad_enabled(phase == 'train'):
# Get model outputs and calculate loss
# Special case for inception because in training it has an
# auxiliary output. In train mode we calculate the loss by
# summing the final output and the auxiliary output but in
# testing we only consider the final output.
if is_inception and phase == 'train':
# From https://discuss.pytorch.org/t/how-to-optimize-inception-model-with-auxiliary-classifiers/7958
outputs, aux_outputs = model(inputs)
loss1 = criterion(outputs, labels)
loss2 = criterion(aux_outputs, labels)
loss = loss1 + 0.4*loss2
else:
outputs = model(inputs)
loss = criterion(outputs, labels)
_, preds = torch.max(outputs, 1)
# backward + optimize only if in training phase
if phase == 'train':
loss.backward()
optimizer.step()
# statistics
running_loss += loss.item() * inputs.size(0)
running_corrects += torch.sum(preds == labels.data)
epoch_loss = running_loss / len(dataloaders[phase].dataset)
epoch_acc = running_corrects.double() / len(dataloaders[phase].dataset)
print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))
# deep copy the model
if phase == 'val' and epoch_acc > best_acc:
best_acc = epoch_acc
best_model_wts = copy.deepcopy(model.state_dict())
if phase == 'val':
val_acc_history.append(epoch_acc)
if phase == 'train':
train_acc_history.append(epoch_acc)
print()
torch.save({
'epoch': epoch,
'model_state_dict': model.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
# 'best_model_accuracy': best_acc,
# 'best_model_weights': best_model_wts,
'loss': loss,
# 'train_hist': train_acc_history,
# 'val_hist': val_acc_history,
}, f'/project/rrg-lelliott/jsa378/model_1_output/checkpoint_run_{run}_epoch_{epoch}.tar')
# }, str(os.getenv('location3'))+f'/checkpoint_run_{run}_epoch_{epoch}.tar')
torch.save({
'epoch': epoch,
'model_state_dict': model.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'best_model_accuracy': best_acc,
'best_model_weights': best_model_wts,
'loss': loss,
'train_hist': train_acc_history,
'val_hist': val_acc_history,
}, f'/project/rrg-lelliott/jsa378/model_1_output/checkpoint_run_{run}.tar')
# }, str(os.getenv('location3'))+f'/checkpoint_run_{run}.tar')
time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
print('Best val Acc: {:4f}'.format(best_acc))
# load best model weights
model.load_state_dict(best_model_wts)
return model, val_acc_history, train_acc_history
Model initializer: (again mostly pasted from a tutorial)
def initialize_model(model_name, num_classes, feature_extract, use_pretrained):
# Initialize these variables which will be set in this if statement.
# Each of these variables is model specific.
model_ft = None
input_size = 0
if model_name == "alexnet":
""" Alexnet
"""
model_ft = models.alexnet(pretrained=use_pretrained)
set_parameter_requires_grad(model_ft, feature_extract)
num_ftrs = model_ft.classifier[6].in_features
model_ft.classifier[6] = nn.Linear(num_ftrs,num_classes)
input_size = 224
elif model_name == "resnet":
""" Resnet18
"""
model_ft = models.resnet152(pretrained=use_pretrained)
set_parameter_requires_grad(model_ft, feature_extract)
num_ftrs = model_ft.fc.in_features
model_ft.fc = nn.Linear(num_ftrs, num_classes)
input_size = 224
else:
print("Invalid model name, exiting...")
exit()
return model_ft, input_size
DistributedDataParallel
usage and model initialization:
def main():
rank = os.environ.get("SLURM_LOCALID")
current_device = 0
torch.cuda.set_device(current_device)
print('From Rank: {}, ==> Initializing Process Group...'.format(rank))
dist.init_process_group(backend="mpi", init_method='tcp://127.0.0.1:3456') # Use backend="mpi" or "gloo". NCCL does not work on a single GPU due to a hard-coded multi-GPU topology check.
print("process group ready!")
print('From Rank: {}, ==> Making model..'.format(rank))
model_ft, input_size = initialize_model(model_name, num_classes, feature_extract, use_pretrained=False)
print(model_ft)
model_ft = model_ft.to(device)
model_ft = torch.nn.parallel.DistributedDataParallel(model_ft, device_ids=[current_device]) # Wrap the model with DistributedDataParallel
(There is much more code in the main
function that I’m not showing.)
Thanks for any help.