Loading model from torchvision
model = models.segmentation.deeplabv3_resnet50(pretrained=True, progress=False, num_classes=21, aux_loss=None)
model.classifier[4] = nn.Conv2d(256, 1, (1, 1), (1, 1))
model.aux_classifier[4] = nn.Conv2d(256, 1, (1, 1), (1, 1))
Freeze initial layer for finetuning
idx = 0
for name, param in model.named_parameters():
# print(idx, name)
if idx < 129:
param.requires_grad_ = False
else:
break
idx += 1
Defining device to cuda if GPU is avaliable else to cpu
device = torch.device(âcudaâ if torch.cuda.is_available() else âcpuâ)
Defining loss criterion
criterion = FocalTverskyLoss(alpha=0.9)
Defining optimizer to update model params, Adamâs a good default
optimizer = optim.Adam(model.parameters())
Learning rate scheduler to update lr when loss stops improving
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=2)
Path at which to store model and required configs
model_save_path = âModels/â
If model file exits load previous training configs (model and other stuff) to continue training else start from 1
if os.path.exists(model_save_path + âdeeplabv3_resnet50_train.ptâ):
checkpoint = torch.load(model_save_path + âdeeplabv3_resnet50_train.ptâ)
start = checkpoint[âepochâ] + 1
min_val_loss = checkpoint[âval_lossâ]
model.load_state_dict(checkpoint[âmodel_state_dictâ])
optimizer.load_state_dict(checkpoint[âoptimizer_state_dictâ])
scheduler.load_state_dict(checkpoint[âscheduler_state_dictâ])
else:
start = 1
min_val_loss = 100
if torch.cuda.device_count() > 1:
print("Letâs use ",torch.cuda.device_count(),âGPUS!â)
model = nn.DataParallel(model, device_ids=[0,1]).cuda()
Move model to device
model.to(device)
Total epochs to train for
epochs = 100
Training loop
for epoch in range(start, epochs + 1):
train_loss = 0.
val_loss = 0.
# Switch model to training mode
model.train()
# Forward pass through dataset
for imgs, masks in train_loader:
# Move batch data to device
imgs, masks = imgs.to(device), masks.to(device)
# Clear previous gradients
optimizer.zero_grad()
# Forward pass the batch and get predictions
preds = model(imgs)['out']
# Calculate Loss
loss = criterion(preds, masks)
# Add to calculate loss for whole dataset
train_loss += (loss.item() * imgs.size(0))
# Backpropagate gradients
loss.backward()
# Make weight updates
optimizer.step()
# Empty cuda cache to clear useless data from VRAM for better utilization
torch.cuda.empty_cache()
# Switch model to inference mode
model.eval()
# Forward pass through validation dataset
with torch.no_grad():
for imgs, masks in val_loader:
imgs, masks = imgs.to(device), masks.to(device)
preds = model(imgs)['out']
loss = criterion(preds, masks)
val_loss += (loss.item() * imgs.size(0))
torch.cuda.empty_cache()
# Average loss on the dataset
train_loss /= len(train_dataset)
val_loss /= len(val_dataset)
# Change lr if loss is not improving
scheduler.step(val_loss)
# If loss is decreasing then store model and other configs in file else not
if val_loss < min_val_loss:
min_val_loss = val_loss
torch.save({
'epoch': epoch,
'model_state_dict': model.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'scheduler_state_dict' : scheduler.state_dict(),
'val_loss' : min_val_loss
}, model_save_path + 'deeplabv3_resnet50_train.pt')
torch.save(model, model_save_path + 'deeplabv3_resnet50_infer.pt')
# Print epoch number and train loss
print('Epoch {}:\tTrain Loss: {}\tVal Loss: {}'.format(epoch, train_loss, val_loss))
It gives error Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu