I tried playing around with learning rates, .01, .001, .0001
however my model loss and val loss are not decreasing.
I am using Densenet from Pytorch models, and have copied most of the code from the Pytorch transfer learning tutorial
which some few minor changes to print out val accurac
y every x amount of batches. I think my minor changes could be affecting training somehow, but I am not sure what I am doing wrong.
Here is how my training looks even after first epoch
Training Loss: 8.5898 Acc: 0.0102
Epoch 1/2
val_loss 8.5163 val_acc: 0.0191
val_loss 8.9667 val_acc: 0.0255
val_loss 9.3078 val_acc: 0.0127
val_loss 8.7855 val_acc: 0.0318
val_loss 8.6217 val_acc: 0.0255
val_loss 9.0631 val_acc: 0.0191
val_loss 8.5167 val_acc: 0.0255
val_loss 9.0499 val_acc: 0.0191
val_loss 9.0549 val_acc: 0.0255
val_loss 8.8373 val_acc: 0.0191
val_loss 8.9288 val_acc: 0.0191
val_loss 8.9968 val_acc: 0.0127
val_loss 9.2790 val_acc: 0.0127
val_loss 9.4389 val_acc: 0.0191
val_loss 8.6907 val_acc: 0.0318
val_loss 9.0903 val_acc: 0.0191
val_loss 9.0093 val_acc: 0.0191
val_loss 9.4387 val_acc: 0.0127
val_loss 9.1059 val_acc: 0.0191
val_loss 9.3480 val_acc: 0.0127
val_loss 8.9435 val_acc: 0.0191
val_loss 8.4412 val_acc: 0.0318
val_loss 8.8712 val_acc: 0.0382
val_loss 8.9125 val_acc: 0.0191
val_loss 9.3815 val_acc: 0.0127
val_loss 9.0214 val_acc: 0.0191
val_loss 9.4234 val_acc: 0.0127
val_loss 9.1625 val_acc: 0.0191
Here is the code (I can’t seem to fix formatting on this site)
from future import print_function, division
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import os
import copydef prepare_dataset_from_folder(data_dir, size, batch_size):
"""prepare datasets into dataloaders
args ------ data_dir = root_dir holding 'train_dir' and 'val_dir' size = size of image batch_size = batch_size """ data_transforms = { 'training': transforms.Compose([ transforms.Resize((size,size)), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]), 'validation': transforms.Compose([ transforms.Resize((size,size)), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]), } image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x), data_transforms[x]) for x in ['training', 'validation']} dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size= batch_size, shuffle=True, num_workers=4) #num_workers leads to errors for x in ['training', 'validation']} dataset_sizes = {x: len(image_datasets[x]) for x in ['training', 'validation']} class_names = image_datasets['training'].classes return dataloaders, dataset_sizes, class_names
def train_model(model, criterion, optimizer, scheduler, num_epochs=3, best_acc = 80.0, batch_size = 5):
since = time.time()
dataloaders, dataset_sizes, class_names = prepare_dataset_from_folder(data_dir, size, batch_size)for epoch in range(num_epochs): print('Epoch {}/{}'.format(epoch, num_epochs - 1)) print('-' * 10) running_loss = 0.0 running_corrects = 0 for num, data in enumerate(dataloaders["training"]): if num % 100 == 0: val_running_loss = 0.0 val_running_corrects = 0 model.eval() for val_num, val_data in enumerate(dataloaders["validation"]): inputs, labels = val_data inputs = inputs.to(device) labels = labels.to(device) with torch.set_grad_enabled(False): outputs = model(inputs) _, preds = torch.max(outputs, 1) loss = criterion(outputs, labels) val_running_loss += loss.item() * inputs.size(0) val_running_corrects += torch.sum(preds == labels.data) val_loss = ( val_running_loss/dataset_sizes['validation']) val_acc = (val_running_corrects.double() / dataset_sizes['validation'])#.cpu().numpy() print('val_loss {:.4f} val_acc: {:.4f}'.format(val_loss, val_acc)) if val_acc > best_acc: print("Saving due to high val accuracy") x = datetime.datetime.now() time_stamp_str = str(x).split(' ')[0] + "_" save_path = os.path.join(SAVE_DIR, f"val_acc_{val_acc}.pt") torch.save(model, save_path) model.train() inputs, labels = data inputs = inputs.to(device) labels = labels.to(device) scheduler.step() optimizer.zero_grad() with torch.set_grad_enabled(True): outputs = model(inputs) _, preds = torch.max(outputs, 1) loss = criterion(outputs, labels) loss.backward() optimizer.step() running_loss += loss.item() * inputs.size(0) running_corrects += torch.sum(preds == labels.data) epoch_loss = running_loss / dataset_sizes['training'] epoch_acc = running_corrects.double() / dataset_sizes['training'] print('Training Loss: {:.4f} Acc: {:.4f}'.format(epoch_loss, epoch_acc)) time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) return model
if name==“main”:
data_dir = "TEST 10" device = torch.device("cuda:0") PATH =" " size = 224 batch_size = 5 # pick your model #model_ft = torch.load(PATH) #model_ft = models.resnet18(pretrained=True) model_ft = models.densenet201(pretrained=True) #num_ftrs = model_ft.fc.in_features #in features num_ftrs = 1920 #densenet 201 #https://discuss.pytorch.org/t/what-does-the-fc-in-feature-mean/4889 model_ft.fc = nn.Linear(num_ftrs, 2) model_ft = model_ft.to(device) criterion = nn.CrossEntropyLoss() # Observe that all parameters are being optimized optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9) # Decay LR by a factor of 0.1 every 7 epochs exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=2, gamma=0.1) model_ft = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler, num_epochs=3, batch_size = batch_size)
Strangely if I use original script from transfer learning with minimal changes, my model begins to learn:
#original script
from future import print_function, division
import pretrainedmodels as ptmodelsimport torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import numpy as np
import torchvision
from torchvision import datasets, models, transformsimport matplotlib.pyplot as plt
import time
import os
import copydef train_model(model, criterion, optimizer, scheduler, num_epochs=25):
since = time.time()best_model_wts = copy.deepcopy(model.state_dict()) best_acc = 0.0 for epoch in range(num_epochs): print('Epoch {}/{}'.format(epoch, num_epochs - 1)) print('-' * 10) # Each epoch has a training and validation phase for phase in ['training', 'validation']: if phase == 'training': scheduler.step() #what is this model.train() # Set model to training mode else: model.eval() # Set model to evaluate mode running_loss = 0.0 running_corrects = 0 # Iterate over data. for num, data in enumerate(dataloaders[phase]): inputs, labels = data inputs = inputs.to(device) labels = labels.to(device) # zero the parameter gradients optimizer.zero_grad() # forward # track history if only in train with torch.set_grad_enabled(phase == 'training'): outputs = model(inputs) _, preds = torch.max(outputs, 1) loss = criterion(outputs, labels) # backward + optimize only if in training phase if phase == 'training': loss.backward() optimizer.step() # statistics running_loss += loss.item() * inputs.size(0) running_corrects += torch.sum(preds == labels.data) if num % 100 == 0: print("500 batches") temp_running_corrects = (running_corrects.double() / ( len(inputs) * num + 1)).cpu().numpy() print(temp_running_corrects) #print('running_corrects', running_corrects/( len(inputs) * num + 1).numpy()) epoch_loss = running_loss / dataset_sizes[phase] epoch_acc = running_corrects.double() / dataset_sizes[phase] print('{} Loss: {:.4f} Acc: {:.4f}'.format( phase, epoch_loss, epoch_acc)) # deep copy the model if phase == 'validation' and epoch_acc > best_acc: best_acc = epoch_acc best_model_wts = copy.deepcopy(model.state_dict()) print() time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) print('Best val Acc: {:4f}'.format(best_acc)) # load best model weights model.load_state_dict(best_model_wts) save_path = os.path.join(SAVE_DIR, f"val_acc_{val_acc}.pt") torch.save(model, save_path)
Training Loss:
Epoch 0/2
500 batches
0.0
500 batches
0.681592039800995
500 batches
0.6359102244389028
500 batches
0.6422628951747088
500 batches
0.6541822721598003
500 batches
0.6763236763236763
500 batches
0.694421315570358
500 batches
0.7059243397573162
500 batches
0.7108057464084947
500 batches
0.7096057745696835
500 batches
0.7126436781609196
500 batches
0.7142208087233075
500 batches
0.7251145356101625
500 batches
0.7293348712033834
500 batches
0.729382363441628
500 batches
0.7340886371209597
500 batches
0.7363323961262105
500 batches
0.7391943546015877
500 batches
0.7428492085531797
500 batches
0.7432254669823731
500 batches
0.74656335916021
500 batches
0.7493453939538205
500 batches
0.7521017950465804