Resnet34 stopped training on GPU

Hello, i’m doing transfer learning on Resnet34 for kaggle Diabetic retinopathy challenge dataset. in this code snippets the size of the output image is 512*512
Here’s how i loaded the data:

data_transforms = {
    'train': transforms.Compose([
        transforms.RandomRotation((90, 180)),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor()
    ]),
    'val': transforms.Compose([
        transforms.ToTensor()
    ]),
    'test': transforms.Compose([
        transforms.ToTensor(),
    ])
}

data_dir = 'new_dataset/'
image_dataset = {x: datasets.ImageFolder(os.path.join(data_dir, x), data_transforms[x])
                 for x in ['train', 'val', 'test']}

dataloaders = {x: DataLoader(image_dataset[x],
                             batch_size=16,
                             shuffle=True,
                            num_workers=0)
              for x in ['train', 'val', 'test']}
dataSizes = {x: len(image_dataset[x]) for x in ['train', 'val', 'test']}
class_names = image_dataset['train'].classes

and here is the Main function:

def weights_init(model):
      if isinstance(model, nn.Conv2d):
          torch.nn.init.kaiming_normal_(model.weight.data)
      elif isinstance(model, nn.Linear):
           torch.nn.init.kaiming_normal_(model.weight.data)
           model.bias.data.normal_(mean=0,std=1e-2)
      elif isinstance(model, nn.BatchNorm2d):
	       model.weight.data.uniform_()
	       model.bias.data.zero_()


use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
model = models.resnet34(pretrained=True)
model.avgpool = nn.AvgPool2d(10,10)
num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, 2)
model = model.to(device)
model  = tourch.nn.DataParallel(model, device_ids=[0])
model.module.fc.apply(weights_init)
criterion = nn.CrossEntropyLoss()
lr = 0.01
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)
class_names = ['normal','upnormal']
true = []
pred = []
accuracy =0
num_epochs = 30

mymodel, train_losses, train_acc, val_losses, val_acc=train_model(model, criterion, optimizer, scheduler, num_epochs=num_epochs)

my problem is the training stopped at epoch 23 (the code did not raise an error but it was stuck for 8 hours in this epoch, this is not the duration of the epoch during training) and my GPU utilization was 0%.
This code was run on Windows. Can anyone tell me what was happening there or what’s the meaning of a stuck epoch like this or where did it come from?

1 Like