Training phase of Leave-One-Out Cross Validation

I am not sure how to make the following code work as expected–maybe it is working correctly actually. Can you please refer to the line number with a fix or suggestion? line numbers can be seen here: https://pastebin.com/Ccy6P0Di

Before showing the code, it seems the training is actually working. However, it is very slow. I have 864 images and 9 categories. Can you please confirm if the below code makes any sense? Like will I eventually get the test accuracy at the end?

[jalal@goku official_tut]$ python loocv_tl.py 

Using sample 0 as test data

Batch 0

Epoch 0/24

----------

train Loss: 2.1908 Acc: 0.2616

Epoch 1/24

----------

train Loss: 1.9760 Acc: 0.2847

Epoch 2/24

----------
from __future__ import print_function, division

import torch
from torch.autograd import Variable



import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import os
import copy



import torch.utils.data as data_utils
from torch.utils import data


data_transforms = {
    'train': transforms.Compose([
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.RandomRotation(20),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
}


data_dir = "images"

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


def imshow(inp, title=None):
    """Imshow for Tensor."""
    inp = inp.numpy().transpose((1, 2, 0))
    mean = np.array([0.485, 0.456, 0.406])
    std = np.array([0.229, 0.224, 0.225])
    inp = std * inp + mean
    inp = np.clip(inp, 0, 1)
    plt.imshow(inp)
    if title is not None:
        plt.title(title)
    plt.pause(0.001)  # pause a bit so that plots are updated



def train_model(model, criterion, optimizer, scheduler, dataloader, num_epochs=25):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train']:
            if phase == 'train':
                scheduler.step()
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            #for inputs, labels in dataloaders[phase]:
            for inputs, labels in dataloader:
                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))

            # deep copy the model
 #           if phase == 'val' and epoch_acc > best_acc:
 #               best_acc = epoch_acc
 #               best_model_wts = copy.deepcopy(model.state_dict())

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
#    print('Best val Acc: {:4f}'.format(best_acc))

#    model.load_state_dict(best_model_wts)
    return model


def visualize_model(model, num_images=6):
    was_training = model.training
    model.eval()
    images_so_far = 0
    fig = plt.figure()

    with torch.no_grad():
        #for i, (inputs, labels) in enumerate(dataloaders['test]):
        for i, (inputs, labels) in enumerate(dataloaders['train']):

            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)

            for j in range(inputs.size()[0]):
                images_so_far += 1
                ax = plt.subplot(num_images//2, 2, images_so_far)
                ax.axis('off')
                ax.set_title('predicted: {}'.format(class_names[preds[j]]))
                imshow(inputs.cpu().data[j])

                if images_so_far == num_images:
                    model.train(mode=was_training)
                    return
        model.train(mode=was_training)



######################################################################
# Finetuning the convnet
# ----------------------
#
# Load a pretrained model and reset final fully connected layer.
#

#model_ft = models.resnet18(pretrained=True)
model_ft = models.resnet50(pretrained=True)

num_ftrs = model_ft.fc.in_features
model_ft.fc = nn.Linear(num_ftrs, 9)

model_ft = model_ft.to(device)

criterion = nn.CrossEntropyLoss()

# Observe that all parameters are being optimized
optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9)

# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)



#model_ft = model_ft.cuda()
nb_samples = 864
nb_classes = 9


data_transforms = {
    'train': transforms.Compose([
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.RandomRotation(20),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
}

'''val_loader = data.DataLoader(
        image_datasets['train'],
        num_workers=2,
        batch_size=1
    )
val_loader = iter(val_loader)'''

image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x),
                                          data_transforms[x])
                  for x in ['train']}

dataset_sizes = {x: len(image_datasets[x]) for x in ['train']}
class_names = image_datasets['train'].classes

# LOOCV
loocv_preds = []
loocv_targets = []
for idx in range(nb_samples):
    
    print('Using sample {} as test data'.format(idx))
    
    # Get all indices and remove test sample
    train_indices = list(range(len(image_datasets['train']))) 
    del train_indices[idx]
    
    # Create new sampler
    sampler = data.SubsetRandomSampler(train_indices)

    dataloader = data.DataLoader(
        image_datasets['train'],
        num_workers=2,
        batch_size=1,
        sampler=sampler
    )
    
    # Train model
    for batch_idx, (samples, target) in enumerate(dataloader):
        print('Batch {}'.format(batch_idx))
        model_ft = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler, dataloader, num_epochs=25) # do I add this line here?
                
    # Test on LOO sample
    model_ft.eval()
    test_data, test_target = image_datasets['train'][idx]
    #test_data, test_target = dataloader.next()
    test_data = test_data.cuda()
    test_target = test_target.cuda()
    test_data.unsqueeze_(1)
    test_target.unsqueeze_(0)

    output = model_ft(test_data)
    pred = torch.argmax(output, 1)
    loocv_preds.append(pred)
    loocv_targets.append(test_target.item())

I have commented some of the stuff from the original tutorial that I was not sure if I should get away with. Should they be commented? Also, if I am doing “LOOCV” on “tranfer learning” code, should I just have “train” folder with all my images in respective categories and no “test”/“val” folders (which are present in original tutorial) in “images” folder?

1 Like

Yes, you won’t need a val folder, as you are selecting one sample as the test case for LOOCV.

There are still some issues in your code:

  • Currently train_model takes the DataLoader and iterates it (line79). However, you are also iterating your DataLoader in line 230. So basically you have a nested loop now, which is probably not, what you want to have. That’s probably the reason for the slow training.
  • Your transformation is only defined for the training case, thus has some random transformations, which is fine. However, since you are sampling the test data also from the train dataset, you are also applying these random transformations to your test data. This is usually not desired.
  • Line 240 should probably unsqueeze dim0.

Since LOOCV might be a bit tricky to get right (without a data leakage at least), maybe using skorch would be a good idea. Skorch provides some wrappers for PyTorch to be scikit-compatible, which has a built-in LOOCV.
If that’s not an option, I would try to separate the data and transformation, and remove some unused code from train_model just to make sure the code is indeed working as expected.

1 Like

Thanks a lot for all these explanations. Except for transformation in dataloader, I was able to apply the rest of the feedback and seems it is actually working (I am not 100% sure). I did a minimal test with 10 images and two classes with 2 epochs.
Here, I expect each batch (say batch 1) to appear 9 times in the results which does.

Here is the new code: https://pastebin.com/MECLdfFZ

and here is the output: https://pastebin.com/SKQNRQNa

Can you help a little further how to separate the data loader for train and the test in loocv? the one in original tutorial was kind of very different and I am not sure how to make it work in this case.

Also, I found a notebook that does “transfer learning” using skorch. However, I cannot figure how to apply LOOCV here: https://colab.research.google.com/github/dnouri/skorch/blob/master/notebooks/Transfer_Learning.ipynb#scrollTo=IY4BAQUJLUiT

Can you please help with that?

I’ve updated your code a bit here.
There was still the preprocessing of the test sample missing as well as the model re-initialization after each LOOCV iteration.
Could you try that code again?

1 Like

Thanks a lot for the modified code. So, when I use your code, only one sample and its label are passed through the train_model method. I felt it was strange. I changed it so that the entire dataloader (except that of the test) gets passed to the train_model. However, it seems that no training is happening. This is a very obvious dataset I made (25 cat, and 25 Spanish Cavalier) so it should be fairly easy to detect with a high accuracy. I get a low accuracy.

here is your code (commented with parts so that I can send the entire dataloader to train_model. I mean this is not ideal specially if I have a original dataset of 860 images. However, enumerate(dataloader) for 49 images does have 49 batches which means in your code at each batch, one image and its label gets passed to train_model.

  1. Is that correct and is it what you expected that at each batch only 1 image and its label get passed to train_model in your code?
  2. Is the new code I have here wrong? can be accessed here with line numbers: https://pastebin.com/wp74WsFY
from __future__ import print_function, division

import torch
from torch.autograd import Variable
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim import lr_scheduler
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import os
import copy

import torch.utils.data as data_utils
from torch.utils import data

torch.manual_seed(2809)


def train_model(model, criterion, optimizer, scheduler,
                dataloader, num_epochs=25):
    '''since = time.time()
    model.train()  # Set model to training mode
    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        scheduler.step()

        running_loss = 0.0
        running_corrects = 0

        # Iterate over data.
        train_input = train_input.to(device)
        train_label = train_label.to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        output = model(train_input)
        _, pred = torch.max(output, 1)
        loss = criterion(output, train_label)
        loss.backward()
        optimizer.step()

        # statistics
        running_loss += loss.item() * train_input.size(0)
        running_corrects += torch.sum(pred == train_label.data)

        epoch_loss = running_loss / dataset_size['train']
        epoch_acc = running_corrects.double() / dataset_size['train']

        print('train Loss: {:.4f} Acc: {:.4f}'.format(epoch_loss, epoch_acc))

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))

    return model'''

    since = time.time()
 
    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)
 
        # Each epoch has a training and validation phase
 
        scheduler.step()
        model.train()  # Set model to training mode
 
 
        running_loss = 0.0
        running_corrects = 0
 
        # Iterate over data.
        for inputs, labels in dataloader:
            inputs = inputs.to(device)
            labels = labels.to(device)
 
            # zero the parameter gradients
            optimizer.zero_grad()
 
            # forward
            # track history if only in train
            with torch.set_grad_enabled(True):
                outputs = model(inputs)
                _, preds = torch.max(outputs, 1)
                loss = criterion(outputs, labels)
                # backward + optimize only if in training phase
                loss.backward()
                optimizer.step()
            # statistics
            running_loss += loss.item() * inputs.size(0)
            running_corrects += torch.sum(preds == labels.data)
 
        epoch_loss = running_loss / dataset_size['train']
        epoch_acc = running_corrects.double() / dataset_size['train']
 
        print('Loss: {:.4f} Acc: {:.4f}'.format(epoch_loss, epoch_acc))
 
 
 
    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
 
 
 
    return model


data_transforms = {
    'train': transforms.Compose([
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.RandomRotation(20),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'test': transforms.Compose([
        transforms.Resize(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
}


data_dir = "test_images"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model_ft = models.resnet50(pretrained=True)
num_ftrs = model_ft.fc.in_features
model_ft.fc = nn.Linear(num_ftrs, 2)
model_ft = model_ft.to(device)

# Save a clone of initial model to restore later
initial_model = copy.deepcopy(model_ft)

criterion = nn.CrossEntropyLoss()

# Observe that all parameters are being optimized
optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9)

# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)

#model_ft = model_ft.cuda()
nb_samples = 50
nb_classes = 2

image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x),
                                          data_transforms[x])
                  for x in ['train']}

dataset_size = {x: len(image_datasets[x]) for x in ['train']}
class_names = image_datasets['train'].classes

# LOOCV
loocv_preds = []
loocv_targets = []
for idx in range(nb_samples):

    print('Using sample {} as test data'.format(idx))

    print('Resetting model')
    model_ft = copy.deepcopy(initial_model)

    # Get all indices and remove test sample
    train_indices = list(range(len(image_datasets['train'])))
    del train_indices[idx]

    # Create new sampler
    sampler = data.SubsetRandomSampler(train_indices)
    dataloader = data.DataLoader(
        image_datasets['train'],
        num_workers=2,
        batch_size=1,
        sampler=sampler
    )

    # Train model
    '''for batch_idx, (sample, target) in enumerate(dataloader):
        print('Batch {}'.format(batch_idx))
        model_ft = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler, sample, target, num_epochs=10)'''

    model_ft = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler, dataloader, num_epochs=10)

    # Test on LOO sample
    model_ft.eval()
    test_data, test_target = image_datasets['train'][idx]
    # Apply test preprocessing on data
    print(type(test_data))
    test_data = data_transforms['test'](transforms.ToPILImage()(test_data))
    test_data = test_data.cuda()
    test_target = torch.tensor(test_target)
    test_target = test_target.cuda()
    test_data.unsqueeze_(0)
    test_target.unsqueeze_(0)
    print(test_data.shape)
    output = model_ft(test_data)
    pred = torch.argmax(output, 1)
    loocv_preds.append(pred)
    loocv_targets.append(test_target.item())


print("loocv preds: ", loocv_preds)
print("loocv targets: ", loocv_targets)
print("acc score: ", accuracy_score(loocv_targets, loocv_preds))
print("confusion matrix: \n", confusion_matrix(loocv_targets, loocv_preds))
print("confidence score for each image: ", F.softmax(output, 1))
  1. I am also interested in printing the confidence score for image for all the classes it belongs to. For example, image 1 could have 2 probabilities 87% belonging to class 1, and 13% belonging to class 2, and I want to have this information printed for all my images. However, using your suggestion, I was not sure how to do so. F.softmax(output, 1)

  2. From the results attached, it does not seem any learning is happening. Like it is stuck at ~50%. Do you know why and how that could be fixed?
    *result from running your code: https://pastebin.com/cpEv7juF
    *result from running the modified version of your code: https://pastebin.com/fiGNtfh3

To sum it up, I feel no learning is actually happening. Also, using enumerator(dataloader) to get batch_idx, passes the training item one by one to the train_model I do not feel this is correct (how can we train on only one sample?) hence I passed the entire 49 remaining samples of dataloader. Ideally, there should be a nice way to pass data in batches of like 10 or so samples at a time at each batch.

  1. It’s not expected and you are correct finding this issue! It looks like I’ve used the wrong code, as I first wanted to define train_model like a closure, i.e. basically training on one batch, but apparently haven’t cleaned up my code. Sorry about that.

  2. Your new code looks alright. I would just re-create the optimizer after resetting the model.

  3. output will contain the logits for each single test sample. So if you would like to print the probabilities for each class, you could use print('Class probabilities: {}'.format(F.softmax(output, 1))).

  4. Could you try to fix point 2 (re-creating the optimizer)? If the code still gets stuck, I’ll try to reproduce it.

1 Like

Thanks a lot for the guidance. So after recreating the optimizer, training is happening:

Entire code here: https://pastebin.com/kjxu9fHB I added two lines here:

for idx in range(nb_samples):
 
    print('Using sample {} as test data'.format(idx))
 
    print('Resetting model')
    model_ft = copy.deepcopy(initial_model)
    # Observe that all parameters are being optimized
    optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9)
 
    # Decay LR by a factor of 0.1 every 7 epochs
    exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)
 
    # Get all indices and remove test sample
    train_indices = list(range(len(image_datasets['train'])))
    del train_indices[idx]
Using sample 49 as test data
Resetting model
Epoch 0/24
----------
Loss: 1.0131 Acc: 0.5400
Epoch 1/24
----------
Loss: 1.2152 Acc: 0.4400
Epoch 2/24
----------
Loss: 1.1712 Acc: 0.5200
Epoch 3/24
----------
Loss: 2.2373 Acc: 0.5200
Epoch 4/24
----------
Loss: 0.8226 Acc: 0.6800
Epoch 5/24
----------
Loss: 0.9330 Acc: 0.4600
Epoch 6/24
----------
Loss: 0.6816 Acc: 0.6200
Epoch 7/24
----------
Loss: 0.2960 Acc: 0.8800
Epoch 8/24
----------
Loss: 0.2573 Acc: 0.9200
Epoch 9/24
----------
Loss: 0.2766 Acc: 0.9400
Epoch 10/24
----------
Loss: 0.2305 Acc: 0.9000
Epoch 11/24
----------
Loss: 0.1655 Acc: 0.9400
Epoch 12/24
----------
Loss: 0.1975 Acc: 0.9400
Epoch 13/24
----------
Loss: 0.2006 Acc: 0.9400
Epoch 14/24
----------
Loss: 0.1893 Acc: 0.9400
Epoch 15/24
----------
Loss: 0.1941 Acc: 0.9400
Epoch 16/24
----------
Loss: 0.1841 Acc: 0.9600
Epoch 17/24
----------
Loss: 0.2250 Acc: 0.9000
Epoch 18/24
----------
Loss: 0.2015 Acc: 0.9400
Epoch 19/24
----------
Loss: 0.1792 Acc: 0.9200
Epoch 20/24
----------
Loss: 0.1708 Acc: 0.9800
Epoch 21/24
----------
Loss: 0.1901 Acc: 0.9200
Epoch 22/24
----------
Loss: 0.1701 Acc: 0.9800
Epoch 23/24
----------
Loss: 0.1362 Acc: 0.9800
Epoch 24/24
----------
Loss: 0.1623 Acc: 0.9400
Training complete in 1m 7s
<class 'torch.Tensor'>
torch.Size([1, 3, 224, 224])
loocv preds:  [tensor([1], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([1], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([1], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([1], device='cuda:0'), tensor([1], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([1], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0')]
loocv targets:  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
acc score:  0.54
confusion matrix: 
 [[23  2]
 [21  4]]

Also, for

print(‘Class probabilities: {}’.format(F.softmax(output, 1)))

I get:
Class probabilities: tensor([[0.9514, 0.0486]], device=‘cuda:0’, grad_fn=)
Which only shows 2 numbers, not like 50 (images) * 2 (classes) numbers.

Questions:

  1. Should I pass the entire dataloader to train_model?
  2. How can I use F.softmax(output, 1)) to show 50*2 numbers for probabilities.

Thanks a lot.

  1. In this case, yes, you should pass the entire DataLoader to train_model, since the sampler should make sure you are not sampling the test image.
  2. You could create a list before the LOOCV loop and store all probabilities in it additionally to the predictions:
loocv_probs = []
for idx in range(nb_samples):
    ...
    loocv_preds.append(pred)
    loocv_targets.append(test_target.item())
    loocv_probs.append(F.softmax(outputs, 1))

Thanks a lot, I decided to run the code in Jupyter notebook. I get the following error:

----------
Loss: 0.1329 Acc: 0.9200
Training complete in 0m 36s
<class 'torch.Tensor'>
torch.Size([1, 3, 224, 224])
Using sample 33 as test data
Resetting model
Epoch 0/24
----------

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-4-1ab8f96ffc60> in <module>()
     34         model_ft = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler, sample, target, num_epochs=10)'''
     35 
---> 36     model_ft = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler, dataloader, num_epochs=25)
     37 
     38     # Test on LOO sample

<ipython-input-2-305560153b80> in train_model(model, criterion, optimizer, scheduler, dataloader, num_epochs)
     29             # track history if only in train
     30             with torch.set_grad_enabled(True):
---> 31                 outputs = model(inputs)
     32                 _, preds = torch.max(outputs, 1)
     33                 loss = criterion(outputs, labels)

/scratch/sjn-p3/anaconda/anaconda3/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    475             result = self._slow_forward(*input, **kwargs)
    476         else:
--> 477             result = self.forward(*input, **kwargs)
    478         for hook in self._forward_hooks.values():
    479             hook_result = hook(self, input, result)

/scratch/sjn-p3/anaconda/anaconda3/lib/python3.6/site-packages/torchvision-0.2.1-py3.6.egg/torchvision/models/resnet.py in forward(self, x)
    144         x = self.layer1(x)
    145         x = self.layer2(x)
--> 146         x = self.layer3(x)
    147         x = self.layer4(x)
    148 

/scratch/sjn-p3/anaconda/anaconda3/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    475             result = self._slow_forward(*input, **kwargs)
    476         else:
--> 477             result = self.forward(*input, **kwargs)
    478         for hook in self._forward_hooks.values():
    479             hook_result = hook(self, input, result)

/scratch/sjn-p3/anaconda/anaconda3/lib/python3.6/site-packages/torch/nn/modules/container.py in forward(self, input)
     89     def forward(self, input):
     90         for module in self._modules.values():
---> 91             input = module(input)
     92         return input
     93 

/scratch/sjn-p3/anaconda/anaconda3/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    475             result = self._slow_forward(*input, **kwargs)
    476         else:
--> 477             result = self.forward(*input, **kwargs)
    478         for hook in self._forward_hooks.values():
    479             hook_result = hook(self, input, result)

/scratch/sjn-p3/anaconda/anaconda3/lib/python3.6/site-packages/torchvision-0.2.1-py3.6.egg/torchvision/models/resnet.py in forward(self, x)
     78         out = self.relu(out)
     79 
---> 80         out = self.conv2(out)
     81         out = self.bn2(out)
     82         out = self.relu(out)

/scratch/sjn-p3/anaconda/anaconda3/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    475             result = self._slow_forward(*input, **kwargs)
    476         else:
--> 477             result = self.forward(*input, **kwargs)
    478         for hook in self._forward_hooks.values():
    479             hook_result = hook(self, input, result)

/scratch/sjn-p3/anaconda/anaconda3/lib/python3.6/site-packages/torch/nn/modules/conv.py in forward(self, input)
    299     def forward(self, input):
    300         return F.conv2d(input, self.weight, self.bias, self.stride,
--> 301                         self.padding, self.dilation, self.groups)
    302 
    303 

RuntimeError: CUDA error: out of memory

I have two 1080Ti GPUs. Does it have anything to do with running it in Jupyter?

actually ran the code with Python script and get the same CUDA OUT OF MEMORY error.

----------
Loss: 0.1821 Acc: 0.9200
Training complete in 0m 37s
<class 'torch.Tensor'>
torch.Size([1, 3, 224, 224])
Using sample 34 as test data
Resetting model
Epoch 0/24
----------
Exception ignored in: <bound method _DataLoaderIter.__del__ of <torch.utils.data.dataloader._DataLoaderIter object at 0x7ff5fc18cf60>>
Traceback (most recent call last):
  File "/scratch/sjn-p3/anaconda/anaconda3/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 399, in __del__
    self._shutdown_workers()
  File "/scratch/sjn-p3/anaconda/anaconda3/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 378, in _shutdown_workers
    self.worker_result_queue.get()
  File "/scratch/sjn-p3/anaconda/anaconda3/lib/python3.6/multiprocessing/queues.py", line 337, in get
    return _ForkingPickler.loads(res)
  File "/scratch/sjn-p3/anaconda/anaconda3/lib/python3.6/site-packages/torch/multiprocessing/reductions.py", line 151, in rebuild_storage_fd
    fd = df.detach()
  File "/scratch/sjn-p3/anaconda/anaconda3/lib/python3.6/multiprocessing/resource_sharer.py", line 58, in detach
    return reduction.recv_handle(conn)
  File "/scratch/sjn-p3/anaconda/anaconda3/lib/python3.6/multiprocessing/reduction.py", line 182, in recv_handle
    return recvfds(s, 1)[0]
  File "/scratch/sjn-p3/anaconda/anaconda3/lib/python3.6/multiprocessing/reduction.py", line 153, in recvfds
    msg, ancdata, flags, addr = sock.recvmsg(1, socket.CMSG_LEN(bytes_size))
ConnectionResetError: [Errno 104] Connection reset by peer
Traceback (most recent call last):
  File "new_loocv.py", line 202, in <module>
    model_ft = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler, dataloader, num_epochs=25)
  File "new_loocv.py", line 99, in train_model
    optimizer.step()
  File "/scratch/sjn-p3/anaconda/anaconda3/lib/python3.6/site-packages/torch/optim/sgd.py", line 97, in step
    buf = param_state['momentum_buffer'] = torch.zeros_like(p.data)
RuntimeError: CUDA error: out of memory

I am not sure if this answer directly related to this situation RuntimeError: cuda runtime error (2) : out of memory @ptrblck but how can we send data in batches of say size 7 or 10 or constant K in this scenario? I mean if the problem is related to batch size.

A note that I ran the same code last night and it was working. So I am not exactly sure why it is running out of cuda memory now. As you see below, output of $nvtop, I am only using 11% of the memory when running the code:

Edit: I figured what caused the problem: appending to the loocv_probs. After commenting it, I was able to do the training.

    print("probs: ", F.softmax(output, 1))
    #loocv_probs.append(F.softmax(output, 1))

Edit2:
The following fixed it:

    gpu_tensor_probs = F.softmax(output, 1)
    cpu_numpy_probs = gpu_tensor_probs.data.cpu().numpy()
    loocv_probs.append(cpu_numpy_probs)
1 Like

Yeah, I forgot to detach output from the computation graph, so that it was stored in the list, which caused the OOM error.